-
Notifications
You must be signed in to change notification settings - Fork 2
/
wordlist2entries.py
94 lines (88 loc) · 3.21 KB
/
wordlist2entries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# wordlist2entries :
# converts KSK entries into LEXC entries
import re, sys, hfst, argparse
argparser = argparse.ArgumentParser("python3 wordlist2entries.py",
description="Converts a word list of entries into entries in "
"LEXC format according to converter")
argparser.add_argument("wordlist",
help="list of base forms and attached inflection codes",
default="~/Dropbox/lang/fin/ksk/ksk-v.dic")
argparser.add_argument("converter", default="finv-pattern-conv.fst",
help="a converter FST made out of patterns")
argparser.add_argument("codes", default="finv-codes.text",
help="a space-separated list of inflection codes found in the patterns. "
"Entries in wordlist with other inflection codes are ignored.")
argparser.add_argument("lexentries", default="finv-words.lexc",
help="entries converted into LEXC format with appropriate morphophonemes and continuations")
argparser.add_argument("-v", "--verbosity", default=0, type=int,
help="level of diagnostic output")
args = argparser.parse_args()
converter_file = hfst.HfstInputStream(args.converter)
converter_fst = converter_file.read()
converter_fst.lookup_optimize()
infl_set = set(open(args.codes).read().split())
#print("infl_set =", infl_set) ###
entrylist = []
multiharacters = set()
def find_multichars(str):
lst = re.findall(r"\{[a-zåäöšžØ']+\}", str)
for sym in lst:
multiharacters.add(sym)
wordlist_file = open(args.wordlist, "r")
linenum = 0
for linenl in wordlist_file:
linenum += 1
line = linenl.strip()
if args.verbosity >= 10:
print("line:", line)
if re.search(r"[/!Y]", line):
continue
lst = line.split(" ")
if len(lst) < 2:
print("LINE", linenum,
"HAS NOT ENOUGH FIELDS:", '"' + line + '"')
continue
word = re.sub(r"[0-9]+$", r"", lst[0]).strip()
if args.verbosity >= 10:
print("word: <" + word + ">")
if len(lst) > 2 and lst[2][0] == "*":
infl = lst[1] + "*"
else:
infl = lst[1]
if args.verbosity >= 10:
print("infl: <" + infl + ">")
if infl not in infl_set:
continue
if not (re.match(r"^[a-zšžåäö']+$", word)):
print(linenum, "word not ok:", '"' + line + '"')
continue
if not (re.match(r"^[VS][0-9][0-9][*]?", infl)):
print(linenum, "infl not ok:", '"' + line + '"')
continue
#if infl == "V41" and re.match(r"^[hjklmnprstv]*[äöye].*t[aä]$", word):
# infl = "V41ä"
#elif infl == "V42" and re.match(r"^.*nt(aa|ää)$", word):
# infl = "V42n"
iclass = infl#.replace('0', 'O')
symlist = list(word)
symlist.append(iclass)
symtup = tuple(symlist)
if args.verbosity >= 10:
print("symtup: ", symtup)
res = converter_fst.lookup(symtup)
if not res:
print(linenum, ':', line)
continue
best_w = min([w for r, w in res])
for r, w in res:
if w > best_w:
continue
mf, cont = re.split(r" +", r)
find_multichars(mf)
entrylist.append(r)
outf = open(args.lexentries, "w")
print("Multichar_Symbols", file=outf)
print(" ".join(sorted(multiharacters)), file=outf)
print("LEXICON Root", file=outf)
for entry in entrylist:
print(entry, ';', file=outf)