-
Notifications
You must be signed in to change notification settings - Fork 2
/
twexamp.py
136 lines (122 loc) · 5.14 KB
/
twexamp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""A module for reading two-level examples
The examples are assumed to be as space-separated one-level
representation and they are compiled into a single automaton.
At the same time, the alphabet used in the examples is
collected in several forms.
cfg.examples_fst -- the transducer which accepts exactly the examples
cfg.symbol_pair_set -- a tuple of string pairs suitable for e.g.
hfst.rules.restriction
"""
import re
import hfst
import cfg
import twbt
def pairs_to_fst(pair_set):
"""Converts a seq of symbol pairs into a fst that accepts any of them
"""
pairs_bfst = hfst.HfstBasicTransducer()
for pair in pair_set:
pairs_bfst.disjunct((pair,), 0) # arg in tokenized format
fst = hfst.HfstTransducer(pairs_bfst)
fst.remove_epsilons()
fst.minimize()
return fst
def read_fst(filename="examples.fst"):
"""Reads in a previously stored example FST file
"""
exfile = hfst.HfstInputStream(filename)
cfg.examples_fst = exfile.read()
pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
# print("pair_symbols", pair_symbols) ##
pair_symbol_lst = re.split(r" +", pair_symbols)
for pair in pair_symbol_lst:
cfg.pair_symbol_set.add(pair)
(insym, outsym) = cfg.pairsym2sympair(pair)
cfg.symbol_pair_set.add((insym, outsym))
cfg.input_symbol_set.add(insym)
cfg.output_symbol_set.add(outsym)
cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set)
if cfg.verbosity >= 30:
twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
return
def read_examples(filename="test.pstr", build_fsts=True):
"""Reads the examples from the file whose name is 'filename'.
The file must contain one example per line and each line consists of
a space separated sequence of pair-symbols.
The examples are processed to a FST which is a union of all examples.
"""
if build_fsts:
import hfst
examples_bfst = hfst.HfstBasicTransducer()
exfile = open(filename, "r")
for line_nl in exfile:
line = line_nl.strip()
if not line or line.startswith("!"):
continue
lst = line.split("!", maxsplit=1)
line = lst[0].strip()
pairsym_lst = re.split("\s+", line)
symbol_pair_lst = [cfg.pairsym2sympair(pairsym)
for pairsym in pairsym_lst]
if not all([insym and outsym for insym, outsym in symbol_pair_lst]):
print("*** example contains an invalid pair symbol")
print(line)
continue
if cfg.verbosity >= 30:
print("symbol_pair_lst:", symbol_pair_lst)
pair_symbol_str = " ".join([cfg.sympair2pairsym(insym, outsym)
for insym,outsym
in symbol_pair_lst])
if cfg.verbosity >= 30:
print("pair_symbol_str:", pair_symbol_str)
cfg.example_lst.append(pair_symbol_str)
cfg.example_set.add(pair_symbol_str) # spaces normalized
#LINE_FST = hfst.tokenized_fst(symbol_pair_lst)
# twbt.printfst(LINE_FST, True) ##
if build_fsts:
examples_bfst.disjunct(symbol_pair_lst, 0)
for insym, outsym in symbol_pair_lst:
cfg.symbol_pair_set.add((insym, outsym))
exfile.close()
if cfg.verbosity >= 30:
print("List of examples:", cfg.example_lst)
print("List of alphabet symbol pairs:", sorted(cfg.symbol_pair_set))
if build_fsts:
cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set)
cfg.examples_fst = hfst.HfstTransducer(examples_bfst)
cfg.examples_fst.set_name(filename)
cfg.examples_fst.minimize()
if cfg.verbosity >= 30:
twbt.ppfst(cfg.examples_fst, False, title="Example file as FST")
for insym, outsym in cfg.symbol_pair_set:
cfg.input_symbol_set.add(insym)
cfg.output_symbol_set.add(outsym)
for insym, outsym in cfg.symbol_pair_set:
pair_symbol = cfg.sympair2pairsym(insym, outsym)
cfg.pair_symbol_set.add(pair_symbol)
if build_fsts:
pair_symbol_lst = [insym+':'+outsym for insym, outsym in cfg.symbol_pair_set]
pair_symbol_str = " ".join(sorted(pair_symbol_lst))
# print("symbol pairs:", pair_symbol_str) ##
cfg.examples_fst.set_property("x-pair_symbols", pair_symbol_str)
return
if __name__ == "__main__":
import hfst
import argparse
arpar = argparse.ArgumentParser("python3 twexamp.py")
arpar.add_argument("examples", help="example pair strings file",
default="examples.pstr")
arpar.add_argument("output", help="file to which write the example FST",
default="")
arpar.add_argument("-v", "--verbosity",
help="level of diagnostic output",
type=int, default=0)
args = arpar.parse_args()
cfg.verbosity = args.verbosity
read_examples(args.examples, build_fsts=True)
if args.output:
exfile = hfst.HfstOutputStream(filename=args.output)
exfile.write(cfg.examples_fst)
exfile.flush()
exfile.close()
print("--- example fst written to ", args.output ," ---")