forked from koskenni/pytwolc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
raw2named.py
85 lines (78 loc) · 2.83 KB
/
raw2named.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
""" Forms morphophonemic representations out of zero-filled example word forms
This program is free software under GPL 3 license
Copyright Kimmo Koskenniemi 2017-2018
"""
import re
import csv
import cfg
import argparse
argparser = argparse.ArgumentParser(
"python3 raw2named.py",
description="joins and renames raw morphophonemes")
argparser.add_argument(
"input",
default="demo-raw.csv",
help="aligned examples as a CSV file")
argparser.add_argument(
"output",
default="demo-renamed.pstr",
help="renamed examples as a space separated pair symbol strings")
argparser.add_argument(
"names",
default="demo-renaming.csv",
help="mapping from raw to neat morphophonemes as a CSV file")
argparser.add_argument(
"-d", "--delimiter",
default=",",
help="delimiter between raw name and new name fields")
argparser.add_argument(
"-F", "--add-features",
default=False, action="store_true",
help="add affix morpheme names to the pairstring representation")
argparser.add_argument(
"-v", "--verbosity",
default=0,
type=int,
help="level of diagnostic and debugging output")
args = argparser.parse_args()
cfg.verbosity = args.verbosity
mphon_name = { }
# Read in the namefile is a CSV file which contains three fields:
# 1. the raw (old) name for the mophophoneme
# 2. a neat (new) name for the morphophoneme
# 3. Comments documenting typical occurrences of the morphophoneme
with open(args.names) as namefile:
reader = csv.reader(namefile, delimiter=args.delimiter)
for row in reader:
if not row or (not row[0].strip()):
continue
if row[1].strip():
mphon_name[row[0].strip()] = row[1].strip()
#print(mphon_name)###
outfil = open(args.output, "w")
with open(args.input) as csvfile:
reader = csv.DictReader(csvfile, delimiter=args.delimiter)
for row in reader:
zero_filled_str = row["ZEROFILLED"].strip().replace(".", "")
raw_str = row["RAW"].strip()
raw_lst = raw_str.split(" ")
pairsym_lst = []
if cfg.verbosity >= 20:
print(row)
print("raw_lst:", raw_lst)
if len(raw_lst) != len(zero_filled_str):
print("** LENGTHS DISAGREE **", raw_lst, zero_filled_str)
continue
for raw_insym, outsym in zip(raw_lst, zero_filled_str):
if raw_insym == outsym:
psym = raw_insym
else:
clean_insym = mphon_name.get(raw_insym, raw_insym)
psym = clean_insym + ":" + outsym
pairsym_lst.append(psym)
if args.add_features:
morpheme_lst = row["MORPHEMES"].strip().split(" ")
for morpheme in morpheme_lst[1:]:
pairsym_lst.append(morpheme + ":Ø")
pairsym_str = " ".join(pairsym_lst)
print(pairsym_str, file=outfil)