-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
297 lines (265 loc) · 10.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# -*- coding: utf-8 -*-
# Katja Konermann
# 802658
"""
Main file - Implements command line arguments with argparse.
Extract - Class for the command to extract terminology
Evaluate - Class for the command to evaluate extracted terms.
Candidates - Class for the command to generate candidates.
"""
import argparse
import os
import sys
from evaluation import Evaluation
from preprocess import Preprocess
from terminology import Terminology
class Extract:
"""
A class that extracts terminology from a corpus and
writes results to a file according to a list of arguments.
Attributes:
domain (str):
Corpus with domain specific content.
Name of directory with txt files.
candidates (str):
Path to a file where possible candidates are stored.
Lines should have the format: <word_i> <word_j>
output (str):
Name of a file where output will be stored.
alpha (float):
Value for alpha, weights relevance and consensus
theta (float):
Value for theta, threshold for terminology
Methods:
read_from_file(file, n=2):
Read in terms from a file.
run():
Extract terminology from domain corpus
and write results to output file.
"""
from nltk.corpus import reuters
REF = reuters
def __init__(self, sysargs):
"""Instanciate an Extract object
Args:
sysargs (list):
A list of command line arguments.
"""
self.args = self._parser(sysargs)
self.corpus = self.args.corpus
self.candidates = self.read_from_file(self.args.candidates)
self.out = self.args.out
self.theta = self.args.theta
self.alpha = self.args.alpha
def _parser(self, sysargs):
"""Parse command line arguments"""
parser = argparse.ArgumentParser("Extract terminology for a domain")
parser.add_argument("corpus",
help="Directory of domain corpus with txt files")
parser.add_argument("candidates",
help="File with candidates.")
parser.add_argument("-a", "--alpha", type=float,
default=0.5,
help="Value for weighing consensus "
"and relevance")
parser.add_argument("-t", "--theta", type=float,
default=2,
help="Threshold when extracting terminology")
parser.add_argument("out", help="Name for output file")
return parser.parse_args(sysargs)
@staticmethod
def read_from_file(file, n=2):
"""Read terms from file."""
terms = set()
with open(file, encoding="utf-8") as file:
for line in file:
line = line.rstrip().split("\t")
if line:
term = line[0].split()
if len(term) == n:
if n == 1:
terms.add(*term)
else:
terms.add(tuple(term))
return terms
def run(self):
"""Extract candidates and write them to the output file.
Returns: None
"""
out = os.path.join(self.out)
# Extract terminology.
print("Processing domain and reference corpus...")
term_obj = Terminology(self.corpus,
self.REF,
self.candidates)
print("Extracting Terminology...")
term_obj.write_csv(self.alpha, self.theta, out)
class Evaluate:
"""
A class that evaluates extracted terms.
Attributes:
gold (str):
Name of a file with gold standard bigrams.
Line format <word> <word>.
extracted (str):
Name of file with extracted terms and values
of decision function. Line format <word> <word>\t<value>
high (int):
Indicates how many of the highest scored terms will be
printed. If None, no terms will be printed.
low (int):
Indicates how many of the lowest scored terms will be
printes, If None, no terms will be printed.
"""
def __init__(self, sysargs):
"""Instanciating an Evaluate object.
Args:
sysargs (list): Command Line arguments.
Returns:
None
"""
self._args = self._parser(sysargs)
self.gold = self._args.gold
self.extracted = self._args.extracted
self.high = self._args.high
self.low = self._args.low
def _parser(self, sysargs):
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Evaluate "
"extracted terminology")
parser.add_argument("--extracted",
help="Name of file with extracted terms.",
required=True)
parser.add_argument("--gold",
help="Name of file with gold standard terms.",
required=True)
parser.add_argument("--high", type=int,
help="Print n highest scored terms")
parser.add_argument("--low", type=int,
help="Print n lowest scored terms")
return parser.parse_args(sysargs)
def run(self):
"""Evaluate extracted terms and print highest/lowest scored terms."""
eval_extrac = Evaluation.from_file(self.gold, self.extracted)
# Print evaluation metrics.
print("Recall: {:.3f}".format(eval_extrac.recall()))
print("Precision: {:.3f}".format(eval_extrac.precision()))
print("F1-Score: {:.3f}".format(eval_extrac.f1()))
if self.high is not None:
print("{} highest scored terms:".format(self.high))
high_terms = eval_extrac.highest_scored(self.high)
for wordi, wordj in high_terms:
print(wordi, wordj)
if self.low is not None:
print("{} lowest scored terms:".format(self.low))
low_terms = eval_extrac.lowest_scored(self.low)
for wordi, wordj in low_terms:
print(wordi, wordj)
class Candidates(Extract):
"""
A class that creates a file with possible candidates.
Attributes:
corpus (str):
directory with text files.
min_count (int):
minimum frequency for a term to be considered a candidate
output (str):
name of file where candidates should be stored.
stops (str):
Name of a file with stopwords that should be ignored.
If not defined, None.
tags [list]:
List of Penn Treebank Tags that are considered relevant for
a candidate, can be empty.
"""
def __init__(self, sysargs):
self.args = self._parser(sysargs)
self.corpus = self.args.corpus
self.stops = self.args.stops
self.min_count = self.args.min
self.output = self.args.output
self.tags = self.args.tags
def _parser(self, sysargs):
parser = argparse.ArgumentParser(description="Generate possible "
"candidates for a domain")
parser.add_argument("corpus",
help="Directory with txt files "
"to extract candidates from")
parser.add_argument("output", help="Name for the output file.")
parser.add_argument("--stops",
help="File with stopwords")
parser.add_argument("--min", default=1, type=int,
help="Minimum count for terms "
"to be considered candidate")
parser.add_argument("tags",
help="Relevant tags for candidates, "
"use Penn Treebank Tags",
nargs="*",
default=[])
return parser.parse_args(sysargs)
def run(self):
"""Generate candidates and write them to a file
Returns: None.
"""
if self.stops is None:
stops = []
else:
stops = self.read_from_file(self.stops, n=1)
out = os.path.join(self.output)
print("Processing corpus...")
process = Preprocess(self.corpus)
print("Generating candidates...")
process.write_candidates_file(min_count=self.min_count,
stops=stops,
tags=self.tags,
filename=out)
def main():
arg = sys.argv
if len(arg) < 2:
raise ValueError("Enter a valid command.")
if arg[1] == "extract":
Extract(arg[2:]).run()
elif arg[1] == "evaluate":
Evaluate(arg[2:]).run()
elif arg[1] == "candidates":
Candidates(arg[2:]).run()
elif arg[1] == "demo":
demo_candidates = ["--stops", "demo/demo_stops.txt",
"--min", "1",
"demo/domain/",
"demo/demo_candidates_out.txt",
"NN"]
print("Demo for command 'candidates'\n"
"\tCall:\n"
"candidates {}\n"
"\tOutput:".format(" ".join(demo_candidates)))
Candidates(demo_candidates).run()
demo_extract = ["-a", "0.5", "-t", "0.6",
"demo/domain/",
"demo/demo_candidates.txt",
"demo/demo_out.csv"]
print("="*30)
print("Demo for command 'extract'\n"
"\tCall:\n"
"extract {} \n"
"\tOutput:".format(" ".join(demo_extract)))
Extract(demo_extract).run()
demo_eval = ["--extracted", "demo/demo_out.csv",
"--gold", "demo/demo_gold.txt",
"--high", "1", "--low", "1"]
print("="*30)
print("Demo for command 'evaluate'\n"
"\tCall:\n"
"evaluate {}\n"
"\tOutput:".format(" ".join(demo_eval)))
Evaluate(demo_eval).run()
else:
raise ValueError("Invalid command '{}'".format(arg[1]))
if __name__ == "__main__":
try:
main()
except (OSError, ValueError) as err:
print("Failure: {}".format(err))
print("Type 'evaluate -h', 'extract -h' "
"or 'candidates -h' for information about commands\n"
"Type 'demo' for a demo of commands")