-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_terminology.py
165 lines (139 loc) · 6.52 KB
/
test_terminology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -*- coding: utf-8 -*-
# Katja Konermann
# 802658
"""
Unittests for the Terminology class
"""
import math
import os
import unittest
from terminology import Terminology
class TestCaseTerminology(unittest.TestCase):
@classmethod
def setUpClass(cls):
# Once in all files in domain and only in domain.
cls.bigr_equally_only_domain = ("computational", "linguistics")
# Once in one file in domain corpus.
cls.bigr_one_domain = ("speech", "recognition")
# Twice in one file in domain, once in another file of domain.
cls.bigr_not_equally = ("text", "mining")
# Twice in domain, once in reference.
cls.bigr_more_domain = ("machine", "learning")
# Three times in reference, once in domain.
cls.bigr_less_domain = ("language", "learning")
cls.term_obj = Terminology(domain="demo/domain/",
reference="demo/reference/",
candidates={
cls.bigr_equally_only_domain,
cls.bigr_one_domain,
cls.bigr_not_equally,
cls.bigr_more_domain,
cls.bigr_less_domain
}
)
def test_domain_relevance_between_0_and_1(self):
relevance = self.term_obj.domain_relevance
for term in relevance:
self.assertTrue(relevance[term] <= 1 and relevance[term] >= 0)
def test_domain_consensus_positive(self):
consensus = self.term_obj.domain_consensus
for term in consensus:
self.assertTrue(consensus[term] >= 0)
def test_domain_consensus_upper_limit(self):
consensus = self.term_obj.domain_consensus
file_number = len(self.term_obj.domain.corpus.fileids())
for term in consensus:
self.assertTrue(consensus[term] <= math.log(file_number))
def test_domain_relevance_only_in_domain(self):
relevance = self.term_obj.domain_relevance
self.assertEqual(relevance[self.bigr_equally_only_domain], 1)
def test_domain_relevance_more_in_domain(self):
relevance = self.term_obj.domain_relevance
self.assertAlmostEqual(relevance[self.bigr_more_domain],
6/11,
places=5)
def test_domain_relevance_less_in_domain(self):
relevance = self.term_obj.domain_relevance
self.assertAlmostEqual(relevance[self.bigr_less_domain],
1/6,
places=5)
def test_domain_consensus_equally_distributed(self):
consensus = self.term_obj.domain_consensus
self.assertAlmostEqual(consensus[self.bigr_equally_only_domain],
1.098612289,
places=5)
def test_domain_consensus_in_one_file(self):
consensus = self.term_obj.domain_consensus
self.assertEqual(consensus[self.bigr_one_domain], 0)
def test_domain_consensus_not_equally_distributed(self):
consensus = self.term_obj.domain_consensus
self.assertAlmostEqual(consensus[self.bigr_not_equally],
0.6365141683,
places=5)
def test_weigh_candidates_error_alpha_above_one(self):
weighted = self.term_obj.weigh_candidates
self.assertRaises(ValueError, weighted, alpha=2)
def test_weigh_candidates_error_alpha_negative(self):
weighted = self.term_obj.weigh_candidates
self.assertRaises(ValueError, weighted, alpha=-2)
def test_weighted_candidates_alpha_05(self):
weighted = self.term_obj.weigh_candidates(alpha=0.5)
self.assertAlmostEqual(weighted[self.bigr_equally_only_domain],
1.049305,
places=3)
def test_weighted_candidates_alpha_1(self):
weighted = self.term_obj.weigh_candidates(alpha=1)
self.assertAlmostEqual(weighted[self.bigr_equally_only_domain],
1,
places=3)
def test_weighted_candidates_alpha_0(self):
weighted = self.term_obj.weigh_candidates(alpha=0)
self.assertAlmostEqual(weighted[self.bigr_equally_only_domain],
1.0986,
places=3)
def test_extract_terminology_raises_error(self):
weighted = {("computational", "linguistics"): 1,
("language", "learning"): 0.4}
self.assertRaises(ValueError,
self.term_obj.extract_terminology,
theta=-1,
weighted_candidates=weighted)
def test_extract_terminology_treshold(self):
weighted = {("computational", "linguistics"): 1,
("language", "learning"): 0.4}
terms = self.term_obj.extract_terminology(theta=0.5,
weighted_candidates=weighted)
self.assertNotIn(("language", "learning"), terms)
self.assertIn(("computational", "linguistics"), terms)
def test_write_csv_file_exists(self):
testfile = "test.csv"
self.term_obj.write_csv(alpha=0.5, theta=1, filename=testfile)
self.assertTrue(os.path.isfile(testfile))
os.remove(testfile)
def test_write_csv_file_first_line(self):
testfile = "test_firstline.csv"
self.term_obj.write_csv(alpha=0.5, theta=1, filename=testfile)
with open(testfile, encoding="utf-8") as file:
first = file.readline().rstrip()
self.assertEqual(first, "alpha;0.5")
os.remove(testfile)
def test_write_csv_second_line(self):
testfile = "test_secondline.csv"
self.term_obj.write_csv(alpha=0.5, theta=1, filename=testfile)
with open(testfile, encoding="utf-8") as file:
file.readline()
second = file.readline().rstrip()
self.assertEqual(second, "theta;1")
os.remove(testfile)
def test_write_csv_three_columns(self):
testfile = "test_columns.csv"
self.term_obj.write_csv(alpha=0.5, theta=1, filename=testfile)
with open(testfile, encoding="utf-8") as file:
for i in range(2):
file.readline()
for line in file:
line = line.split(";")
self.assertEqual(len(line), 3)
os.remove(testfile)
if __name__ == "__main__":
unittest.main(buffer=True)