This repository has been archived by the owner on Jul 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
correlation_count.py
121 lines (94 loc) · 4.3 KB
/
correlation_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from avro.datafile import DataFileReader
from avro.io import DatumReader
import time
import argparse
import sqlite3
print("currently not working, it's incredibly slow and rows are duplicated, could be better to use Redis or Postgres or some other tool")
exit()
# parameters from CLI
parser = argparse.ArgumentParser(description="calculate a tag/token co-occurrency table from a corpus file")
parser.add_argument("-input_file", type=str, help="the input avro file")
parser.add_argument("-tokenizer", type=str, help="the method to map text and tokens", default='split',
choices=['split', 'char', 'nltk'])
parser.add_argument("-case_insensitive", type=str, help="lowercase the tokens with the system locale", default='yes',
choices=['yes', 'no'])
parser.add_argument('-sqlite_db_file', default='tag_token_counters.db', help='name of the SQLite file to create or add to')
args = parser.parse_args()
# pick the joiner and splitter functions, mapping plain text to token sequences
splitter = None
joiner = None
if args.tokenizer == 'char':
def splitter(x): return list(x)
def joiner(x): return ''.join(x)
if args.tokenizer == 'split':
def splitter(x): return x.split(' ')
def joiner(x): return ' '.join(x)
if args.tokenizer == 'nltk':
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
def splitter(x): return tokenizer.tokenize(x)
def joiner(x): return ' '.join(x)
conn = sqlite3.connect(args.sqlite_db_file)
c = conn.cursor()
# table to count single tokens ignoring their tags
c.execute('CREATE TABLE token_count(token TEXT, count INTEGER)')
# table to count the token and tag co-occurrences. Note that an utterance can have 0, 1 or many tags
c.execute('CREATE TABLE token_tag_count(token TEXT, tag TEXT, count INTEGER)')
# dictionaries containing the data to be inserted/updated in small batches
pending_token_count = {}
pending_token_tag_count = {}
# how much pending data can we accept?
PENDING_MAX_SIZE = 3000
input_file = args.input_file
case_insensitive = (args.case_insensitive == 'yes')
if case_insensitive:
print('the tokens will be lowercased using the system locale')
else:
print('the tokens will be counted as they are (case sensitive)')
reader = DataFileReader(open(input_file, "rb"), DatumReader())
start_time = time.time()
utterances_count = 0
max_tokens = 0
longest_description = ''
tags_counter = {}
total_length_chars = 0
total_length_tokens = 0
for utterance in reader:
utterances_count += 1
tokens = splitter(utterance['text'])
tags = utterance['tags']
total_length_tokens += len(tokens)
for ct in set(tokens):
if case_insensitive:
t = ct.lower()
else:
t = ct
if t in pending_token_count:
pending_token_count[t] += 1
else:
pending_token_count[t] = 1
for tag in tags:
if (tag, t) in pending_token_tag_count:
pending_token_tag_count[(tag, t)] += 1
else:
pending_token_tag_count[(tag, t)] = 1
if len(pending_token_count) + len(pending_token_tag_count) > PENDING_MAX_SIZE:
print('writing on database, processed {0} utterances...'.format(utterances_count))
tmp_pending = []
for tag_token, count in pending_token_tag_count.items():
tmp_pending.append({'tag': tag_token[0], 'token': tag_token[1], 'count': count})
c.executemany('INSERT OR REPLACE INTO token_tag_count(token, tag, count) VALUES(:token, :tag, COALESCE((SELECT count + :count FROM token_tag_count WHERE token = :token AND tag = :tag), :count))',
tmp_pending)
conn.commit()
tmp_pending = []
for token, count in pending_token_count.items():
tmp_pending.append({'token': token, 'count': count})
c.executemany('INSERT OR REPLACE INTO token_count(token, count) VALUES(:token, COALESCE((SELECT count + :count FROM token_count WHERE token = :token), :count))',
tmp_pending)
conn.commit()
print('wrote on database')
pending_token_count = {}
pending_token_tag_count = {}
reader.close()
elapsed = time.time() - start_time
print('processed {0} utterances, it took {1} seconds [{2} utterances per second]'.format(utterances_count, elapsed, utterances_count/elapsed))