-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_model.py
93 lines (74 loc) · 2.83 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
import pickle
import logging
import warnings
import pandas as pd
import gensim
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
#set stopword
stop_words = nltk.corpus.stopwords.words('english')
custom = ['?','(', ')', '.', '[', ']','!', '...', '-', '@', '->', 'https', 'http',
';', "`", "'", '"',',', '``', "''", ':', '*', '~' , '/', '//', '\\', '&', 'n', ':\\']
stop_words.extend(custom)
def clean_status(data):
remove_mentions = re.sub(r'@[A-Za-z0-9]+', '', data)
remove_links = re.sub('https?://[A-Za-z0-9./]+', '', remove_mentions, flags=re.MULTILINE)
remove_bitly_links = re.sub(r'bit.ly/\S+', '', remove_links)
remove_non_ascii = re.sub(r'[^\x00-\x7F]+', '', remove_bitly_links)
set_lowercase = remove_non_ascii.lower()
token = word_tokenize(set_lowercase)
filtered = [words for words in token if not words in stop_words]
return filtered
#convert words into bigrams
def get_bigram(words, bi_min=15, tri_min=10):
bigram = gensim.models.Phrases(words, min_count=bi_min)
bigram_mod = gensim.models.phrases.Phraser(bigram)
return bigram_mod
def main():
#set vars, logging, and open csv file
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
print('Opening file...')
train_data = pd.read_csv('data/train_data.csv', encoding='ISO-8859-1')
status_list = []
for row in train_data['tweets']:
cleaned_status = clean_status(row)
status_list.append(cleaned_status)
bigrams = get_bigram(status_list)
bigram = [bigrams[entry] for entry in status_list]
id2word = gensim.corpora.Dictionary(bigram)
id2word.compactify()
corpus = [id2word.doc2bow(tweets) for tweets in bigram]
print('Training model...')
print('\n')
with warnings.catch_warnings():
warnings.simplefilter('ignore')
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, chunksize=100, workers=2, passes=50, eval_every=1, per_word_topics=True)
#print list of topics
print('\n')
print('Geting topics...')
lda_model.print_topics(10, num_words=10)[:10]
print('\n')
#save everything
print('Saving bigrams...')
bigram_save = open('bigram.pkl', 'wb')
pickle.dump(bigram, bigram_save)
bigram_save.close()
print('Saving id2word...')
id2word_save = open('id2word.pkl', 'wb')
pickle.dump(id2word, id2word_save)
id2word_save.close()
print('Saving corpus...')
corpus_save = open('corpus.pkl', 'wb')
pickle.dump(corpus, corpus_save)
corpus_save.close()
print('Saving model...')
model_save = open('lda_model2.model', 'wb')
pickle.dump(lda_model, model_save)
model_save.close()
print('Done')
if __name__ == "__main__":
main()