-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare.py
138 lines (94 loc) · 4.22 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Text data preparation for SPON news corpus.
This uses tmtoolkit (see https://tmtoolkit.readthedocs.io/) for preprocessing the large text corpus and generating
a document-term matrix, which is then used as input for topic modeling (see `tm_evaluation.py` and `tm_final.py`).
Note that this requires quite a large amount of computer memory to run (> 8GB).
December 2020, Markus Konrad <markus.konrad@wzb.eu>
"""
import re
import json
import logging
import numpy as np
from tmtoolkit.preprocess import TMPreproc
from tmtoolkit.bow.bow_stats import doc_lengths
from tmtoolkit.utils import pickle_data
# enable logging for tmtoolkit
logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True
#%% configuration and constants
INPUT_DATA = 'fetch_news/data/spon.json' # fetched SPON corpus raw data
OUTPUT_DTM = 'data/dtm_nov20.pickle' # document-term matrix output
OUTPUT_META = 'data/meta_nov20.pickle' # corpus metadata
OUTPUT_CORPUS = 'data/corpus_nov20.pickle' # raw text corpus
MIN_TOKENS_PER_DOC = 50
MAX_TOKENS_PER_DOC = 3000
pttrn_urlend = re.compile(r'\.html?$')
#%% loading raw data from JSON file
print(f'loading articles from {INPUT_DATA}')
with open(INPUT_DATA) as f:
sponraw = json.load(f)
print(f'loaded {len(sponraw)} articles')
#%% generating the corpus of raw text and corpus metadata
corpus = {} # maps document label to raw article text (including headline, abstract and main text)
meta = {} # maps document label to document metadata such as category, publication date, author
print('generating corpus')
# iterate through scraped article data
for art in sponraw:
if 'error_message' in art: # skip articles with errors
print('error for article', art['url'], ':', art['error_message'])
continue
# generate document label from end of article URL
if pttrn_urlend.search(art['url']):
urlend = art['url'].rindex('.')
else:
urlend = None
doclabel = art['url'][art['url'].rindex('/')+1:urlend]
# generate document full text from headline, abstract and paragraphs, all separted by double linebreaks
doctext = '\n\n'.join([art['archive_headline'], art['intro'] or '', '\n\n'.join(art['paragraphs'])])
# store to corpus and metadata dicts
if doclabel in corpus.keys():
print(f'> ignoring duplicate: {doclabel}')
else:
corpus[doclabel] = doctext
assert doclabel not in meta.keys()
meta[doclabel] = {k: v for k, v in art.items() if k in {'categ', 'pub_date', 'author'}}
print(f'generated corpus with {len(corpus)} documents')
del sponraw # remove unused objects
# store corpus and metadata to disk
print(f'storing corpus to {OUTPUT_CORPUS}')
pickle_data(corpus, OUTPUT_CORPUS)
print(f'storing corpus metadata to {OUTPUT_META}')
pickle_data(meta, OUTPUT_META)
del meta # remove unused objects
#%% process text data and form document-term matrix using parallel text processing via TMPreproc
print('tokenizing documents')
preproc = TMPreproc(corpus, language='de')
del corpus # remove unused objects
#preproc.print_summary()
print('processing documents')
# run preprocessing pipeline
# last two steps remove tokens that appear in more than 95% or less than 1% of all documents
preproc.pos_tag() \
.lemmatize() \
.tokens_to_lowercase() \
.remove_special_chars_in_tokens() \
.clean_tokens(remove_shorter_than=2, remove_numbers=True) \
.remove_common_tokens(df_threshold=0.95) \
.remove_uncommon_tokens(df_threshold=0.01)
#preproc.print_summary()
print('generating DTM')
dtm = preproc.dtm
print(f'DTM shape: {dtm.shape}')
print(f'filtering DTM for range of tokens per document: {MIN_TOKENS_PER_DOC} - {MAX_TOKENS_PER_DOC}')
dlengths = doc_lengths(dtm)
doc_mask = (dlengths >= MIN_TOKENS_PER_DOC) & (dlengths <= MAX_TOKENS_PER_DOC)
print(f'retaining {doc_mask.sum()} out of {dtm.shape[0]} documents')
#%% store document-term matrix along with document labels and vocabulary to disk
print(f'storing output DTM to {OUTPUT_DTM}')
pickle_data((np.array(preproc.doc_labels)[doc_mask],
preproc.vocabulary,
dtm[doc_mask, :]),
OUTPUT_DTM)
print('done.')