-
Notifications
You must be signed in to change notification settings - Fork 676
/
NLP Dendrogram Topics
267 lines (203 loc) · 9.44 KB
/
NLP Dendrogram Topics
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
from gensim import corpora, models, similarities
from collections import defaultdict
import csv
from nltk import sent_tokenize, word_tokenize, pos_tag
import nltk
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re
import gensim
from sklearn import manifold, datasets, decomposition, ensemble,discriminant_analysis, random_projection
from sklearn.cluster import KMeans
from matplotlib.colors import LinearSegmentedColormap
from string import punctuation
import logging
from pprint import pprint # pretty-printer
text=["The purpose of the current study was to examine the painful elbow, and in particular enthesitis, in psoriatic arthritis PsA and rheumatoid arthritis RA using clinical examination, ultrasonography US and magnetic resonance imaging MRI. Patients with elbow pain 11 with PsA and 9 with RA were recruited. Clinical examination, US and MRI studies were performed on the same day. For enthesitis, the common extensor and flexor insertions and the triceps insertion were imaged 20 patients, giving a total of 60 sites with comparative data. Imaging was performed with the radiologists blinded to the diagnosis and clinical findings. US was used to assess inflammatory activity Power Doppler signal, oedema, tendon thickening and bursal swelling and damage erosions, cortical roughening and enthesophytes. MRI was used to assess inflammation fluid in paratenon, peri-entheseal soft-tissue oedema, entheseal enhancement with gadolinium, entheseal oedema and bone oedema and damage erosion, cortical roughening and enthesophyte. Complete scan data were not available for all patients as one patient could not tolerate the MRI examination. No significant differences in imaging scores were found between PsA an d RA. Analysis of damage scores revealed complete agreement between US and MRI data in 43/55 78% comparisons; in 10/55 18% cases the US data were abnormal but the MRI data normal; in 2/55 4% cases, the MRI data were abnormal and the US data normal. Analysis of the inflammation scores revealed complete agreement between US and MRI data in 33/55 60% comparisons; in 3/55 5% cases US data were abnormal but MRI data normal; in 19/55 35% cases the MRI data were abnormal and the US data normal. There was a poor relationship between assessments based on clinical examination and imaging studies. Readers could not accurately identify the disease from imaging findings."]
# kill all script and style elements
txt=sent_tokenize(str(text))
def strip_punctuation(s):
return ''.join(c for c in s if c not in punctuation)
txt=[strip_punctuation(re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',txt[i])) for i in range(0,len(txt))][0:110]
txt
tokens = word_tokenize(str(txt))
tokens
long_words1 = [w for w in tokens if 7<len(w)<9]
sorted(long_words1)
fdist01 = nltk.FreqDist(long_words1)
fdist01
a1=fdist01.most_common(20)
a1
names0=[]
value0=[]
for i in range(5,len(a1)):
names0.append(a1[i][0])
value0.append(a1[i][1])
names0.reverse()
value0.reverse()
val = value0 # the bar lengths
pos = np.arange(len(a1)-5)+.5 # the bar centers on the y axis
pos
val
plt.figure(figsize=(9,4))
plt.barh(pos,val, align='center',alpha=0.7,color='rgbcmyk')
plt.yticks(pos, names0)
plt.xlabel('Mentions')
plt.title('WIKIPEDIA ANALYSIS\n')
txt
sentences = txt
##### LDA
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
documents = sentences
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
texts
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
frequency
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
pprint(texts)
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester4.dict')
print(dictionary.token2id)
## VETOR DAS FRASES
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester4.mm', corpus) # store to disk, for later use
print(corpus)
from gensim import corpora, models, similarities
import logging
import matplotlib.pyplot as plt
from gensim import corpora
from collections import defaultdict
from pprint import pprint # pretty-printer
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
p=[]
for i in range(0,len(documents)):
doc1 = documents[i]
vec_bow2 = dictionary.doc2bow(doc1.lower().split())
vec_lsi2 = lsi[vec_bow2] # convert the query to LSI space
p.append(vec_lsi2)
p
matrix1 = gensim.matutils.corpus2dense(p, num_terms=2)
matrix3=matrix1.T
matrix3
def norm(x):
return (x-np.min(x))/(np.max(x)-np.min(x))
X=norm(matrix3)
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0,perplexity=50,verbose=1,n_iter=1500)
X_tsne = tsne.fit_transform(X)
model3=KMeans(n_clusters=2,random_state=0)
model3.fit(X)
cc=model3.predict(X)
fig = plt.figure(figsize=(9,4))
plt.title('TEXT RATIONALITY - Structure of Arguments (0,1)\n'+'WIKIPEDIA webpage Statistical Inference\n'+'Gensim Only 1 Topic',fontweight="bold")
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cc,marker='.', s=2,label='Topic {0})'.format(cc))
for i in range(0,len(cc)):
plt.annotate('{}'.format(cc[i]),(X_tsne[:, 0][i], X_tsne[:, 1][i]),fontweight="bold",color='rgbck'[cc[i]])
plt.show()
ag=lsi.show_topics(num_topics=1, num_words=100)
cd=[''.join([i for i in str(ag[x]) if not i.isdigit()]) for x in range(0,1)]
bc=[strip_punctuation(re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',str(cd[i]))) for i in range(0,1)]
txt2=[str(txt[i]).lower() for i in range(0,len(txt))]
tokens_lsi=[word_tokenize(bc[i]) for i in range(0,len(bc))]
tokens_txt=[word_tokenize(txt2[i]) for i in range(0,len(txt))]
def norm(x):
return (x-np.min(x))/(np.max(x)-np.min(x))
'''TOPIC 0'''
a0=[]
for i in range(0,len(tokens_txt)):
a0.append(np.sum([Counter(tokens_txt[i])[x] for x in tokens_lsi[0]]))
topic1=norm(a0)
threshold=0.3
[print(topic1[i],documents[i]) for i in np.where(topic1>threshold)[0]]
lsi.print_topics(1)
from scipy.cluster.hierarchy import dendrogram, linkage
P = linkage(matrix3, 'ward')
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
corr, coph_distances = cophenet(P, pdist(matrix3))
corr
plt.figure(figsize=(9,4))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('PARAGRAPH')
plt.ylabel('DISTANCE')
dendrogram(P,
leaf_rotation=0.,
leaf_font_size=12.,)
plt.show()
model00=[]
for i in range(0,len(sentences)):
tokens = word_tokenize(str(sentences[i]))
long_words = [w for w in tokens if len(w)>6]
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in long_words]
texts
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester4.mm', corpus) # store to disk, for later use
from gensim import corpora, models, similarities
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
## COORDENADAS DOS TEXTOS
todas=[]
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
todas.append(doc)
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester4.dict')
corpus = corpora.MmCorpus('/tmp/deerwester4.mm') # comes from the first tutorial, "From strings to vectors"
model = models.LdaModel(corpus, id2word=dictionary, num_topics=1)
model00.append(model.print_topics(1))
model00[0]
import re
sp=[]
for i in range(0,len(sentences)):
word1 = " ".join(re.findall("[a-zA-Z]+", str(model00[i])))
sp.append(word_tokenize(word1)[0])
P
x = np.array(range(len(sp)))
plt.figure(figsize=(8,4))
plt.title('Hierarchical Clustering Dendrogram\n'+'Obtained via LDA of Paragraphs in Text Summarization')
plt.xlabel('\nPARAGRAPHS KEYWORDS')
plt.ylabel('DISTANCE - SUMMARIZATION')
dendrogram(P,
leaf_rotation=40.,
leaf_font_size=10.,color_threshold=1,)
plt.xticks(3+x*10,sp)
plt.show()
plt.figure(figsize=(8,4))
plt.title('Hierarchical Clustering Dendrogram\n'+'Obtained via LDA of Paragraphs in Text Summarization')
plt.xlabel('\nPARAGRAPHS KEYWORDS')
plt.ylabel('DISTANCE - SUMMARIZATION')
dendrogram(P,
leaf_rotation=0.,
leaf_font_size=10.,color_threshold=12,)
plt.show()
#clusters
from scipy.cluster.hierarchy import fcluster
max_d = 1
clusters = fcluster(P, max_d, criterion='distance')
clusters
plt.figure(figsize=(9, 3.5))
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=clusters, cmap='prism',s=400)
for i in range(0,len(sentences)):
plt.annotate(sp[i],xy=(X_tsne[:,0][i],X_tsne[:,1][i]),xytext=(X_tsne[:,0][i]+8,X_tsne[:,1][i]+20))
plt.title('Cluster of topics via Dendrogram + LDA')
plt.show()
sa=[]
for i in range(1,max(clusters)+1):
sa.append(np.where(clusters==i)[0])
sa
for x in range(len(sa)):[print('Cluster {0}:'.format(x),sentences[i]) for i in sa[x]]