-
Notifications
You must be signed in to change notification settings - Fork 4
/
main.py
243 lines (169 loc) · 7.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
'''
Authors: Ashwani Kashyap, Anshul Pardhi
'''
import random as rd
import re
import math
import string
def pre_process_tweets(url):
f = open(url, "r", encoding="utf8")
tweets = list(f)
list_of_tweets = []
for i in range(len(tweets)):
# remove \n from the end after every sentence
tweets[i] = tweets[i].strip('\n')
# Remove the tweet id and timestamp
tweets[i] = tweets[i][50:]
# Remove any word that starts with the symbol @
tweets[i] = " ".join(filter(lambda x: x[0] != '@', tweets[i].split()))
# Remove any URL
tweets[i] = re.sub(r"http\S+", "", tweets[i])
tweets[i] = re.sub(r"www\S+", "", tweets[i])
# remove colons from the end of the sentences (if any) after removing url
tweets[i] = tweets[i].strip()
tweet_len = len(tweets[i])
if tweet_len > 0:
if tweets[i][len(tweets[i]) - 1] == ':':
tweets[i] = tweets[i][:len(tweets[i]) - 1]
# Remove any hash-tags symbols
tweets[i] = tweets[i].replace('#', '')
# Convert every word to lowercase
tweets[i] = tweets[i].lower()
# remove punctuations
tweets[i] = tweets[i].translate(str.maketrans('', '', string.punctuation))
# trim extra spaces
tweets[i] = " ".join(tweets[i].split())
# convert each tweet from string type to as list<string> using " " as a delimiter
list_of_tweets.append(tweets[i].split(' '))
f.close()
return list_of_tweets
def k_means(tweets, k=4, max_iterations=50):
centroids = []
# initialization, assign random tweets as centroids
count = 0
hash_map = dict()
while count < k:
random_tweet_idx = rd.randint(0, len(tweets) - 1)
if random_tweet_idx not in hash_map:
count += 1
hash_map[random_tweet_idx] = True
centroids.append(tweets[random_tweet_idx])
iter_count = 0
prev_centroids = []
# run the iterations until not converged or until the max iteration in not reached
while (is_converged(prev_centroids, centroids)) == False and (iter_count < max_iterations):
print("running iteration " + str(iter_count))
# assignment, assign tweets to the closest centroids
clusters = assign_cluster(tweets, centroids)
# to check if k-means converges, keep track of prev_centroids
prev_centroids = centroids
# update, update centroid based on clusters formed
centroids = update_centroids(clusters)
iter_count = iter_count + 1
if (iter_count == max_iterations):
print("max iterations reached, K means not converged")
else:
print("converged")
sse = compute_SSE(clusters)
return clusters, sse
def is_converged(prev_centroid, new_centroids):
# false if lengths are not equal
if len(prev_centroid) != len(new_centroids):
return False
# iterate over each entry of clusters and check if they are same
for c in range(len(new_centroids)):
if " ".join(new_centroids[c]) != " ".join(prev_centroid[c]):
return False
return True
def assign_cluster(tweets, centroids):
clusters = dict()
# for every tweet iterate each centroid and assign closest centroid to a it
for t in range(len(tweets)):
min_dis = math.inf
cluster_idx = -1;
for c in range(len(centroids)):
dis = getDistance(centroids[c], tweets[t])
# look for a closest centroid for a tweet
if centroids[c] == tweets[t]:
# print("tweet and centroid are equal with c: " + str(c) + ", t" + str(t))
cluster_idx = c
min_dis = 0
break
if dis < min_dis:
cluster_idx = c
min_dis = dis
# randomise the centroid assignment to a tweet if nothing is common
if min_dis == 1:
cluster_idx = rd.randint(0, len(centroids) - 1)
# assign the closest centroid to a tweet
clusters.setdefault(cluster_idx, []).append([tweets[t]])
# print("tweet t: " + str(t) + " is assigned to cluster c: " + str(cluster_idx))
# add the tweet distance from its closest centroid to compute sse in the end
last_tweet_idx = len(clusters.setdefault(cluster_idx, [])) - 1
clusters.setdefault(cluster_idx, [])[last_tweet_idx].append(min_dis)
return clusters
def update_centroids(clusters):
centroids = []
# iterate each cluster and check for a tweet with closest distance sum with all other tweets in the same cluster
# select that tweet as the centroid for the cluster
for c in range(len(clusters)):
min_dis_sum = math.inf
centroid_idx = -1
# to avoid redundant calculations
min_dis_dp = []
for t1 in range(len(clusters[c])):
min_dis_dp.append([])
dis_sum = 0
# get distances sum for every of tweet t1 with every tweet t2 in a same cluster
for t2 in range(len(clusters[c])):
if t1 != t2:
if t2 < t1:
dis = min_dis_dp[t2][t1]
else:
dis = getDistance(clusters[c][t1][0], clusters[c][t2][0])
min_dis_dp[t1].append(dis)
dis_sum += dis
else:
min_dis_dp[t1].append(0)
# select the tweet with the minimum distances sum as the centroid for the cluster
if dis_sum < min_dis_sum:
min_dis_sum = dis_sum
centroid_idx = t1
# append the selected tweet to the centroid list
centroids.append(clusters[c][centroid_idx][0])
return centroids
def getDistance(tweet1, tweet2):
# get the intersection
intersection = set(tweet1).intersection(tweet2)
# get the union
union = set().union(tweet1, tweet2)
# return the jaccard distance
return 1 - (len(intersection) / len(union))
def compute_SSE(clusters):
sse = 0
# iterate every cluster 'c', compute SSE as the sum of square of distances of the tweet from it's centroid
for c in range(len(clusters)):
for t in range(len(clusters[c])):
sse = sse + (clusters[c][t][1] * clusters[c][t][1])
return sse
if __name__ == '__main__':
data_url = 'Health_Tweets/bbchealth.txt'
tweets = pre_process_tweets(data_url)
# default number of experiments to be performed
experiments = 5
# default value of K for K-means
k = 3
# for every experiment 'e', run K-means
for e in range(experiments):
print("------ Running K means for experiment no. " + str((e + 1)) + " for k = " + str(k))
clusters, sse = k_means(tweets, k)
# for every cluster 'c', print size of each cluster
for c in range(len(clusters)):
print(str(c+1) + ": ", str(len(clusters[c])) + " tweets")
# # to print tweets in a cluster
# for t in range(len(clusters[c])):
# print("t" + str(t) + ", " + (" ".join(clusters[c][t][0])))
print("--> SSE : " + str(sse))
print('\n')
# increment k after every experiment
k = k + 1