-
Notifications
You must be signed in to change notification settings - Fork 0
/
naivebayes.py
120 lines (98 loc) · 3.17 KB
/
naivebayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#Naive Bayes Approach
import numpy as np
import math
from sklearn.metrics import precision_score, recall_score, f1_score
def separate_by_class(x):
separated = dict()
for i in range(len(x)):
row = list(x[i])
cv = row.pop(-1) # pop class to store as key
if (cv not in separated):
separated[cv] = list()
separated[cv].append(row)
return separated
def summarise(seperated):
summary = dict()
for label,value in seperated.items():
summary[label] = [(np.mean(c), np.std(c), len(c)) for c in zip(*value)]
return summary
#predicting each class for given row
# def calcCprob(summaries, row):
# total_rows = sum([summaries[label][0][2] for label in summaries])
# probabilities = dict()
# for cv, cs in summaries.items():
# #calculating prior for the class
# probabilities[cv] = summaries[cv][0][2]/float(total_rows)
# for i in range(len(cs)):
# mean, std, _ = cs[i]
# probabilities[cv] *= (1 / (np.sqrt(2 * math.pi) * std)) * np.exp(-((row[i]-mean)**2 / (2 * std**2 )))
# return probabilities
#With log trick
def calcCprob(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for cv, cs in summaries.items():
#calculating prior for the class
probabilities[cv] = np.log(summaries[cv][0][2]/float(total_rows))
for i in range(len(cs)):
mean, std, _ = cs[i]
if std>0.2:
prob = (1 / (np.sqrt(2 * math.pi) * std)) * np.exp(-((row[i]-mean)**2 / (2 * std**2 )))
else:
prob = 0
if prob != 0:
probabilities[cv] += np.log(prob)
return probabilities
def predict(summaries, row):
probabilities = calcCprob(summaries, row)
bestLabel, bestProb = None, -1
for cv, prob in probabilities.items():
if bestLabel is None or prob > bestProb:
bestProb = prob
bestLabel = cv
return bestLabel
def accuracy(y,yhat):
ate = 0
for i in range(yhat.shape[0]):
if y[i,:] == yhat[i,:]:
ate += 1
ate = ate*100/yhat.shape[0]
return ate
np.random.seed(0)
#LOADING DATA
X = np.loadtxt(open("data.csv", "rb"), delimiter=",", skiprows=1)[:,1:]
Y = np.loadtxt(open("data.csv", "rb"), delimiter=",", skiprows=1)[:,0]
Y = Y.reshape(Y.shape[0],1)
#Z-SCORE
meanX = np.mean(X, axis=0)
stdX = np.std(X, ddof=1, axis=0)
stdX[stdX==0] = 1
zscored = (X - meanX) / stdX
X = zscored
#print(X.shape,Y.shape)
X = np.concatenate((X,Y),axis=1)
#-------------------------------------------------------------------------------------------------------
#SHUFFLE
p = np.random.permutation(len(X))
X_train, X_test = X[p][:math.ceil(2*len(X)/3),:], X[p][math.ceil(2*len(X)/3):,:]
# print(X_train.shape, X_test.shape)
sep = separate_by_class(X_train)
model = summarise(sep)
#print(len(model[0.0]))
#print(model)
#validation into x and y
valx = X_test[:,:-1]
valy = X_test[:,-1]
valy = valy.reshape(valy.shape[0],1)
#print(valx.shape,valy.shape)
yhat = []
for row in valx:
y_hat = predict(model, row)
yhat.append(y_hat)
yhat = np.array(yhat)
yhat = yhat.reshape(valy.shape)
#note: to run the code with log trick please uncomment the function above.
print(f'Precision: {precision_score(valy, yhat)}')
print(f'Recall: {recall_score(valy,yhat)}')
print(f'F-measure: {f1_score(valy,yhat)}')
print(f'Accuracy: {accuracy(valy,yhat)} %')