-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
probabilistic_learning.py
154 lines (120 loc) · 5.21 KB
/
probabilistic_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Learning probabilistic models. (Chapters 20)"""
import heapq
from utils import weighted_sampler, product, gaussian
class CountingProbDist:
"""
A probability distribution formed by observing and counting examples.
If p is an instance of this class and o is an observed value, then
there are 3 main operations:
p.add(o) increments the count for observation o by 1.
p.sample() returns a random element from the distribution.
p[o] returns the probability for o (as in a regular ProbDist).
"""
def __init__(self, observations=None, default=0):
"""
Create a distribution, and optionally add in some observations.
By default this is an unsmoothed distribution, but saying default=1,
for example, gives you add-one smoothing.
"""
if observations is None:
observations = []
self.dictionary = {}
self.n_obs = 0
self.default = default
self.sampler = None
for o in observations:
self.add(o)
def add(self, o):
"""Add an observation o to the distribution."""
self.smooth_for(o)
self.dictionary[o] += 1
self.n_obs += 1
self.sampler = None
def smooth_for(self, o):
"""
Include o among the possible observations, whether or not
it's been observed yet.
"""
if o not in self.dictionary:
self.dictionary[o] = self.default
self.n_obs += self.default
self.sampler = None
def __getitem__(self, item):
"""Return an estimate of the probability of item."""
self.smooth_for(item)
return self.dictionary[item] / self.n_obs
# (top() and sample() are not used in this module, but elsewhere.)
def top(self, n):
"""Return (count, obs) tuples for the n most frequent observations."""
return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
def sample(self):
"""Return a random sample from the distribution."""
if self.sampler is None:
self.sampler = weighted_sampler(list(self.dictionary.keys()), list(self.dictionary.values()))
return self.sampler()
def NaiveBayesLearner(dataset, continuous=True, simple=False):
if simple:
return NaiveBayesSimple(dataset)
if continuous:
return NaiveBayesContinuous(dataset)
else:
return NaiveBayesDiscrete(dataset)
def NaiveBayesSimple(distribution):
"""
A simple naive bayes classifier that takes as input a dictionary of
CountingProbDist objects and classifies items according to these distributions.
The input dictionary is in the following form:
(ClassName, ClassProb): CountingProbDist
"""
target_dist = {c_name: prob for c_name, prob in distribution.keys()}
attr_dists = {c_name: count_prob for (c_name, _), count_prob in distribution.items()}
def predict(example):
"""Predict the target value for example. Calculate probabilities for each
class and pick the max."""
def class_probability(target_val):
attr_dist = attr_dists[target_val]
return target_dist[target_val] * product(attr_dist[a] for a in example)
return max(target_dist.keys(), key=class_probability)
return predict
def NaiveBayesDiscrete(dataset):
"""
Just count how many times each value of each input attribute
occurs, conditional on the target value. Count the different
target values too.
"""
target_vals = dataset.values[dataset.target]
target_dist = CountingProbDist(target_vals)
attr_dists = {(gv, attr): CountingProbDist(dataset.values[attr]) for gv in target_vals for attr in dataset.inputs}
for example in dataset.examples:
target_val = example[dataset.target]
target_dist.add(target_val)
for attr in dataset.inputs:
attr_dists[target_val, attr].add(example[attr])
def predict(example):
"""
Predict the target value for example. Consider each possible value,
and pick the most likely by looking at each attribute independently.
"""
def class_probability(target_val):
return (target_dist[target_val] * product(attr_dists[target_val, attr][example[attr]]
for attr in dataset.inputs))
return max(target_vals, key=class_probability)
return predict
def NaiveBayesContinuous(dataset):
"""
Count how many times each target value occurs.
Also, find the means and deviations of input attribute values for each target value.
"""
means, deviations = dataset.find_means_and_deviations()
target_vals = dataset.values[dataset.target]
target_dist = CountingProbDist(target_vals)
def predict(example):
"""Predict the target value for example. Consider each possible value,
and pick the most likely by looking at each attribute independently."""
def class_probability(target_val):
prob = target_dist[target_val]
for attr in dataset.inputs:
prob *= gaussian(means[target_val][attr], deviations[target_val][attr], example[attr])
return prob
return max(target_vals, key=class_probability)
return predict