-
Notifications
You must be signed in to change notification settings - Fork 0
/
bloom_filter.py
154 lines (123 loc) · 5.77 KB
/
bloom_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/env python
import math
from abc import ABC, abstractmethod
from bitarray import bitarray
'''
A Bloom filter is a space-efficient probabilistic data structure
that is used to test whether an element is a member of a set.
A query returns either "possibly in set" or "definitely not in set".
Elements can be added to the set, but not removed.
'''
__author__ = "Damian Stygar"
class BloomFilter(ABC):
def __init__(self, probability_of_false_positives, expected_number_of_elements):
"""
Size of Bloom Filter is estimated from:
m = (-n*ln(p))/(ln(2))^2,
where m is size of Bloom Filter, n is number of expected elements, p is probability of false positives.
:param probability_of_false_positives: probability of false positives.
:param expected_number_of_elements: expected number of elements to be inserted to Bloom Filter.
"""
if expected_number_of_elements <= 0:
raise ValueError("Expected number of elements should be greater than 0!")
self.size = math.ceil((-expected_number_of_elements * math.log(probability_of_false_positives)) / pow(math.log(2), 2))
if self.size <= 0:
raise ValueError("Size of Bloom Filter should be greater than 0!")
self.expected_number_of_elements = expected_number_of_elements
self.number_of_hash = math.ceil((self.size / self.expected_number_of_elements) * math.log(2))
self.bits_per_element = self.size / self.expected_number_of_elements
self.bit_set = bitarray(self.size)
self.bit_set.setall(0)
self.number_of_elements = 0
def add(self, element):
"""
The add method enables you to insert element to Bloom Filter.
:param element: an element to be inserted to Bloom Filter
"""
hashes = self.create_hashes(str(element).encode('utf-8'), self.number_of_hash)
for h in hashes:
self.bit_set[h] = 1
self.number_of_elements += 1
def add_all(self, collection):
"""
The add_all method enables you to insert each element from collection to Bloom Filter.
:param collection: a collection with elements to be inserted to Bloom Filter.
"""
for item in collection:
self.add(item)
def might_contains(self, element):
"""
The might_contains method enables you to check if Bloom Filter may contains element.
:param element: an element to be checked
:return: True if Bloom Filter can contains element (Remember that can be false positive result).
False if Bloom Filter cannot contains element.
"""
hashes = self.create_hashes(str(element).encode('utf-8'), self.number_of_hash)
for h in hashes:
if self.bit_set[h] == 0:
return False
return True
def might_contains_all(self, collection):
"""
The might_contains_all method enables you to check if Bloom Filter may contains each element from collection.
:param collection: a collection with elements to be checked.
:return: True if Bloom Filter can contains each element (Remember that can be false positive result).
False if Bloom Filter cannot contains each element.
"""
for item in collection:
if not self.might_contains(item):
return False
return True
def get_expected_probability_of_false_positives(self):
"""
The get_expected_probability_of_false_positives method enables you to get expected probability of false positives.
:return: expected probability of false positives.
"""
return self.get_probability_of_false_positives(self.expected_number_of_elements)
def get_current_probability_of_false_positives(self):
"""
The get_current_probability_of_false_positives method enables you to get actual probability of false positives.
:return: actual probability of false positives.
"""
return self.get_probability_of_false_positives(self.number_of_elements)
def get_probability_of_false_positives(self, number_of_elements):
"""
The get_probability_of_false_positives method enables you to get probability of false positives based on parameter.
:param number_of_elements: a number of elements in Bloom Filter.
:return: probability of false positives based on parameter.
"""
return pow(1 - math.exp(-self.number_of_hash * number_of_elements / self.size), self.number_of_hash)
def clear(self):
"""
The clear method enables you to delete all elements from Bloom Filter.
"""
self.number_of_elements = 0
self.bit_set.setall(0)
def is_empty(self):
"""
The is_empty method enables you to check if Bloom Filter is empty.
:return: True, if Bloom Filter is empty.
False, if Bloom Filter is not empty.
"""
return self.number_of_elements == 0
def get_bits_per_element(self):
"""
The get_bits_per_element method enables you to get actual bits per element.
:return: actual bits per element.
:raise ValueError: when actual number of inserted element = 0.
"""
if self.number_of_elements <= 0:
raise ValueError('Bloom Filter is empty!')
return self.size / self.number_of_elements
@abstractmethod
def create_hashes(self, data, number_of_hash):
return []
def get_value_from_generated_hash(self, data, hash_func):
"""
The get_value_from_generated_hash method enables you to get value from created hash.
:param data: data to hash.
:param hash_func: hash function.
:return: value from hash.
"""
h = hash_func(data)
return int(h.hexdigest(), 16)