Skip to content

Commit 9bb7346

Browse files
committed
Implement a Bloom Filter (Miscellaneous) Data Structure
1 parent f34b7d6 commit 9bb7346

File tree

4 files changed

+258
-0
lines changed

4 files changed

+258
-0
lines changed

AUTHORS

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ Pratik Goyal <[email protected]>
1111
Jay Thorat <[email protected]>
1212
Rajveer Singh Bharadwaj <[email protected]>
1313
Kishan Ved <[email protected]>
14+
Akshat Shukla <[email protected]>

pydatastructs/miscellaneous_data_structures/__init__.py

+6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
queue,
77
disjoint_set,
88
sparse_table,
9+
bloom_filter,
910
_extensions,
1011
)
1112

@@ -50,3 +51,8 @@
5051
Multiset
5152
)
5253
__all__.extend(multiset.__all__)
54+
55+
from .bloom_filter import (
56+
BloomFilter
57+
)
58+
__all__.extend(bloom_filter.__all__)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
from pydatastructs.linear_data_structures import OneDimensionalArray # TODO: Use a C++ Backend for the BitArray class
2+
from pydatastructs.utils.misc_util import (
3+
Backend, raise_if_backend_is_not_python)
4+
import hashlib
5+
import math
6+
7+
8+
__all__ = ['BloomFilter']
9+
10+
class BloomFilter(object):
11+
"""
12+
Represents a Bloom Filter for Probabilistic Membership testing
13+
14+
Parameters
15+
==========
16+
17+
capacity: int
18+
The capacity of the Bloom Filter. i.e., the maximum number of elements the filter can hold not exceeding the error rate.
19+
20+
error_rate: float
21+
The error rate of the Bloom Filter. i.e., the probability of false positives.
22+
23+
backend: pydatastructs.Backend
24+
The backend to be used.
25+
Optional, by default, the best available
26+
backend is used.
27+
28+
Examples
29+
========
30+
31+
>>> from pydatastructs import BloomFilter
32+
>>> bf = BloomFilter(capacity=10**5, error_rate=0.005)
33+
>>> bf.add(1)
34+
>>> 1 in bf
35+
True
36+
>>> bf.add("Hello")
37+
>>> "Hello" in bf
38+
True
39+
>>> "hello" in bf
40+
False
41+
>>> len(bf)
42+
2
43+
44+
References
45+
==========
46+
47+
.. [1] https://en.wikipedia.org/wiki/Bloom_filter
48+
"""
49+
50+
BITS_PER_SLICE = 32
51+
52+
__slots__ = ['barray', 'array_size', 'capacity', 'num_hashes', 'hash_name', 'n_ele']
53+
54+
def __new__(cls, capacity=10**5, error_rate=0.005, array_size=None, num_hashes=None, hash_name='sha512', init_elements=None, **kwargs):
55+
raise_if_backend_is_not_python(
56+
cls, kwargs.get('backend', Backend.PYTHON))
57+
58+
if not (1 > error_rate > 0):
59+
raise ValueError("Error Rate must be between 0 and 1.")
60+
if not capacity > 0:
61+
raise ValueError("Capacity must be gerater than 0")
62+
63+
obj = object.__new__(cls)
64+
if array_size is None:
65+
array_size = math.ceil((capacity * abs(math.log(error_rate))) / (math.log(2) ** 2))
66+
obj.array_size = array_size
67+
obj.barray = BitArray(obj.array_size, cls.BITS_PER_SLICE)
68+
obj.hash_name = hash_name
69+
obj.capacity = capacity
70+
obj.n_ele = 0
71+
if num_hashes is None:
72+
num_hashes = math.ceil((obj.array_size / capacity) * math.log(2))
73+
obj.num_hashes = num_hashes
74+
75+
if init_elements is not None:
76+
for elem in init_elements:
77+
obj.add(elem)
78+
79+
return obj
80+
81+
@classmethod
82+
def methods(cls):
83+
return ['add', '__new__', 'contains', '__contains__']
84+
85+
def add(self, key):
86+
"""
87+
Adds the key to the Bloom Filter
88+
89+
Parameters
90+
==========
91+
key: str | bytes | int | float | bool
92+
The key to be added to the Bloom Filter
93+
"""
94+
95+
if self.n_ele >= self.capacity:
96+
raise ValueError("Bloom Filter is full")
97+
98+
key = self._serialize(key)
99+
for h in self._hashes(key):
100+
self.barray[h] = 1
101+
self.n_ele += 1
102+
103+
def contains(self, key):
104+
"""
105+
Checks if the Bloom Filter contains the key
106+
107+
Parameters
108+
==========
109+
110+
key: str | bytes | int | float | bool
111+
The key to be checked for membership
112+
"""
113+
114+
key = self._serialize(key)
115+
for h in self._hashes(key):
116+
if self.barray[h] == 0:
117+
return False
118+
return True
119+
120+
def _serialize(self, data):
121+
if isinstance(data, bytes):
122+
return data
123+
elif isinstance(data, str):
124+
return data.encode('utf-8')
125+
elif isinstance(data, (int, float, bool)):
126+
return str(data).encode('utf-8')
127+
else:
128+
raise TypeError(f"Data type {type(data)} not supported")
129+
130+
131+
def _hashes(self, data: bytes):
132+
result = []
133+
salt = 0
134+
135+
while len(result) < self.num_hashes:
136+
hasher = hashlib.new(self.hash_name)
137+
hasher.update(bytes([salt]))
138+
hasher.update(data)
139+
digest = hasher.digest()
140+
salt += 1
141+
142+
for i in range(0, len(digest), 4):
143+
if len(result) >= self.num_hashes:
144+
break
145+
h = int.from_bytes(digest[i:i+4], byteorder="big", signed=False) % self.array_size
146+
result.append(h)
147+
148+
return result
149+
150+
def __len__(self):
151+
return self.n_ele
152+
153+
def __contains__(self, data):
154+
return self.contains(data)
155+
156+
157+
158+
class BitArray():
159+
def __init__(self, size, bits_per_slice=32):
160+
if bits_per_slice <= 0:
161+
raise ValueError("Bits per slice must be greater than 0")
162+
if size <= 0:
163+
raise ValueError("Size must be greater than 0")
164+
165+
self.size = size
166+
self.byte_size = (size + bits_per_slice - 1) // bits_per_slice
167+
self.b = bits_per_slice
168+
self.array = OneDimensionalArray(int, size, init=0)
169+
170+
def __setitem__(self, i, value):
171+
if i >= self.size:
172+
raise IndexError("Index out of range")
173+
174+
byte_index = i // self.b
175+
bit_index = i % self.b
176+
177+
current_value = self.array[byte_index]
178+
179+
if value:
180+
current_value |= (1 << bit_index)
181+
else:
182+
current_value &= ~(1 << bit_index)
183+
184+
self.array[byte_index] = current_value
185+
186+
def __getitem__(self, i):
187+
if i >= self.size:
188+
raise IndexError("Index out of range")
189+
190+
byte_index = i // self.b
191+
bit_index = i % self.b
192+
193+
return (self.array[byte_index] >> bit_index) & 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from pydatastructs.miscellaneous_data_structures import BloomFilter
2+
from pydatastructs.miscellaneous_data_structures.bloom_filter import BitArray
3+
from pydatastructs.utils.raises_util import raises
4+
from pydatastructs.utils.misc_util import _check_type, Backend
5+
6+
def test_BloomFilter():
7+
assert raises(ValueError, lambda: BloomFilter(capacity=10**5, error_rate=0))
8+
9+
bf = BloomFilter(capacity=10**5, error_rate=0.005)
10+
bf.add(1)
11+
assert 1 in bf
12+
bf.add("Q")
13+
assert "Q" in bf
14+
assert "q" not in bf
15+
assert len(bf) == 2
16+
assert 1 in bf
17+
18+
bf.add(True)
19+
assert True in bf
20+
assert False not in bf
21+
22+
bf = BloomFilter(capacity=10**2, error_rate=0.002, array_size=10**6, num_hashes=5, hash_name='md5')
23+
bf.add(1.0)
24+
assert 1 not in bf
25+
bf.add("Q")
26+
assert "p" not in bf
27+
assert "Q" in bf
28+
bf.add(False)
29+
assert len(bf) == 3
30+
assert False in bf
31+
32+
33+
34+
def test_BitArray():
35+
ba = BitArray(10, bits_per_slice=8)
36+
assert ba[0] == ba[1] == ba[2] == ba[3] == ba[4] == ba[5] == ba[6] == ba[7] == ba[8] == ba[9] == 0
37+
ba[0] = 1
38+
assert ba[0] == 1
39+
assert ba[1] == ba[2] == ba[3] == ba[4] == ba[5] == ba[6] == ba[7] == ba[8] == ba[9] == 0
40+
ba[1] = 1
41+
assert ba[0] == ba[1] == 1
42+
assert ba[2] == ba[3] == ba[4] == ba[5] == ba[6] == ba[7] == ba[8] == ba[9] == 0
43+
ba[9] = 1
44+
assert ba[0] == ba[1] == ba[9] == 1
45+
assert ba[2] == ba[3] == ba[4] == ba[5] == ba[6] == ba[7] == ba[8] == 0
46+
ba[0] = 0
47+
assert ba[0] == ba[2] == ba[3] == ba[4] == ba[5] == ba[6] == ba[7] == ba[8] == 0
48+
assert ba[1] == ba[9] == 1
49+
ba[1] = 0
50+
assert ba[0] == ba[1] == ba[2] == ba[3] == ba[4] == ba[5] == ba[6] == ba[7] == ba[8] == 0
51+
assert ba[9] == 1
52+
ba[9] = 0
53+
assert ba[0] == ba[1] == ba[2] == ba[3] == ba[4] == ba[5] == ba[6] == ba[7] == ba[8] == ba[9] == 0
54+
55+
assert raises(IndexError, lambda: ba[10])
56+
assert raises(IndexError, lambda: ba[-1])
57+
58+
assert raises(ValueError, lambda: BitArray(10, bits_per_slice=0))

0 commit comments

Comments
 (0)