-
Notifications
You must be signed in to change notification settings - Fork 3
/
bloom_filter.hpp
363 lines (311 loc) · 9.7 KB
/
bloom_filter.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#ifndef BLOOM_FILTER_H
#define BLOOM_FILTER_H
#include <stdlib.h>
#include <cmath>
#include <sstream>
using namespace std;
#define WORD 32 // for 32 bits in a uint32_t type
#define PRIME 32494189635056477 // long prime number required for hash functions
typedef unsigned long long int ullong_t; // alias for type
/*
[Class Methods]
Class bit_vector:
default constructor: initialises empty array of 0s of given size;
parameterized constructor: initialises empty array of 0s of size _bit_size;
copy constructor: Makes a deep copy of a previous bit_vector of same size;
set(bit_pos, bit_val): sets bit to bit_val at position bit_pos;
reset(): resets all bits to 0;
flip(): flips all bits;
test(i): returns value at index i;
operator[i]: returns value at index i (works only for rvalue);
size(): returns length of bit vector;
count(): returns number of 1s in the vector;
any(): checks if atleast 1 bit is set to 1;
none(): checks if none of the bits are set to 1;
all(): checks if all of the bits are set to 1;
friend ostream operator<<: displays bit vector;
*/
template<ullong_t bit_size>
class bit_vector;
template<ullong_t bit_size>
ostream& operator<<(ostream &output, bit_vector<bit_size>& b);
template<typename T>
class bloom_filter;
template<typename T>
ostream& operator<<(ostream& output, bloom_filter<T>& bf);
template<ullong_t bit_size = 0>
class bit_vector{
private:
ullong_t _bit_size; // Total number of bits in the array
ullong_t _array_size; // Number of 32 bit WORDS in the array [ONLY FOR INTERNAL USE]
uint32_t* _bit_array; // The actual array of 32 bit words
// Internal helper functions
ullong_t array_index(ullong_t bit_pos){
return bit_pos / WORD;
}
ullong_t array_offset(ullong_t bit_pos){
return bit_pos % WORD;
}
public:
// Constructors and destructors
bit_vector() : _bit_size(bit_size) {
_array_size = _bit_size / WORD + 1;
_bit_array = new uint32_t[_array_size];
for(int i = 0; i < _array_size; ++i){
_bit_array[i] = 0;
}
}
bit_vector(ullong_t _bit_size) : _bit_size(_bit_size){
_array_size = _bit_size / WORD + 1;
_bit_array = new uint32_t[_array_size];
for(int i = 0; i < _array_size; ++i){
_bit_array[i] = 0;
}
}
bit_vector(bit_vector<bit_size>& b){
this -> _bit_size = b._bit_size;
this -> _array_size = b._array_size;
this -> _bit_array = new uint32_t[this -> _array_size];
for(int i = 0; i < this -> _bit_size; ++i){
this -> set(i, b.test(i));
}
}
~bit_vector(){
delete[] _bit_array;
}
// Bit operation functions
bool set(ullong_t bit_pos, bool bit_val){
if(bit_pos >= _bit_size){
return false;
}
ullong_t bit_index = array_index(bit_pos);
ullong_t bit_offset = array_offset(bit_pos);
if(bit_val){
_bit_array[bit_index] |= (1 << (WORD - bit_offset - 1));
}
else{
_bit_array[bit_index] &= ~(1 << (WORD - bit_offset - 1));
}
return true;
}
void reset(){
for(ullong_t i = 0; i < this -> _bit_size; ++i){
this -> set(i, 0);
}
}
void flip(){
for(ullong_t i = 0; i < this -> _bit_size; ++i){
if(this -> test(i) == true){
this -> set(i, false);
}
else{
this -> set(i, true);
}
}
}
// Bit access functions
bool test(ullong_t bit_pos){
if(bit_pos >= _bit_size){
return false;
}
ullong_t bit_index = array_index(bit_pos);
ullong_t bit_offset = array_offset(bit_pos);
return _bit_array[bit_index] & (1 << (WORD - bit_offset - 1));
}
bool operator[] (ullong_t bit_pos){
return this -> test(bit_pos);
}
ullong_t size(){
return this -> _bit_size;
}
ullong_t count(){
ullong_t _count = 0;
for(ullong_t i = 0; i < this -> _bit_size; ++i){
if(this -> test(i) == true){
++_count;
}
}
return _count;
}
bool any(){
for(ullong_t i = 0; i < this -> _bit_size; ++i){
if(this -> test(i) == true){
return true;
}
}
return false;
}
bool none(){
if(this -> any() == false){
return true;
}
else{
return false;
}
}
bool all(){
for(ullong_t i = 0; i < this -> _bit_size; ++i){
if(this -> test(i) == false){
return false;
}
}
return true;
}
// friend functions
friend ostream& operator<<<bit_size>(ostream &output, bit_vector<bit_size>& b);
};
// operator overloaded friend function for bit vector
template<ullong_t bit_size>
ostream& operator<<(ostream &output, bit_vector<bit_size>& b){
uint32_t compare = (1 << (WORD - 1));
// printing all but last element of bit array
ullong_t i = 0;
for(i = 0; i < b._array_size - 1; ++i){
uint32_t temp = b._bit_array[i];
for(int j = 0; j < WORD; ++j){
output << ((temp & compare) ? 1 : 0) << " ";
temp = temp << 1;
}
}
// printing last element of the bit array
uint32_t temp = b._bit_array[i];
ullong_t max_offset = b._bit_size % WORD;
for(int j = 0; j < max_offset; ++j){
output << ((temp & compare) ? 1 : 0) << " ";
temp = temp << 1;
}
return output;
}
/*
[Class Methods]
Class Hasher:
static methods:
hash(string value, int seed): hashes a string and returns ullong_t;
hash(int value, int seed): hashes an int and returns ullong_t;
hash(ullong_t value, int seed): hashes a ullong_t and returns ullong_t;
hash(long double value, int seed): hashes a double and returns ullong_t;
*/
class hasher{
public:
static ullong_t hash(string value, int seed){
ullong_t hashed_value = 1;
long long int temp = 1;
for(int i = 0; i < value.size(); ++i){
temp = (temp * 53) % PRIME;
hashed_value = (hashed_value + (temp * (int)value[i] + 47 * seed)) % PRIME;
}
return hashed_value;
}
static ullong_t hash(int value, int seed){
ullong_t hashed_value = 1;
hashed_value = (53 * value + 47 * seed) % PRIME;
return hashed_value;
}
static ullong_t hash(ullong_t value, int seed){
ullong_t hashed_value = 1;
hashed_value = (53 * value + 47 * seed) % PRIME;
return hashed_value;
}
static ullong_t hash(long double value, int seed){
ostringstream strs;
strs << value;
string str = strs.str();
return hash(str, seed);
}
};
/*
[Class Methods]
Class bloom_filter:
parameterized constructor 1: simple init method from parameters _false_positive_rate and _expected_num_elements;
parameterized constructor 2: another init method from _num_hash_fn, _bit_array_size, _expected_num_elements;
copy constructor: Makes a deep copy of a previous bloom_filter of same size;
insert(): insert hashed value k number of times after each hash function with seed;
check(): checks and returns true/false;
get_false_positive_rate(): returns false probability percentage calculated from the # of inserted elements;
get_num_hash_fn(): returns number of hash functions used;
get_bit_array_size(): returns size of bit array;
get_expected_num_elements(): returns the expected number of elements to be inserted;
*/
template<typename T>
class bloom_filter{
private:
long double _false_positive_rate; // false positivity rate
int _num_hash_fn; // number of hash functions (also works as seed)
ullong_t _bit_array_size; // size of bit array required
ullong_t _expected_num_elements; // expected number of elements to insert
bit_vector<>* _bit_vector; // pointer to the bit_vector used internally
// Internal functions
inline void update_fpr(int num_hash_fn, ullong_t bit_array_size, ullong_t expected_num_elements){
_false_positive_rate = pow(( 1.0 - (pow( (1.0-(1.0/bit_array_size)) , (num_hash_fn*expected_num_elements) ) )), (num_hash_fn));
}
public:
// Constructors and destructors
bloom_filter(long double false_positive_rate, ullong_t expected_num_elements)
: _false_positive_rate(false_positive_rate), _expected_num_elements(expected_num_elements){
_bit_array_size = -1 * ((_expected_num_elements * log(_false_positive_rate)) / pow(log(2), 2));
_num_hash_fn = ceil((1.0 * _bit_array_size / _expected_num_elements) * log(2));
_bit_vector = new bit_vector<>(_bit_array_size);
}
bloom_filter(int num_hash_fn, ullong_t bit_array_size, ullong_t expected_num_elements)
: _num_hash_fn(num_hash_fn), _bit_array_size(bit_array_size), _expected_num_elements(expected_num_elements){
update_fpr(_num_hash_fn, _bit_array_size, _expected_num_elements);
_bit_vector = new bit_vector<>(_bit_array_size);
}
bloom_filter(bloom_filter<T> &bf){
this -> _false_positive_rate = bf._false_positive_rate;
this -> _num_hash_fn = bf._num_hash_fn;
this -> _bit_array_size = bf._bit_array_size;
this -> _expected_num_elements = bf._expected_num_elements;
this -> _bit_vector = new bit_vector<>(*(bf._bit_vector));
}
~bloom_filter(){
(*(this->_bit_vector)).~bit_vector();
}
// main functionalities
bool insert(T value){
for(int i = 0; i < _num_hash_fn; ++i){
int seed = i + 1;
ullong_t hashed_value = hasher::hash(value, seed);
ullong_t insert_at_index = hashed_value % _bit_array_size;
bool is_inserted = _bit_vector -> set(insert_at_index, 1);
if(!is_inserted){
return false;
}
}
return true;
}
bool check(T value){
for(int i = 0; i < _num_hash_fn; ++i){
int seed = i + 1;
ullong_t hashed_value = hasher::hash(value, seed);
ullong_t check_at_index = hashed_value % _bit_array_size;
bool check_index = _bit_vector -> test(check_at_index);
if(!check_index){
return false;
}
}
return true;
}
// getters of private attributes
long double get_false_positive_rate(){
return _false_positive_rate;
}
int get_num_hash_fn(){
return _num_hash_fn;
}
ullong_t get_bit_array_size(){
return _bit_array_size;
}
ullong_t get_expected_num_elements(){
return _expected_num_elements;
}
// friend functions
friend ostream& operator<<<T>(ostream& output, bloom_filter<T>& bf);
};
// operator overloaded friend function for bloom filter
template<typename T>
ostream& operator<<(ostream& output, bloom_filter<T>& bf){
output << *(bf._bit_vector);
return output;
}
#endif