-
Notifications
You must be signed in to change notification settings - Fork 0
/
Filter.pm
341 lines (221 loc) · 6.42 KB
/
Filter.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
package Bloom::Filter;
use strict;
use warnings;
use Carp;
use Digest::SHA1 qw/sha1 sha1_base64/;
our $VERSION = '1.0';
=head1 NAME
Bloom::Filter - Sample Perl Bloom filter implementation
=head1 DESCRIPTION
A Bloom filter is a probabilistic algorithm for doing existence tests
in less memory than a full list of keys would require. The tradeoff to
using Bloom filters is a certain configurable risk of false positives.
This module implements a simple Bloom filter with configurable capacity
and false positive rate. Bloom filters were first described in a 1970
paper by Burton Bloom, see L<http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal>.
=head1 SYNOPSIS
use Bloom::Filter
my $bf = Bloom::Filter->new( capacity => 10, error_rate => .001 );
$bf->add( @keys );
while ( <> ) {
chomp;
print "Found $_\n" if $bf->check( $_ );
}
=head1 CONSTRUCTORS
=over
=item new %PARAMS
Create a brand new instance. Allowable params are C<error_rate>, C<capacity>.
=cut
sub new
{
my ( $class, %params ) = @_;
my $self =
{
# some defaults
error_rate => 0.001,
capacity => 100,
%params,
# internal data
key_count => 0,
filter_length => 0,
num_hash_funcs => 0,
salts => [],
};
bless $self, $class;
$self->init();
return $self;
}
=item init
Calculates the best number of hash functions and optimum filter length,
creates some random salts, and generates a blank bit vector. Called
automatically by constructor.
=cut
sub init
{
my ( $self ) = @_;
# some sanity checks
croak "Capacity must be greater than zero" unless $self->{capacity};
croak "Error rate must be greater than zero" unless $self->{error_rate};
croak "Error rate cannot exceed 1" unless $self->{error_rate} < 1;
my ( $length, $num_funcs ) = $self->_calculate_shortest_filter_length
($self->{capacity}, $self->{error_rate} );
$self->{num_hash_funcs} = $num_funcs;
$self->{filter_length} = $length;
# create some random salts;
my %collisions;
while ( scalar keys %collisions < $self->{num_hash_funcs} ) {
$collisions{rand()}++;
}
$self->{salts} = [ keys %collisions ];
# make an empty filter
$self->{filter} = pack( "b*", '0' x $self->{filter_length} );
# make some blank vectors to use
$self->{blankvec} = pack( "N", 0 );
return 1;
}
=back
=head1 ACCESSORS
=over
=item capacity
Returns the total capacity of the Bloom filter
=cut
sub capacity { $_[0]->{capacity} };
=item error_rate
Returns the configured maximum error rate
=cut
sub error_rate { $_[0]->{error_rate} };
=item length
Returns the length of the Bloom filter in bits
=cut
sub length { $_[0]->{filter_length} };
=item key_count
Returns the number of items currently stored in the filter
=cut
sub key_count { $_[0]->{key_count} };
=item on_bits
Returns the number of 'on' bits in the filter
=cut
sub on_bits
{
my ( $self ) = @_;
return unless $self->{filter};
return unpack( "%32b*", $self->{filter})
}
=item salts
Returns the list of salts used to create the hash functions
=cut
sub salts
{
my ( $self ) = @_;
return unless exists $self->{salts}
and ref $self->{salts}
and ref $self->{salts} eq 'ARRAY';
return @{ $self->{salts} };
}
=back
=head1 PUBLIC METHODS
=over
=item add @KEYS
Adds the list of keys to the filter. Will fail, return C<undef> and complain
if the number of keys in the filter exceeds the configured capacity.
=cut
sub add
{
my ( $self, @keys ) = @_;
return unless @keys;
# Hash our list of keys into the empty filter
my @salts = @{ $self->{salts} }
or croak "No salts found, cannot make bitmask";
foreach my $key ( @keys ) {
if ($self->{key_count} >= $self->{capacity}) {
carp "Exceeded filter capacity";
return;
}
# flip the appropriate bits on
vec($self->{filter}, $_, 1) = 1 foreach @{$self->_get_cells($key)};
$self->{key_count}++;
}
return 1;
}
=item check @KEYS
Checks the provided key list against the Bloom filter,
and returns a list of equivalent length, with true or
false values depending on whether there was a match.
=cut
sub check
{
my ( $self, @keys ) = @_;
return unless @keys;
my @result;
# A match occurs if every bit we check is on
foreach my $key ( @keys ) {
my $match = 1;
foreach my $cell (@{$self->_get_cells($key)} ) {
$match = vec( $self->{filter}, $cell, 1 ) ;
last unless $match;
}
push @result, $match;
}
return ( wantarray() ? @result : $result[0] );
}
=back
=head1 INTERNAL METHODS
=over
=item _calculate_shortest_filter_length CAPACITY ERR_RATE
Given a desired error rate and maximum capacity, returns the optimum
combination of vector length (in bits) and number of hash functions
to use in building the filter, where "optimum" means shortest vector length.
=cut
sub _calculate_shortest_filter_length
{
my ( $self, $num_keys, $error_rate ) = @_;
my $lowest_m;
my $best_k = 1;
foreach my $k ( 1..100 ) {
my $m = (-1 * $k * $num_keys) /
( log( 1 - ($error_rate ** (1/$k))));
if ( !defined $lowest_m or ($m < $lowest_m) ) {
$lowest_m = $m;
$best_k = $k;
}
}
$lowest_m = int( $lowest_m ) + 1;
return ( $lowest_m, $best_k );
}
=item _get_cells KEY
Given a key, hashes it using the list of salts and returns
an array of cell indexes corresponding to the key.
=cut
sub _get_cells
{
my ( $self, $key ) = @_;
croak "Filter length is undefined" unless $self->{filter_length};
my @salts = @{ $self->{salts} }
or croak "No salts found, cannot make bitmask";
my @cells;
foreach my $salt ( @salts ){
my $hash = sha1( $key, $salt );
# blank 32 bit vector
my $vec = $self->{blankvec};
# split the 160-bit hash into five 32-bit ints
# and XOR the pieces together
my @pieces = map {pack( "N", $_ )} unpack("N*", $hash );
$vec = $_ ^ $vec foreach @pieces;
# Calculate bit offset by modding
my $result = unpack( "N", $vec );
my $bit_offset = $result % $self->{filter_length};
push @cells, $bit_offset;
}
return \@cells;
}
=back
=head1 AUTHOR
Maciej Ceglowski E<lt>[email protected]<gt>
=head1 CHANGELOG
Feb 2007 big speedup by Dmitriy Ryaboy E<lt>[email protected]<gt> (thanks!)
=head1 COPYRIGHT AND LICENSE
(c) 2004 Maciej Ceglowski
This is free software, distributed under version 2
of the GNU Public License (GPL).
=cut
1;