Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make pickleable and allow data_array to be retrieved and set #44

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ The goal of `pybloomfiltermmap3` is simple: to provide a fast, simple, scalable,

There are a couple reasons to use this module:

* It natively uses [mmaped files](http://en.wikipedia.org/wiki/Mmap).
* It natively uses [mmapped files](http://en.wikipedia.org/wiki/Mmap).
* It is fast (see [benchmarks](http://axiak.github.io/pybloomfiltermmap/#benchmarks)).
* It natively does the set things you want a Bloom filter to do.

Expand All @@ -38,10 +38,21 @@ interface and an ste interface. As an example:

To create an in-memory filter, simply omit the file location:
```python
>>> cakes = pybloomfilter.BloomFilter(10000, 0.1)
>>> fruit2 = pybloomfilter.BloomFilter(10000, 0.1)
>>> fruit2.add('apple')
>>> 'apple' in fruit2
True
```
*Caveat*: it is currently not possible to persist this filter later.

These in-memory filters can be pickled and reloaded:
```python
>>> import pickle
>>> data = pickle.dumps(fruit2)
>>> fruit3 = pickle.loads(data)
>>> 'apple' in fruit3
True
```
*Caveat*: it is currently not possible to persist this filter later as an mmap file.

## Docs

Expand Down
5 changes: 4 additions & 1 deletion src/bloomfilter.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "bloomfilter.h"

BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
BTYPE num_bits, int *hash_seeds, int num_hashes)
BTYPE num_bits, int *hash_seeds, int num_hashes, const char *data)
{
BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter));
MBArray * array;
Expand All @@ -31,6 +31,9 @@ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256);
memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes);
array = mbarray_Create_Malloc(num_bits);
if (data) {
memcpy(array->vector, data, num_bits / 8);
}
Comment on lines +34 to +36
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you move this to below the if (!array) check?

if (!array) {
bloomfilter_Destroy(bf);
return NULL;
Expand Down
2 changes: 1 addition & 1 deletion src/bloomfilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ typedef struct _BloomFilter BloomFilter;

/* Create a bloom filter without a memory-mapped file backing it */
BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
BTYPE num_bits, int *hash_seeds, int num_hashes);
BTYPE num_bits, int *hash_seeds, int num_hashes, const char * data);

/* Create a bloom filter with a memory-mapped file backing it */
BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate,
Expand Down
2 changes: 1 addition & 1 deletion src/cbloomfilter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ cdef extern from "bloomfilter.h":
BloomFilter * bloomfilter_Create_Malloc(long max_num_elem,
double error_rate,
long num_bits,
int * hash_seeds, int num_hashes)
int * hash_seeds, int num_hashes, char * data)
void bloomfilter_Destroy(BloomFilter * bf)
int bloomfilter_Add(BloomFilter * bf, Key * key)
int bloomfilter_Test(BloomFilter * bf, Key * key)
Expand Down
58 changes: 52 additions & 6 deletions src/pybloomfilter.pyx
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# cython: language_level=3

VERSION = (0, 5, 3)
VERSION = (0, 5, 4)
AUTHOR = "Michael Axiak"

__VERSION__ = VERSION


cimport cbloomfilter
cimport cpython

Expand Down Expand Up @@ -62,6 +61,9 @@ cdef class BloomFilter:
:param list hash_seeds: optionally specify hash seeds to use for the
hashing algorithm. Each hash seed must not exceed 32 bits. The number
of hash seeds will determine the number of hashes performed.
:param bytes data_array: optionally specify the filter data array, same as
given by BloomFilter.data_array. Only valid for in-memory bloomfilters.
If provided, hash_seeds must be given too.

**Note that we do not check capacity.** This is important, because
we want to be able to support logical OR and AND (see :meth:`BloomFilter.union`
Expand All @@ -80,20 +82,35 @@ cdef class BloomFilter:
cdef int _in_memory
cdef int _oflags

def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None):
def __reduce__(self):
"""Makes an in-memory BloomFilter pickleable."""
callable = BloomFilter
args = (self.capacity, self.error_rate, None, None, self.hash_seeds, self.data_array)
return (callable, args)


def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None):
self._closed = 0
self._in_memory = 0
self._oflags = os.O_RDWR

if capacity is NoConstruct:
return

self._create(capacity, error_rate, filename, perm, hash_seeds)
self._create(capacity, error_rate, filename, perm, hash_seeds, data_array)

def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None):

def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None):
cdef char * seeds
cdef char * data = NULL
cdef long long num_bits

if data_array is not None:
if filename:
raise ValueError("data_array cannot be used for an mmapped filter.")
if hash_seeds is None:
raise ValueError("hash_seeds must be specified if a data_array is provided.")

# Make sure that if the filename is defined, that the
# file exists
if filename and os.path.exists(filename):
Expand Down Expand Up @@ -135,6 +152,10 @@ cdef class BloomFilter:
# Minimum bit vector of 128 bits
num_bits = max(num_hashes * bits_per_hash,128)

# Override calculated capacity if we are provided a data array
if data_array is not None:
num_bits = 8 * len(data_array)

# print("k = %d m = %d n = %d p ~= %.8f" % (
# num_hashes, num_bits, capacity,
# (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits))
Expand All @@ -153,18 +174,21 @@ cdef class BloomFilter:
num_hashes)
else:
self._in_memory = 1
if data_array is not None:
data = data_array
self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity,
error_rate,
num_bits,
<int *>seeds,
num_hashes)
num_hashes, <const char *>data)
if self._bf is NULL:
if filename:
raise OSError(errno, '%s: %s' % (os.strerror(errno),
filename))
else:
cpython.PyErr_NoMemory()


def _open(self, filename, mode="rw"):
# Should not overwrite
mode = mode.replace("+", "")
Expand Down Expand Up @@ -202,6 +226,19 @@ cdef class BloomFilter:
arr = (<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos]
return int.from_bytes(arr, byteorder="big", signed=False)

@property
def data_array(self):
"""Bytes array of the Bloom filter contents.
"""
self._assert_open()
start_pos = self._bf.array.preamblebytes
end_pos = start_pos + self._bf.array.bytes
arr = array.array('B')
arr.frombytes(
(<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos]
)
return bytes(arr)

@property
def hash_seeds(self):
"""Integer seeds used for the random hashing. Returns a list of integers."""
Expand Down Expand Up @@ -340,7 +377,12 @@ cdef class BloomFilter:
item = item_.encode()
key.shash = item
key.nhash = len(item)
elif isinstance(item_, bytes):
item = item_.encode()
key.shash = item
key.nhash = len(item)
else:
# Warning! Only works reliably for objects whose hash is based on value not memory address.
Copy link
Owner

@prashnts prashnts Oct 30, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm not mistaken, this basically means:

class KlassX:
    pass

class KlassHashable:
    def __hash__(self):
        return 123

bf.add(KlassX()) # this won't work.
bf.add(KlassHashable()) # this will "work" but not in a way we want?

right?

item = item_
key.shash = NULL
key.nhash = hash(item)
Expand Down Expand Up @@ -391,6 +433,10 @@ cdef class BloomFilter:
item = item_.encode()
key.shash = item
key.nhash = len(item)
elif isinstance(item_, bytes):
item = item_.encode()
key.shash = item
key.nhash = len(item)
else:
item = item_
key.shash = NULL
Expand Down