diff --git a/README.markdown b/README.markdown index 20af811..0838f44 100644 --- a/README.markdown +++ b/README.markdown @@ -15,7 +15,7 @@ The goal of `pybloomfiltermmap3` is simple: to provide a fast, simple, scalable, There are a couple reasons to use this module: -* It natively uses [mmaped files](http://en.wikipedia.org/wiki/Mmap). +* It natively uses [mmapped files](http://en.wikipedia.org/wiki/Mmap). * It is fast (see [benchmarks](http://axiak.github.io/pybloomfiltermmap/#benchmarks)). * It natively does the set things you want a Bloom filter to do. @@ -38,10 +38,21 @@ interface and an ste interface. As an example: To create an in-memory filter, simply omit the file location: ```python - >>> cakes = pybloomfilter.BloomFilter(10000, 0.1) + >>> fruit2 = pybloomfilter.BloomFilter(10000, 0.1) + >>> fruit2.add('apple') + >>> 'apple' in fruit2 + True ``` -*Caveat*: it is currently not possible to persist this filter later. +These in-memory filters can be pickled and reloaded: +```python + >>> import pickle + >>> data = pickle.dumps(fruit2) + >>> fruit3 = pickle.loads(data) + >>> 'apple' in fruit3 + True +``` +*Caveat*: it is currently not possible to persist this filter later as an mmap file. ## Docs diff --git a/src/bloomfilter.c b/src/bloomfilter.c index cd89d18..baeefa7 100644 --- a/src/bloomfilter.c +++ b/src/bloomfilter.c @@ -9,7 +9,7 @@ #include "bloomfilter.h" BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, - BTYPE num_bits, int *hash_seeds, int num_hashes) + BTYPE num_bits, int *hash_seeds, int num_hashes, const char *data) { BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter)); MBArray * array; @@ -31,6 +31,9 @@ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256); memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes); array = mbarray_Create_Malloc(num_bits); + if (data) { + memcpy(array->vector, data, num_bits / 8); + } if (!array) { bloomfilter_Destroy(bf); return NULL; diff --git a/src/bloomfilter.h b/src/bloomfilter.h index e974c58..fde99ff 100644 --- a/src/bloomfilter.h +++ b/src/bloomfilter.h @@ -28,7 +28,7 @@ typedef struct _BloomFilter BloomFilter; /* Create a bloom filter without a memory-mapped file backing it */ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, - BTYPE num_bits, int *hash_seeds, int num_hashes); + BTYPE num_bits, int *hash_seeds, int num_hashes, const char * data); /* Create a bloom filter with a memory-mapped file backing it */ BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate, diff --git a/src/cbloomfilter.pxd b/src/cbloomfilter.pxd index e99f346..6a4b6fd 100644 --- a/src/cbloomfilter.pxd +++ b/src/cbloomfilter.pxd @@ -48,7 +48,7 @@ cdef extern from "bloomfilter.h": BloomFilter * bloomfilter_Create_Malloc(long max_num_elem, double error_rate, long num_bits, - int * hash_seeds, int num_hashes) + int * hash_seeds, int num_hashes, char * data) void bloomfilter_Destroy(BloomFilter * bf) int bloomfilter_Add(BloomFilter * bf, Key * key) int bloomfilter_Test(BloomFilter * bf, Key * key) diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx index 1b5b31b..a810b86 100644 --- a/src/pybloomfilter.pyx +++ b/src/pybloomfilter.pyx @@ -1,11 +1,10 @@ # cython: language_level=3 -VERSION = (0, 5, 3) +VERSION = (0, 5, 4) AUTHOR = "Michael Axiak" __VERSION__ = VERSION - cimport cbloomfilter cimport cpython @@ -62,6 +61,9 @@ cdef class BloomFilter: :param list hash_seeds: optionally specify hash seeds to use for the hashing algorithm. Each hash seed must not exceed 32 bits. The number of hash seeds will determine the number of hashes performed. + :param bytes data_array: optionally specify the filter data array, same as + given by BloomFilter.data_array. Only valid for in-memory bloomfilters. + If provided, hash_seeds must be given too. **Note that we do not check capacity.** This is important, because we want to be able to support logical OR and AND (see :meth:`BloomFilter.union` @@ -80,7 +82,14 @@ cdef class BloomFilter: cdef int _in_memory cdef int _oflags - def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): + def __reduce__(self): + """Makes an in-memory BloomFilter pickleable.""" + callable = BloomFilter + args = (self.capacity, self.error_rate, None, None, self.hash_seeds, self.data_array) + return (callable, args) + + + def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): self._closed = 0 self._in_memory = 0 self._oflags = os.O_RDWR @@ -88,12 +97,20 @@ cdef class BloomFilter: if capacity is NoConstruct: return - self._create(capacity, error_rate, filename, perm, hash_seeds) + self._create(capacity, error_rate, filename, perm, hash_seeds, data_array) - def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): + + def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): cdef char * seeds + cdef char * data = NULL cdef long long num_bits + if data_array is not None: + if filename: + raise ValueError("data_array cannot be used for an mmapped filter.") + if hash_seeds is None: + raise ValueError("hash_seeds must be specified if a data_array is provided.") + # Make sure that if the filename is defined, that the # file exists if filename and os.path.exists(filename): @@ -135,6 +152,10 @@ cdef class BloomFilter: # Minimum bit vector of 128 bits num_bits = max(num_hashes * bits_per_hash,128) + # Override calculated capacity if we are provided a data array + if data_array is not None: + num_bits = 8 * len(data_array) + # print("k = %d m = %d n = %d p ~= %.8f" % ( # num_hashes, num_bits, capacity, # (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits)) @@ -153,11 +174,13 @@ cdef class BloomFilter: num_hashes) else: self._in_memory = 1 + if data_array is not None: + data = data_array self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity, error_rate, num_bits, seeds, - num_hashes) + num_hashes, data) if self._bf is NULL: if filename: raise OSError(errno, '%s: %s' % (os.strerror(errno), @@ -165,6 +188,7 @@ cdef class BloomFilter: else: cpython.PyErr_NoMemory() + def _open(self, filename, mode="rw"): # Should not overwrite mode = mode.replace("+", "") @@ -202,6 +226,19 @@ cdef class BloomFilter: arr = (cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] return int.from_bytes(arr, byteorder="big", signed=False) + @property + def data_array(self): + """Bytes array of the Bloom filter contents. + """ + self._assert_open() + start_pos = self._bf.array.preamblebytes + end_pos = start_pos + self._bf.array.bytes + arr = array.array('B') + arr.frombytes( + (cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] + ) + return bytes(arr) + @property def hash_seeds(self): """Integer seeds used for the random hashing. Returns a list of integers.""" @@ -340,7 +377,12 @@ cdef class BloomFilter: item = item_.encode() key.shash = item key.nhash = len(item) + elif isinstance(item_, bytes): + item = item_.encode() + key.shash = item + key.nhash = len(item) else: + # Warning! Only works reliably for objects whose hash is based on value not memory address. item = item_ key.shash = NULL key.nhash = hash(item) @@ -391,6 +433,10 @@ cdef class BloomFilter: item = item_.encode() key.shash = item key.nhash = len(item) + elif isinstance(item_, bytes): + item = item_.encode() + key.shash = item + key.nhash = len(item) else: item = item_ key.shash = NULL