From a5130346927cd6da79089deccbd4f1fedfe3a01b Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Mon, 19 Oct 2020 22:50:22 +0100 Subject: [PATCH 1/4] Make pickleable and allow data_array to be retrieved and set --- src/bloomfilter.c | 5 ++++- src/bloomfilter.h | 2 +- src/cbloomfilter.pxd | 2 +- src/pybloomfilter.pyx | 50 +++++++++++++++++++++++++++++++++++++++---- 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/src/bloomfilter.c b/src/bloomfilter.c index cd89d18..baeefa7 100644 --- a/src/bloomfilter.c +++ b/src/bloomfilter.c @@ -9,7 +9,7 @@ #include "bloomfilter.h" BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, - BTYPE num_bits, int *hash_seeds, int num_hashes) + BTYPE num_bits, int *hash_seeds, int num_hashes, const char *data) { BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter)); MBArray * array; @@ -31,6 +31,9 @@ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256); memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes); array = mbarray_Create_Malloc(num_bits); + if (data) { + memcpy(array->vector, data, num_bits / 8); + } if (!array) { bloomfilter_Destroy(bf); return NULL; diff --git a/src/bloomfilter.h b/src/bloomfilter.h index e974c58..fde99ff 100644 --- a/src/bloomfilter.h +++ b/src/bloomfilter.h @@ -28,7 +28,7 @@ typedef struct _BloomFilter BloomFilter; /* Create a bloom filter without a memory-mapped file backing it */ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, - BTYPE num_bits, int *hash_seeds, int num_hashes); + BTYPE num_bits, int *hash_seeds, int num_hashes, const char * data); /* Create a bloom filter with a memory-mapped file backing it */ BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate, diff --git a/src/cbloomfilter.pxd b/src/cbloomfilter.pxd index e99f346..6a4b6fd 100644 --- a/src/cbloomfilter.pxd +++ b/src/cbloomfilter.pxd @@ -48,7 +48,7 @@ cdef extern from "bloomfilter.h": BloomFilter * bloomfilter_Create_Malloc(long max_num_elem, double error_rate, long num_bits, - int * hash_seeds, int num_hashes) + int * hash_seeds, int num_hashes, char * data) void bloomfilter_Destroy(BloomFilter * bf) int bloomfilter_Add(BloomFilter * bf, Key * key) int bloomfilter_Test(BloomFilter * bf, Key * key) diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx index 1b5b31b..91966c9 100644 --- a/src/pybloomfilter.pyx +++ b/src/pybloomfilter.pyx @@ -5,6 +5,8 @@ AUTHOR = "Michael Axiak" __VERSION__ = VERSION +from libc.stdlib cimport malloc +from libc.string cimport strncpy cimport cbloomfilter cimport cpython @@ -62,6 +64,7 @@ cdef class BloomFilter: :param list hash_seeds: optionally specify hash seeds to use for the hashing algorithm. Each hash seed must not exceed 32 bits. The number of hash seeds will determine the number of hashes performed. + :param bytes data_array: optionally specify data array, same as . **Note that we do not check capacity.** This is important, because we want to be able to support logical OR and AND (see :meth:`BloomFilter.union` @@ -80,7 +83,14 @@ cdef class BloomFilter: cdef int _in_memory cdef int _oflags - def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): + def __reduce__(self): + """Makes an in-memory BloomFilter pickleable.""" + callable = BloomFilter + args = (self.capacity, self.error_rate, None, None, self.hash_seeds, self.data_array) + return (callable, args) + + + def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): self._closed = 0 self._in_memory = 0 self._oflags = os.O_RDWR @@ -88,10 +98,12 @@ cdef class BloomFilter: if capacity is NoConstruct: return - self._create(capacity, error_rate, filename, perm, hash_seeds) + self._create(capacity, error_rate, filename, perm, hash_seeds, data_array) + - def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): + def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): cdef char * seeds + cdef char * data = NULL cdef long long num_bits # Make sure that if the filename is defined, that the @@ -135,6 +147,10 @@ cdef class BloomFilter: # Minimum bit vector of 128 bits num_bits = max(num_hashes * bits_per_hash,128) + # Override calculated capacity if we are provided a data array + if data_array is not None: + num_bits = 8 * len(data_array) + # print("k = %d m = %d n = %d p ~= %.8f" % ( # num_hashes, num_bits, capacity, # (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits)) @@ -153,11 +169,14 @@ cdef class BloomFilter: num_hashes) else: self._in_memory = 1 + if data_array is not None: + print(f"Inserting data array of length {len(data_array)} with num_bits {num_bits}") + data = data_array self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity, error_rate, num_bits, seeds, - num_hashes) + num_hashes, data) if self._bf is NULL: if filename: raise OSError(errno, '%s: %s' % (os.strerror(errno), @@ -165,6 +184,7 @@ cdef class BloomFilter: else: cpython.PyErr_NoMemory() + def _open(self, filename, mode="rw"): # Should not overwrite mode = mode.replace("+", "") @@ -202,6 +222,19 @@ cdef class BloomFilter: arr = (cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] return int.from_bytes(arr, byteorder="big", signed=False) + @property + def data_array(self): + """Bytes array of the Bloom filter contents. + """ + self._assert_open() + start_pos = 0 + end_pos = start_pos + self._bf.array.bytes + self._bf.array.preamblebytes + arr = array.array('I') + arr.frombytes( + (cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] + ) + return bytes(arr) + @property def hash_seeds(self): """Integer seeds used for the random hashing. Returns a list of integers.""" @@ -340,7 +373,12 @@ cdef class BloomFilter: item = item_.encode() key.shash = item key.nhash = len(item) + elif isinstance(item_, bytes): + item = item_.encode() + key.shash = item + key.nhash = len(item) else: + # Warning! Only works reliably for objects whose hash is based on value not memory address. item = item_ key.shash = NULL key.nhash = hash(item) @@ -391,6 +429,10 @@ cdef class BloomFilter: item = item_.encode() key.shash = item key.nhash = len(item) + elif isinstance(item_, bytes): + item = item_.encode() + key.shash = item + key.nhash = len(item) else: item = item_ key.shash = NULL From cb8c21bc1aa4e4dd62a9086bd22dac28d1547751 Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Mon, 19 Oct 2020 23:01:08 +0100 Subject: [PATCH 2/4] Fix comment and data_array if preamblebytes is non-zero --- src/pybloomfilter.pyx | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx index 91966c9..bac51ef 100644 --- a/src/pybloomfilter.pyx +++ b/src/pybloomfilter.pyx @@ -1,13 +1,10 @@ # cython: language_level=3 -VERSION = (0, 5, 3) +VERSION = (0, 5, 4) AUTHOR = "Michael Axiak" __VERSION__ = VERSION -from libc.stdlib cimport malloc -from libc.string cimport strncpy - cimport cbloomfilter cimport cpython @@ -64,7 +61,9 @@ cdef class BloomFilter: :param list hash_seeds: optionally specify hash seeds to use for the hashing algorithm. Each hash seed must not exceed 32 bits. The number of hash seeds will determine the number of hashes performed. - :param bytes data_array: optionally specify data array, same as . + :param bytes data_array: optionally specify the filter data array, same as + given by BloomFilter.data_array. Only valid for in-memory bloomfilters. + If provided, hash_seeds must be given too. **Note that we do not check capacity.** This is important, because we want to be able to support logical OR and AND (see :meth:`BloomFilter.union` @@ -106,6 +105,12 @@ cdef class BloomFilter: cdef char * data = NULL cdef long long num_bits + if data_array is not None: + if filename: + raise ValueError("data_array cannot be used for an mmapped filter.") + if hash_seeds is None: + raise ValueError("hash_seeds must be specified if a data_array is provided.") + # Make sure that if the filename is defined, that the # file exists if filename and os.path.exists(filename): @@ -227,9 +232,9 @@ cdef class BloomFilter: """Bytes array of the Bloom filter contents. """ self._assert_open() - start_pos = 0 - end_pos = start_pos + self._bf.array.bytes + self._bf.array.preamblebytes - arr = array.array('I') + start_pos = self._bf.array.preamblebytes + end_pos = start_pos + self._bf.array.bytes + arr = array.array('B') arr.frombytes( (cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] ) From c07465a0430b99523cfde188c02870a16e72a666 Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Mon, 19 Oct 2020 23:21:31 +0100 Subject: [PATCH 3/4] Update README --- README.markdown | 15 +++++++++++++-- src/pybloomfilter.pyx | 1 - 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/README.markdown b/README.markdown index 20af811..b432741 100644 --- a/README.markdown +++ b/README.markdown @@ -38,10 +38,21 @@ interface and an ste interface. As an example: To create an in-memory filter, simply omit the file location: ```python - >>> cakes = pybloomfilter.BloomFilter(10000, 0.1) + >>> fruit = pybloomfilter.BloomFilter(10000, 0.1) + >>> fruit.add('apple') + >>> 'apple' in fruit + True ``` -*Caveat*: it is currently not possible to persist this filter later. +These in-memory filters can be pickled and reloaded: +```python + >>> import pickle + >>> data = pickle.dumps(fruit) + >>> trees = pickle.loads(data) + >>> 'apple' in trees + True +``` +*Caveat*: it is currently not possible to persist this filter later as an mmap file. ## Docs diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx index bac51ef..a810b86 100644 --- a/src/pybloomfilter.pyx +++ b/src/pybloomfilter.pyx @@ -175,7 +175,6 @@ cdef class BloomFilter: else: self._in_memory = 1 if data_array is not None: - print(f"Inserting data array of length {len(data_array)} with num_bits {num_bits}") data = data_array self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity, error_rate, From ceb42b21ee6d5ef519b5450c436a7ef9288a0fe8 Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Mon, 19 Oct 2020 23:27:24 +0100 Subject: [PATCH 4/4] Fix typo and improve example --- README.markdown | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.markdown b/README.markdown index b432741..0838f44 100644 --- a/README.markdown +++ b/README.markdown @@ -15,7 +15,7 @@ The goal of `pybloomfiltermmap3` is simple: to provide a fast, simple, scalable, There are a couple reasons to use this module: -* It natively uses [mmaped files](http://en.wikipedia.org/wiki/Mmap). +* It natively uses [mmapped files](http://en.wikipedia.org/wiki/Mmap). * It is fast (see [benchmarks](http://axiak.github.io/pybloomfiltermmap/#benchmarks)). * It natively does the set things you want a Bloom filter to do. @@ -38,18 +38,18 @@ interface and an ste interface. As an example: To create an in-memory filter, simply omit the file location: ```python - >>> fruit = pybloomfilter.BloomFilter(10000, 0.1) - >>> fruit.add('apple') - >>> 'apple' in fruit + >>> fruit2 = pybloomfilter.BloomFilter(10000, 0.1) + >>> fruit2.add('apple') + >>> 'apple' in fruit2 True ``` These in-memory filters can be pickled and reloaded: ```python >>> import pickle - >>> data = pickle.dumps(fruit) - >>> trees = pickle.loads(data) - >>> 'apple' in trees + >>> data = pickle.dumps(fruit2) + >>> fruit3 = pickle.loads(data) + >>> 'apple' in fruit3 True ``` *Caveat*: it is currently not possible to persist this filter later as an mmap file.