prashnts · stevesimmons · Oct 19, 2020 · Oct 19, 2020 · Oct 19, 2020 · Oct 19, 2020
diff --git a/README.markdown b/README.markdown
@@ -15,7 +15,7 @@ The goal of `pybloomfiltermmap3` is simple: to provide a fast, simple, scalable,
 
 There are a couple reasons to use this module:
 
-* It natively uses [mmaped files](http://en.wikipedia.org/wiki/Mmap).
+* It natively uses [mmapped files](http://en.wikipedia.org/wiki/Mmap).
 * It is fast (see [benchmarks](http://axiak.github.io/pybloomfiltermmap/#benchmarks)).
 * It natively does the set things you want a Bloom filter to do.
 
@@ -38,10 +38,21 @@ interface and an ste interface. As an example:
 
 To create an in-memory filter, simply omit the file location:
 ```python
-    >>> cakes = pybloomfilter.BloomFilter(10000, 0.1)
+    >>> fruit2 = pybloomfilter.BloomFilter(10000, 0.1)
+    >>> fruit2.add('apple')
+    >>> 'apple' in fruit2
+    True
 ```
-*Caveat*: it is currently not possible to persist this filter later.
 
+These in-memory filters can be pickled and reloaded:
+```python
+    >>> import pickle
+    >>> data = pickle.dumps(fruit2)
+    >>> fruit3 = pickle.loads(data)
+    >>> 'apple' in fruit3
+    True
+```
+*Caveat*: it is currently not possible to persist this filter later as an mmap file.
 
 ## Docs
 

diff --git a/src/bloomfilter.c b/src/bloomfilter.c
@@ -9,7 +9,7 @@
 #include "bloomfilter.h"
 
 BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
-                                BTYPE num_bits, int *hash_seeds, int num_hashes)
+                                BTYPE num_bits, int *hash_seeds, int num_hashes, const char *data)
 {
     BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter));
     MBArray * array;
@@ -31,6 +31,9 @@ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
     memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256);
     memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes);
     array = mbarray_Create_Malloc(num_bits);
+    if (data) {
+        memcpy(array->vector, data, num_bits / 8);
+    }
     if (!array) {
         bloomfilter_Destroy(bf);
         return NULL;

diff --git a/src/bloomfilter.h b/src/bloomfilter.h
@@ -28,7 +28,7 @@ typedef struct _BloomFilter BloomFilter;
 
 /* Create a bloom filter without a memory-mapped file backing it */
 BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
-                                BTYPE num_bits, int *hash_seeds, int num_hashes);
+                                BTYPE num_bits, int *hash_seeds, int num_hashes, const char * data);
 
 /* Create a bloom filter with a memory-mapped file backing it */
 BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate,

diff --git a/src/cbloomfilter.pxd b/src/cbloomfilter.pxd
@@ -48,7 +48,7 @@ cdef extern from "bloomfilter.h":
      BloomFilter * bloomfilter_Create_Malloc(long max_num_elem,
                                       double error_rate,
                                       long num_bits,
-                                      int * hash_seeds, int num_hashes)
+                                      int * hash_seeds, int num_hashes, char * data)
      void bloomfilter_Destroy(BloomFilter * bf)
      int bloomfilter_Add(BloomFilter * bf, Key * key)
      int bloomfilter_Test(BloomFilter * bf, Key * key)

diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx
@@ -1,11 +1,10 @@
 # cython: language_level=3
 
-VERSION = (0, 5, 3)
+VERSION = (0, 5, 4)
 AUTHOR = "Michael Axiak"
 
 __VERSION__ = VERSION
 
-
 cimport cbloomfilter
 cimport cpython
 
@@ -62,6 +61,9 @@ cdef class BloomFilter:
     :param list hash_seeds: optionally specify hash seeds to use for the
         hashing algorithm. Each hash seed must not exceed 32 bits. The number
         of hash seeds will determine the number of hashes performed.
+    :param bytes data_array: optionally specify the filter data array, same as
+        given by BloomFilter.data_array. Only valid for in-memory bloomfilters.
+        If provided, hash_seeds must be given too.
 
     **Note that we do not check capacity.** This is important, because
     we want to be able to support logical OR and AND (see :meth:`BloomFilter.union`
@@ -80,20 +82,35 @@ cdef class BloomFilter:
     cdef int _in_memory
     cdef int _oflags
 
-    def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None):
+    def __reduce__(self):
+        """Makes an in-memory BloomFilter pickleable."""
+        callable = BloomFilter
+        args = (self.capacity, self.error_rate, None, None, self.hash_seeds, self.data_array)
+        return (callable, args)
+
+
+    def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None):
         self._closed = 0
         self._in_memory = 0
         self._oflags = os.O_RDWR
 
         if capacity is NoConstruct:
             return
 
-        self._create(capacity, error_rate, filename, perm, hash_seeds)
+        self._create(capacity, error_rate, filename, perm, hash_seeds, data_array)
 
-    def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None):
+
+    def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None):
         cdef char * seeds
+        cdef char * data = NULL
         cdef long long num_bits
 
+        if data_array is not None:
+            if filename:
+                raise ValueError("data_array cannot be used for an mmapped filter.")
+            if hash_seeds is None:
+                raise ValueError("hash_seeds must be specified if a data_array is provided.")
+
         # Make sure that if the filename is defined, that the
         # file exists
         if filename and os.path.exists(filename):
@@ -135,6 +152,10 @@ cdef class BloomFilter:
         # Minimum bit vector of 128 bits
         num_bits = max(num_hashes * bits_per_hash,128)
 
+        # Override calculated capacity if we are provided a data array
+        if data_array is not None:
+            num_bits = 8 * len(data_array)
+
         # print("k = %d  m = %d  n = %d   p ~= %.8f" % (
         #     num_hashes, num_bits, capacity,
         #     (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits))
@@ -153,18 +174,21 @@ cdef class BloomFilter:
                                                     num_hashes)
         else:
             self._in_memory = 1
+            if data_array is not None:
+                data = data_array
             self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity,
                                                     error_rate,
                                                     num_bits,
                                                     <int *>seeds,
-                                                    num_hashes)
+                                                    num_hashes, <const char *>data)
         if self._bf is NULL:
             if filename:
                 raise OSError(errno, '%s: %s' % (os.strerror(errno),
                                                     filename))
             else:
                 cpython.PyErr_NoMemory()
 
+
     def _open(self, filename, mode="rw"):
         # Should not overwrite
         mode = mode.replace("+", "")
@@ -202,6 +226,19 @@ cdef class BloomFilter:
         arr = (<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos]
         return int.from_bytes(arr, byteorder="big", signed=False)
 
+    @property
+    def data_array(self):
+        """Bytes array of the Bloom filter contents.
+        """
+        self._assert_open()
+        start_pos = self._bf.array.preamblebytes
+        end_pos = start_pos + self._bf.array.bytes 
+        arr = array.array('B')
+        arr.frombytes(
+            (<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos]
+        )
+        return bytes(arr)
+
     @property
     def hash_seeds(self):
         """Integer seeds used for the random hashing. Returns a list of integers."""
@@ -340,7 +377,12 @@ cdef class BloomFilter:
             item = item_.encode()
             key.shash = item
             key.nhash = len(item)
+        elif isinstance(item_, bytes):
+            item = item_.encode()
+            key.shash = item
+            key.nhash = len(item)
         else:
+            # Warning! Only works reliably for objects whose hash is based on value not memory address.
             item = item_
             key.shash = NULL
             key.nhash = hash(item)
@@ -391,6 +433,10 @@ cdef class BloomFilter:
             item = item_.encode()
             key.shash = item
             key.nhash = len(item)
+        elif isinstance(item_, bytes):
+            item = item_.encode()
+            key.shash = item
+            key.nhash = len(item)
         else:
             item = item_
             key.shash = NULL