From a5130346927cd6da79089deccbd4f1fedfe3a01b Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Mon, 19 Oct 2020 22:50:22 +0100
Subject: [PATCH 1/4] Make pickleable and allow data_array to be retrieved and
 set

---
 src/bloomfilter.c     |  5 ++++-
 src/bloomfilter.h     |  2 +-
 src/cbloomfilter.pxd  |  2 +-
 src/pybloomfilter.pyx | 50 +++++++++++++++++++++++++++++++++++++++----
 4 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/src/bloomfilter.c b/src/bloomfilter.c
index cd89d18..baeefa7 100644
--- a/src/bloomfilter.c
+++ b/src/bloomfilter.c
@@ -9,7 +9,7 @@
 #include "bloomfilter.h"
 
 BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
-                                BTYPE num_bits, int *hash_seeds, int num_hashes)
+                                BTYPE num_bits, int *hash_seeds, int num_hashes, const char *data)
 {
     BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter));
     MBArray * array;
@@ -31,6 +31,9 @@ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
     memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256);
     memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes);
     array = mbarray_Create_Malloc(num_bits);
+    if (data) {
+        memcpy(array->vector, data, num_bits / 8);
+    }
     if (!array) {
         bloomfilter_Destroy(bf);
         return NULL;
diff --git a/src/bloomfilter.h b/src/bloomfilter.h
index e974c58..fde99ff 100644
--- a/src/bloomfilter.h
+++ b/src/bloomfilter.h
@@ -28,7 +28,7 @@ typedef struct _BloomFilter BloomFilter;
 
 /* Create a bloom filter without a memory-mapped file backing it */
 BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
-                                BTYPE num_bits, int *hash_seeds, int num_hashes);
+                                BTYPE num_bits, int *hash_seeds, int num_hashes, const char * data);
 
 /* Create a bloom filter with a memory-mapped file backing it */
 BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate,
diff --git a/src/cbloomfilter.pxd b/src/cbloomfilter.pxd
index e99f346..6a4b6fd 100644
--- a/src/cbloomfilter.pxd
+++ b/src/cbloomfilter.pxd
@@ -48,7 +48,7 @@ cdef extern from "bloomfilter.h":
      BloomFilter * bloomfilter_Create_Malloc(long max_num_elem,
                                       double error_rate,
                                       long num_bits,
-                                      int * hash_seeds, int num_hashes)
+                                      int * hash_seeds, int num_hashes, char * data)
      void bloomfilter_Destroy(BloomFilter * bf)
      int bloomfilter_Add(BloomFilter * bf, Key * key)
      int bloomfilter_Test(BloomFilter * bf, Key * key)
diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx
index 1b5b31b..91966c9 100644
--- a/src/pybloomfilter.pyx
+++ b/src/pybloomfilter.pyx
@@ -5,6 +5,8 @@ AUTHOR = "Michael Axiak"
 
 __VERSION__ = VERSION
 
+from libc.stdlib cimport malloc
+from libc.string cimport strncpy
 
 cimport cbloomfilter
 cimport cpython
@@ -62,6 +64,7 @@ cdef class BloomFilter:
     :param list hash_seeds: optionally specify hash seeds to use for the
         hashing algorithm. Each hash seed must not exceed 32 bits. The number
         of hash seeds will determine the number of hashes performed.
+    :param bytes data_array: optionally specify data array, same as .
 
     **Note that we do not check capacity.** This is important, because
     we want to be able to support logical OR and AND (see :meth:`BloomFilter.union`
@@ -80,7 +83,14 @@ cdef class BloomFilter:
     cdef int _in_memory
     cdef int _oflags
 
-    def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None):
+    def __reduce__(self):
+        """Makes an in-memory BloomFilter pickleable."""
+        callable = BloomFilter
+        args = (self.capacity, self.error_rate, None, None, self.hash_seeds, self.data_array)
+        return (callable, args)
+
+
+    def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None):
         self._closed = 0
         self._in_memory = 0
         self._oflags = os.O_RDWR
@@ -88,10 +98,12 @@ cdef class BloomFilter:
         if capacity is NoConstruct:
             return
 
-        self._create(capacity, error_rate, filename, perm, hash_seeds)
+        self._create(capacity, error_rate, filename, perm, hash_seeds, data_array)
+
 
-    def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None):
+    def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None):
         cdef char * seeds
+        cdef char * data = NULL
         cdef long long num_bits
 
         # Make sure that if the filename is defined, that the
@@ -135,6 +147,10 @@ cdef class BloomFilter:
         # Minimum bit vector of 128 bits
         num_bits = max(num_hashes * bits_per_hash,128)
 
+        # Override calculated capacity if we are provided a data array
+        if data_array is not None:
+            num_bits = 8 * len(data_array)
+
         # print("k = %d  m = %d  n = %d   p ~= %.8f" % (
         #     num_hashes, num_bits, capacity,
         #     (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits))
@@ -153,11 +169,14 @@ cdef class BloomFilter:
                                                     num_hashes)
         else:
             self._in_memory = 1
+            if data_array is not None:
+                print(f"Inserting data array of length {len(data_array)} with num_bits {num_bits}")
+                data = data_array
             self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity,
                                                     error_rate,
                                                     num_bits,
                                                     <int *>seeds,
-                                                    num_hashes)
+                                                    num_hashes, <const char *>data)
         if self._bf is NULL:
             if filename:
                 raise OSError(errno, '%s: %s' % (os.strerror(errno),
@@ -165,6 +184,7 @@ cdef class BloomFilter:
             else:
                 cpython.PyErr_NoMemory()
 
+
     def _open(self, filename, mode="rw"):
         # Should not overwrite
         mode = mode.replace("+", "")
@@ -202,6 +222,19 @@ cdef class BloomFilter:
         arr = (<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos]
         return int.from_bytes(arr, byteorder="big", signed=False)
 
+    @property
+    def data_array(self):
+        """Bytes array of the Bloom filter contents.
+        """
+        self._assert_open()
+        start_pos = 0
+        end_pos = start_pos + self._bf.array.bytes + self._bf.array.preamblebytes
+        arr = array.array('I')
+        arr.frombytes(
+            (<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos]
+        )
+        return bytes(arr)
+
     @property
     def hash_seeds(self):
         """Integer seeds used for the random hashing. Returns a list of integers."""
@@ -340,7 +373,12 @@ cdef class BloomFilter:
             item = item_.encode()
             key.shash = item
             key.nhash = len(item)
+        elif isinstance(item_, bytes):
+            item = item_.encode()
+            key.shash = item
+            key.nhash = len(item)
         else:
+            # Warning! Only works reliably for objects whose hash is based on value not memory address.
             item = item_
             key.shash = NULL
             key.nhash = hash(item)
@@ -391,6 +429,10 @@ cdef class BloomFilter:
             item = item_.encode()
             key.shash = item
             key.nhash = len(item)
+        elif isinstance(item_, bytes):
+            item = item_.encode()
+            key.shash = item
+            key.nhash = len(item)
         else:
             item = item_
             key.shash = NULL

From cb8c21bc1aa4e4dd62a9086bd22dac28d1547751 Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Mon, 19 Oct 2020 23:01:08 +0100
Subject: [PATCH 2/4] Fix comment and data_array if preamblebytes is non-zero

---
 src/pybloomfilter.pyx | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx
index 91966c9..bac51ef 100644
--- a/src/pybloomfilter.pyx
+++ b/src/pybloomfilter.pyx
@@ -1,13 +1,10 @@
 # cython: language_level=3
 
-VERSION = (0, 5, 3)
+VERSION = (0, 5, 4)
 AUTHOR = "Michael Axiak"
 
 __VERSION__ = VERSION
 
-from libc.stdlib cimport malloc
-from libc.string cimport strncpy
-
 cimport cbloomfilter
 cimport cpython
 
@@ -64,7 +61,9 @@ cdef class BloomFilter:
     :param list hash_seeds: optionally specify hash seeds to use for the
         hashing algorithm. Each hash seed must not exceed 32 bits. The number
         of hash seeds will determine the number of hashes performed.
-    :param bytes data_array: optionally specify data array, same as .
+    :param bytes data_array: optionally specify the filter data array, same as
+        given by BloomFilter.data_array. Only valid for in-memory bloomfilters.
+        If provided, hash_seeds must be given too.
 
     **Note that we do not check capacity.** This is important, because
     we want to be able to support logical OR and AND (see :meth:`BloomFilter.union`
@@ -106,6 +105,12 @@ cdef class BloomFilter:
         cdef char * data = NULL
         cdef long long num_bits
 
+        if data_array is not None:
+            if filename:
+                raise ValueError("data_array cannot be used for an mmapped filter.")
+            if hash_seeds is None:
+                raise ValueError("hash_seeds must be specified if a data_array is provided.")
+             
         # Make sure that if the filename is defined, that the
         # file exists
         if filename and os.path.exists(filename):
@@ -227,9 +232,9 @@ cdef class BloomFilter:
         """Bytes array of the Bloom filter contents.
         """
         self._assert_open()
-        start_pos = 0
-        end_pos = start_pos + self._bf.array.bytes + self._bf.array.preamblebytes
-        arr = array.array('I')
+        start_pos = self._bf.array.preamblebytes
+        end_pos = start_pos + self._bf.array.bytes 
+        arr = array.array('B')
         arr.frombytes(
             (<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos]
         )

From c07465a0430b99523cfde188c02870a16e72a666 Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Mon, 19 Oct 2020 23:21:31 +0100
Subject: [PATCH 3/4] Update README

---
 README.markdown       | 15 +++++++++++++--
 src/pybloomfilter.pyx |  1 -
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/README.markdown b/README.markdown
index 20af811..b432741 100644
--- a/README.markdown
+++ b/README.markdown
@@ -38,10 +38,21 @@ interface and an ste interface. As an example:
 
 To create an in-memory filter, simply omit the file location:
 ```python
-    >>> cakes = pybloomfilter.BloomFilter(10000, 0.1)
+    >>> fruit = pybloomfilter.BloomFilter(10000, 0.1)
+    >>> fruit.add('apple')
+    >>> 'apple' in fruit
+    True
 ```
-*Caveat*: it is currently not possible to persist this filter later.
 
+These in-memory filters can be pickled and reloaded:
+```python
+    >>> import pickle
+    >>> data = pickle.dumps(fruit)
+    >>> trees = pickle.loads(data)
+    >>> 'apple' in trees
+    True
+```
+*Caveat*: it is currently not possible to persist this filter later as an mmap file.
 
 ## Docs
 
diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx
index bac51ef..a810b86 100644
--- a/src/pybloomfilter.pyx
+++ b/src/pybloomfilter.pyx
@@ -175,7 +175,6 @@ cdef class BloomFilter:
         else:
             self._in_memory = 1
             if data_array is not None:
-                print(f"Inserting data array of length {len(data_array)} with num_bits {num_bits}")
                 data = data_array
             self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity,
                                                     error_rate,

From ceb42b21ee6d5ef519b5450c436a7ef9288a0fe8 Mon Sep 17 00:00:00 2001
From: Stephen Simmons <mail@stevesimmons.com>
Date: Mon, 19 Oct 2020 23:27:24 +0100
Subject: [PATCH 4/4] Fix typo and improve example

---
 README.markdown | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.markdown b/README.markdown
index b432741..0838f44 100644
--- a/README.markdown
+++ b/README.markdown
@@ -15,7 +15,7 @@ The goal of `pybloomfiltermmap3` is simple: to provide a fast, simple, scalable,
 
 There are a couple reasons to use this module:
 
-* It natively uses [mmaped files](http://en.wikipedia.org/wiki/Mmap).
+* It natively uses [mmapped files](http://en.wikipedia.org/wiki/Mmap).
 * It is fast (see [benchmarks](http://axiak.github.io/pybloomfiltermmap/#benchmarks)).
 * It natively does the set things you want a Bloom filter to do.
 
@@ -38,18 +38,18 @@ interface and an ste interface. As an example:
 
 To create an in-memory filter, simply omit the file location:
 ```python
-    >>> fruit = pybloomfilter.BloomFilter(10000, 0.1)
-    >>> fruit.add('apple')
-    >>> 'apple' in fruit
+    >>> fruit2 = pybloomfilter.BloomFilter(10000, 0.1)
+    >>> fruit2.add('apple')
+    >>> 'apple' in fruit2
     True
 ```
 
 These in-memory filters can be pickled and reloaded:
 ```python
     >>> import pickle
-    >>> data = pickle.dumps(fruit)
-    >>> trees = pickle.loads(data)
-    >>> 'apple' in trees
+    >>> data = pickle.dumps(fruit2)
+    >>> fruit3 = pickle.loads(data)
+    >>> 'apple' in fruit3
     True
 ```
 *Caveat*: it is currently not possible to persist this filter later as an mmap file.