diff --git a/MANIFEST.in b/MANIFEST.in index 88f0eb3..de1bf55 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ include CHANGELOG include LICENSE include AUTHORS -include README.markdown +include README.md recursive-include src * recursive-include tests * diff --git a/README.markdown b/README.md similarity index 75% rename from README.markdown rename to README.md index f34d721..ea04ba9 100644 --- a/README.markdown +++ b/README.md @@ -10,20 +10,19 @@ The goal of `pybloomfiltermmap3` is simple: to provide a fast, simple, scalable, [![PyPI](https://img.shields.io/pypi/dw/pybloomfiltermmap3.svg)](https://pypi.python.org/pypi/pybloomfiltermmap3) [![PyPI](https://img.shields.io/pypi/pyversions/pybloomfiltermmap3.svg)](https://pypi.python.org/pypi/pybloomfiltermmap3) - ## Why pybloomfiltermmap3? There are a couple reasons to use this module: -* It natively uses [mmaped files](http://en.wikipedia.org/wiki/Mmap). -* It is fast (see [benchmarks](http://axiak.github.io/pybloomfiltermmap/#benchmarks)). -* It natively does the set things you want a Bloom filter to do. - +- It natively uses [mmapped files](http://en.wikipedia.org/wiki/Mmap). +- It is fast (see [benchmarks](http://axiak.github.io/pybloomfiltermmap/#benchmarks)). +- It natively does the set things you want a Bloom filter to do. ## Quickstart After you install, the interface to use is a cross between a file interface and an ste interface. As an example: + ```python >>> import pybloomfilter >>> fruit = pybloomfilter.BloomFilter(100000, 0.1, '/tmp/words.bloom') @@ -37,23 +36,36 @@ interface and an ste interface. As an example: ``` To create an in-memory filter, simply omit the file location: + +```python + >>> fruit = pybloomfilter.BloomFilter(10000, 0.1) + >>> fruit.add('apple') + >>> 'apple' in fruit + True +``` + +These in-memory filters can be pickled and reloaded: + ```python - >>> cakes = pybloomfilter.BloomFilter(10000, 0.1) + >>> import pickle + >>> pickled_fruit = pickle.dumps(fruit) + >>> unpickled_fruit = pickle.loads(pickled_fruit) + >>> 'apple' in unpickled_fruit + True ``` -*Caveat*: it is currently not possible to persist this filter later. +_Caveat_: it is currently not possible to persist this filter later as an mmap file. ## Docs Current docs are available at [pybloomfiltermmap3.rtfd.io](https://pybloomfiltermmap3.readthedocs.io/en/latest). - ## Install To install: -```shell - $ pip install pybloomfiltermmap3 +```bash +pip install pybloomfiltermmap3 ``` and you should be set. @@ -62,7 +74,6 @@ and you should be set. This library is specifically meant for Python 3.5 and above. [As of 2020](https://www.python.org/doc/sunset-python-2/), we strongly advise you to switch to an actively maintained distribution of Python 3. If for any reason your current environment is restricted to Python 2, please see [pybloomfiltermmap](https://github.com/axiak/pybloomfiltermmap). Please note that the latter is not actively maintained and will lack bug fixes and new features. - ## History and Future [pybloomfiltermmap](https://github.com/axiak/pybloomfiltermmap) is an excellent Bloom filter implementation for Python 2 by [@axiak](https://github.com/axiak) and contributors. I, [@prashnts](https://github.com/prashnts), made initial changes to add support for Python 3 sometime in 2016 as the current [pybloomfiltermmap3](https://pypi.org/project/pybloomfiltermmap3/) on `PyPI`. Since then, with the help of contributors, there have been incremental improvements and bug fixes while maintaining the API from versions `0.4.x` and below. @@ -71,18 +82,47 @@ Some new features and changes were first introduced in version `0.5.0`. From thi Suggestions, bug reports, and / or patches are welcome! - ## Contributions and development When contributing, you should set up an appropriate Python 3 environment and install the dependencies listed in `requirements-dev.txt`. Package installation depends on a generated `pybloomfilter.c` file, which requires Cython module to be in your current environment. +### Environment setup -## Maintainers +```bash +# Installs the venv and python3-dev packages +sudo apt install python3.10-venv python3-dev + +# Creates a virtual env called "env" +python -m venv env + +# Activates the created virtual env +source ./env/bin/activate +``` + +### Dependencies + +```bash +python -m pip install --upgrade pip +pip install cython +``` -* [Prashant Sinha](https://github.com/prashnts) -* [Vytautas Mizgiris](https://github.com/vmizg) +### Build + +```bash +python setup.py develop +``` + +### Test + +```bash +python setup.py test +``` + +## Maintainers +- [Prashant Sinha](https://github.com/prashnts) +- [Vytautas Mizgiris](https://github.com/vmizg) ## License diff --git a/setup.py b/setup.py index e25cb4d..63c9a0e 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ here = os.path.dirname(__file__) # Get the long description from the README file -with open(os.path.join(here, "README.markdown"), encoding="utf-8") as fp: +with open(os.path.join(here, "README.md"), encoding="utf-8") as fp: long_description = fp.read() setup_kwargs = {} diff --git a/src/bloomfilter.c b/src/bloomfilter.c index cd89d18..e1a39c8 100644 --- a/src/bloomfilter.c +++ b/src/bloomfilter.c @@ -9,7 +9,7 @@ #include "bloomfilter.h" BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, - BTYPE num_bits, int *hash_seeds, int num_hashes) + BTYPE num_bits, int *hash_seeds, int num_hashes, const char *data) { BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter)); MBArray * array; @@ -35,6 +35,9 @@ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, bloomfilter_Destroy(bf); return NULL; } + if (data) { + memcpy(array->vector, data, num_bits / 8); + } bf->array = array; diff --git a/src/bloomfilter.h b/src/bloomfilter.h index e974c58..fde99ff 100644 --- a/src/bloomfilter.h +++ b/src/bloomfilter.h @@ -28,7 +28,7 @@ typedef struct _BloomFilter BloomFilter; /* Create a bloom filter without a memory-mapped file backing it */ BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, - BTYPE num_bits, int *hash_seeds, int num_hashes); + BTYPE num_bits, int *hash_seeds, int num_hashes, const char * data); /* Create a bloom filter with a memory-mapped file backing it */ BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate, diff --git a/src/cbloomfilter.pxd b/src/cbloomfilter.pxd index e99f346..6a4b6fd 100644 --- a/src/cbloomfilter.pxd +++ b/src/cbloomfilter.pxd @@ -48,7 +48,7 @@ cdef extern from "bloomfilter.h": BloomFilter * bloomfilter_Create_Malloc(long max_num_elem, double error_rate, long num_bits, - int * hash_seeds, int num_hashes) + int * hash_seeds, int num_hashes, char * data) void bloomfilter_Destroy(BloomFilter * bf) int bloomfilter_Add(BloomFilter * bf, Key * key) int bloomfilter_Test(BloomFilter * bf, Key * key) diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx index f7f9447..4d58457 100644 --- a/src/pybloomfilter.pyx +++ b/src/pybloomfilter.pyx @@ -5,7 +5,6 @@ AUTHOR = "Michael Axiak" __VERSION__ = VERSION - cimport cbloomfilter cimport cpython @@ -62,6 +61,9 @@ cdef class BloomFilter: :param list hash_seeds: optionally specify hash seeds to use for the hashing algorithm. Each hash seed must not exceed 32 bits. The number of hash seeds will determine the number of hashes performed. + :param bytes data_array: optionally specify the filter data array, same as + given by BloomFilter.data_array. Only valid for in-memory bloomfilters. + If provided, hash_seeds must be given too. **Note that we do not check capacity.** This is important, because we want to be able to support logical OR and AND (see :meth:`BloomFilter.union` @@ -80,7 +82,14 @@ cdef class BloomFilter: cdef int _in_memory cdef int _oflags - def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): + def __reduce__(self): + """Makes an in-memory BloomFilter pickleable.""" + callable = BloomFilter + args = (self.capacity, self.error_rate, None, None, self.hash_seeds, self.data_array) + return (callable, args) + + + def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): self._closed = 0 self._in_memory = 0 self._oflags = os.O_RDWR @@ -88,12 +97,20 @@ cdef class BloomFilter: if capacity is NoConstruct: return - self._create(capacity, error_rate, filename, perm, hash_seeds) + self._create(capacity, error_rate, filename, perm, hash_seeds, data_array) + - def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): + def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): cdef char * seeds + cdef char * data = NULL cdef long long num_bits + if data_array is not None: + if filename: + raise ValueError("data_array cannot be used for an mmapped filter.") + if hash_seeds is None: + raise ValueError("hash_seeds must be specified if a data_array is provided.") + # Make sure that if the filename is defined, that the # file exists if filename and os.path.exists(filename): @@ -135,6 +152,10 @@ cdef class BloomFilter: # Minimum bit vector of 128 bits num_bits = max(num_hashes * bits_per_hash,128) + # Override calculated capacity if we are provided a data array + if data_array is not None: + num_bits = 8 * len(data_array) + # print("k = %d m = %d n = %d p ~= %.8f" % ( # num_hashes, num_bits, capacity, # (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits)) @@ -153,11 +174,13 @@ cdef class BloomFilter: num_hashes) else: self._in_memory = 1 + if data_array is not None: + data = data_array self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity, error_rate, num_bits, seeds, - num_hashes) + num_hashes, data) if self._bf is NULL: if filename: raise OSError(errno, '%s: %s' % (os.strerror(errno), @@ -165,6 +188,7 @@ cdef class BloomFilter: else: cpython.PyErr_NoMemory() + def _open(self, filename, mode="rw"): # Should not overwrite mode = mode.replace("+", "") @@ -202,6 +226,19 @@ cdef class BloomFilter: arr = (cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] return int.from_bytes(arr, byteorder="big", signed=False) + @property + def data_array(self): + """Bytes array of the Bloom filter contents. + """ + self._assert_open() + start_pos = self._bf.array.preamblebytes + end_pos = start_pos + self._bf.array.bytes + arr = array.array('B') + arr.frombytes( + (cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] + ) + return bytes(arr) + @property def hash_seeds(self): """Integer seeds used for the random hashing. Returns a list of integers.""" @@ -345,6 +382,7 @@ cdef class BloomFilter: key.shash = item key.nhash = len(item) else: + # Warning! Only works reliably for objects whose hash is based on value not memory address. item = item_ key.shash = NULL key.nhash = hash(item) diff --git a/tests/simpletest.py b/tests/simpletest.py index aa4b4ec..83f908d 100755 --- a/tests/simpletest.py +++ b/tests/simpletest.py @@ -2,6 +2,7 @@ import string import unittest import tempfile +import pickle from random import randint, choice, getrandbits import pybloomfilter @@ -400,6 +401,19 @@ def test_approximate_size_after_union_called(self): intersection = len(bf1) + len(bf2) - len(union_bf) assert intersection == 11 # approximate size + def test_pickle(self): + bf = pybloomfilter.BloomFilter(100, 0.1) + bf.add('apple') + assert 'apple' in bf + assert 'hello' not in bf + + pickled = pickle.dumps(bf) + unpickled = pickle.loads(pickled) + assert 'apple' in unpickled + assert 'hello' not in unpickled + + # Expecting same hashing sequence + self.assertEqual(bf.bit_array, unpickled.bit_array) def suite(): suite = unittest.TestSuite()