-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make pickleable and allow data_array to be retrieved and set #44
Closed
Closed
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,10 @@ | ||
# cython: language_level=3 | ||
|
||
VERSION = (0, 5, 3) | ||
VERSION = (0, 5, 4) | ||
AUTHOR = "Michael Axiak" | ||
|
||
__VERSION__ = VERSION | ||
|
||
|
||
cimport cbloomfilter | ||
cimport cpython | ||
|
||
|
@@ -62,6 +61,9 @@ cdef class BloomFilter: | |
:param list hash_seeds: optionally specify hash seeds to use for the | ||
hashing algorithm. Each hash seed must not exceed 32 bits. The number | ||
of hash seeds will determine the number of hashes performed. | ||
:param bytes data_array: optionally specify the filter data array, same as | ||
given by BloomFilter.data_array. Only valid for in-memory bloomfilters. | ||
If provided, hash_seeds must be given too. | ||
|
||
**Note that we do not check capacity.** This is important, because | ||
we want to be able to support logical OR and AND (see :meth:`BloomFilter.union` | ||
|
@@ -80,20 +82,35 @@ cdef class BloomFilter: | |
cdef int _in_memory | ||
cdef int _oflags | ||
|
||
def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): | ||
def __reduce__(self): | ||
"""Makes an in-memory BloomFilter pickleable.""" | ||
callable = BloomFilter | ||
args = (self.capacity, self.error_rate, None, None, self.hash_seeds, self.data_array) | ||
return (callable, args) | ||
|
||
|
||
def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): | ||
self._closed = 0 | ||
self._in_memory = 0 | ||
self._oflags = os.O_RDWR | ||
|
||
if capacity is NoConstruct: | ||
return | ||
|
||
self._create(capacity, error_rate, filename, perm, hash_seeds) | ||
self._create(capacity, error_rate, filename, perm, hash_seeds, data_array) | ||
|
||
def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): | ||
|
||
def _create(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None, data_array=None): | ||
cdef char * seeds | ||
cdef char * data = NULL | ||
cdef long long num_bits | ||
|
||
if data_array is not None: | ||
if filename: | ||
raise ValueError("data_array cannot be used for an mmapped filter.") | ||
if hash_seeds is None: | ||
raise ValueError("hash_seeds must be specified if a data_array is provided.") | ||
|
||
# Make sure that if the filename is defined, that the | ||
# file exists | ||
if filename and os.path.exists(filename): | ||
|
@@ -135,6 +152,10 @@ cdef class BloomFilter: | |
# Minimum bit vector of 128 bits | ||
num_bits = max(num_hashes * bits_per_hash,128) | ||
|
||
# Override calculated capacity if we are provided a data array | ||
if data_array is not None: | ||
num_bits = 8 * len(data_array) | ||
|
||
# print("k = %d m = %d n = %d p ~= %.8f" % ( | ||
# num_hashes, num_bits, capacity, | ||
# (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits)) | ||
|
@@ -153,18 +174,21 @@ cdef class BloomFilter: | |
num_hashes) | ||
else: | ||
self._in_memory = 1 | ||
if data_array is not None: | ||
data = data_array | ||
self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity, | ||
error_rate, | ||
num_bits, | ||
<int *>seeds, | ||
num_hashes) | ||
num_hashes, <const char *>data) | ||
if self._bf is NULL: | ||
if filename: | ||
raise OSError(errno, '%s: %s' % (os.strerror(errno), | ||
filename)) | ||
else: | ||
cpython.PyErr_NoMemory() | ||
|
||
|
||
def _open(self, filename, mode="rw"): | ||
# Should not overwrite | ||
mode = mode.replace("+", "") | ||
|
@@ -202,6 +226,19 @@ cdef class BloomFilter: | |
arr = (<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] | ||
return int.from_bytes(arr, byteorder="big", signed=False) | ||
|
||
@property | ||
def data_array(self): | ||
"""Bytes array of the Bloom filter contents. | ||
""" | ||
self._assert_open() | ||
start_pos = self._bf.array.preamblebytes | ||
end_pos = start_pos + self._bf.array.bytes | ||
arr = array.array('B') | ||
arr.frombytes( | ||
(<char *>cbloomfilter.mbarray_CharData(self._bf.array))[start_pos:end_pos] | ||
) | ||
return bytes(arr) | ||
|
||
@property | ||
def hash_seeds(self): | ||
"""Integer seeds used for the random hashing. Returns a list of integers.""" | ||
|
@@ -340,7 +377,12 @@ cdef class BloomFilter: | |
item = item_.encode() | ||
key.shash = item | ||
key.nhash = len(item) | ||
elif isinstance(item_, bytes): | ||
item = item_.encode() | ||
key.shash = item | ||
key.nhash = len(item) | ||
else: | ||
# Warning! Only works reliably for objects whose hash is based on value not memory address. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm not mistaken, this basically means: class KlassX:
pass
class KlassHashable:
def __hash__(self):
return 123
bf.add(KlassX()) # this won't work.
bf.add(KlassHashable()) # this will "work" but not in a way we want? right? |
||
item = item_ | ||
key.shash = NULL | ||
key.nhash = hash(item) | ||
|
@@ -391,6 +433,10 @@ cdef class BloomFilter: | |
item = item_.encode() | ||
key.shash = item | ||
key.nhash = len(item) | ||
elif isinstance(item_, bytes): | ||
item = item_.encode() | ||
key.shash = item | ||
key.nhash = len(item) | ||
else: | ||
item = item_ | ||
key.shash = NULL | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you move this to below the
if (!array)
check?