Skip to content

Commit

Permalink
Pass through to check docstrings are up to date. Closes #102
Browse files Browse the repository at this point in the history
  • Loading branch information
hardbyte committed Jun 27, 2018
1 parent a356ffc commit 03dbc2b
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 53 deletions.
68 changes: 32 additions & 36 deletions clkhash/bloomfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,18 @@ def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
encoding # type: str
):
# type: (...) -> bitarray
"""
Computes the double hash encoding of the provided ngrams with the given keys.
""" Computes the double hash encoding of the provided ngrams with the given keys.
Using the method from
http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf
Using the method from
http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf
:param ngrams: list of n-grams to be encoded
:param keys: hmac secret keys for md5 and sha1 as bytes
:param k: number of hash functions to use per element of the ngrams
:param l: length of the output bitarray
:param encoding: the encoding to use when turning the ngrams to bytes
:param ngrams: list of n-grams to be encoded
:param keys: hmac secret keys for md5 and sha1 as bytes
:param k: number of hash functions to use per element of the ngrams
:param l: length of the output bitarray
:param encoding: the encoding to use when turning the ngrams to bytes
:return: bitarray of length l with the bits set which correspond to the encoding of the ngrams
:return: bitarray of length l with the bits set which correspond to the encoding of the ngrams
"""
key_sha1, key_md5 = keys
bf = bitarray(l)
Expand Down Expand Up @@ -210,10 +209,9 @@ def blake_encode_ngrams(ngrams, # type: Iterable[str]


class NgramEncodings(Enum):
"""
Lists the available schemes for encoding n-grams.
""" The available schemes for encoding n-grams.
.. note::
..
the slightly awkward looking construction with the calls to partial and the overwrite of __call__ are due to
compatibility issues with Python 2.7.
"""
Expand Down Expand Up @@ -286,23 +284,22 @@ def crypto_bloom_filter(record, # type: Sequence[Text]
hash_properties # type: GlobalHashingProperties
):
# type: (...) -> Tuple[bitarray, Text, int]
"""
Makes a Bloom filter from a record with given tokenizers and lists of keys.
Using the method from
http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf
:param record: plaintext record tuple. E.g. (index, name, dob, gender)
:param tokenizers: A list of tokenizers. A tokenizer is a function that
returns tokens from a string.
:param fields: A list of FieldSpec. One for each field.
:param keys: Keys for the hash functions as a tuple of lists of bytes.
:param hash_properties: Global hashing properties.
:return: 3-tuple:
- bloom filter for record as a bitarray
- first element of record (usually an index)
- number of bits set in the bloomfilter
""" Computes the composite Bloom filter encoding of a record.
Using the method from
http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf
:param record: plaintext record tuple. E.g. (index, name, dob, gender)
:param tokenizers: A list of tokenizers. A tokenizer is a function that
returns tokens from a string.
:param fields: A list of FieldSpec. One for each field.
:param keys: Keys for the hash functions as a tuple of lists of bytes.
:param hash_properties: Global hashing properties.
:return: 3-tuple:
- bloom filter for record as a bitarray
- first element of record (usually an index)
- number of bits set in the bloomfilter
"""
xor_folds = hash_properties.xor_folds
hash_l = hash_properties.l * 2 ** xor_folds
Expand Down Expand Up @@ -333,13 +330,12 @@ def stream_bloom_filters(dataset, # type: Iterable[Sequence[Text]]
schema # type: Schema
):
# type: (...) -> Iterable[Tuple[bitarray, Text, int]]
"""
Yield bloom filters
""" Compute composite Bloom filters (CLKs) for every record in an iterable dataset.
:param dataset: An iterable of indexable records.
:param schema: An instantiated Schema instance
:param keys: A tuple of two lists of secret keys used in the HMAC.
:return: Yields bloom filters as 3-tuples
:param dataset: An iterable of indexable records.
:param schema: An instantiated Schema instance
:param keys: A tuple of two lists of secret keys used in the HMAC.
:return: Generator yielding bloom filters as 3-tuples
"""
tokenizers = [tokenizer.get_tokenizer(field.hashing_properties)
for field in schema.fields]
Expand Down
13 changes: 6 additions & 7 deletions clkhash/clk.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def hash_and_serialize_chunk(chunk_pii_data, # type: Sequence[Sequence[str]]
set to one -- of the generated Bloom filters.
:param chunk_pii_data: An iterable of indexable records.
:param keys: A tuple of two lists of secret keys used in the HMAC.
:param Schema schema: Schema specifying the entry formats and
hashing settings.
:param keys: A tuple of two lists of secret keys used in the HMAC.
:return: A list of serialized Bloom filters and a list of corresponding popcounts
"""
clk_data = []
Expand All @@ -62,16 +62,15 @@ def generate_clk_from_csv(input_f, # type: TextIO
(a.k.a popcount -- the number of bits set to high) of the
generated Bloom filters.
:param input_f: The CSV file to hash, as stream.
:param keys: A tuple of two lists of secret keys used in the
HMAC.
:param Schema schema: Schema specifying the entry formats and
:param input_f: A file-like object of csv data to hash.
:param keys: A tuple of two lists of secret keys.
:param schema: Schema specifying the record formats and
hashing settings.
:param bool validate: Set to `False` to disable validation of
:param validate: Set to `False` to disable validation of
data against the schama. Note that this will silence
warnings whose aim is to keep the hashes consistent between
data sources; this may affect linkage accuracy.
:param bool header: Set to `False` if the CSV file does not have
:param header: Set to `False` if the CSV file does not have
a header. Set to `'ignore'` if the CSV file does have a
header but it should not be checked against the schema.
:param bool progress_bar: Set to `False` to disable the progress
Expand Down
13 changes: 5 additions & 8 deletions clkhash/randomnames.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

def load_csv_data(resource_name):
# type: (str) -> List[str]
"""Loads a specified CSV data file and returns the first column as a Python list
""" Loads first column of specified CSV file from package data.
"""
data_bytes = pkgutil.get_data('clkhash', 'data/{}'.format(resource_name))
if data_bytes is None:
Expand Down Expand Up @@ -62,8 +62,7 @@ def save_csv(data, # type: Iterable[Tuple[Union[str, int], ...]]

def random_date(start, end):
# type: (datetime, datetime) -> datetime
"""
This function will return a random datetime between two datetime objects.
""" Generate a random datetime between two datetime objects.
:param start: datetime of start
:param end: datetime of end
Expand All @@ -76,7 +75,7 @@ def random_date(start, end):


class NameList:
""" List of randomly generated names.
""" Randomly generated PII records.
"""

with open(os.path.join(os.path.dirname(__file__),
Expand Down Expand Up @@ -130,12 +129,10 @@ def generate_random_person(self, n):

def load_names(self):
# type: () -> None
"""
This function loads a name database into globals firstNames and lastNames
""" Loads a name database from package data
initial version uses data files from
Uses data files sourced from
http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/
"""

self.all_male_first_names = load_csv_data('male-first-names.csv')
Expand Down
3 changes: 1 addition & 2 deletions clkhash/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ def from_json_dict(cls, properties_dict):


class Schema(object):
""" Overall schema which describes how to hash plaintext identifiers
into clks.
"""Linkage Schema which describes how to encode plaintext identifiers.
:ivar version: Version for the schema. Needed to keep behaviour
consistent between clkhash versions for the same schema.
Expand Down

0 comments on commit 03dbc2b

Please sign in to comment.