diff --git a/clkhash/bloomfilter.py b/clkhash/bloomfilter.py index e7499fef..291fca67 100644 --- a/clkhash/bloomfilter.py +++ b/clkhash/bloomfilter.py @@ -37,19 +37,18 @@ def double_hash_encode_ngrams(ngrams, # type: Iterable[str] encoding # type: str ): # type: (...) -> bitarray - """ - Computes the double hash encoding of the provided ngrams with the given keys. + """ Computes the double hash encoding of the provided ngrams with the given keys. - Using the method from - http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf + Using the method from + http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf - :param ngrams: list of n-grams to be encoded - :param keys: hmac secret keys for md5 and sha1 as bytes - :param k: number of hash functions to use per element of the ngrams - :param l: length of the output bitarray - :param encoding: the encoding to use when turning the ngrams to bytes + :param ngrams: list of n-grams to be encoded + :param keys: hmac secret keys for md5 and sha1 as bytes + :param k: number of hash functions to use per element of the ngrams + :param l: length of the output bitarray + :param encoding: the encoding to use when turning the ngrams to bytes - :return: bitarray of length l with the bits set which correspond to the encoding of the ngrams + :return: bitarray of length l with the bits set which correspond to the encoding of the ngrams """ key_sha1, key_md5 = keys bf = bitarray(l) @@ -210,10 +209,9 @@ def blake_encode_ngrams(ngrams, # type: Iterable[str] class NgramEncodings(Enum): - """ - Lists the available schemes for encoding n-grams. + """ The available schemes for encoding n-grams. - .. note:: + .. the slightly awkward looking construction with the calls to partial and the overwrite of __call__ are due to compatibility issues with Python 2.7. """ @@ -286,23 +284,22 @@ def crypto_bloom_filter(record, # type: Sequence[Text] hash_properties # type: GlobalHashingProperties ): # type: (...) -> Tuple[bitarray, Text, int] - """ - Makes a Bloom filter from a record with given tokenizers and lists of keys. - - Using the method from - http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf - - :param record: plaintext record tuple. E.g. (index, name, dob, gender) - :param tokenizers: A list of tokenizers. A tokenizer is a function that - returns tokens from a string. - :param fields: A list of FieldSpec. One for each field. - :param keys: Keys for the hash functions as a tuple of lists of bytes. - :param hash_properties: Global hashing properties. - - :return: 3-tuple: - - bloom filter for record as a bitarray - - first element of record (usually an index) - - number of bits set in the bloomfilter + """ Computes the composite Bloom filter encoding of a record. + + Using the method from + http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf + + :param record: plaintext record tuple. E.g. (index, name, dob, gender) + :param tokenizers: A list of tokenizers. A tokenizer is a function that + returns tokens from a string. + :param fields: A list of FieldSpec. One for each field. + :param keys: Keys for the hash functions as a tuple of lists of bytes. + :param hash_properties: Global hashing properties. + + :return: 3-tuple: + - bloom filter for record as a bitarray + - first element of record (usually an index) + - number of bits set in the bloomfilter """ xor_folds = hash_properties.xor_folds hash_l = hash_properties.l * 2 ** xor_folds @@ -333,13 +330,12 @@ def stream_bloom_filters(dataset, # type: Iterable[Sequence[Text]] schema # type: Schema ): # type: (...) -> Iterable[Tuple[bitarray, Text, int]] - """ - Yield bloom filters + """ Compute composite Bloom filters (CLKs) for every record in an iterable dataset. - :param dataset: An iterable of indexable records. - :param schema: An instantiated Schema instance - :param keys: A tuple of two lists of secret keys used in the HMAC. - :return: Yields bloom filters as 3-tuples + :param dataset: An iterable of indexable records. + :param schema: An instantiated Schema instance + :param keys: A tuple of two lists of secret keys used in the HMAC. + :return: Generator yielding bloom filters as 3-tuples """ tokenizers = [tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields] diff --git a/clkhash/clk.py b/clkhash/clk.py index 7e3b20e0..dd059999 100644 --- a/clkhash/clk.py +++ b/clkhash/clk.py @@ -35,9 +35,9 @@ def hash_and_serialize_chunk(chunk_pii_data, # type: Sequence[Sequence[str]] set to one -- of the generated Bloom filters. :param chunk_pii_data: An iterable of indexable records. + :param keys: A tuple of two lists of secret keys used in the HMAC. :param Schema schema: Schema specifying the entry formats and hashing settings. - :param keys: A tuple of two lists of secret keys used in the HMAC. :return: A list of serialized Bloom filters and a list of corresponding popcounts """ clk_data = [] @@ -62,16 +62,15 @@ def generate_clk_from_csv(input_f, # type: TextIO (a.k.a popcount -- the number of bits set to high) of the generated Bloom filters. - :param input_f: The CSV file to hash, as stream. - :param keys: A tuple of two lists of secret keys used in the - HMAC. - :param Schema schema: Schema specifying the entry formats and + :param input_f: A file-like object of csv data to hash. + :param keys: A tuple of two lists of secret keys. + :param schema: Schema specifying the record formats and hashing settings. - :param bool validate: Set to `False` to disable validation of + :param validate: Set to `False` to disable validation of data against the schama. Note that this will silence warnings whose aim is to keep the hashes consistent between data sources; this may affect linkage accuracy. - :param bool header: Set to `False` if the CSV file does not have + :param header: Set to `False` if the CSV file does not have a header. Set to `'ignore'` if the CSV file does have a header but it should not be checked against the schema. :param bool progress_bar: Set to `False` to disable the progress diff --git a/clkhash/randomnames.py b/clkhash/randomnames.py index 06310a03..acc5a816 100644 --- a/clkhash/randomnames.py +++ b/clkhash/randomnames.py @@ -30,7 +30,7 @@ def load_csv_data(resource_name): # type: (str) -> List[str] - """Loads a specified CSV data file and returns the first column as a Python list + """ Loads first column of specified CSV file from package data. """ data_bytes = pkgutil.get_data('clkhash', 'data/{}'.format(resource_name)) if data_bytes is None: @@ -62,8 +62,7 @@ def save_csv(data, # type: Iterable[Tuple[Union[str, int], ...]] def random_date(start, end): # type: (datetime, datetime) -> datetime - """ - This function will return a random datetime between two datetime objects. + """ Generate a random datetime between two datetime objects. :param start: datetime of start :param end: datetime of end @@ -76,7 +75,7 @@ def random_date(start, end): class NameList: - """ List of randomly generated names. + """ Randomly generated PII records. """ with open(os.path.join(os.path.dirname(__file__), @@ -130,12 +129,10 @@ def generate_random_person(self, n): def load_names(self): # type: () -> None - """ - This function loads a name database into globals firstNames and lastNames + """ Loads a name database from package data - initial version uses data files from + Uses data files sourced from http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/ - """ self.all_male_first_names = load_csv_data('male-first-names.csv') diff --git a/clkhash/schema.py b/clkhash/schema.py index 8fe15e10..fcf799c9 100644 --- a/clkhash/schema.py +++ b/clkhash/schema.py @@ -125,8 +125,7 @@ def from_json_dict(cls, properties_dict): class Schema(object): - """ Overall schema which describes how to hash plaintext identifiers - into clks. + """Linkage Schema which describes how to encode plaintext identifiers. :ivar version: Version for the schema. Needed to keep behaviour consistent between clkhash versions for the same schema.