Skip to content

Commit

Permalink
Automated code format update. (#142)
Browse files Browse the repository at this point in the history
  • Loading branch information
hardbyte authored Jun 26, 2018
1 parent 2f36551 commit afe37eb
Show file tree
Hide file tree
Showing 12 changed files with 139 additions and 142 deletions.
6 changes: 1 addition & 5 deletions clkhash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import pkg_resources

from . import bloomfilter
from . import field_formats
from . import key_derivation
from . import schema
from . import randomnames
from . import bloomfilter, field_formats, key_derivation, schema, randomnames

try:
__version__ = pkg_resources.get_distribution('clkhash').version
Expand Down
13 changes: 7 additions & 6 deletions clkhash/backports.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
import csv
from datetime import datetime
import re
import sys
import time
from datetime import datetime
from typing import AnyStr, Callable, cast, Pattern, Sequence, Text

from future.utils import raise_from as _raise_from
from mypy_extensions import Arg, DefaultNamedArg, NoReturn


try:
int_from_bytes = int.from_bytes
except AttributeError:
import codecs


def _int_from_bytes(bytes, byteorder, signed=False):
# type: (Sequence[int], str, bool) -> int
""" Emulate Python 3's `int.from_bytes`.
Expand All @@ -39,6 +39,7 @@ def _int_from_bytes(bytes, byteorder, signed=False):
hex_str = codecs.encode(bytes, 'hex') # type: ignore
return int(hex_str, 16)


# Make this cast since Python 2 doesn't have syntax for default
# named arguments. Hence, must cast so Mypy thinks it matches the
# original function.
Expand Down Expand Up @@ -99,10 +100,9 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):


unicode_reader = (_p2_unicode_reader # Python 2 with hacky workarounds.
if sys.version_info < (3,0)
if sys.version_info < (3, 0)
else csv.reader) # Py3 with native Unicode support.


if sys.version_info > (3, 2):
strftime = datetime.strftime

Expand All @@ -113,17 +113,19 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# even number of '%'s before the 's' because those are all escaped.
_illegal_s = re.compile(r'((^|[^%])(%%)*%s)')


def _findall(text, substr):
# Also finds overlaps
i = 0
while True:
j = text.find(substr, i)
if j == -1:
return

yield j
i = j + 1


def strftime(dt, fmt):
# type: (datetime, Text) -> Text
""" strftime that support years < 1900 in Python < 3.2.
Expand Down Expand Up @@ -177,7 +179,6 @@ def strftime(dt, fmt):
s = s[:site] + syear + s[site + _YEAR_LEN:]
return s


# Help MyPy understand that this always throws.
raise_from = cast(Callable[[BaseException, BaseException], NoReturn],
_raise_from)
56 changes: 28 additions & 28 deletions clkhash/bloomfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@
"""

import base64
from enum import Enum
from functools import partial
from hashlib import md5, sha1
import hmac
import math
import struct
from enum import Enum
from functools import partial
from hashlib import md5, sha1
from typing import Callable, Iterable, List, Sequence, Text, Tuple

from bitarray import bitarray
from future.builtins import range, zip

from clkhash import tokenizer
from clkhash.backports import int_from_bytes
from clkhash.schema import Schema, GlobalHashingProperties
from clkhash.field_formats import FieldSpec
from clkhash.schema import Schema, GlobalHashingProperties

try:
from hashlib import blake2b
Expand All @@ -30,11 +30,11 @@
# blake2b is already defined.


def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
# type: (...) -> bitarray
"""
Expand Down Expand Up @@ -64,11 +64,11 @@ def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
return bf


def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
# type: (...) -> bitarray.bitarray
"""
Expand Down Expand Up @@ -131,11 +131,11 @@ def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str
return bf


def blake_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
def blake_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
# type: (...) -> bitarray.bitarray
"""
Expand Down Expand Up @@ -190,13 +190,13 @@ def blake_encode_ngrams(ngrams, # type: Iterable[str]
key, = keys # Unpack.

log_l = int(math.log(l, 2))
if not 2**log_l == l:
if not 2 ** log_l == l:
raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l))
bf = bitarray(l)
bf.setall(False)
if k < 1:
return bf
num_macs = (k+31) // 32
num_macs = (k + 31) // 32

for m in ngrams:
random_shorts = [] # type: List[int]
Expand Down Expand Up @@ -249,7 +249,7 @@ def from_properties(cls,


def fold_xor(bloomfilter, # type: bitarray
folds # type: int
folds # type: int
):
# type: (...) -> bitarray
""" Performs XOR folding on a Bloom filter.
Expand Down Expand Up @@ -279,10 +279,10 @@ def fold_xor(bloomfilter, # type: bitarray
return bloomfilter


def crypto_bloom_filter(record, # type: Sequence[Text]
tokenizers, # type: List[Callable[[Text], Iterable[Text]]]
fields, # type: Sequence[FieldSpec]
keys, # type: Sequence[Sequence[bytes]]
def crypto_bloom_filter(record, # type: Sequence[Text]
tokenizers, # type: List[Callable[[Text], Iterable[Text]]]
fields, # type: Sequence[FieldSpec]
keys, # type: Sequence[Sequence[bytes]]
hash_properties # type: GlobalHashingProperties
):
# type: (...) -> Tuple[bitarray, Text, int]
Expand Down Expand Up @@ -329,8 +329,8 @@ def crypto_bloom_filter(record, # type: Sequence[Text]


def stream_bloom_filters(dataset, # type: Iterable[Sequence[Text]]
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
):
# type: (...) -> Iterable[Tuple[bitarray, Text, int]]
"""
Expand Down
7 changes: 3 additions & 4 deletions clkhash/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import clkhash
from clkhash import benchmark as bench, clk, randomnames, validate_data


DEFAULT_SERVICE_URL = 'https://es.data61.xyz'


Expand Down Expand Up @@ -41,7 +40,6 @@ def cli():
"""



@cli.command('hash', short_help="generate hashes from local PII data")
@click.argument('input', type=click.File('r'))
@click.argument('keys', nargs=2, type=click.Tuple([str, str]))
Expand Down Expand Up @@ -92,7 +90,7 @@ def hash(input, keys, schema, output, quiet, no_header, check_header, validate):

@cli.command('status', short_help='Get status of entity service')
@click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option('-o','--output', type=click.File('w'), default='-')
@click.option('-o', '--output', type=click.File('w'), default='-')
@click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
def status(server, output, verbose):
"""Connect to an entity matching server and check the service status.
Expand Down Expand Up @@ -131,6 +129,7 @@ def status(server, output, verbose):
"""



@cli.command('create-project', short_help="create a linkage project on the entity service")
@click.option('--type', default='permutations',
type=click.Choice(['mapping', 'permutations', 'similarity_scores']),
Expand Down Expand Up @@ -226,7 +225,7 @@ def create(server, name, project, apikey, output, threshold, verbose):
@click.option('--project', help='Project identifier')
@click.option('--apikey', help='Authentication API key for the server.')
@click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option('-o','--output', type=click.File('w'), default='-')
@click.option('-o', '--output', type=click.File('w'), default='-')
@click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
def upload(input, project, apikey, server, output, verbose):
"""Upload CLK data to entity matching server.
Expand Down
29 changes: 14 additions & 15 deletions clkhash/clk.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
from clkhash.validate_data import (validate_entries, validate_header,
validate_row_lengths)


log = logging.getLogger('clkhash.clk')

CHUNK_SIZE = 1000


def hash_and_serialize_chunk(chunk_pii_data, # type: Sequence[Sequence[str]]
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
):
# type: (...) -> Tuple[List[str], Sequence[int]]
"""
Expand All @@ -49,12 +48,12 @@ def hash_and_serialize_chunk(chunk_pii_data, # type: Sequence[Sequence[str]]
return clk_data, clk_popcounts


def generate_clk_from_csv(input_f, # type: TextIO
keys, # type: Tuple[AnyStr, AnyStr]
schema, # type: Schema
validate=True, # type: bool
header=True, # type: Union[bool, AnyStr]
progress_bar=True # type: bool
def generate_clk_from_csv(input_f, # type: TextIO
keys, # type: Tuple[AnyStr, AnyStr]
schema, # type: Schema
validate=True, # type: bool
header=True, # type: Union[bool, AnyStr]
progress_bar=True # type: bool
):
# type: (...) -> List[str]
""" Generate Bloom filters from CSV file, then serialise them.
Expand Down Expand Up @@ -127,11 +126,11 @@ def callback(tics, clk_stats):
return results


def generate_clks(pii_data, # type: Sequence[Sequence[str]]
schema, # type: Schema
keys, # type: Tuple[AnyStr, AnyStr]
def generate_clks(pii_data, # type: Sequence[Sequence[str]]
schema, # type: Schema
keys, # type: Tuple[AnyStr, AnyStr]
validate=True, # type: bool
callback=None # type: Optional[Callable[[int, Sequence[int]], None]]
callback=None # type: Optional[Callable[[int, Sequence[int]], None]]
):
# type: (...) -> List[str]

Expand All @@ -158,7 +157,7 @@ def generate_clks(pii_data, # type: Sequence[Sequence[str]]
for chunk in chunks(pii_data, chunk_size):
future = executor.submit(
hash_and_serialize_chunk,
chunk, key_lists, schema,)
chunk, key_lists, schema, )
if callback is not None:
unpacked_callback = cast(Callable[[int, Sequence[int]], None],
callback)
Expand All @@ -175,7 +174,7 @@ def generate_clks(pii_data, # type: Sequence[Sequence[str]]
return results


T = TypeVar('T') # Declare generic type variable
T = TypeVar('T') # Declare generic type variable


def chunks(seq, chunk_size):
Expand Down
Loading

0 comments on commit afe37eb

Please sign in to comment.