From 36a14d13fe443da657880fca91c7c58597553da7 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Fri, 1 Mar 2024 15:42:47 -0500 Subject: [PATCH 1/9] Add initial test of scix_id to pipeline utils. --- SciXPipelineUtils/scix_id.py | 172 +++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 173 insertions(+) create mode 100644 SciXPipelineUtils/scix_id.py diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py new file mode 100644 index 0000000..7d08b8a --- /dev/null +++ b/SciXPipelineUtils/scix_id.py @@ -0,0 +1,172 @@ +""" +base32-crockford +================ + +A Python module implementing the alternate base32 encoding as described +by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html. + +He designed the encoding to: + + * Be human and machine readable + * Be compact + * Be error resistant + * Be pronounceable + +It uses a symbol set of 10 digits and 22 letters, excluding I, L O and +U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1' +and 'o' is converted to '0'. Encoding uses only upper-case characters. + +Hyphens may be present in symbol strings to improve readability, and +are removed when decoding. + +A check symbol can be appended to a symbol string to detect errors +within the string. + +""" + +import re +import sys + +PY3 = sys.version_info[0] == 3 + +if not PY3: + import string as str + + +__all__ = ["encode", "decode", "normalize"] + + +if PY3: + string_types = str, +else: + string_types = basestring, + +# The encoded symbol space does not include I, L, O or U +symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ' +# These five symbols are exclusively for checksum values +check_symbols = '*~$=U' + +encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols)) +decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols)) +normalize_symbols = str.maketrans('IiLlOo', '111100') +valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols, + re.escape(check_symbols))) + +base = len(symbols) +check_base = len(symbols + check_symbols) + + +def encode(number, checksum=False, split=0): + """Encode an integer into a symbol string. + + A ValueError is raised on invalid input. + + If checksum is set to True, a check symbol will be + calculated and appended to the string. + + If split is specified, the string will be divided into + clusters of that size separated by hyphens. + + The encoded string is returned. + """ + number = int(number) + if number < 0: + raise ValueError("number '%d' is not a positive integer" % number) + + split = int(split) + if split < 0: + raise ValueError("split '%d' is not a positive integer" % split) + + check_symbol = '' + if checksum: + check_symbol = encode_symbols[number % check_base] + + if number == 0: + return '0' + check_symbol + + symbol_string = '' + while number > 0: + remainder = number % base + number //= base + symbol_string = encode_symbols[remainder] + symbol_string + symbol_string = symbol_string + check_symbol + + if split: + chunks = [] + for pos in range(0, len(symbol_string), split): + chunks.append(symbol_string[pos:pos + split]) + symbol_string = '-'.join(chunks) + + return symbol_string + + +def decode(symbol_string, checksum=False, strict=False): + """Decode an encoded symbol string. + + If checksum is set to True, the string is assumed to have a + trailing check symbol which will be validated. If the + checksum validation fails, a ValueError is raised. + + If strict is set to True, a ValueError is raised if the + normalization step requires changes to the string. + + The decoded string is returned. + """ + symbol_string = normalize(symbol_string, strict=strict) + if checksum: + symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1] + + number = 0 + for symbol in symbol_string: + number = number * base + decode_symbols[symbol] + + if checksum: + check_value = decode_symbols[check_symbol] + modulo = number % check_base + if check_value != modulo: + raise ValueError("invalid check symbol '%s' for string '%s'" % + (check_symbol, symbol_string)) + + return number + + +def normalize(symbol_string, strict=False): + """Normalize an encoded symbol string. + + Normalization provides error correction and prepares the + string for decoding. These transformations are applied: + + 1. Hyphens are removed + 2. 'I', 'i', 'L' or 'l' are converted to '1' + 3. 'O' or 'o' are converted to '0' + 4. All characters are converted to uppercase + + A TypeError is raised if an invalid string type is provided. + + A ValueError is raised if the normalized string contains + invalid characters. + + If the strict parameter is set to True, a ValueError is raised + if any of the above transformations are applied. + + The normalized string is returned. + """ + if isinstance(symbol_string, string_types): + if not PY3: + try: + symbol_string = symbol_string.encode('ascii') + except UnicodeEncodeError: + raise ValueError("string should only contain ASCII characters") + else: + raise TypeError("string is of invalid type %s" % + symbol_string.__class__.__name__) + + norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper() + + if not valid_symbols.match(norm_string): + raise ValueError("string '%s' contains invalid characters" % norm_string) + + if strict and norm_string != symbol_string: + raise ValueError("string '%s' requires normalization" % symbol_string) + + return norm_string \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c9f732c..0c21f74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ packages = [ dependencies = [ 'boto3==1.26.59', 'avro==1.11.1', + 'base32-crockford==0.3.0', ] [project.urls] From 27d6f6819bfe23b01844435c7b638e6c63f22cbf Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 22 May 2024 10:22:13 -0400 Subject: [PATCH 2/9] Updated scix-id module to pad ids to uniform arbitrary length. --- SciXPipelineUtils/s3_methods.py | 6 +++--- SciXPipelineUtils/scix_id.py | 14 +++++++++++--- pyproject.toml | 3 +-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/SciXPipelineUtils/s3_methods.py b/SciXPipelineUtils/s3_methods.py index 5ce3a04..27f9f71 100644 --- a/SciXPipelineUtils/s3_methods.py +++ b/SciXPipelineUtils/s3_methods.py @@ -18,9 +18,9 @@ def __init__(self, provider, config): config: The imported Pipeline configuration """ if provider == "AWS": - self.s3 = boto3.resource("s3") - self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME")) - else: + # self.s3 = boto3.resource("s3") + # self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME")) + # else: self.s3 = boto3.resource( "s3", endpoint_url=config.get(str(provider) + "_S3_URL"), diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index 7d08b8a..6849cb3 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -56,7 +56,7 @@ check_base = len(symbols + check_symbols) -def encode(number, checksum=False, split=0): +def encode(number, checksum=False, split=0, string_length=8): """Encode an integer into a symbol string. A ValueError is raised on invalid input. @@ -67,6 +67,11 @@ def encode(number, checksum=False, split=0): If split is specified, the string will be divided into clusters of that size separated by hyphens. + The param string_length causes the returned value to be padded + with 0s if the returned string is shorter than the requested + length (ie. 01 becomes 00000001 for the default string length). + This includes the checksum if specified. + The encoded string is returned. """ number = int(number) @@ -82,20 +87,23 @@ def encode(number, checksum=False, split=0): check_symbol = encode_symbols[number % check_base] if number == 0: - return '0' + check_symbol + symbol_string = '0' symbol_string = '' while number > 0: remainder = number % base number //= base symbol_string = encode_symbols[remainder] + symbol_string - symbol_string = symbol_string + check_symbol + + symbol_string = str(symbol_string).zfill(string_length-int(checksum)) if split: chunks = [] for pos in range(0, len(symbol_string), split): chunks.append(symbol_string[pos:pos + split]) symbol_string = '-'.join(chunks) + symbol_string = symbol_string + check_symbol + return symbol_string diff --git a/pyproject.toml b/pyproject.toml index 0c21f74..55db90b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "SciXPipelineUtils" -version = "0.0.2" +version = "0.0.3" description = "A collection of utilities for the new NASA Science Explorer (NASA SciX) backoffice architecture" authors = [{ name = "Taylor Jacovich", email = "tjacovich@cfa.harvard.edu"}] license = { text = "GPL-3.0" } @@ -21,7 +21,6 @@ packages = [ dependencies = [ 'boto3==1.26.59', 'avro==1.11.1', - 'base32-crockford==0.3.0', ] [project.urls] From 1c1e6959732dccf450c9f0860cb8fd239e53b0d0 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 22 May 2024 10:26:19 -0400 Subject: [PATCH 3/9] Remove comment for AWS S3 provider. --- SciXPipelineUtils/s3_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SciXPipelineUtils/s3_methods.py b/SciXPipelineUtils/s3_methods.py index 27f9f71..5ce3a04 100644 --- a/SciXPipelineUtils/s3_methods.py +++ b/SciXPipelineUtils/s3_methods.py @@ -18,9 +18,9 @@ def __init__(self, provider, config): config: The imported Pipeline configuration """ if provider == "AWS": - # self.s3 = boto3.resource("s3") - # self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME")) - # else: + self.s3 = boto3.resource("s3") + self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME")) + else: self.s3 = boto3.resource( "s3", endpoint_url=config.get(str(provider) + "_S3_URL"), From 83090280e21ea0937a3f54bf46473d5163cf2103 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 22 May 2024 11:25:02 -0400 Subject: [PATCH 4/9] Added linting fixes. --- .flake8 | 10 ++++++++ SciXPipelineUtils/scix_id.py | 47 ++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..e151dac --- /dev/null +++ b/.flake8 @@ -0,0 +1,10 @@ +[flake8] +ignore = E203 +exclude = + .git, + __pycache__, + docs/source/conf.py, + build, + dist +max-complexity = 10 +max-line-length = 79 diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index 6849cb3..cdf1b1e 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -36,21 +36,17 @@ __all__ = ["encode", "decode", "normalize"] -if PY3: - string_types = str, -else: - string_types = basestring, +string_types = (str,) # The encoded symbol space does not include I, L, O or U -symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ' +symbols = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" # These five symbols are exclusively for checksum values -check_symbols = '*~$=U' +check_symbols = "*~$=U" encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols)) decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols)) -normalize_symbols = str.maketrans('IiLlOo', '111100') -valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols, - re.escape(check_symbols))) +normalize_symbols = str.maketrans("IiLlOo", "111100") +valid_symbols = re.compile("^[%s]+[%s]?$" % (symbols, re.escape(check_symbols))) base = len(symbols) check_base = len(symbols + check_symbols) @@ -67,9 +63,9 @@ def encode(number, checksum=False, split=0, string_length=8): If split is specified, the string will be divided into clusters of that size separated by hyphens. - The param string_length causes the returned value to be padded - with 0s if the returned string is shorter than the requested - length (ie. 01 becomes 00000001 for the default string length). + The param string_length causes the returned value to be padded + with 0s if the returned string is shorter than the requested + length (ie. 01 becomes 00000001 for the default string length). This includes the checksum if specified. The encoded string is returned. @@ -82,29 +78,28 @@ def encode(number, checksum=False, split=0, string_length=8): if split < 0: raise ValueError("split '%d' is not a positive integer" % split) - check_symbol = '' + check_symbol = "" if checksum: check_symbol = encode_symbols[number % check_base] if number == 0: - symbol_string = '0' + symbol_string = "0" - symbol_string = '' + symbol_string = "" while number > 0: remainder = number % base number //= base symbol_string = encode_symbols[remainder] + symbol_string - symbol_string = str(symbol_string).zfill(string_length-int(checksum)) + symbol_string = str(symbol_string).zfill(string_length - int(checksum)) if split: chunks = [] for pos in range(0, len(symbol_string), split): - chunks.append(symbol_string[pos:pos + split]) - symbol_string = '-'.join(chunks) + chunks.append(symbol_string[pos : pos + split]) + symbol_string = "-".join(chunks) symbol_string = symbol_string + check_symbol - return symbol_string @@ -132,8 +127,9 @@ def decode(symbol_string, checksum=False, strict=False): check_value = decode_symbols[check_symbol] modulo = number % check_base if check_value != modulo: - raise ValueError("invalid check symbol '%s' for string '%s'" % - (check_symbol, symbol_string)) + raise ValueError( + "invalid check symbol '%s' for string '%s'" % (check_symbol, symbol_string) + ) return number @@ -162,14 +158,13 @@ def normalize(symbol_string, strict=False): if isinstance(symbol_string, string_types): if not PY3: try: - symbol_string = symbol_string.encode('ascii') + symbol_string = symbol_string.encode("ascii") except UnicodeEncodeError: raise ValueError("string should only contain ASCII characters") else: - raise TypeError("string is of invalid type %s" % - symbol_string.__class__.__name__) + raise TypeError("string is of invalid type %s" % symbol_string.__class__.__name__) - norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper() + norm_string = symbol_string.replace("-", "").translate(normalize_symbols).upper() if not valid_symbols.match(norm_string): raise ValueError("string '%s' contains invalid characters" % norm_string) @@ -177,4 +172,4 @@ def normalize(symbol_string, strict=False): if strict and norm_string != symbol_string: raise ValueError("string '%s' requires normalization" % symbol_string) - return norm_string \ No newline at end of file + return norm_string From fb8cb06a8746478fb09c8ec76778fe76213268e7 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 22 May 2024 13:40:18 -0400 Subject: [PATCH 5/9] Remove Python2.7 compatibility code. --- SciXPipelineUtils/scix_id.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index cdf1b1e..4b27a74 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -25,19 +25,9 @@ """ import re -import sys - -PY3 = sys.version_info[0] == 3 - -if not PY3: - import string as str - __all__ = ["encode", "decode", "normalize"] - -string_types = (str,) - # The encoded symbol space does not include I, L, O or U symbols = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" # These five symbols are exclusively for checksum values @@ -155,14 +145,6 @@ def normalize(symbol_string, strict=False): The normalized string is returned. """ - if isinstance(symbol_string, string_types): - if not PY3: - try: - symbol_string = symbol_string.encode("ascii") - except UnicodeEncodeError: - raise ValueError("string should only contain ASCII characters") - else: - raise TypeError("string is of invalid type %s" % symbol_string.__class__.__name__) norm_string = symbol_string.replace("-", "").translate(normalize_symbols).upper() From 90e07531baac7e244dbaa53a439c38122bc70cd6 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 10 Oct 2024 13:52:00 -0400 Subject: [PATCH 6/9] Updated encode --- SciXPipelineUtils/scix_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index 4b27a74..7b5da44 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -42,7 +42,7 @@ check_base = len(symbols + check_symbols) -def encode(number, checksum=False, split=0, string_length=8): +def encode(number, checksum=True, split=4, string_length=12): """Encode an integer into a symbol string. A ValueError is raised on invalid input. From 8c10c4e422fa0aca704d362f64487dacd02e6e67 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 10 Oct 2024 13:57:38 -0400 Subject: [PATCH 7/9] Add scix-id tests --- tests/test_scix_id.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/test_scix_id.py diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py new file mode 100644 index 0000000..22a23d8 --- /dev/null +++ b/tests/test_scix_id.py @@ -0,0 +1,11 @@ +from unittest import TestCase + +import SciXPipelineUtils.scix_id as scixid + + +class TestSciXUUIDImplementation(TestCase): + def test_generate_uuid7(self): + test_id = scixid.encode(1000) + self.assertEqual(test_id, "0000-0000-0Z81") + test_int = scixid.decode(test_id) + self.assertEqual(test_int, 1000) From 2adb9fead0106fac79542706da0a0f57db17008c Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 10 Oct 2024 14:12:25 -0400 Subject: [PATCH 8/9] Added tests. Updated default checksum status for decode. --- SciXPipelineUtils/scix_id.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index 7b5da44..57084b6 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -93,7 +93,7 @@ def encode(number, checksum=True, split=4, string_length=12): return symbol_string -def decode(symbol_string, checksum=False, strict=False): +def decode(symbol_string, checksum=True, strict=False): """Decode an encoded symbol string. If checksum is set to True, the string is assumed to have a diff --git a/pyproject.toml b/pyproject.toml index 55db90b..afa4159 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ dev = [ 'pytest-cov==4.0.0', 'moto==4.1.3', 'confluent-kafka==1.9.2', - 'fastavro==1.7.2', + 'fastavro==1.9.7', ] [tool.pytest.ini_options] From dc70e899e302678a53777358177aab6d041f330a Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 10 Oct 2024 14:18:26 -0400 Subject: [PATCH 9/9] Fixed test names. Added No checksum test. --- tests/test_scix_id.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py index 22a23d8..80e140d 100644 --- a/tests/test_scix_id.py +++ b/tests/test_scix_id.py @@ -3,9 +3,15 @@ import SciXPipelineUtils.scix_id as scixid -class TestSciXUUIDImplementation(TestCase): - def test_generate_uuid7(self): +class TestSciXIDImplementation(TestCase): + def test_generate_scixid(self): test_id = scixid.encode(1000) self.assertEqual(test_id, "0000-0000-0Z81") test_int = scixid.decode(test_id) self.assertEqual(test_int, 1000) + + def test_generate_scixid_no_checksum(self): + test_id = scixid.encode(1000, checksum=False) + self.assertEqual(test_id, "0000-0000-00Z8") + test_int = scixid.decode(test_id, checksum=False) + self.assertEqual(test_int, 1000)