From 36a14d13fe443da657880fca91c7c58597553da7 Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Fri, 1 Mar 2024 15:42:47 -0500
Subject: [PATCH 1/9] Add initial test of scix_id to pipeline utils.

---
 SciXPipelineUtils/scix_id.py | 172 +++++++++++++++++++++++++++++++++++
 pyproject.toml               |   1 +
 2 files changed, 173 insertions(+)
 create mode 100644 SciXPipelineUtils/scix_id.py

diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py
new file mode 100644
index 0000000..7d08b8a
--- /dev/null
+++ b/SciXPipelineUtils/scix_id.py
@@ -0,0 +1,172 @@
+"""
+base32-crockford
+================
+
+A Python module implementing the alternate base32 encoding as described
+by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html.
+
+He designed the encoding to:
+
+   * Be human and machine readable
+   * Be compact
+   * Be error resistant
+   * Be pronounceable
+
+It uses a symbol set of 10 digits and 22 letters, excluding I, L O and
+U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1'
+and 'o' is converted to '0'. Encoding uses only upper-case characters.
+
+Hyphens may be present in symbol strings to improve readability, and
+are removed when decoding.
+
+A check symbol can be appended to a symbol string to detect errors
+within the string.
+
+"""
+
+import re
+import sys
+
+PY3 = sys.version_info[0] == 3
+
+if not PY3:
+    import string as str
+
+
+__all__ = ["encode", "decode", "normalize"]
+
+
+if PY3:
+    string_types = str,
+else:
+    string_types = basestring,
+
+# The encoded symbol space does not include I, L, O or U
+symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ'
+# These five symbols are exclusively for checksum values
+check_symbols = '*~$=U'
+
+encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols))
+decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols))
+normalize_symbols = str.maketrans('IiLlOo', '111100')
+valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols,
+                                             re.escape(check_symbols)))
+
+base = len(symbols)
+check_base = len(symbols + check_symbols)
+
+
+def encode(number, checksum=False, split=0):
+    """Encode an integer into a symbol string.
+
+    A ValueError is raised on invalid input.
+
+    If checksum is set to True, a check symbol will be
+    calculated and appended to the string.
+
+    If split is specified, the string will be divided into
+    clusters of that size separated by hyphens.
+
+    The encoded string is returned.
+    """
+    number = int(number)
+    if number < 0:
+        raise ValueError("number '%d' is not a positive integer" % number)
+
+    split = int(split)
+    if split < 0:
+        raise ValueError("split '%d' is not a positive integer" % split)
+
+    check_symbol = ''
+    if checksum:
+        check_symbol = encode_symbols[number % check_base]
+
+    if number == 0:
+        return '0' + check_symbol
+
+    symbol_string = ''
+    while number > 0:
+        remainder = number % base
+        number //= base
+        symbol_string = encode_symbols[remainder] + symbol_string
+    symbol_string = symbol_string + check_symbol
+
+    if split:
+        chunks = []
+        for pos in range(0, len(symbol_string), split):
+            chunks.append(symbol_string[pos:pos + split])
+        symbol_string = '-'.join(chunks)
+
+    return symbol_string
+
+
+def decode(symbol_string, checksum=False, strict=False):
+    """Decode an encoded symbol string.
+
+    If checksum is set to True, the string is assumed to have a
+    trailing check symbol which will be validated. If the
+    checksum validation fails, a ValueError is raised.
+
+    If strict is set to True, a ValueError is raised if the
+    normalization step requires changes to the string.
+
+    The decoded string is returned.
+    """
+    symbol_string = normalize(symbol_string, strict=strict)
+    if checksum:
+        symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1]
+
+    number = 0
+    for symbol in symbol_string:
+        number = number * base + decode_symbols[symbol]
+
+    if checksum:
+        check_value = decode_symbols[check_symbol]
+        modulo = number % check_base
+        if check_value != modulo:
+            raise ValueError("invalid check symbol '%s' for string '%s'" %
+                             (check_symbol, symbol_string))
+
+    return number
+
+
+def normalize(symbol_string, strict=False):
+    """Normalize an encoded symbol string.
+
+    Normalization provides error correction and prepares the
+    string for decoding. These transformations are applied:
+
+       1. Hyphens are removed
+       2. 'I', 'i', 'L' or 'l' are converted to '1'
+       3. 'O' or 'o' are converted to '0'
+       4. All characters are converted to uppercase
+
+    A TypeError is raised if an invalid string type is provided.
+
+    A ValueError is raised if the normalized string contains
+    invalid characters.
+
+    If the strict parameter is set to True, a ValueError is raised
+    if any of the above transformations are applied.
+
+    The normalized string is returned.
+    """
+    if isinstance(symbol_string, string_types):
+        if not PY3:
+            try:
+                symbol_string = symbol_string.encode('ascii')
+            except UnicodeEncodeError:
+                raise ValueError("string should only contain ASCII characters")
+    else:
+        raise TypeError("string is of invalid type %s" %
+                        symbol_string.__class__.__name__)
+
+    norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper()
+
+    if not valid_symbols.match(norm_string):
+        raise ValueError("string '%s' contains invalid characters" % norm_string)
+
+    if strict and norm_string != symbol_string:
+        raise ValueError("string '%s' requires normalization" % symbol_string)
+
+    return norm_string
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index c9f732c..0c21f74 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ packages = [
 dependencies = [
     'boto3==1.26.59',
     'avro==1.11.1',
+    'base32-crockford==0.3.0',
 ]
 
 [project.urls]

From 27d6f6819bfe23b01844435c7b638e6c63f22cbf Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Wed, 22 May 2024 10:22:13 -0400
Subject: [PATCH 2/9] Updated scix-id module to pad ids to uniform arbitrary
 length.

---
 SciXPipelineUtils/s3_methods.py |  6 +++---
 SciXPipelineUtils/scix_id.py    | 14 +++++++++++---
 pyproject.toml                  |  3 +--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/SciXPipelineUtils/s3_methods.py b/SciXPipelineUtils/s3_methods.py
index 5ce3a04..27f9f71 100644
--- a/SciXPipelineUtils/s3_methods.py
+++ b/SciXPipelineUtils/s3_methods.py
@@ -18,9 +18,9 @@ def __init__(self, provider, config):
         config: The imported Pipeline configuration
         """
         if provider == "AWS":
-            self.s3 = boto3.resource("s3")
-            self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME"))
-        else:
+        #     self.s3 = boto3.resource("s3")
+        #     self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME"))
+        # else:
             self.s3 = boto3.resource(
                 "s3",
                 endpoint_url=config.get(str(provider) + "_S3_URL"),
diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py
index 7d08b8a..6849cb3 100644
--- a/SciXPipelineUtils/scix_id.py
+++ b/SciXPipelineUtils/scix_id.py
@@ -56,7 +56,7 @@
 check_base = len(symbols + check_symbols)
 
 
-def encode(number, checksum=False, split=0):
+def encode(number, checksum=False, split=0, string_length=8):
     """Encode an integer into a symbol string.
 
     A ValueError is raised on invalid input.
@@ -67,6 +67,11 @@ def encode(number, checksum=False, split=0):
     If split is specified, the string will be divided into
     clusters of that size separated by hyphens.
 
+    The param string_length causes the returned value to be padded 
+    with 0s if the returned string is shorter than the requested 
+    length (ie. 01 becomes 00000001 for the default string length). 
+    This includes the checksum if specified.
+
     The encoded string is returned.
     """
     number = int(number)
@@ -82,20 +87,23 @@ def encode(number, checksum=False, split=0):
         check_symbol = encode_symbols[number % check_base]
 
     if number == 0:
-        return '0' + check_symbol
+        symbol_string = '0'
 
     symbol_string = ''
     while number > 0:
         remainder = number % base
         number //= base
         symbol_string = encode_symbols[remainder] + symbol_string
-    symbol_string = symbol_string + check_symbol
+
+    symbol_string = str(symbol_string).zfill(string_length-int(checksum))
 
     if split:
         chunks = []
         for pos in range(0, len(symbol_string), split):
             chunks.append(symbol_string[pos:pos + split])
         symbol_string = '-'.join(chunks)
+        symbol_string = symbol_string + check_symbol
+
 
     return symbol_string
 
diff --git a/pyproject.toml b/pyproject.toml
index 0c21f74..55db90b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "SciXPipelineUtils"
-version = "0.0.2"
+version = "0.0.3"
 description = "A collection of utilities for the new NASA Science Explorer (NASA SciX) backoffice architecture"
 authors = [{ name = "Taylor Jacovich", email = "tjacovich@cfa.harvard.edu"}]
 license = { text = "GPL-3.0" }
@@ -21,7 +21,6 @@ packages = [
 dependencies = [
     'boto3==1.26.59',
     'avro==1.11.1',
-    'base32-crockford==0.3.0',
 ]
 
 [project.urls]

From 1c1e6959732dccf450c9f0860cb8fd239e53b0d0 Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Wed, 22 May 2024 10:26:19 -0400
Subject: [PATCH 3/9] Remove comment for AWS S3 provider.

---
 SciXPipelineUtils/s3_methods.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SciXPipelineUtils/s3_methods.py b/SciXPipelineUtils/s3_methods.py
index 27f9f71..5ce3a04 100644
--- a/SciXPipelineUtils/s3_methods.py
+++ b/SciXPipelineUtils/s3_methods.py
@@ -18,9 +18,9 @@ def __init__(self, provider, config):
         config: The imported Pipeline configuration
         """
         if provider == "AWS":
-        #     self.s3 = boto3.resource("s3")
-        #     self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME"))
-        # else:
+            self.s3 = boto3.resource("s3")
+            self.bucket = self.s3.Bucket(config.get("AWS_BUCKET_NAME"))
+        else:
             self.s3 = boto3.resource(
                 "s3",
                 endpoint_url=config.get(str(provider) + "_S3_URL"),

From 83090280e21ea0937a3f54bf46473d5163cf2103 Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Wed, 22 May 2024 11:25:02 -0400
Subject: [PATCH 4/9] Added linting fixes.

---
 .flake8                      | 10 ++++++++
 SciXPipelineUtils/scix_id.py | 47 ++++++++++++++++--------------------
 2 files changed, 31 insertions(+), 26 deletions(-)
 create mode 100644 .flake8

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..e151dac
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,10 @@
+[flake8]
+ignore = E203
+exclude =
+    .git,
+    __pycache__,
+    docs/source/conf.py,
+    build,
+    dist
+max-complexity = 10
+max-line-length = 79
diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py
index 6849cb3..cdf1b1e 100644
--- a/SciXPipelineUtils/scix_id.py
+++ b/SciXPipelineUtils/scix_id.py
@@ -36,21 +36,17 @@
 __all__ = ["encode", "decode", "normalize"]
 
 
-if PY3:
-    string_types = str,
-else:
-    string_types = basestring,
+string_types = (str,)
 
 # The encoded symbol space does not include I, L, O or U
-symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ'
+symbols = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
 # These five symbols are exclusively for checksum values
-check_symbols = '*~$=U'
+check_symbols = "*~$=U"
 
 encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols))
 decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols))
-normalize_symbols = str.maketrans('IiLlOo', '111100')
-valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols,
-                                             re.escape(check_symbols)))
+normalize_symbols = str.maketrans("IiLlOo", "111100")
+valid_symbols = re.compile("^[%s]+[%s]?$" % (symbols, re.escape(check_symbols)))
 
 base = len(symbols)
 check_base = len(symbols + check_symbols)
@@ -67,9 +63,9 @@ def encode(number, checksum=False, split=0, string_length=8):
     If split is specified, the string will be divided into
     clusters of that size separated by hyphens.
 
-    The param string_length causes the returned value to be padded 
-    with 0s if the returned string is shorter than the requested 
-    length (ie. 01 becomes 00000001 for the default string length). 
+    The param string_length causes the returned value to be padded
+    with 0s if the returned string is shorter than the requested
+    length (ie. 01 becomes 00000001 for the default string length).
     This includes the checksum if specified.
 
     The encoded string is returned.
@@ -82,29 +78,28 @@ def encode(number, checksum=False, split=0, string_length=8):
     if split < 0:
         raise ValueError("split '%d' is not a positive integer" % split)
 
-    check_symbol = ''
+    check_symbol = ""
     if checksum:
         check_symbol = encode_symbols[number % check_base]
 
     if number == 0:
-        symbol_string = '0'
+        symbol_string = "0"
 
-    symbol_string = ''
+    symbol_string = ""
     while number > 0:
         remainder = number % base
         number //= base
         symbol_string = encode_symbols[remainder] + symbol_string
 
-    symbol_string = str(symbol_string).zfill(string_length-int(checksum))
+    symbol_string = str(symbol_string).zfill(string_length - int(checksum))
 
     if split:
         chunks = []
         for pos in range(0, len(symbol_string), split):
-            chunks.append(symbol_string[pos:pos + split])
-        symbol_string = '-'.join(chunks)
+            chunks.append(symbol_string[pos : pos + split])
+        symbol_string = "-".join(chunks)
         symbol_string = symbol_string + check_symbol
 
-
     return symbol_string
 
 
@@ -132,8 +127,9 @@ def decode(symbol_string, checksum=False, strict=False):
         check_value = decode_symbols[check_symbol]
         modulo = number % check_base
         if check_value != modulo:
-            raise ValueError("invalid check symbol '%s' for string '%s'" %
-                             (check_symbol, symbol_string))
+            raise ValueError(
+                "invalid check symbol '%s' for string '%s'" % (check_symbol, symbol_string)
+            )
 
     return number
 
@@ -162,14 +158,13 @@ def normalize(symbol_string, strict=False):
     if isinstance(symbol_string, string_types):
         if not PY3:
             try:
-                symbol_string = symbol_string.encode('ascii')
+                symbol_string = symbol_string.encode("ascii")
             except UnicodeEncodeError:
                 raise ValueError("string should only contain ASCII characters")
     else:
-        raise TypeError("string is of invalid type %s" %
-                        symbol_string.__class__.__name__)
+        raise TypeError("string is of invalid type %s" % symbol_string.__class__.__name__)
 
-    norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper()
+    norm_string = symbol_string.replace("-", "").translate(normalize_symbols).upper()
 
     if not valid_symbols.match(norm_string):
         raise ValueError("string '%s' contains invalid characters" % norm_string)
@@ -177,4 +172,4 @@ def normalize(symbol_string, strict=False):
     if strict and norm_string != symbol_string:
         raise ValueError("string '%s' requires normalization" % symbol_string)
 
-    return norm_string
\ No newline at end of file
+    return norm_string

From fb8cb06a8746478fb09c8ec76778fe76213268e7 Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Wed, 22 May 2024 13:40:18 -0400
Subject: [PATCH 5/9] Remove Python2.7 compatibility code.

---
 SciXPipelineUtils/scix_id.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py
index cdf1b1e..4b27a74 100644
--- a/SciXPipelineUtils/scix_id.py
+++ b/SciXPipelineUtils/scix_id.py
@@ -25,19 +25,9 @@
 """
 
 import re
-import sys
-
-PY3 = sys.version_info[0] == 3
-
-if not PY3:
-    import string as str
-
 
 __all__ = ["encode", "decode", "normalize"]
 
-
-string_types = (str,)
-
 # The encoded symbol space does not include I, L, O or U
 symbols = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
 # These five symbols are exclusively for checksum values
@@ -155,14 +145,6 @@ def normalize(symbol_string, strict=False):
 
     The normalized string is returned.
     """
-    if isinstance(symbol_string, string_types):
-        if not PY3:
-            try:
-                symbol_string = symbol_string.encode("ascii")
-            except UnicodeEncodeError:
-                raise ValueError("string should only contain ASCII characters")
-    else:
-        raise TypeError("string is of invalid type %s" % symbol_string.__class__.__name__)
 
     norm_string = symbol_string.replace("-", "").translate(normalize_symbols).upper()
 

From 90e07531baac7e244dbaa53a439c38122bc70cd6 Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Thu, 10 Oct 2024 13:52:00 -0400
Subject: [PATCH 6/9] Updated encode

---
 SciXPipelineUtils/scix_id.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py
index 4b27a74..7b5da44 100644
--- a/SciXPipelineUtils/scix_id.py
+++ b/SciXPipelineUtils/scix_id.py
@@ -42,7 +42,7 @@
 check_base = len(symbols + check_symbols)
 
 
-def encode(number, checksum=False, split=0, string_length=8):
+def encode(number, checksum=True, split=4, string_length=12):
     """Encode an integer into a symbol string.
 
     A ValueError is raised on invalid input.

From 8c10c4e422fa0aca704d362f64487dacd02e6e67 Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Thu, 10 Oct 2024 13:57:38 -0400
Subject: [PATCH 7/9] Add scix-id tests

---
 tests/test_scix_id.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 tests/test_scix_id.py

diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py
new file mode 100644
index 0000000..22a23d8
--- /dev/null
+++ b/tests/test_scix_id.py
@@ -0,0 +1,11 @@
+from unittest import TestCase
+
+import SciXPipelineUtils.scix_id as scixid
+
+
+class TestSciXUUIDImplementation(TestCase):
+    def test_generate_uuid7(self):
+        test_id = scixid.encode(1000)
+        self.assertEqual(test_id, "0000-0000-0Z81")
+        test_int = scixid.decode(test_id)
+        self.assertEqual(test_int, 1000)

From 2adb9fead0106fac79542706da0a0f57db17008c Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Thu, 10 Oct 2024 14:12:25 -0400
Subject: [PATCH 8/9] Added tests. Updated default checksum status for decode.

---
 SciXPipelineUtils/scix_id.py | 2 +-
 pyproject.toml               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py
index 7b5da44..57084b6 100644
--- a/SciXPipelineUtils/scix_id.py
+++ b/SciXPipelineUtils/scix_id.py
@@ -93,7 +93,7 @@ def encode(number, checksum=True, split=4, string_length=12):
     return symbol_string
 
 
-def decode(symbol_string, checksum=False, strict=False):
+def decode(symbol_string, checksum=True, strict=False):
     """Decode an encoded symbol string.
 
     If checksum is set to True, the string is assumed to have a
diff --git a/pyproject.toml b/pyproject.toml
index 55db90b..afa4159 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ dev = [
     'pytest-cov==4.0.0',
     'moto==4.1.3',
     'confluent-kafka==1.9.2',
-    'fastavro==1.7.2',
+    'fastavro==1.9.7',
 ]
 
 [tool.pytest.ini_options]

From dc70e899e302678a53777358177aab6d041f330a Mon Sep 17 00:00:00 2001
From: tjacovich <tjacovich@cfa.harvard.edu>
Date: Thu, 10 Oct 2024 14:18:26 -0400
Subject: [PATCH 9/9] Fixed test names. Added No checksum test.

---
 tests/test_scix_id.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py
index 22a23d8..80e140d 100644
--- a/tests/test_scix_id.py
+++ b/tests/test_scix_id.py
@@ -3,9 +3,15 @@
 import SciXPipelineUtils.scix_id as scixid
 
 
-class TestSciXUUIDImplementation(TestCase):
-    def test_generate_uuid7(self):
+class TestSciXIDImplementation(TestCase):
+    def test_generate_scixid(self):
         test_id = scixid.encode(1000)
         self.assertEqual(test_id, "0000-0000-0Z81")
         test_int = scixid.decode(test_id)
         self.assertEqual(test_int, 1000)
+
+    def test_generate_scixid_no_checksum(self):
+        test_id = scixid.encode(1000, checksum=False)
+        self.assertEqual(test_id, "0000-0000-00Z8")
+        test_int = scixid.decode(test_id, checksum=False)
+        self.assertEqual(test_int, 1000)