From 4c4f922ecd0c12b55300b1f07e03dad83674b12d Mon Sep 17 00:00:00 2001 From: Ben Schulz <110361904+ben-schulz-mh@users.noreply.github.com> Date: Fri, 27 Dec 2024 13:23:48 +1000 Subject: [PATCH 1/7] feat: add SHA256 encryption method to inline stream maps --- singer_sdk/mapper.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/singer_sdk/mapper.py b/singer_sdk/mapper.py index 004d0f60a..f0d9265d4 100644 --- a/singer_sdk/mapper.py +++ b/singer_sdk/mapper.py @@ -63,6 +63,12 @@ def md5(string: str) -> str: return hashlib.md5(string.encode("utf-8")).hexdigest() # noqa: S324 +def sha256(value: str) -> str: + """Digest a string using SHA256. This is a function for inline calculations.""" + + return hashlib.sha256(value.encode("utf-8")).hexdigest() + + StreamMapsDict: TypeAlias = dict[str, t.Union[str, dict, None]] @@ -307,6 +313,7 @@ def functions(self) -> dict[str, t.Callable]: """ funcs: dict[str, t.Any] = simpleeval.DEFAULT_FUNCTIONS.copy() funcs["md5"] = md5 + funcs["sha256"] = sha256 funcs["datetime"] = datetime funcs["bool"] = bool funcs["json"] = json From 83245230bdeab2e1de6755ca59290379f2671569 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Dec 2024 03:25:31 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- singer_sdk/mapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/singer_sdk/mapper.py b/singer_sdk/mapper.py index f0d9265d4..3e6895eb4 100644 --- a/singer_sdk/mapper.py +++ b/singer_sdk/mapper.py @@ -65,7 +65,6 @@ def md5(string: str) -> str: def sha256(value: str) -> str: """Digest a string using SHA256. This is a function for inline calculations.""" - return hashlib.sha256(value.encode("utf-8")).hexdigest() From fea67b47180923cc4d53d1b899c8bbe38fd93084 Mon Sep 17 00:00:00 2001 From: Ben Schulz <110361904+ben-schulz-mh@users.noreply.github.com> Date: Fri, 27 Dec 2024 13:29:36 +1000 Subject: [PATCH 3/7] fix up docstring --- singer_sdk/mapper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/singer_sdk/mapper.py b/singer_sdk/mapper.py index 3e6895eb4..994ce0cb5 100644 --- a/singer_sdk/mapper.py +++ b/singer_sdk/mapper.py @@ -64,7 +64,13 @@ def md5(string: str) -> str: def sha256(value: str) -> str: - """Digest a string using SHA256. This is a function for inline calculations.""" + """Digest a string using SHA256. This is a function for inline calculations. + + Args: + string: String to digest. + Returns: + A string digested into SHA256. + """ return hashlib.sha256(value.encode("utf-8")).hexdigest() From ae8e12c5de22ed1cf246cf55bf2e0534fc643300 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Dec 2024 03:29:44 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- singer_sdk/mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/singer_sdk/mapper.py b/singer_sdk/mapper.py index 994ce0cb5..c2ace170d 100644 --- a/singer_sdk/mapper.py +++ b/singer_sdk/mapper.py @@ -68,6 +68,7 @@ def sha256(value: str) -> str: Args: string: String to digest. + Returns: A string digested into SHA256. """ From 9ca6c7cba99a26feb6ed32f09ec22b4bf87a960a Mon Sep 17 00:00:00 2001 From: Ben Schulz <110361904+ben-schulz-mh@users.noreply.github.com> Date: Fri, 27 Dec 2024 13:31:15 +1000 Subject: [PATCH 5/7] rename variable to match docstring --- singer_sdk/mapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/singer_sdk/mapper.py b/singer_sdk/mapper.py index c2ace170d..d528831a1 100644 --- a/singer_sdk/mapper.py +++ b/singer_sdk/mapper.py @@ -63,7 +63,7 @@ def md5(string: str) -> str: return hashlib.md5(string.encode("utf-8")).hexdigest() # noqa: S324 -def sha256(value: str) -> str: +def sha256(string: str) -> str: """Digest a string using SHA256. This is a function for inline calculations. Args: @@ -72,7 +72,7 @@ def sha256(value: str) -> str: Returns: A string digested into SHA256. """ - return hashlib.sha256(value.encode("utf-8")).hexdigest() + return hashlib.sha256(string.encode("utf-8")).hexdigest() StreamMapsDict: TypeAlias = dict[str, t.Union[str, dict, None]] From cf708ccbf2f717a2190c36f17133e0fe8113198e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Ram=C3=ADrez-Mondrag=C3=B3n?= Date: Fri, 27 Dec 2024 11:35:21 -0600 Subject: [PATCH 6/7] chore: Add docs and test for `sha256` stream maps function --- docs/stream_maps.md | 11 ++++++----- tests/core/test_mapper.py | 1 + .../snapshots/mapped_stream/only_mapped_fields.jsonl | 8 ++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/stream_maps.md b/docs/stream_maps.md index 7c65b88c8..a6f60b259 100644 --- a/docs/stream_maps.md +++ b/docs/stream_maps.md @@ -230,11 +230,12 @@ can be referenced directly by mapping expressions. The following functions and namespaces are available for use in mapping expressions: -| Function | Description | -| :------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [`md5()`](inv:python:py:module:#hashlib) | Returns an inline MD5 hash of any string, outputting the string representation of the hash's hex digest. This is defined by the SDK internally with native python: [`hashlib.md5(.encode("utf-8")).hexdigest()`](inv:python:py:method:#hashlib.hash.hexdigest). | -| [`datetime`](inv:python:py:module:#datetime) | This is the datetime module object from the Python standard library. You can access [`datetime.datetime`](inv:python:py:class:#datetime.datetime), [`datetime.timedelta`](inv:python:py:class:#datetime.timedelta), etc. | -| [`json`](inv:python:py:module:#json) | This is the json module object from the Python standard library. Primarily used for calling [`json.dumps()`](inv:python:py:function:#json.dumps) and [`json.loads()`](inv:python:py:function:#json.loads). | +| Function | Description | +| :------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`md5()`](inv:python:py:module:#hashlib) | Returns an inline MD5 hash of any string, outputting the string representation of the hash's hex digest. This is defined by the SDK internally with native python: [`hashlib.md5(.encode("utf-8")).hexdigest()`](inv:python:py:method:#hashlib.hash.hexdigest). | +| [`sha256()`](inv:python:py:module:#hashlib) | Returns an inline SHA256 hash of any string, outputting the string representation of the hash's hex digest. This is defined by the SDK internally with native python: [`hashlib.sha256(.encode("utf-8")).hexdigest()`](inv:python:py:method:#hashlib.hash.hexdigest). | +| [`datetime`](inv:python:py:module:#datetime) | This is the datetime module object from the Python standard library. You can access [`datetime.datetime`](inv:python:py:class:#datetime.datetime), [`datetime.timedelta`](inv:python:py:class:#datetime.timedelta), etc. | +| [`json`](inv:python:py:module:#json) | This is the json module object from the Python standard library. Primarily used for calling [`json.dumps()`](inv:python:py:function:#json.dumps) and [`json.loads()`](inv:python:py:function:#json.loads). | #### Built-in Variable Names diff --git a/tests/core/test_mapper.py b/tests/core/test_mapper.py index 0ade78823..192fc2718 100644 --- a/tests/core/test_mapper.py +++ b/tests/core/test_mapper.py @@ -704,6 +704,7 @@ def discover_streams(self): { "mystream": { "email_hash": "md5(email)", + "email_hash_sha256": "sha256(email)", "fixed_count": "int(count-1)", "__else__": None, }, diff --git a/tests/snapshots/mapped_stream/only_mapped_fields.jsonl b/tests/snapshots/mapped_stream/only_mapped_fields.jsonl index ef70c9aac..c02e689c7 100644 --- a/tests/snapshots/mapped_stream/only_mapped_fields.jsonl +++ b/tests/snapshots/mapped_stream/only_mapped_fields.jsonl @@ -1,6 +1,6 @@ {"type":"STATE","value":{}} -{"type":"SCHEMA","stream":"mystream","schema":{"type":"object","properties":{"email_hash":{"type":["string","null"]},"fixed_count":{"type":["integer","null"]}},"$schema":"https://json-schema.org/draft/2020-12/schema"},"key_properties":[]} -{"type":"RECORD","stream":"mystream","record":{"email_hash":"c160f8cc69a4f0bf2b0362752353d060","fixed_count":20},"time_extracted":"2022-01-01T00:00:00+00:00"} -{"type":"RECORD","stream":"mystream","record":{"email_hash":"4b9bb80620f03eb3719e0a061c14283d","fixed_count":12},"time_extracted":"2022-01-01T00:00:00+00:00"} -{"type":"RECORD","stream":"mystream","record":{"email_hash":"426b189df1e2f359efe6ee90f2d2030f","fixed_count":18},"time_extracted":"2022-01-01T00:00:00+00:00"} +{"type":"SCHEMA","stream":"mystream","schema":{"type":"object","properties":{"email_hash":{"type":["string","null"]},"email_hash_sha256":{"type":["string","null"]},"fixed_count":{"type":["integer","null"]}},"$schema":"https://json-schema.org/draft/2020-12/schema"},"key_properties":[]} +{"type":"RECORD","stream":"mystream","record":{"email_hash":"c160f8cc69a4f0bf2b0362752353d060","email_hash_sha256":"ff8d9819fc0e12bf0d24892e45987e249a28dce836a85cad60e28eaaa8c6d976","fixed_count":20},"time_extracted":"2022-01-01T00:00:00+00:00"} +{"type":"RECORD","stream":"mystream","record":{"email_hash":"4b9bb80620f03eb3719e0a061c14283d","email_hash_sha256":"5ff860bf1190596c7188ab851db691f0f3169c453936e9e1eba2f9a47f7a0018","fixed_count":12},"time_extracted":"2022-01-01T00:00:00+00:00"} +{"type":"RECORD","stream":"mystream","record":{"email_hash":"426b189df1e2f359efe6ee90f2d2030f","email_hash_sha256":"add7232b65bb559f896cbcfa9a600170a7ca381a0366789dcf59ad986bdf4a98","fixed_count":18},"time_extracted":"2022-01-01T00:00:00+00:00"} {"type":"STATE","value":{"bookmarks":{"mystream":{}}}} From a6708181467531f239bef079efa47ba29cd46085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Ram=C3=ADrez-Mondrag=C3=B3n?= Date: Fri, 27 Dec 2024 11:42:20 -0600 Subject: [PATCH 7/7] docs: Link to `hashlib.md5` and `hashlib.sha256` respectively --- docs/stream_maps.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/stream_maps.md b/docs/stream_maps.md index a6f60b259..9df083e9b 100644 --- a/docs/stream_maps.md +++ b/docs/stream_maps.md @@ -230,12 +230,12 @@ can be referenced directly by mapping expressions. The following functions and namespaces are available for use in mapping expressions: -| Function | Description | -| :------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [`md5()`](inv:python:py:module:#hashlib) | Returns an inline MD5 hash of any string, outputting the string representation of the hash's hex digest. This is defined by the SDK internally with native python: [`hashlib.md5(.encode("utf-8")).hexdigest()`](inv:python:py:method:#hashlib.hash.hexdigest). | -| [`sha256()`](inv:python:py:module:#hashlib) | Returns an inline SHA256 hash of any string, outputting the string representation of the hash's hex digest. This is defined by the SDK internally with native python: [`hashlib.sha256(.encode("utf-8")).hexdigest()`](inv:python:py:method:#hashlib.hash.hexdigest). | -| [`datetime`](inv:python:py:module:#datetime) | This is the datetime module object from the Python standard library. You can access [`datetime.datetime`](inv:python:py:class:#datetime.datetime), [`datetime.timedelta`](inv:python:py:class:#datetime.timedelta), etc. | -| [`json`](inv:python:py:module:#json) | This is the json module object from the Python standard library. Primarily used for calling [`json.dumps()`](inv:python:py:function:#json.dumps) and [`json.loads()`](inv:python:py:function:#json.loads). | +| Function | Description | +| :--------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`md5()`](inv:python:py:function:#hashlib.md5) | Returns an inline MD5 hash of any string, outputting the string representation of the hash's hex digest. This is defined by the SDK internally with native python: [`hashlib.md5(.encode("utf-8")).hexdigest()`](inv:python:py:method:#hashlib.hash.hexdigest). | +| [`sha256()`](inv:python:py:function:#hashlib.sha256) | Returns an inline SHA256 hash of any string, outputting the string representation of the hash's hex digest. This is defined by the SDK internally with native python: [`hashlib.sha256(.encode("utf-8")).hexdigest()`](inv:python:py:method:#hashlib.hash.hexdigest). | +| [`datetime`](inv:python:py:module:#datetime) | This is the datetime module object from the Python standard library. You can access [`datetime.datetime`](inv:python:py:class:#datetime.datetime), [`datetime.timedelta`](inv:python:py:class:#datetime.timedelta), etc. | +| [`json`](inv:python:py:module:#json) | This is the json module object from the Python standard library. Primarily used for calling [`json.dumps()`](inv:python:py:function:#json.dumps) and [`json.loads()`](inv:python:py:function:#json.loads). | #### Built-in Variable Names