From 073562149dae66964a2dbe1d8c39f4ffc65946a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Far=C3=ADas=20Santana?= Date: Thu, 16 Nov 2023 18:51:01 +0100 Subject: [PATCH] fix: More Redshift whitespace handling (#18688) --- .../test_redshift_batch_export_workflow.py | 24 +++++++-------- .../workflows/redshift_batch_export.py | 29 +++++-------------- 2 files changed, 20 insertions(+), 33 deletions(-) diff --git a/posthog/temporal/tests/batch_exports/test_redshift_batch_export_workflow.py b/posthog/temporal/tests/batch_exports/test_redshift_batch_export_workflow.py index 03b19700b9910..6a0448b24cfd5 100644 --- a/posthog/temporal/tests/batch_exports/test_redshift_batch_export_workflow.py +++ b/posthog/temporal/tests/batch_exports/test_redshift_batch_export_workflow.py @@ -26,7 +26,7 @@ RedshiftBatchExportWorkflow, RedshiftInsertInputs, insert_into_redshift_activity, - translate_whitespace_recursive, + remove_escaped_whitespace_recursive, ) REQUIRED_ENV_VARS = ( @@ -65,7 +65,7 @@ async def assert_events_in_redshift(connection, schema, table_name, events, excl continue raw_properties = event.get("properties", None) - properties = translate_whitespace_recursive(raw_properties) if raw_properties else None + properties = remove_escaped_whitespace_recursive(raw_properties) if raw_properties else None elements_chain = event.get("elements_chain", None) expected_event = { "distinct_id": event.get("distinct_id"), @@ -116,7 +116,7 @@ def redshift_config(): return { "user": user, "password": password, - "database": "posthog_batch_exports_test", + "database": "posthog_batch_exports_test_2", "schema": "exports_test_schema", "host": host, "port": int(port), @@ -362,14 +362,14 @@ async def test_redshift_export_workflow( "value,expected", [ ([1, 2, 3], [1, 2, 3]), - ("\n\n", ""), - ([["\t\n"]], [[""]]), - (("\t\n",), ("",)), - ({"\t\n"}, {""}), - ({"key": "\t\n"}, {"key": ""}), - ({"key": ["\t\n"]}, {"key": [""]}), + ("hi\t\n\r\f\bhi", "hi hi"), + ([["\t\n\r\f\b"]], [[""]]), + (("\t\n\r\f\b",), ("",)), + ({"\t\n\r\f\b"}, {""}), + ({"key": "\t\n\r\f\b"}, {"key": ""}), + ({"key": ["\t\n\r\f\b"]}, {"key": [""]}), ], ) -def test_translate_whitespace_recursive(value, expected): - """Test we translate some whitespace values.""" - assert translate_whitespace_recursive(value) == expected +def test_remove_escaped_whitespace_recursive(value, expected): + """Test we remove some whitespace values.""" + assert remove_escaped_whitespace_recursive(value) == expected diff --git a/posthog/temporal/workflows/redshift_batch_export.py b/posthog/temporal/workflows/redshift_batch_export.py index 74ad617e8b9da..4f107b571a5d0 100644 --- a/posthog/temporal/workflows/redshift_batch_export.py +++ b/posthog/temporal/workflows/redshift_batch_export.py @@ -31,22 +31,9 @@ postgres_connection, ) -WHITESPACE_TRANSLATE = str.maketrans( - { - whitespace_char: None - for whitespace_char in ( - "\n", - "\t", - "\f", - "\r", - "\b", - ) - } -) - -def translate_whitespace_recursive(value): - """Translate all whitespace from given value using WHITESPACE_TRANSLATE. +def remove_escaped_whitespace_recursive(value): + """Remove all escaped whitespace characters from given value. PostgreSQL supports constant escaped strings by appending an E' to each string that contains whitespace in them (amongst other characters). See: @@ -61,22 +48,22 @@ def translate_whitespace_recursive(value): """ match value: case str(s): - return s.translate(WHITESPACE_TRANSLATE) + return " ".join(s.replace("\b", " ").split()) case bytes(b): - return b.decode("utf-8").translate(WHITESPACE_TRANSLATE).encode("utf-8") + return remove_escaped_whitespace_recursive(b.decode("utf-8")) case [*sequence]: # mypy could be bugged as it's raising a Statement unreachable error. # But we are definitely reaching this statement in tests; hence the ignore comment. # Maybe: https://github.com/python/mypy/issues/16272. - return type(value)(translate_whitespace_recursive(sequence_value) for sequence_value in sequence) # type: ignore + return type(value)(remove_escaped_whitespace_recursive(sequence_value) for sequence_value in sequence) # type: ignore case set(elements): - return set(translate_whitespace_recursive(element) for element in elements) + return set(remove_escaped_whitespace_recursive(element) for element in elements) case {**mapping}: - return {k: translate_whitespace_recursive(v) for k, v in mapping.items()} + return {k: remove_escaped_whitespace_recursive(v) for k, v in mapping.items()} case value: return value @@ -292,7 +279,7 @@ async def insert_into_redshift_activity(inputs: RedshiftInsertInputs): def map_to_record(row: dict) -> dict: """Map row to a record to insert to Redshift.""" return { - key: json.dumps(translate_whitespace_recursive(row[key])) + key: json.dumps(remove_escaped_whitespace_recursive(row[key])) if key in json_columns and row[key] is not None else row[key] for key in schema_columns