Skip to content

Commit

Permalink
fix(batch-exports): Remove zero unicode in PostgreSQL (#27179)
Browse files Browse the repository at this point in the history
  • Loading branch information
tomasfarias authored Dec 30, 2024
1 parent 75dea84 commit baf54a0
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
6 changes: 5 additions & 1 deletion posthog/temporal/batch_exports/postgres_batch_export.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import collections.abc
import contextlib
import csv
Expand Down Expand Up @@ -387,7 +388,10 @@ async def copy_tsv_to_postgres(
fields=sql.SQL(",").join(sql.Identifier(column) for column in schema_columns),
)
) as copy:
while data := tsv_file.read():
while data := await asyncio.to_thread(tsv_file.read):
# \u0000 cannot be present in PostgreSQL's jsonb type, and will cause an error.
# See: https://www.postgresql.org/docs/17/datatype-json.html
data = data.replace(b"\\u0000", b"")
await copy.write(data)


Expand Down
14 changes: 12 additions & 2 deletions posthog/temporal/tests/batch_exports/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,19 @@ def data_interval_end(request, interval):
return dt.datetime(2023, 4, 25, 15, 0, 0, tzinfo=dt.UTC)


@pytest.fixture
def test_properties(request):
"""Set test data properties."""
try:
return request.param
except AttributeError:
pass
return {"$browser": "Chrome", "$os": "Mac OS X"}


@pytest_asyncio.fixture
async def generate_test_data(
ateam, clickhouse_client, exclude_events, data_interval_start, data_interval_end, interval
ateam, clickhouse_client, exclude_events, data_interval_start, data_interval_end, interval, test_properties
):
"""Generate test data in ClickHouse."""
if interval != "every 5 minutes":
Expand All @@ -239,7 +249,7 @@ async def generate_test_data(
count_outside_range=10,
count_other_team=10,
duplicate=True,
properties={"$browser": "Chrome", "$os": "Mac OS X"},
properties=test_properties,
person_properties={"utm_medium": "referral", "$initial_os": "Linux"},
table=table,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ async def assert_clickhouse_records_in_postgres(
# bq_ingested_timestamp cannot be compared as it comes from an unstable function.
continue

if isinstance(v, str):
v = v.replace("\\u0000", "")
elif isinstance(v, bytes):
v = v.replace(b"\\u0000", b"")

if k in {"properties", "set", "set_once", "person_properties", "elements"} and v is not None:
expected_record[k] = json.loads(v)
elif isinstance(v, dt.datetime):
Expand All @@ -159,6 +164,12 @@ async def assert_clickhouse_records_in_postgres(
assert inserted_records == expected_records


@pytest.fixture
def test_properties(request):
"""Include a \u0000 unicode escape sequence in properties."""
return {"$browser": "Chrome", "$os": "Mac OS X", "unicode": "\u0000"}


@pytest.fixture
def postgres_config():
return {
Expand Down

0 comments on commit baf54a0

Please sign in to comment.