Skip to content

Commit

Permalink
fix(batch-exports): replace invalid unicode with ?
Browse files Browse the repository at this point in the history
  • Loading branch information
bretthoerner committed Mar 26, 2024
1 parent a54ae88 commit 19a9068
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
27 changes: 25 additions & 2 deletions posthog/temporal/batch_exports/temporary_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,25 @@
import pyarrow.parquet as pq


def replace_broken_unicode(obj):
if isinstance(obj, str):
return obj.encode("utf-8", "replace").decode("utf-8")
elif isinstance(obj, list):
return [replace_broken_unicode(item) for item in obj]
elif isinstance(obj, dict):
return {replace_broken_unicode(key): replace_broken_unicode(value) for key, value in obj.items()}
else:
return obj


def json_dumps_bytes(d) -> bytes:
return orjson.dumps(d, default=str)
try:
return orjson.dumps(d, default=str)
except (UnicodeEncodeError, TypeError):
# orjson is very strict about invalid unicode. This slow path protects us against
# things we've observed in practice, like single surrogate codes, e.g. "\ud83d"
cleaned_d = replace_broken_unicode(d)
return orjson.dumps(cleaned_d, default=str)


class BatchExportTemporaryFile:
Expand Down Expand Up @@ -131,7 +148,13 @@ def write_record_as_bytes(self, record: bytes):
def write_records_to_jsonl(self, records):
"""Write records to a temporary file as JSONL."""
if len(records) == 1:
jsonl_dump = orjson.dumps(records[0], option=orjson.OPT_APPEND_NEWLINE, default=str)
try:
jsonl_dump = orjson.dumps(records[0], option=orjson.OPT_APPEND_NEWLINE, default=str)
except (UnicodeEncodeError, TypeError):
# orjson is very strict about invalid unicode. This slow path protects us against
# things we've observed in practice, like single surrogate codes, e.g. "\ud83d"
cleaned_record = replace_broken_unicode(records[0])
jsonl_dump = orjson.dumps(cleaned_record, option=orjson.OPT_APPEND_NEWLINE, default=str)
else:
jsonl_dump = b"\n".join(map(json_dumps_bytes, records))

Expand Down
9 changes: 9 additions & 0 deletions posthog/temporal/tests/batch_exports/test_temporary_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ def test_batch_export_temporary_file_write_records_to_jsonl(records):
assert be_file.records_since_last_reset == 0


def test_batch_export_temporary_file_write_records_to_jsonl_invalid_unicode():
with BatchExportTemporaryFile() as be_file:
be_file.write_records_to_jsonl(["hello\ud83dworld"])

be_file.seek(0)
# Invalid single surrogate is replaced with a question mark.
assert json.loads(be_file.readlines()[0]) == "hello?world"


@pytest.mark.parametrize(
"records",
TEST_RECORDS,
Expand Down

0 comments on commit 19a9068

Please sign in to comment.