chore(data-import): handle large integers in json (#27141)

PostHog · Dec 24, 2024 · 9089380 · 9089380
1 parent b9099ec
commit 9089380
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 2 deletions.
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py
@@ -1,8 +1,9 @@
+import json
 from typing import Any, Optional
 from collections.abc import Sequence
 
 from dlt.common.schema.typing import TTableSchemaColumns
-from dlt.common import logger, json
+from dlt.common import logger, json as orjson
 from dlt.common.configuration import with_config
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.json import custom_encode, map_nested_in_place
@@ -91,7 +92,7 @@ def row_tuples_to_arrow(rows: Sequence[RowAny], columns: TTableSchemaColumns, tz
             logger.warning(
                 f"Field {field.name} was reflected as JSON type and needs to be serialized back to string to be placed in arrow table. This will slow data extraction down. You should cast JSON field to STRING in your database system ie. by creating and extracting an SQL VIEW that selects with cast."
             )
-            json_str_array = pa.array([None if s is None else json.dumps(s) for s in columnar_known_types[field.name]])
+            json_str_array = pa.array([None if s is None else json_dumps(s) for s in columnar_known_types[field.name]])
             columnar_known_types[field.name] = json_str_array
 
     # If there are unknown type columns, first create a table to infer their types
@@ -139,3 +140,12 @@ def row_tuples_to_arrow(rows: Sequence[RowAny], columns: TTableSchemaColumns, tz
         )
 
     return pa.Table.from_pydict(columnar_known_types, schema=arrow_schema)
+
+
+def json_dumps(obj: Any) -> str:
+    try:
+        return orjson.dumps(obj)
+    except TypeError as e:
+        if str(e) == "Integer exceeds 64-bit range":
+            return json.dumps(obj)
+        raise TypeError(e)
diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/test/test_arrow_helpers.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/test/test_arrow_helpers.py
@@ -0,0 +1,22 @@
+import pytest
+import pyarrow as pa
+from posthog.temporal.data_imports.pipelines.sql_database_v2.arrow_helpers import json_dumps
+from dlt.common.json import json
+
+
+def test_handle_large_integers():
+    # Test that orjson raises TypeError for integers outside 64-bit range
+    with pytest.raises(TypeError, match="Integer exceeds 64-bit range"):
+        json.dumps({"a": 2**64})
+
+    with pytest.raises(TypeError, match="Integer exceeds 64-bit range"):
+        json.dumps({"a": -(2**64)})
+
+    json_str_array = pa.array([None if s is None else json_dumps(s) for s in [{"a": 2**64}]])
+
+    loaded = json.loads(json_str_array[0].as_py())
+    assert loaded["a"] == float(2**64)
+
+    json_str_array = pa.array([None if s is None else json_dumps(s) for s in [{"a": -(2**64)}]])
+    loaded = json.loads(json_str_array[0].as_py())
+    assert loaded["a"] == float(-(2**64))