Skip to content

Commit

Permalink
chore(data-import): handle large integers in json (#27141)
Browse files Browse the repository at this point in the history
  • Loading branch information
EDsCODE authored Dec 24, 2024
1 parent b9099ec commit 9089380
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json
from typing import Any, Optional
from collections.abc import Sequence

from dlt.common.schema.typing import TTableSchemaColumns
from dlt.common import logger, json
from dlt.common import logger, json as orjson
from dlt.common.configuration import with_config
from dlt.common.destination import DestinationCapabilitiesContext
from dlt.common.json import custom_encode, map_nested_in_place
Expand Down Expand Up @@ -91,7 +92,7 @@ def row_tuples_to_arrow(rows: Sequence[RowAny], columns: TTableSchemaColumns, tz
logger.warning(
f"Field {field.name} was reflected as JSON type and needs to be serialized back to string to be placed in arrow table. This will slow data extraction down. You should cast JSON field to STRING in your database system ie. by creating and extracting an SQL VIEW that selects with cast."
)
json_str_array = pa.array([None if s is None else json.dumps(s) for s in columnar_known_types[field.name]])
json_str_array = pa.array([None if s is None else json_dumps(s) for s in columnar_known_types[field.name]])
columnar_known_types[field.name] = json_str_array

# If there are unknown type columns, first create a table to infer their types
Expand Down Expand Up @@ -139,3 +140,12 @@ def row_tuples_to_arrow(rows: Sequence[RowAny], columns: TTableSchemaColumns, tz
)

return pa.Table.from_pydict(columnar_known_types, schema=arrow_schema)


def json_dumps(obj: Any) -> str:
try:
return orjson.dumps(obj)
except TypeError as e:
if str(e) == "Integer exceeds 64-bit range":
return json.dumps(obj)
raise TypeError(e)
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pytest
import pyarrow as pa
from posthog.temporal.data_imports.pipelines.sql_database_v2.arrow_helpers import json_dumps
from dlt.common.json import json


def test_handle_large_integers():
# Test that orjson raises TypeError for integers outside 64-bit range
with pytest.raises(TypeError, match="Integer exceeds 64-bit range"):
json.dumps({"a": 2**64})

with pytest.raises(TypeError, match="Integer exceeds 64-bit range"):
json.dumps({"a": -(2**64)})

json_str_array = pa.array([None if s is None else json_dumps(s) for s in [{"a": 2**64}]])

loaded = json.loads(json_str_array[0].as_py())
assert loaded["a"] == float(2**64)

json_str_array = pa.array([None if s is None else json_dumps(s) for s in [{"a": -(2**64)}]])
loaded = json.loads(json_str_array[0].as_py())
assert loaded["a"] == float(-(2**64))

0 comments on commit 9089380

Please sign in to comment.