Skip to content

Commit

Permalink
fix: Support using pandas nullable types
Browse files Browse the repository at this point in the history
Allow nullable types that aren't floats. This means that we can upload
data frames that have int/bool columns that have empty cells and still
maintain the correct type.
  • Loading branch information
AndyBryson committed Apr 2, 2024
1 parent f502eda commit 14b6dc0
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 14 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
## 0.4.0 [unreleased]

### Bugfix

1. [#77](https://github.com/InfluxCommunity/influxdb3-python/pull/77): Support using pandas nullable types
32 changes: 18 additions & 14 deletions influxdb_client_3/write_client/client/write/dataframe_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@
logger = logging.getLogger('influxdb_client.client.write.dataframe_serializer')


def _not_nan(x):
from ...extras import pd
return not pd.isna(x)


def _itertuples(data_frame):
cols = [data_frame.iloc[:, k] for k in range(len(data_frame.columns))]
return zip(data_frame.index, *cols)


def _not_nan(x):
return x == x


def _any_not_nan(p, indexes):
return any(map(lambda x: _not_nan(p[x]), indexes))

Expand Down Expand Up @@ -175,7 +176,7 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
# This column is a tag column.
if null_columns.iloc[index]:
key_value = f"""{{
'' if {val_format} == '' or type({val_format}) == float and math.isnan({val_format}) else
'' if {val_format} == '' or pd.isna({val_format}) else
f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}'
}}"""
else:
Expand All @@ -191,20 +192,23 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
# It's important to omit it because when the first
# field column has no nulls, we don't run the comma-removal
# regexp substitution step.

sep = '' if len(field_indexes) == 0 else ','
if issubclass(value.type, np.integer):
field_value = f"{sep}{key_format}={{{val_format}}}i"
elif issubclass(value.type, np.bool_):
field_value = f'{sep}{key_format}={{{val_format}}}'
elif issubclass(value.type, np.floating):

if (issubclass(value.type, np.integer)
or issubclass(value.type, np.floating)
or issubclass(value.type, np.bool_)):
suffix = 'i' if issubclass(value.type, np.integer) else ''
if null_columns.iloc[index]:
field_value = f"""{{"" if math.isnan({val_format}) else f"{sep}{key_format}={{{val_format}}}"}}"""
field_value = (
f"""{{"" if pd.isna({val_format}) else f"{sep}{key_format}={{{val_format}}}{suffix}"}}"""
)
else:
field_value = f'{sep}{key_format}={{{val_format}}}'
field_value = f'{sep}{key_format}={{{val_format}}}{suffix}'
else:
if null_columns.iloc[index]:
field_value = f"""{{
'' if type({val_format}) == float and math.isnan({val_format}) else
'' if pd.isna({val_format}) else
f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'
}}"""
else:
Expand All @@ -229,7 +233,7 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
'_ESCAPE_KEY': _ESCAPE_KEY,
'_ESCAPE_STRING': _ESCAPE_STRING,
'keys': keys,
'math': math,
'pd': pd,
})

for k, v in dict(data_frame.dtypes).items():
Expand Down
45 changes: 45 additions & 0 deletions tests/test_dataframe_serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import unittest

from influxdb_client_3 import PointSettings
from influxdb_client_3.write_client.client.write.dataframe_serializer import DataframeSerializer
import pandas as pd


class TestDataFrameSerializer(unittest.TestCase):

def test_nullable_types(self):
df = pd.DataFrame({
"bool_nulls": [True, None, False],
"int_nulls": [None, 1, 2],
"float_nulls": [1.0, 2.0, None],
"str_nulls": ["a", "b", None],
})
df['bool_nulls_pd'] = df['bool_nulls'].astype(pd.BooleanDtype())
df['int_nulls_pd'] = df['int_nulls'].astype(pd.Int64Dtype())
df['float_nulls_pd'] = df['float_nulls'].astype(pd.Float64Dtype())
df['str_nulls_pd'] = df['str_nulls'].astype(pd.StringDtype())

df.index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])

ps = PointSettings()

serializer = DataframeSerializer(df, ps, data_frame_measurement_name="test")

lines = serializer.serialize()

# make sure there are no `<NA>` values in the serialized lines
# first line should not have "int"
first_line = lines[0]
self.assertNotIn('<NA>', first_line)
self.assertNotIn('int_nulls', first_line)

# the second line should not have "bool"
second_line = lines[1]
self.assertNotIn('<NA>', second_line)
self.assertNotIn('bool_nulls', second_line)

# the third line should not have "str" or "float"
third_line = lines[2]
self.assertNotIn('<NA>', third_line)
self.assertNotIn('str_nulls', third_line)
self.assertNotIn('float_nulls', third_line)

0 comments on commit 14b6dc0

Please sign in to comment.