fix: Support using pandas nullable types

Allow nullable types that aren't floats. This means that we can upload data frames that have int/bool columns that have empty cells and still maintain the correct type.
InfluxCommunity · Apr 2, 2024 · 14b6dc0 · 14b6dc0
1 parent f502eda
commit 14b6dc0
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1 +1,5 @@
 ## 0.4.0 [unreleased]
+
+### Bugfix
+
+1. [#77](https://github.com/InfluxCommunity/influxdb3-python/pull/77): Support using pandas nullable types
diff --git a/influxdb_client_3/write_client/client/write/dataframe_serializer.py b/influxdb_client_3/write_client/client/write/dataframe_serializer.py
@@ -14,15 +14,16 @@
 logger = logging.getLogger('influxdb_client.client.write.dataframe_serializer')
 
 
+def _not_nan(x):
+    from ...extras import pd
+    return not pd.isna(x)
+
+
 def _itertuples(data_frame):
     cols = [data_frame.iloc[:, k] for k in range(len(data_frame.columns))]
     return zip(data_frame.index, *cols)
 
 
-def _not_nan(x):
-    return x == x
-
-
 def _any_not_nan(p, indexes):
     return any(map(lambda x: _not_nan(p[x]), indexes))
 
@@ -175,7 +176,7 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
                 # This column is a tag column.
                 if null_columns.iloc[index]:
                     key_value = f"""{{
-                            '' if {val_format} == '' or type({val_format}) == float and math.isnan({val_format}) else
+                            '' if {val_format} == '' or pd.isna({val_format}) else
                             f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}'
                         }}"""
                 else:
@@ -191,20 +192,23 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
             # It's important to omit it because when the first
             # field column has no nulls, we don't run the comma-removal
             # regexp substitution step.
+
             sep = '' if len(field_indexes) == 0 else ','
-            if issubclass(value.type, np.integer):
-                field_value = f"{sep}{key_format}={{{val_format}}}i"
-            elif issubclass(value.type, np.bool_):
-                field_value = f'{sep}{key_format}={{{val_format}}}'
-            elif issubclass(value.type, np.floating):
+
+            if (issubclass(value.type, np.integer)
+                    or issubclass(value.type, np.floating)
+                    or issubclass(value.type, np.bool_)):
+                suffix = 'i' if issubclass(value.type, np.integer) else ''
                 if null_columns.iloc[index]:
-                    field_value = f"""{{"" if math.isnan({val_format}) else f"{sep}{key_format}={{{val_format}}}"}}"""
+                    field_value = (
+                        f"""{{"" if pd.isna({val_format}) else f"{sep}{key_format}={{{val_format}}}{suffix}"}}"""
+                    )
                 else:
-                    field_value = f'{sep}{key_format}={{{val_format}}}'
+                    field_value = f'{sep}{key_format}={{{val_format}}}{suffix}'
             else:
                 if null_columns.iloc[index]:
                     field_value = f"""{{
-                            '' if type({val_format}) == float and math.isnan({val_format}) else
+                            '' if pd.isna({val_format}) else
                             f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'
                         }}"""
                 else:
@@ -229,7 +233,7 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
             '_ESCAPE_KEY': _ESCAPE_KEY,
             '_ESCAPE_STRING': _ESCAPE_STRING,
             'keys': keys,
-            'math': math,
+            'pd': pd,
         })
 
         for k, v in dict(data_frame.dtypes).items():

diff --git a/tests/test_dataframe_serializer.py b/tests/test_dataframe_serializer.py
@@ -0,0 +1,45 @@
+import unittest
+
+from influxdb_client_3 import PointSettings
+from influxdb_client_3.write_client.client.write.dataframe_serializer import DataframeSerializer
+import pandas as pd
+
+
+class TestDataFrameSerializer(unittest.TestCase):
+
+    def test_nullable_types(self):
+        df = pd.DataFrame({
+            "bool_nulls": [True, None, False],
+            "int_nulls": [None, 1, 2],
+            "float_nulls": [1.0, 2.0, None],
+            "str_nulls": ["a", "b", None],
+        })
+        df['bool_nulls_pd'] = df['bool_nulls'].astype(pd.BooleanDtype())
+        df['int_nulls_pd'] = df['int_nulls'].astype(pd.Int64Dtype())
+        df['float_nulls_pd'] = df['float_nulls'].astype(pd.Float64Dtype())
+        df['str_nulls_pd'] = df['str_nulls'].astype(pd.StringDtype())
+
+        df.index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
+
+        ps = PointSettings()
+
+        serializer = DataframeSerializer(df, ps, data_frame_measurement_name="test")
+
+        lines = serializer.serialize()
+
+        # make sure there are no `<NA>` values in the serialized lines
+        # first line should not have "int"
+        first_line = lines[0]
+        self.assertNotIn('<NA>', first_line)
+        self.assertNotIn('int_nulls', first_line)
+
+        # the second line should not have "bool"
+        second_line = lines[1]
+        self.assertNotIn('<NA>', second_line)
+        self.assertNotIn('bool_nulls', second_line)
+
+        # the third line should not have "str" or "float"
+        third_line = lines[2]
+        self.assertNotIn('<NA>', third_line)
+        self.assertNotIn('str_nulls', third_line)
+        self.assertNotIn('float_nulls', third_line)