From 2e2e4b67d67bf42f97ad7d22262ac5d154b178a2 Mon Sep 17 00:00:00 2001 From: Nitin Bansal Date: Thu, 29 Aug 2024 10:53:43 +0530 Subject: [PATCH] Add support for timestamp dtype parsing in case of non timestamp field. --- fennel/CHANGELOG.md | 3 + fennel/internal_lib/utils/utils.py | 3 +- fennel/testing/test_cast_df_to_schema.py | 94 +++++++++++++++++++++++- fennel/testing/test_utils.py | 56 +++++++++++++- pyproject.toml | 2 +- 5 files changed, 152 insertions(+), 6 deletions(-) diff --git a/fennel/CHANGELOG.md b/fennel/CHANGELOG.md index fb9d98524..21a48c7aa 100644 --- a/fennel/CHANGELOG.md +++ b/fennel/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## [1.5.12] - 2024-08-29 +- Add support for timestamp dtype parsing in case of non timestamp field. + ## [1.5.8] - 2024-08-23 - Fix selection of indexes from dataset decorator diff --git a/fennel/internal_lib/utils/utils.py b/fennel/internal_lib/utils/utils.py index 3d79d3ec8..200bcbdc4 100644 --- a/fennel/internal_lib/utils/utils.py +++ b/fennel/internal_lib/utils/utils.py @@ -69,7 +69,8 @@ def to_dict(value): def parse_datetime(value: Union[int, str, datetime]) -> datetime: - if isinstance(value, int): + if isinstance(value, (int, float)): + value = int(value) try: value = pd.to_datetime(value, unit="s", utc=True) except ValueError: diff --git a/fennel/testing/test_cast_df_to_schema.py b/fennel/testing/test_cast_df_to_schema.py index 24086f4c3..f2633d058 100644 --- a/fennel/testing/test_cast_df_to_schema.py +++ b/fennel/testing/test_cast_df_to_schema.py @@ -1,20 +1,22 @@ from datetime import datetime, timezone -from typing import Optional +from typing import Optional, List import pandas as pd import pyarrow as pa import pytest from fennel.datasets import dataset, field -from fennel.dtypes import between, oneof, regex +from fennel.dtypes import between, oneof, regex, struct from fennel.featuresets import featureset, feature as F from fennel.gen import schema_pb2 as schema_proto +from fennel.internal_lib.schema import get_datatype from fennel.internal_lib.to_proto.to_proto import fields_to_dsschema from fennel.testing import mock from fennel.testing.test_utils import ( cast_df_to_schema, cast_col_to_arrow_dtype, cast_col_to_pandas_dtype, + parse_datetime_in_value, ) __owner__ = "nitin@fennel.ai" @@ -446,7 +448,7 @@ def test_invalid_cast_col_to_arrow_dtype(): cast_col_to_arrow_dtype(data, data_type) assert ( str(e.value) - == "object of type cannot be converted to int" + == "Unknown datetime string format, unable to parse: fdfd, at position 0" ) # 2. Casting of a map @@ -566,6 +568,34 @@ def test_cast_col_to_pandas_dtype(): assert pandas_dtype_data.tolist()[0] == value +def test_optional_timestamp_cast_col_to_pandas_dtype(): + """ + Testing casting pd.Series of arrow dtype to pandas dtype. + """ + value = [1, 2, 3, None, None] + parsed_value = [ + datetime.fromtimestamp(1, tz=timezone.utc), + datetime.fromtimestamp(2, tz=timezone.utc), + datetime.fromtimestamp(3, tz=timezone.utc), + pd.NaT, + pd.NaT, + ] + data = pd.Series(value, name="testing") + data_type = schema_proto.DataType( + optional_type=schema_proto.OptionalType( + of=schema_proto.DataType( + timestamp_type=schema_proto.TimestampType() + ) + ) + ) + + arrow_dtype_data = cast_col_to_arrow_dtype(data, data_type) + pandas_dtype_data = cast_col_to_pandas_dtype(arrow_dtype_data, data_type) + + assert pandas_dtype_data.dtype == "datetime64[ns, UTC]" + assert pandas_dtype_data.tolist() == parsed_value + + @mock def test_casting_empty_dataframe(client): @dataset(index=True) @@ -593,3 +623,61 @@ class UserFeatures: pd.NaT, pd.NaT, ] + + +def test_parse_datetime_in_value(): + dtype = get_datatype(List[datetime]) + value = [0, 1, 3, 4, 5] + parse_value = parse_datetime_in_value(value, dtype) + assert parse_value == [ + datetime.fromtimestamp(0, tz=timezone.utc), + datetime.fromtimestamp(1, tz=timezone.utc), + datetime.fromtimestamp(3, tz=timezone.utc), + datetime.fromtimestamp(4, tz=timezone.utc), + datetime.fromtimestamp(5, tz=timezone.utc), + ] + + dtype = get_datatype(List[Optional[datetime]]) + value = [0, 1, None, 4, 5] + parse_value = parse_datetime_in_value(value, dtype) + assert parse_value == [ + datetime.fromtimestamp(0, tz=timezone.utc), + datetime.fromtimestamp(1, tz=timezone.utc), + pd.NA, + datetime.fromtimestamp(4, tz=timezone.utc), + datetime.fromtimestamp(5, tz=timezone.utc), + ] + + dtype = get_datatype(Optional[datetime]) + value = None + parse_value = parse_datetime_in_value(value, dtype) + assert parse_value is pd.NA + + dtype = get_datatype(datetime) + value = 1 + parse_value = parse_datetime_in_value(value, dtype) + assert parse_value == datetime.fromtimestamp(1, tz=timezone.utc) + + @struct + class A: + name: str + birthdate: Optional[datetime] + + dtype = get_datatype(List[A]) + value = [ + A(name=1, birthdate=1), + A(name=1, birthdate=None), + A(name=1, birthdate=datetime.fromtimestamp(0, tz=timezone.utc)), + {"name": 1, "birthdate": 1}, + {"name": 1, "birthdate": None}, + {"name": 1, "birthdate": datetime.fromtimestamp(0, tz=timezone.utc)}, + ] + parse_value = parse_datetime_in_value(value, dtype) + assert parse_value == [ + {"name": 1, "birthdate": datetime.fromtimestamp(1, tz=timezone.utc)}, + {"name": 1, "birthdate": pd.NA}, + {"name": 1, "birthdate": datetime.fromtimestamp(0, tz=timezone.utc)}, + {"name": 1, "birthdate": datetime.fromtimestamp(1, tz=timezone.utc)}, + {"name": 1, "birthdate": pd.NA}, + {"name": 1, "birthdate": datetime.fromtimestamp(0, tz=timezone.utc)}, + ] diff --git a/fennel/testing/test_utils.py b/fennel/testing/test_utils.py index 730c5c6e1..6e1bd2892 100644 --- a/fennel/testing/test_utils.py +++ b/fennel/testing/test_utils.py @@ -1,6 +1,6 @@ import json from math import isnan -from typing import Any, Union, List +from typing import Any, Union, List, Dict import numpy as np import pandas as pd @@ -141,6 +141,58 @@ def parse_struct_into_dict(value: Any) -> Union[dict, list]: return value +def parse_datetime_in_value( + value: Any, dtype: DataType, nullable: bool = False +) -> Any: + """ + This function assumes that there's a struct somewhere in the value that needs to be converted into json. + """ + if nullable: + try: + if not isinstance( + value, (list, tuple, dict, set, np.ndarray, frozendict) + ) and pd.isna(value): + return pd.NA + # ValueError error occurs when you do something like pd.isnull([1, 2, None]) + except ValueError: + pass + if dtype.HasField("optional_type"): + return parse_datetime_in_value(value, dtype.optional_type.of, True) + elif dtype.HasField("timestamp_type"): + return parse_datetime(value) + elif dtype.HasField("array_type"): + return [parse_datetime_in_value(x, dtype.array_type.of) for x in value] + elif dtype.HasField("map_type"): + if isinstance(value, (dict, frozendict)): + return { + key: parse_datetime_in_value(value, dtype.array_type.of) + for (key, value) in value.items() + } + elif isinstance(value, (list, np.ndarray)): + return [ + (key, parse_datetime_in_value(value, dtype.array_type.of)) + for (key, value) in value + ] + else: + return value + elif dtype.HasField("struct_type"): + if hasattr(value, FENNEL_STRUCT): + try: + value = value.as_json() + except Exception as e: + raise TypeError( + f"Not able parse value: {value} into json, error: {e}" + ) + output: Dict[Any, Any] = {} + for field in dtype.struct_type.fields: + output[field.name] = parse_datetime_in_value( + value[field.name], field.dtype + ) + return output + else: + return value + + def cast_col_to_arrow_dtype(series: pd.Series, dtype: DataType) -> pd.Series: """ This function casts dtype of pd.Series object into pd.ArrowDtype depending on the DataType proto. @@ -153,6 +205,8 @@ def cast_col_to_arrow_dtype(series: pd.Series, dtype: DataType) -> pd.Series: # dtype conversion fails with fennel struct if check_dtype_has_struct_type(dtype): series = series.apply(lambda x: parse_struct_into_dict(x)) + # Parse datetime values + series = series.apply(lambda x: parse_datetime_in_value(x, dtype)) arrow_type = convert_dtype_to_arrow_type(dtype) return series.astype(pd.ArrowDtype(arrow_type)) diff --git a/pyproject.toml b/pyproject.toml index c806a1578..8d948eea9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "fennel-ai" -version = "1.5.11" +version = "1.5.12" description = "The modern realtime feature engineering platform" authors = ["Fennel AI "] packages = [{ include = "fennel" }]