diff --git a/fennel/CHANGELOG.md b/fennel/CHANGELOG.md index 2eb9d4033..462489e1d 100644 --- a/fennel/CHANGELOG.md +++ b/fennel/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## [1.5.18] - 2024-09-05 +- Struct initializer + arrow fixes + type promotion in assign + ## [1.5.17] - 2024-09-04 - Add support for several more expressions diff --git a/fennel/client_tests/test_complex_struct.py b/fennel/client_tests/test_complex_struct.py index 285007654..72b92a154 100644 --- a/fennel/client_tests/test_complex_struct.py +++ b/fennel/client_tests/test_complex_struct.py @@ -8,6 +8,7 @@ from fennel.connectors import Webhook, source from fennel.datasets import dataset, Dataset, field, pipeline, LastK from fennel.dtypes import struct, Continuous +from fennel.expr.expr import col, make_struct from fennel.featuresets import featureset, feature as F, extractor from fennel.lib import inputs, outputs from fennel.testing import mock @@ -87,6 +88,118 @@ def movie_info(cls, movie: Dataset): ) +@dataset(index=True) +class MovieInfoExpr: + director_id: int = field(key=True) + movie_id: int = field(key=True) + role_list: List[Role] + timestamp: datetime = field(timestamp=True) + + @pipeline + @inputs(MovieDS) + def movie_info(cls, movie: Dataset): + return ( + movie.assign( + role=make_struct( + { + "role_id": col("role_id"), + "name": col("name"), + "cost": col("cost"), + }, + Role, + ).astype(Role) + ) + .drop(columns=["role_id", "name", "cost"]) + .groupby("director_id", "movie_id") + .aggregate( + LastK( + into_field="role_list", + of="role", + window=Continuous("forever"), + limit=3, + dedup=False, + ), + ) + ) + + +@dataset(index=True) +class MovieInfoExpr2: + director_id: int = field(key=True) + movie_id: int = field(key=True) + role_list: List[Role] + timestamp: datetime = field(timestamp=True) + + @pipeline + @inputs(MovieDS) + def movie_info(cls, movie: Dataset): + return ( + movie.assign( + role=Role.expr( # type: ignore + role_id=col("role_id"), name=col("name"), cost=col("cost") + ).astype(Role) + ) + .drop(columns=["role_id", "name", "cost"]) + .groupby("director_id", "movie_id") + .aggregate( + LastK( + into_field="role_list", + of="role", + window=Continuous("forever"), + limit=3, + dedup=False, + ), + ) + ) + + +@struct +class FullName: + first_name: str + last_name: str + + +@struct +class RoleExtended: + role_id: int + name: FullName + cost: int + + +@dataset(index=True) +class MovieInfoExprNested: + director_id: int = field(key=True) + movie_id: int = field(key=True) + role_list: List[RoleExtended] + timestamp: datetime = field(timestamp=True) + + @pipeline + @inputs(MovieDS) + def movie_info(cls, movie: Dataset): + return ( + movie.assign( + role=RoleExtended.expr( # type: ignore + role_id=col("role_id"), + name=FullName.expr( # type: ignore + first_name=col("name"), last_name="rando" + ), + cost=col("cost"), + ).astype(RoleExtended) + ) + .drop(columns=["role_id", "name", "cost"]) + .groupby("director_id", "movie_id") + .aggregate( + LastK( + into_field="role_list", + of="role", + window=Continuous("forever"), + limit=3, + dedup=False, + ), + ) + ) + + @featureset class Request: director_id: int @@ -218,7 +331,13 @@ def test_complex_struct(client): client.commit( message="msg", - datasets=[MovieDS, MovieInfo], + datasets=[ + MovieDS, + MovieInfo, + MovieInfoExpr, + MovieInfoExpr2, + MovieInfoExprNested, + ], featuresets=[Request, MovieFeatures], ) @@ -255,6 +374,36 @@ def test_complex_struct(client): input_dataframe=input_df, ) + res1, found1 = client.lookup( + "MovieInfo", + keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}), + ) + res2, found2 = client.lookup( + "MovieInfoExpr", + keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}), + ) + res3, found3 = client.lookup( + "MovieInfoExpr2", + keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}), + ) + assert res1.shape == res2.shape + assert res1.shape == res3.shape + for c in res1.columns: + assert res1[c].equals(res2[c]) + assert res1[c].equals(res3[c]) + assert list(found1) == list(found2) + assert list(found1) == list(found3) + + res4, found4 = client.lookup( + "MovieInfoExprNested", + keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}), + ) + assert res1.shape == res4.shape + assert list(found1) == list(found4) + for r in res4["role_list"]: + for role in r: + assert role.name.last_name == "rando" + assert df.shape[0] == 4 assert len(df["MovieFeatures.role_list_py"].tolist()[0]) == 3 assert df["MovieFeatures.role_list_py"].tolist()[0][0].as_json() == { diff --git a/fennel/client_tests/test_dataset.py b/fennel/client_tests/test_dataset.py index 6001c994d..6d3e1619b 100644 --- a/fennel/client_tests/test_dataset.py +++ b/fennel/client_tests/test_dataset.py @@ -606,81 +606,6 @@ class UserInfoDataset: ) -# On demand datasets are not supported for now. - -# class TestDocumentDataset(unittest.TestCase): -# @mock_client -# def test_log_to_document_dataset(self, client): -# """Log some data to the dataset and check if it is logged correctly.""" -# -# @meta(owner="aditya@fennel.ai") -# @dataset -# class DocumentContentDataset: -# doc_id: int = field(key=True) -# bert_embedding: Embedding[4] -# fast_text_embedding: Embedding[3] -# num_words: int -# timestamp: datetime = field(timestamp=True) -# -# @on_demand(expires_after="3d") -# @inputs(datetime, int) -# def get_embedding(cls, ts: pd.Series, doc_ids: pd.Series): -# data = [] -# doc_ids = doc_ids.tolist() -# for i in range(len(ts)): -# data.append( -# [ -# doc_ids[i], -# [0.1, 0.2, 0.3, 0.4], -# [1.1, 1.2, 1.3], -# 10 * i, -# ts[i], -# ] -# ) -# columns = [ -# str(cls.doc_id), -# str(cls.bert_embedding), -# str(cls.fast_text_embedding), -# str(cls.num_words), -# str(cls.timestamp), -# ] -# return pd.DataFrame(data, columns=columns), pd.Series( -# [True] * len(ts) -# ) -# -# # Sync the dataset -# client.commit(datasets=[DocumentContentDataset]) -# now = datetime.now(timezone.utc) -# data = [ -# [18232, np.array([1, 2, 3, 4]), np.array([1, 2, 3]), 10, now], -# [ -# 18234, -# np.array([1, 2.2, 0.213, 0.343]), -# np.array([0.87, 2, 3]), -# 9, -# now, -# ], -# [18934, [1, 2.2, 0.213, 0.343], [0.87, 2, 3], 12, now], -# ] -# columns = [ -# "doc_id", -# "bert_embedding", -# "fast_text_embedding", -# "num_words", -# "timestamp", -# ] -# df = pd.DataFrame(data, columns=columns) -# response = client.log("fennel_webhook","DocumentContentDataset", df) -# assert response.status_code == requests.codes.OK, response.json() -# -# # Do some lookups -# doc_ids = pd.Series([18232, 1728, 18234, 18934, 19200, 91012]) -# ts = pd.Series([now, now, now, now, now, now]) -# df, _ = DocumentContentDataset.lookup(ts, doc_id=doc_ids) -# assert df.shape == (6, 5) -# assert df["num_words"].tolist() == [10.0, 9.0, 12, 0, 10.0, 20.0] - - ################################################################################ # Dataset & Pipelines Unit Tests ################################################################################ @@ -1012,6 +937,25 @@ class Orders: timestamp: datetime +@dataset +class OrdersOptional: + uid: Optional[int] + uid_float: float + uid_twice: float + skus: List[int] + prices: List[float] + timestamp: datetime + + @pipeline + @inputs(Orders) + def cast(cls, ds: Dataset): + return ds.assign( + uid=col("uid").astype(Optional[int]), # type: ignore + uid_float=col("uid").astype(float), # type: ignore + uid_twice=(col("uid") * 2.0).astype(float), # type: ignore + ) + + @dataset(index=True) class Derived: uid: int = field(key=True) @@ -1066,6 +1010,39 @@ def test_basic_explode(self, client): assert df["price"].tolist()[0] == 10.1 assert pd.isna(df["price"].tolist()[1]) + @pytest.mark.integration + @mock + def test_basic_cast(self, client): + # # Sync the dataset + client.commit(message="msg", datasets=[Orders, OrdersOptional]) + # log some rows to the transaction dataset + df = pd.DataFrame( + [ + { + "uid": 1, + "skus": [1, 2], + "prices": [10.1, 20.0], + "timestamp": "2021-01-01T00:00:00", + }, + { + "uid": 2, + "skus": [], + "prices": [], + "timestamp": "2021-01-01T00:00:00", + }, + ] + ) + client.log("webhook", "Orders", df) + client.sleep() + + # do lookup on the WithSquare dataset + df = client.inspect("OrdersOptional") + assert df.shape == (2, 6) + assert df["uid"].tolist() == [1, 2] + assert df["uid_float"].tolist() == [1.0, 2.0] + assert df["uid_twice"].tolist() == [2.0, 4.0] + assert df["skus"].tolist() == [[1, 2], []] + class TestBasicAssign(unittest.TestCase): @pytest.mark.integration diff --git a/fennel/datasets/datasets.py b/fennel/datasets/datasets.py index e6598bc8e..f4daa5340 100644 --- a/fennel/datasets/datasets.py +++ b/fennel/datasets/datasets.py @@ -2935,7 +2935,9 @@ def visitAssign(self, obj) -> DSSchema: raise ValueError( f"invalid assign - {output_schema_name} error in expression for column `{col}`: {str(e)}" ) - if typed_expr.dtype != expr_type: + if not typed_expr.expr.matches_type( + typed_expr.dtype, input_schema.schema() + ): printer = ExprPrinter() type_errors.append( f"'{col}' is expected to be of type `{dtype_to_string(typed_expr.dtype)}`, but evaluates to `{dtype_to_string(expr_type)}`. Full expression: `{printer.print(typed_expr.expr.root)}`" @@ -2943,7 +2945,6 @@ def visitAssign(self, obj) -> DSSchema: if len(type_errors) > 0: joined_errors = "\n\t".join(type_errors) - print(joined_errors) raise TypeError( f"found type errors in assign node of `{self.dsname}.{self.pipeline_name}`:\n\t{joined_errors}" ) diff --git a/fennel/dtypes/dtypes.py b/fennel/dtypes/dtypes.py index 253d9dbc0..e01f4f879 100644 --- a/fennel/dtypes/dtypes.py +++ b/fennel/dtypes/dtypes.py @@ -1,4 +1,5 @@ import dataclasses +from functools import partial import inspect import sys from dataclasses import dataclass @@ -75,6 +76,16 @@ def get_fennel_struct(annotation) -> Any: return None +def make_struct_expr(cls, **kwargs): + from fennel.expr.expr import Expr, make_expr, make_struct + + fields = {} + for name, value in kwargs.items(): + fields[name] = make_expr(value) + + return make_struct(fields, cls) + + def struct(cls): for name, member in inspect.getmembers(cls): if inspect.isfunction(member) and name in cls.__dict__: @@ -131,6 +142,7 @@ def struct(cls): setattr(cls, FENNEL_STRUCT_SRC_CODE, "") setattr(cls, FENNEL_STRUCT_DEPENDENCIES_SRC_CODE, dependency_code) cls.as_json = as_json + cls.expr = partial(make_struct_expr, cls) return dataclasses.dataclass(cls) diff --git a/fennel/expr/expr.py b/fennel/expr/expr.py index 6bcee1530..5b748540b 100644 --- a/fennel/expr/expr.py +++ b/fennel/expr/expr.py @@ -13,7 +13,7 @@ parse_json, ) import pyarrow as pa -from fennel_data_lib import eval, type_of +from fennel_data_lib import assign, type_of, matches import fennel.gen.schema_pb2 as schema_proto from fennel.internal_lib.schema import ( @@ -340,7 +340,26 @@ def typeof(self, schema: Dict = None) -> Type: datatype.ParseFromString(type_bytes) return from_proto(datatype) - def eval(self, input_df: pd.DataFrame, schema: Dict) -> pd.Series: + def matches_type(self, dtype: Type, schema: Dict) -> bool: + schema = schema or {} + from fennel.expr.serializer import ExprSerializer + + serializer = ExprSerializer() + proto_expr = serializer.serialize(self.root) + proto_bytes = proto_expr.SerializeToString() + proto_schema = {} + for key, value in schema.items(): + proto_schema[key] = get_datatype(value).SerializeToString() + proto_type_bytes = get_datatype(dtype).SerializeToString() + return matches(proto_bytes, proto_schema, proto_type_bytes) + + def eval( + self, + input_df: pd.DataFrame, + schema: Dict, + output_dtype: Optional[Type] = None, + parse=True, + ) -> pd.Series: from fennel.expr.serializer import ExprSerializer def pd_to_pa(pd_data: pd.DataFrame, schema: Dict[str, Type]): @@ -352,37 +371,29 @@ def pd_to_pa(pd_data: pd.DataFrame, schema: Dict[str, Type]): f"column : {column} not found in input dataframe but defined in schema." ) new_df[column] = cast_col_to_arrow_dtype(new_df[column], dtype) - print("NEw DF: ", column, new_df[column]) new_df = new_df.loc[:, list(schema.keys())] fields = [] for column, dtype in schema.items(): proto_dtype = get_datatype(dtype) - pa_type = convert_dtype_to_arrow_type_with_nullable( - proto_dtype - ) - print(f"column: {column}, dtype: {dtype}, pa_type: {pa_type}") - print("proto_dtype: ", proto_dtype) + pa_type = convert_dtype_to_arrow_type_with_nullable(proto_dtype) if proto_dtype.HasField("optional_type"): nullable = True else: nullable = False field = pa.field(column, type=pa_type, nullable=nullable) - print(f"field: {field.name}, type: {field.type}", "nullable: ", field.nullable) fields.append(field) pa_schema = pa.schema(fields) # Replace pd.NA with None new_df = new_df.where(pd.notna(new_df), None) - print("New DF: ", new_df) - x = pa.RecordBatch.from_pandas( + return pa.RecordBatch.from_pandas( new_df, preserve_index=False, schema=pa_schema ) - print("RecordBatch: ", x) - return x - def pa_to_pd(pa_data, ret_type): + def pa_to_pd(pa_data, ret_type, parse=True): ret = pa_data.to_pandas(types_mapper=pd.ArrowDtype) ret = cast_col_to_pandas(ret, get_datatype(ret_type)) - ret = ret.apply(lambda x: parse_json(ret_type, x)) + if parse: + ret = ret.apply(lambda x: parse_json(ret_type, x)) return ret serializer = ExprSerializer() @@ -392,9 +403,16 @@ def pa_to_pd(pa_data, ret_type): proto_schema = {} for key, value in schema.items(): proto_schema[key] = get_datatype(value).SerializeToString() - ret_type = self.typeof(schema) - arrow_col = eval(proto_bytes, df_pa, proto_schema) - return pa_to_pd(arrow_col, ret_type) + if output_dtype is None: + ret_type = self.typeof(schema) + else: + ret_type = output_dtype + + serialized_ret_type = get_datatype(ret_type).SerializeToString() + arrow_col = assign( + proto_bytes, df_pa, proto_schema, serialized_ret_type + ) + return pa_to_pd(arrow_col, ret_type, parse) def __str__(self) -> str: # type: ignore from fennel.expr.visitor import ExprPrinter diff --git a/fennel/expr/test_expr.py b/fennel/expr/test_expr.py index 59472b42a..478bd0ae3 100644 --- a/fennel/expr/test_expr.py +++ b/fennel/expr/test_expr.py @@ -408,14 +408,19 @@ def compare_values(received, expected, dtype): r = getattr(act, field.name) e = getattr(exp, field.name) if not is_user_defined_class(field.type): + if ( + not isinstance(e, list) + and not isinstance(r, list) + and pd.isna(e) + and pd.isna(r) + ): + continue assert ( r == e ), f"Expected {e}, got {r} for field {field.name} in struct {act}" else: compare_values([r], [e], field.type) else: - print("Received", received) - print("Expected", expected) assert ( list(received) == expected ), f"Expected {expected}, got {received} for dtype {dtype}" @@ -423,8 +428,9 @@ def compare_values(received, expected, dtype): def check_test_case(test_case: ExprTestCase): # Test print test case - printer = ExprPrinter() - assert printer.print(test_case.expr) == test_case.display + if test_case.display: + printer = ExprPrinter() + assert printer.print(test_case.expr) == test_case.display # Test FetchReferences test case ref_extractor = FetchReferences() @@ -441,6 +447,10 @@ def check_test_case(test_case: ExprTestCase): # Test type inference # If it is a dataclass, we check if all fields are present if is_user_defined_class(test_case.expected_dtype): + if not hasattr( + test_case.expr.typeof(test_case.schema), "__annotations__" + ): + assert False, "Expected a dataclass" for field in fields(test_case.expected_dtype): assert ( field.name @@ -518,204 +528,294 @@ class Nested: c: List[int] -def test_parse(): - test_case = ExprTestCase( - expr=(col("a").str.parse(int)), - df=pd.DataFrame({"a": ["1", "2", "3", "4"]}), - schema={"a": str}, - display="PARSE(col('a'), )", - refs={"a"}, - eval_result=[1, 2, 3, 4], - expected_dtype=int, - proto_json=None, - ) - check_test_case(test_case) - - # Parse a struct - test_case = ExprTestCase( - expr=(col("a").str.parse(A)), - df=pd.DataFrame( - {"a": ['{"x": 1, "y": 2, "z": "a"}', '{"x": 2, "y": 3, "z": "b"}']} - ), - schema={"a": str}, - display="PARSE(col('a'), )", - refs={"a"}, - eval_result=[A(1, 2, "a"), A(2, 3, "b")], - expected_dtype=A, - proto_json=None, - ) - check_test_case(test_case) +@struct +class OptionalA: + w: int + x: Optional[int] + y: Optional[int] + z: Optional[str] - # Parse a list of integers - test_case = ExprTestCase( - expr=(col("a").str.parse(List[int])), - df=pd.DataFrame({"a": ["[1, 2, 3]", "[4, 5, 6]"]}), - schema={"a": str}, - display="PARSE(col('a'), typing.List[int])", - refs={"a"}, - eval_result=[[1, 2, 3], [4, 5, 6]], - expected_dtype=List[int], - proto_json=None, - ) - check_test_case(test_case) - # Parse a nested struct - test_case = ExprTestCase( - expr=(col("a").str.parse(Nested)), - df=pd.DataFrame( - { - "a": [ - '{"a": {"x": 1, "y": 2, "z": "a"}, "b": {"p": 1, "q": "b"}, "c": [1, 2, 3]}' - ] - } +def test_parse(): + cases = [ + ExprTestCase( + expr=(col("a").str.parse(int)), + df=pd.DataFrame({"a": ["1", "2", "3", "4"]}), + schema={"a": str}, + display="PARSE(col('a'), )", + refs={"a"}, + eval_result=[1, 2, 3, 4], + expected_dtype=int, + proto_json=None, ), - schema={"a": str}, - display="PARSE(col('a'), )", - refs={"a"}, - eval_result=[Nested(A(1, 2, "a"), B(1, "b"), [1, 2, 3])], - expected_dtype=Nested, - proto_json=None, - ) - check_test_case(test_case) - - # Parse floats - test_case = ExprTestCase( - expr=(col("a").str.parse(float)), - df=pd.DataFrame({"a": ["1.1", "2.2", "3.3", "4.4"]}), - schema={"a": str}, - display="PARSE(col('a'), )", - refs={"a"}, - eval_result=[1.1, 2.2, 3.3, 4.4], - expected_dtype=float, - proto_json=None, - ) - check_test_case(test_case) - - # Parse bool - test_case = ExprTestCase( - expr=(col("a").str.parse(bool)), - df=pd.DataFrame({"a": ["true", "false", "true", "false"]}), - schema={"a": str}, - display="PARSE(col('a'), )", - refs={"a"}, - eval_result=[True, False, True, False], - expected_dtype=bool, - proto_json=None, - ) - check_test_case(test_case) + # Parse a struct + ExprTestCase( + expr=(col("a").str.parse(A)), + df=pd.DataFrame( + { + "a": [ + '{"x": 1, "y": 2, "z": "a"}', + '{"x": 2, "y": 3, "z": "b"}', + ] + } + ), + schema={"a": str}, + display="PARSE(col('a'), )", + refs={"a"}, + eval_result=[A(1, 2, "a"), A(2, 3, "b")], + expected_dtype=A, + proto_json=None, + ), + # Parse a list of integers + ExprTestCase( + expr=(col("a").str.parse(List[int])), + df=pd.DataFrame({"a": ["[1, 2, 3]", "[4, 5, 6]"]}), + schema={"a": str}, + display="PARSE(col('a'), typing.List[int])", + refs={"a"}, + eval_result=[[1, 2, 3], [4, 5, 6]], + expected_dtype=List[int], + proto_json=None, + ), + # Parse a nested struct + ExprTestCase( + expr=(col("a").str.parse(Nested)), + df=pd.DataFrame( + { + "a": [ + '{"a": {"x": 1, "y": 2, "z": "a"}, "b": {"p": 1, "q": "b"}, "c": [1, 2, 3]}' + ] + } + ), + schema={"a": str}, + display="PARSE(col('a'), )", + refs={"a"}, + eval_result=[Nested(A(1, 2, "a"), B(1, "b"), [1, 2, 3])], + expected_dtype=Nested, + proto_json=None, + ), + # Parse floats + ExprTestCase( + expr=(col("a").str.parse(float)), + df=pd.DataFrame({"a": ["1.1", "2.2", "3.3", "4.4"]}), + schema={"a": str}, + display="PARSE(col('a'), )", + refs={"a"}, + eval_result=[1.1, 2.2, 3.3, 4.4], + expected_dtype=float, + proto_json=None, + ), + # Parse bool + ExprTestCase( + expr=(col("a").str.parse(bool)), + df=pd.DataFrame({"a": ["true", "false", "true", "false"]}), + schema={"a": str}, + display="PARSE(col('a'), )", + refs={"a"}, + eval_result=[True, False, True, False], + expected_dtype=bool, + proto_json=None, + ), + # Parse strings + ExprTestCase( + expr=(col("a").str.parse(str)), + df=pd.DataFrame({"a": ['"a1"', '"b"', '"c"', '"d"']}), + schema={"a": str}, + display="PARSE(col('a'), )", + refs={"a"}, + eval_result=["a1", "b", "c", "d"], + expected_dtype=str, + proto_json=None, + ), + # Parse optional strings to structs + ExprTestCase( + expr=( + ( + col("a") + .fillnull('{"x": 12, "y": 21, "z": "rando"}') + .str.parse(A) + ) + ), + df=pd.DataFrame( + { + "a": [ + '{"x": 1, "y": 2, "z": "a"}', + '{"x": 2, "y": 3, "z": "b"}', + None, + None, + ] + } + ), + schema={"a": Optional[str]}, + display=None, + refs={"a"}, + eval_result=[ + A(1, 2, "a"), + A(2, 3, "b"), + A(12, 21, "rando"), + A(12, 21, "rando"), + ], + expected_dtype=A, + proto_json={}, + ), + ExprTestCase( + expr=( + ( + col("a") + .fillnull("""{"x": 12, "y": 21, "z": "rando"}""") + .str.parse(A) + ) + ), + df=pd.DataFrame( + { + "a": [ + '{"x": 1, "y": 2, "z": "a"}', + '{"x": 2, "y": 3, "z": "b"}', + None, + None, + ] + } + ), + schema={"a": Optional[str]}, + display=None, + refs={"a"}, + eval_result=[ + A(1, 2, "a"), + A(2, 3, "b"), + A(12, 21, "rando"), + A(12, 21, "rando"), + ], + expected_dtype=A, + proto_json={}, + ), + ExprTestCase( + expr=((col("a").fillnull("""{"w": 12}""").str.parse(OptionalA))), + df=pd.DataFrame( + { + "a": [ + '{"w": 1, "x": 2, "y": 3, "z": "a"}', + '{"w": 2, "x": 3, "y": 4, "z": "b"}', + None, + None, + ] + } + ), + schema={"a": Optional[str]}, + display=None, + refs={"a"}, + eval_result=[ + OptionalA(1, 2, 3, "a"), + OptionalA(2, 3, 4, "b"), + OptionalA(12, pd.NA, pd.NA, pd.NA), + OptionalA(12, pd.NA, pd.NA, pd.NA), + ], + expected_dtype=OptionalA, + proto_json={}, + ), + ] - # Parse strings - test_case = ExprTestCase( - expr=(col("a").str.parse(str)), - df=pd.DataFrame({"a": ['"a1"', '"b"', '"c"', '"d"']}), - schema={"a": str}, - display="PARSE(col('a'), )", - refs={"a"}, - eval_result=["a1", "b", "c", "d"], - expected_dtype=str, - proto_json=None, - ) - check_test_case(test_case) + for case in cases: + check_test_case(case) def test_list(): test_cases = [ - # ExprTestCase( - # expr=(col("a").list.get(0)), - # df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}), - # schema={"a": List[int]}, - # display="col('a')[0]", - # refs={"a"}, - # eval_result=[1, 4, 7], - # expected_dtype=Optional[int], - # proto_json=None, - # ), - # # Get index where index is an expression - # ExprTestCase( - # expr=(col("a").list.get(col("b") + col("c"))), - # df=pd.DataFrame( - # { - # "a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]], - # "b": [ - # 0, - # 1, - # 2, - # ], - # "c": [1, 2, 0], - # } - # ), - # schema={"a": List[int], "b": int, "c": int}, - # display="col('a')[(col('b') + col('c'))]", - # refs={"a", "b", "c"}, - # eval_result=[2, 12, 9], - # expected_dtype=Optional[int], - # proto_json=None, - # ), - # # Out of bounds index - # ExprTestCase( - # expr=(col("a").list.get(col("b"))), - # df=pd.DataFrame( - # { - # "a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]], - # "b": [0, 21, 5], - # } - # ), - # schema={"a": List[int], "b": int}, - # display="col('a')[col('b')]", - # refs={"a", "b"}, - # eval_result=[1, pd.NA, pd.NA], - # expected_dtype=Optional[int], - # proto_json=None, - # ), - # # List contains - # ExprTestCase( - # expr=(col("a").list.contains(3)), - # df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}), - # schema={"a": List[int]}, - # display="CONTAINS(col('a'), 3)", - # refs={"a"}, - # eval_result=[True, False, False], - # expected_dtype=bool, - # proto_json=None, - # ), - # # List contains with expression - # ExprTestCase( - # expr=(col("a").list.contains(col("b") * col("c"))), - # df=pd.DataFrame( - # { - # "a": [[1, 2, 3], [4, 15, 6], [7, 8, 9]], - # "b": [1, 5, 10], - # "c": [2, 3, 4], - # } - # ), - # schema={"a": List[int], "b": int, "c": int}, - # display="CONTAINS(col('a'), (col('b') * col('c')))", - # refs={"a", "b", "c"}, - # eval_result=[True, True, False], - # expected_dtype=bool, - # proto_json=None, - # ), - # # List contains for list of strings - # ExprTestCase( - # expr=(col("a2").list.contains(col("b2"))), - # df=pd.DataFrame( - # { - # "a2": [ - # ["a", "b", "c"], - # ["d", "e", "f"], - # ["g", "h", "i"], - # ["a", "b", "c"], - # ], - # "b2": ["a", "e", "c", "d"], - # } - # ), - # schema={"a2": List[str], "b2": str}, - # display="""CONTAINS(col('a2'), col('b2'))""", - # refs={"a2", "b2"}, - # eval_result=[True, True, False, False], - # expected_dtype=bool, - # proto_json=None, - # ), + ExprTestCase( + expr=(col("a").list.get(0)), + df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}), + schema={"a": List[int]}, + display="col('a')[0]", + refs={"a"}, + eval_result=[1, 4, 7], + expected_dtype=Optional[int], + proto_json=None, + ), + # Get index where index is an expression + ExprTestCase( + expr=(col("a").list.get(col("b") + col("c"))), + df=pd.DataFrame( + { + "a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]], + "b": [ + 0, + 1, + 2, + ], + "c": [1, 2, 0], + } + ), + schema={"a": List[int], "b": int, "c": int}, + display="col('a')[(col('b') + col('c'))]", + refs={"a", "b", "c"}, + eval_result=[2, 12, 9], + expected_dtype=Optional[int], + proto_json=None, + ), + # Out of bounds index + ExprTestCase( + expr=(col("a").list.get(col("b"))), + df=pd.DataFrame( + { + "a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]], + "b": [0, 21, 5], + } + ), + schema={"a": List[int], "b": int}, + display="col('a')[col('b')]", + refs={"a", "b"}, + eval_result=[1, pd.NA, pd.NA], + expected_dtype=Optional[int], + proto_json=None, + ), + # List contains + ExprTestCase( + expr=(col("a").list.contains(3)), + df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}), + schema={"a": List[int]}, + display="CONTAINS(col('a'), 3)", + refs={"a"}, + eval_result=[True, False, False], + expected_dtype=bool, + proto_json=None, + ), + # List contains with expression + ExprTestCase( + expr=(col("a").list.contains(col("b") * col("c"))), + df=pd.DataFrame( + { + "a": [[1, 2, 3], [4, 15, 6], [7, 8, 9]], + "b": [1, 5, 10], + "c": [2, 3, 4], + } + ), + schema={"a": List[int], "b": int, "c": int}, + display="CONTAINS(col('a'), (col('b') * col('c')))", + refs={"a", "b", "c"}, + eval_result=[True, True, False], + expected_dtype=bool, + proto_json=None, + ), + # List contains for list of strings + ExprTestCase( + expr=(col("a2").list.contains(col("b2"))), + df=pd.DataFrame( + { + "a2": [ + ["a", "b", "c"], + ["d", "e", "f"], + ["g", "h", "i"], + ["a", "b", "c"], + ], + "b2": ["a", "e", "c", "d"], + } + ), + schema={"a2": List[str], "b2": str}, + display="""CONTAINS(col('a2'), col('b2'))""", + refs={"a2", "b2"}, + eval_result=[True, True, False, False], + expected_dtype=bool, + proto_json=None, + ), # Support struct inside a list ExprTestCase( expr=( @@ -733,40 +833,40 @@ def test_list(): expected_dtype=bool, proto_json=None, ), - # ExprTestCase( - # expr=(col("a").list.len()), - # df=pd.DataFrame( - # {"a": [[A(1, 2, "a"), A(2, 3, "b"), A(4, 5, "c")]]} - # ), - # schema={"a": List[A]}, - # display="LEN(col('a'))", - # refs={"a"}, - # eval_result=[3], - # expected_dtype=int, - # proto_json=None, - # ), - # # List length - # ExprTestCase( - # expr=(col("a").list.len()), - # df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6, 12], [7, 8, 9, 19]]}), - # schema={"a": List[int]}, - # display="LEN(col('a'))", - # refs={"a"}, - # eval_result=[3, 4, 4], - # expected_dtype=int, - # proto_json=None, - # ), - # # Empty list length - # ExprTestCase( - # expr=(col("a").list.len()), - # df=pd.DataFrame({"a": [[], [4, 5, 6, 12], [7, 8, 9, 19]]}), - # schema={"a": List[int]}, - # display="LEN(col('a'))", - # refs={"a"}, - # eval_result=[0, 4, 4], - # expected_dtype=int, - # proto_json=None, - # ), + ExprTestCase( + expr=(col("a").list.len()), + df=pd.DataFrame( + {"a": [[A(1, 2, "a"), A(2, 3, "b"), A(4, 5, "c")]]} + ), + schema={"a": List[A]}, + display="LEN(col('a'))", + refs={"a"}, + eval_result=[3], + expected_dtype=int, + proto_json=None, + ), + # List length + ExprTestCase( + expr=(col("a").list.len()), + df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6, 12], [7, 8, 9, 19]]}), + schema={"a": List[int]}, + display="LEN(col('a'))", + refs={"a"}, + eval_result=[3, 4, 4], + expected_dtype=int, + proto_json=None, + ), + # Empty list length + ExprTestCase( + expr=(col("a").list.len()), + df=pd.DataFrame({"a": [[], [4, 5, 6, 12], [7, 8, 9, 19]]}), + schema={"a": List[int]}, + display="LEN(col('a'))", + refs={"a"}, + eval_result=[0, 4, 4], + expected_dtype=int, + proto_json=None, + ), ] for test_case in test_cases: @@ -1170,35 +1270,57 @@ def test_fillnull(): expected_dtype=datetime, proto_json=None, ), - ] - for case in cases: - check_test_case(case) - - -def test_isnull(): - cases = [ + # TODO(Nikhil): Add support for filling nulls for complex types + # Fill null for struct # ExprTestCase( - # expr=(col("a").isnull()), - # df=pd.DataFrame({"a": [1, 2, None, 4]}), - # schema={"a": Optional[int]}, - # display="IS_NULL(col('a'))", + # expr=(col("a").fillnull(lit(A(1, 2, "a"), A))), + # df=pd.DataFrame({"a": [A(1, 2, "a"), None, A(3, 4, "b")]}), + # schema={"a": Optional[A]}, + # display="""FILL_NULL(col('a'), {"x": 1, "y": 2, "z": "a"})""", # refs={"a"}, - # eval_result=[False, False, True, False], - # expected_dtype=bool, + # eval_result=[A(1, 2, "a"), A(1, 2, "a"), A(3, 4, "b")], + # expected_dtype=A, # proto_json=None, # ), + # Fill null for Optional[List[int]] # ExprTestCase( - # expr=(col("a").isnull()), - # df=pd.DataFrame({"a": ["a", "b", None, "d"]}), - # schema={"a": Optional[str]}, - # display="IS_NULL(col('a'))", + # expr=(col("a").fillnull(lit([1, 2, 3], List[int]))), + # df=pd.DataFrame({"a": [[1, 2, 3], None, [4, 5, 6]]}), + # schema={"a": Optional[List[int]]}, + # display="FILL_NULL(col('a'), [1, 2, 3])", # refs={"a"}, - # eval_result=[False, False, True, False], - # expected_dtype=bool, + # eval_result=[[1, 2, 3], [1, 2, 3], [4, 5, 6]], + # expected_dtype=List[int], # proto_json=None, # ), + ] + for case in cases: + check_test_case(case) + + +def test_isnull(): + cases = [ + ExprTestCase( + expr=(col("a").isnull()), + df=pd.DataFrame({"a": [1, 2, None, 4]}), + schema={"a": Optional[int]}, + display="IS_NULL(col('a'))", + refs={"a"}, + eval_result=[False, False, True, False], + expected_dtype=bool, + proto_json=None, + ), + ExprTestCase( + expr=(col("a").isnull()), + df=pd.DataFrame({"a": ["a", "b", None, "d"]}), + schema={"a": Optional[str]}, + display="IS_NULL(col('a'))", + refs={"a"}, + eval_result=[False, False, True, False], + expected_dtype=bool, + proto_json=None, + ), # Each type is a struct - # TODO(Aditya): Fix this test case ExprTestCase( expr=(col("a").isnull()), df=pd.DataFrame({"a": [A(1, 2, "a"), A(2, 3, "b"), None]}), diff --git a/fennel/internal_lib/schema/schema.py b/fennel/internal_lib/schema/schema.py index 5dd98180f..643cd51ea 100644 --- a/fennel/internal_lib/schema/schema.py +++ b/fennel/internal_lib/schema/schema.py @@ -292,17 +292,25 @@ def convert_dtype_to_arrow_type_with_nullable( return pa.decimal128(28, dtype.decimal_type.scale) elif dtype.HasField("array_type"): return pa.list_( - value_type=convert_dtype_to_arrow_type_with_nullable(dtype.array_type.of, nullable) + value_type=convert_dtype_to_arrow_type_with_nullable( + dtype.array_type.of, nullable + ) ) elif dtype.HasField("map_type"): - key_pa_type = convert_dtype_to_arrow_type_with_nullable(dtype.map_type.key, nullable=False) - value_pa_type = convert_dtype_to_arrow_type_with_nullable(dtype.map_type.value, nullable) + key_pa_type = convert_dtype_to_arrow_type_with_nullable( + dtype.map_type.key, nullable=False + ) + value_pa_type = convert_dtype_to_arrow_type_with_nullable( + dtype.map_type.value, nullable + ) return pa.map_(key_pa_type, value_pa_type, False) elif dtype.HasField("embedding_type"): embedding_size = dtype.embedding_type.embedding_size return pa.list_(pa.float64(), embedding_size) elif dtype.HasField("one_of_type"): - return convert_dtype_to_arrow_type_with_nullable(dtype.one_of_type.of, nullable) + return convert_dtype_to_arrow_type_with_nullable( + dtype.one_of_type.of, nullable + ) elif dtype.HasField("between_type"): return convert_dtype_to_arrow_type_with_nullable( dtype.between_type.dtype, nullable @@ -517,9 +525,6 @@ def validate_field_in_df( name = field.name dtype = field.dtype arrow_type = convert_dtype_to_arrow_type(dtype) - for col in df.columns: - print("Type of column: ", col, df[col].dtype) - print("Dtype: ", dtype) if df.shape[0] == 0: return if name not in df.columns: @@ -856,7 +861,6 @@ def is_hashable(dtype: Any) -> bool: def data_schema_check( schema: schema_proto.DSSchema, df: pd.DataFrame, dataset_name="" ) -> List[ValueError]: - print("Checkung schema", df.dtypes) exceptions = [] fields = [] for key in schema.keys.fields: @@ -960,11 +964,10 @@ def cast_col_to_arrow_dtype( # Let's convert structs into json, this is done because arrow # dtype conversion fails with fennel struct # Parse datetime values - series = series.apply(lambda x: parse_datetime_in_value(x, dtype)) if check_dtype_has_struct_type(dtype): series = series.apply(lambda x: parse_struct_into_dict(x, dtype)) + series = series.apply(lambda x: parse_datetime_in_value(x, dtype)) arrow_type = convert_dtype_to_arrow_type_with_nullable(dtype) - print("cast_col_to_arrow_dtype Arrow type: ", arrow_type, "dtype: ", dtype) return series.astype(pd.ArrowDtype(arrow_type)) diff --git a/fennel/internal_lib/utils/utils.py b/fennel/internal_lib/utils/utils.py index 306b26fad..2c8688d3b 100644 --- a/fennel/internal_lib/utils/utils.py +++ b/fennel/internal_lib/utils/utils.py @@ -129,7 +129,9 @@ def cast_col_to_pandas( return series.fillna(pd.NA) -def parse_struct_into_dict(value: Any, dtype: schema_proto.DataType) -> Optional[Union[dict, list]]: +def parse_struct_into_dict( + value: Any, dtype: schema_proto.DataType +) -> Optional[Union[dict, list]]: """ This function assumes that there's a struct somewhere in the value that needs to be converted into json. """ @@ -143,11 +145,19 @@ def parse_struct_into_dict(value: Any, dtype: schema_proto.DataType) -> Optional elif isinstance(value, list) or isinstance(value, np.ndarray): return [parse_struct_into_dict(x, dtype.array_type.of) for x in value] elif isinstance(value, dict) or isinstance(value, frozendict): - return {key: parse_struct_into_dict(val, dtype.map_type.value) for key, val in value.items()} + return { + key: parse_struct_into_dict(val, dtype.map_type.value) + for key, val in value.items() + } elif value is None or pd.isna(value): # If dtype is an optional struct type return, a dict with all fields as None - if dtype.HasField("optional_type") and dtype.optional_type.of.HasField("struct_type"): - return {field.name: None for field in dtype.optional_type.of.struct_type.fields} + if dtype.HasField("optional_type") and dtype.optional_type.of.HasField( + "struct_type" + ): + return { + field.name: None + for field in dtype.optional_type.of.struct_type.fields + } else: return None else: diff --git a/fennel/testing/executor.py b/fennel/testing/executor.py index de1d8b7d2..54ada5b29 100644 --- a/fennel/testing/executor.py +++ b/fennel/testing/executor.py @@ -814,13 +814,11 @@ def visitAssign(self, obj): df = copy.deepcopy(input_df) df.reset_index(drop=True, inplace=True) for col, typed_expr in obj.output_expressions.items(): - if col in input_ret.df.columns: - raise Exception( - f"Column `{col}` already present in dataframe" - ) input_dsschema = obj.node.dsschema().schema() try: - df[col] = typed_expr.expr.eval(input_df, input_dsschema) + df[col] = typed_expr.expr.eval( + input_df, input_dsschema, typed_expr.dtype, parse=False + ) except Exception as e: raise Exception( f"Error in assign node for column `{col}` for pipeline " diff --git a/poetry.lock b/poetry.lock index 0cd14396e..872848ae4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -766,44 +766,44 @@ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benc [[package]] name = "fennel-data-lib" -version = "0.1.15" +version = "0.1.16" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "fennel_data_lib-0.1.15-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:42ad4c903b09bc72486a81a5e4250e648d6d766241509facf0ebe72572e11585"}, - {file = "fennel_data_lib-0.1.15-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a48792d49000de3f1d2b8d7f1fc24a9b0a342f410f01ebb092141526d691842e"}, - {file = "fennel_data_lib-0.1.15-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3da42b3a7230ebfa0ee087c39c4b04b617bf5f13808b3cc593f3b50220921c59"}, - {file = "fennel_data_lib-0.1.15-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4ec33813adb02a37d6943ed0014927739f1b098c6d575f59ebf85399c364d2f1"}, - {file = "fennel_data_lib-0.1.15-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc35dcee6c0afef19558698bc425ff73954d99ccae0f6d24a7dd0835c21439ef"}, - {file = "fennel_data_lib-0.1.15-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:fae1e1a8a9cc925e2ba42a72ba27a070e9a63fd3dc68116d2894477d01b53394"}, - {file = "fennel_data_lib-0.1.15-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:21f469a4e6197ac6f64910a98fb6cdea88144dac2d582cc428e60be7f9c6fca8"}, - {file = "fennel_data_lib-0.1.15-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:93e90b0d02e37ff879879cb6a76f11870c42267c65a73e21b96e8999e3eba81e"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:f8215b50977267cfd4af6c0ad1ae1a7a5e4b73ac73ae089f196e3bfc573288db"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f94fd8c649210c4968aabf9a573f6616bd7cdc0a825672d077ca1e684eee32cc"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:05e8deff1ac986f9c9c3da2ed626402b3bcc01f28b1ac25f004c53f00567878d"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eea0f44dfb2f401648a21fe94d8eb446f4e2cd0a1eb89384046906831b429f09"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:74c79b4b67e8af8e9d39a7316bf95c228d82c9ac99eabcad2889482c8a6f889a"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:e50ba983c0fa58e606593a961dca099ff5bab92865c3f164b3296caa061c34e4"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:444b275022b2b7f19126787d86ca6265c654199e22a230aaf0b8dd0870822262"}, - {file = "fennel_data_lib-0.1.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8f74d429b3e6d2c838e2e871809241a160adf39e923628e6a3b548a077a41872"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:0a46a9a094255b3c4984c963dc0c4aaeb63983b0a57966b5c6e85085de6a65dc"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7720e9880248d819145f38aefc8b72d691960b461e29de5e49cf9894ff4ba7a"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9d0e6f3164c21819d3002c10ba3b70a3aed1dfa70369fba872559fa77d69badf"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff7c034981c23904e4f13d9657b3667ce1df386133603cd3ed105719779759a4"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a37af5a8c627d7d003fadfd9cf0430d45aeddaa9d4e788d91e224f9f1e28f94"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:11d4859099357021ee2741b9cee84d481e69d3cb2e37ad5f7bb08e5517b805ea"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c2b129aebdb74b74019ee294cfcaba915f76344310430788792ee46c3d7cd3a1"}, - {file = "fennel_data_lib-0.1.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e277852036f79d65734d065bcd9ca0abcdefd6d773f285c528dfa851888451f4"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:631ce0e801a631ec4f15abc6a2b886e47b25076b7a444ae5d0838aee89090fae"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d3abba61270217bece19bc331ad16f55fb058f1aabef4ab3726cd18dc6fce89"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f094498f46803298c932ed521537d02bbdcc464a014b9e83d01ee00e053b193"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa53fd1863c7f277c718bbd24891722223bee90d6b249216523cf7d1c9eefe42"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c058c90921fa7d1f4d178592ba6e054695ab9c619a33e07c935f2415faeb1104"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-manylinux_2_34_x86_64.whl", hash = "sha256:e1aae677fa041a1a4906ff846ed6585a6a9f9dc4c4909683354ad930c1e5d3a1"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d360649a168a9452f5915c307d9dcfd8d2a910c5ae769a5eb7f7be80caddac68"}, - {file = "fennel_data_lib-0.1.15-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5048ae590669033cbe6fcd6a88fb905a9c53f4d46611f9aed1510bdfbbde4347"}, - {file = "fennel_data_lib-0.1.15.tar.gz", hash = "sha256:158ff36a5e732ad48125881d03314dd25eece6ab3515ea672bf4fa49b2dea522"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:11b06ff8bf7787ccec56563da9450fcc412d90f22363581e99fe7e6f35e205eb"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b6c3ef5d05c6e5511ad0d2918c24d090ccd1ce3c4479a8b9bd1bde6873d22c3"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a8523aa90d231ea87badf2dd9167d5c932eb38ab5ed9ec793547aef320f12881"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bd568b6231fbfbf363cc919dcf06c5baad57f039d4d9d765532106b1c04d4e63"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15b3bd238169b4806d852fcbe700aa53886e3f5e49af7b9eb424c04e4e325528"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:1d3a0616387486406ec7642ec5b1799e04757e7c33b0c274f93a8e7cde7f609e"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0fbeb721d03587acfaf1036553cc3081ec53b32516da1525f964ab19f19a24ee"}, + {file = "fennel_data_lib-0.1.16-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:518b80c4091491561d8c2c808197694d8d68e453183eba14a9637104dc00700b"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:94c803f77dcdcad9a510614271425710a9c2bdfb402fcb4669002143eab62910"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e57db109a623a2858a6caf17ceb23dc8309a961d4e7d61c451f97615ffd22cd4"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a42224d01267792daf2c1c920396ca9224361e548ae56c2d7ed117194c3e6adf"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d943c614452b7ba3bc8d7deff8a0b3f45b8c43e1a287fe193a67d66969f3c1a6"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a079adfe00a907d5b44d33af62bd556bc5aa45d98bb35ed03be7f25b0627b5b5"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:d3b6fce44931b0bd5ad1ccb012058de956f637035be2ccf83e2f6169c012ee7c"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f64edaefcbe28528815571e1af629456a6a106fd198c71f01bc56e3538278117"}, + {file = "fennel_data_lib-0.1.16-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6460055fad1e954b2573122a34cae9218328738c5b233b551fe12074f24b958e"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:b16bbd97f0ae6aeb78d2a5aaf2071b3fe67d134d945264e8b19afc0fb93ff3fd"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc7adb37bde0671140288e2c25da95488bf1c2d33c66abaa59378b456cd0af63"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1f6ad71152ed0e1b97e37ba3cf83bb657f562625bf21ef3e79a72e2391604a0b"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de4e0034989a3387cd2595725d2c6b0528665c0503294e2d3c18e418d029c2ea"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b58d2e4193afeb83281216be32e77db472273160eeadd9e2f5ae1eb98fa8bc50"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:9530eb18989cc111c5d490de47c309cb86a8a0fc66073b14f15ad9e5f5094d27"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:eea2890d58695f67f3e13aab938d5a59eb0ae635b19a1e686b05e96c895b464f"}, + {file = "fennel_data_lib-0.1.16-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8a6dcbf7226565e328d6ddb5a91f71e30713087f4a0577ca6dc65b9ca11f2ba8"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:3b47d6c216de3714507b014ddaaed65832f5279ef9d07d33bf24c36d23b661e1"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94537e21649552f1664c922b8646c227e1e7ca4b22f8d55f728e70dda1fead43"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3f36e19036b5541213cfd2957f916a1613c640f0147a5203c3f841d8462d93f3"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6f8c705e9aa825326d3f46164659141db4980e19f6bbb09e4c1b57ea46e51c6b"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7718f5f3362c2c40411d0cc96841e28e4902570b23db3fcd4c5803720939332"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-manylinux_2_34_x86_64.whl", hash = "sha256:1f6c0de8cf4fdda0a8353ede190b25aac067316ff027c3ce8b75f0326898007e"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:460fbe7b6e394968f214a933c0b3aff8a5754508cd607753500b53222d5d932f"}, + {file = "fennel_data_lib-0.1.16-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22a94a261ccfe24678ee98786ef2b35be15041ccb31667df1697fa70d6132e50"}, + {file = "fennel_data_lib-0.1.16.tar.gz", hash = "sha256:fb0001dfcb995356190d19cf4b33855a133ea072fe5452dd72728ad223320e3c"}, ] [[package]] @@ -1861,13 +1861,13 @@ files = [ [[package]] name = "more-itertools" -version = "10.4.0" +version = "10.5.0" description = "More routines for operating on iterables, beyond itertools" optional = false python-versions = ">=3.8" files = [ - {file = "more-itertools-10.4.0.tar.gz", hash = "sha256:fe0e63c4ab068eac62410ab05cccca2dc71ec44ba8ef29916a0090df061cf923"}, - {file = "more_itertools-10.4.0-py3-none-any.whl", hash = "sha256:0f7d9f83a0a8dcfa8a2694a770590d98a67ea943e3d9f5298309a484758c4e27"}, + {file = "more-itertools-10.5.0.tar.gz", hash = "sha256:5482bfef7849c25dc3c6dd53a6173ae4795da2a41a80faea6700d9f5846c5da6"}, + {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"}, ] [[package]] @@ -3328,13 +3328,13 @@ win32 = ["pywin32"] [[package]] name = "setuptools" -version = "74.1.1" +version = "74.1.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-74.1.1-py3-none-any.whl", hash = "sha256:fc91b5f89e392ef5b77fe143b17e32f65d3024744fba66dc3afe07201684d766"}, - {file = "setuptools-74.1.1.tar.gz", hash = "sha256:2353af060c06388be1cecbf5953dcdb1f38362f87a2356c480b6b4d5fcfc8847"}, + {file = "setuptools-74.1.2-py3-none-any.whl", hash = "sha256:5f4c08aa4d3ebcb57a50c33b1b07e94315d7fc7230f7115e47fc99776c8ce308"}, + {file = "setuptools-74.1.2.tar.gz", hash = "sha256:95b40ed940a1c67eb70fc099094bd6e99c6ee7c23aa2306f4d2697ba7916f9c6"}, ] [package.extras] @@ -3540,13 +3540,13 @@ files = [ [[package]] name = "types-python-dateutil" -version = "2.9.0.20240821" +version = "2.9.0.20240906" description = "Typing stubs for python-dateutil" optional = false python-versions = ">=3.8" files = [ - {file = "types-python-dateutil-2.9.0.20240821.tar.gz", hash = "sha256:9649d1dcb6fef1046fb18bebe9ea2aa0028b160918518c34589a46045f6ebd98"}, - {file = "types_python_dateutil-2.9.0.20240821-py3-none-any.whl", hash = "sha256:f5889fcb4e63ed4aaa379b44f93c32593d50b9a94c9a60a0c854d8cc3511cd57"}, + {file = "types-python-dateutil-2.9.0.20240906.tar.gz", hash = "sha256:9706c3b68284c25adffc47319ecc7947e5bb86b3773f843c73906fd598bc176e"}, + {file = "types_python_dateutil-2.9.0.20240906-py3-none-any.whl", hash = "sha256:27c8cc2d058ccb14946eebcaaa503088f4f6dbc4fb6093d3d456a49aef2753f6"}, ] [[package]] @@ -3714,4 +3714,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "38238318d918ada7382d0d335dc4ad3e8c16c55d7ce0650c685acfafe5c72489" +content-hash = "d3fbd04d20d5dfb6f1bd354348f847db347d3b9273dfae5e5b4364c5e83424b0" diff --git a/pyproject.toml b/pyproject.toml index 7cf8bced6..edd57735a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "fennel-ai" -version = "1.5.17" +version = "1.5.18" description = "The modern realtime feature engineering platform" authors = ["Fennel AI "] packages = [{ include = "fennel" }] @@ -20,7 +20,7 @@ pytest = "7.1.3" pytest-rerunfailures = "^13.0" sortedcontainers = "^2.4.0" typing-extensions = "^4.12.0" -fennel-data-lib = "0.1.15" +fennel-data-lib = "0.1.16" pyarrow = "^14.0.2" [tool.poetry.dev-dependencies] @@ -43,20 +43,20 @@ pyyaml = "^6.0.1" # For production, we use poetry to build the python package -# [build-system] -# requires = ["poetry-core>=1.0.0"] -# build-backend = "poetry.core.masonry.api" +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" # For local development, we use maturin to build the rust library -[build-system] -requires = ["maturin", "setuptools>=42", "wheel"] -build-backend = "maturin" +# [build-system] +# requires = ["maturin", "setuptools>=42", "wheel"] +# build-backend = "maturin" -[tool.maturin] -name = "fennel_data_lib" -sdist-directory = "python_package" -manifest-path = "../server/fennel_data_lib/Cargo.toml" +# [tool.maturin] +# name = "fennel_data_lib" +# sdist-directory = "python_package" +# manifest-path = "../server/fennel_data_lib/Cargo.toml" # inspired from - https://github.com/pypa/pip/blob/main/pyproject.toml