diff --git a/fennel/client_tests/test_complex_autogen_extractor.py b/fennel/client_tests/test_complex_autogen_extractor.py index 9170372c3..87790ffe2 100644 --- a/fennel/client_tests/test_complex_autogen_extractor.py +++ b/fennel/client_tests/test_complex_autogen_extractor.py @@ -282,8 +282,8 @@ def test_complex_auto_gen_extractors(client): rider_df = pd.DataFrame( { "rider_id": [1], - "created": [datetime.now()], - "birthdate": [datetime.now() - timedelta(days=365 * 30)], + "created": [datetime.utcnow()], + "birthdate": [datetime.utcnow() - timedelta(days=365 * 30)], "country_code": ["US"], } ) @@ -298,7 +298,7 @@ def test_complex_auto_gen_extractors(client): "rider_id": [1], "vehicle_id": [1], "is_completed_trip": [1], - "created": [datetime.now()], + "created": [datetime.utcnow()], } ) log_response = client.log( @@ -311,7 +311,7 @@ def test_complex_auto_gen_extractors(client): country_license_df = pd.DataFrame( { "rider_id": [1], - "created": [datetime.now()], + "created": [datetime.utcnow()], "country_code": ["US"], } ) diff --git a/fennel/featuresets/featureset.py b/fennel/featuresets/featureset.py index 925cbbb81..0b43f0f12 100644 --- a/fennel/featuresets/featureset.py +++ b/fennel/featuresets/featureset.py @@ -32,7 +32,14 @@ set_meta_attr, ) from fennel.lib.includes import TierSelector -from fennel.lib.schema import FENNEL_INPUTS, FENNEL_OUTPUTS +from fennel.lib.schema import ( + FENNEL_INPUTS, + FENNEL_OUTPUTS, + validate_value_matches_type, + fennel_get_optional_inner, +) + +from fennel.lib import FENNEL_GEN_CODE_MARKER from fennel.utils import ( parse_annotation_comments, propogate_fennel_attributes, @@ -372,7 +379,6 @@ def extract( tier=tier, ) return self - provider_features = [] # If provider is none, then the provider is this featureset. The input features # are captured once this featureset is initialized @@ -480,8 +486,8 @@ def __init__( self._features = features self._feature_map = {feature.name: feature for feature in features} self._id_to_feature = {feature.id: feature for feature in features} - self._extractors = self._get_extractors() self._validate() + self._extractors = self._get_extractors() self._add_feature_names_as_attributes() self._set_extractors_as_attributes() self._expectation = self._get_expectations() @@ -576,6 +582,60 @@ def _get_extractors(self) -> List[Extractor]: return extractors def _validate(self): + cls_module = inspect.getmodule(self.__fennel_original_cls__) + if cls_module is not None and hasattr( + cls_module, FENNEL_GEN_CODE_MARKER + ): + if getattr(cls_module, FENNEL_GEN_CODE_MARKER): + return + + # Validate all auto generated extractors. + for feature in self._features: + if feature.extractor: + extractor = feature.extractor + if extractor.extractor_type == ExtractorType.ALIAS: + # Check that the types match + if feature.dtype != extractor.inputs[0].dtype: + raise TypeError( + f"Feature `{feature.fqn()}` has type `{feature.dtype}` " + f"but the extractor aliasing `{extractor.inputs[0].fqn()}` has input type " + f"`{extractor.inputs[0].dtype}`." + ) + if extractor.extractor_type == ExtractorType.LOOKUP: + # Check that the types match + field_dtype = extractor.derived_extractor_info.field.dtype + if extractor.derived_extractor_info.default is not None: + if extractor.derived_extractor_info.field.is_optional(): + field_dtype = fennel_get_optional_inner(field_dtype) + + if feature.dtype != field_dtype: + raise TypeError( + f"Feature `{feature.fqn()}` has type `{feature.dtype}` " + f"but expected type `{extractor.derived_extractor_info.field.dtype}`." + ) + else: + if extractor.derived_extractor_info.field.is_optional(): + expected_field_type = field_dtype + else: + expected_field_type = Optional[field_dtype] + + if feature.dtype != expected_field_type: + raise TypeError( + f"Feature `{feature.fqn()}` has type `{feature.dtype}` " + f"but expectected type `{expected_field_type}`" + ) + # Check that the default value has the right type + if extractor.derived_extractor_info.default is not None: + try: + validate_value_matches_type( + extractor.derived_extractor_info.default, + extractor.derived_extractor_info.field.dtype, + ) + except ValueError as e: + raise ValueError( + f"Default value `{extractor.derived_extractor_info.default}` for feature `{feature.fqn()}` has incorrect default value: {e}" + ) + # Check that all features have unique ids. feature_id_set = set() for feature in self._features: diff --git a/fennel/featuresets/test_derived_extractors.py b/fennel/featuresets/test_derived_extractors.py index 9bd729a02..79a2e5ccc 100644 --- a/fennel/featuresets/test_derived_extractors.py +++ b/fennel/featuresets/test_derived_extractors.py @@ -57,7 +57,7 @@ class UserInfo: default="unspecified", ) # lookup with meta - age_years: float = ( + age_years: int = ( feature(id=3) .extract( field=UserInfoDataset.age, @@ -101,7 +101,7 @@ class AgeInfo: # alias a feature that has an explicit extractor age_group: AgeGroup = feature(id=1).extract(feature=UserInfo.age_group) # alias a feature that has a derived extractor - age: float = feature(id=2).extract(feature=UserInfo.age_years) + age: int = feature(id=2).extract(feature=UserInfo.age_years) view = InternalTestClient() view.add(UserInfoDataset) @@ -168,7 +168,7 @@ def test_feature(actual_feature, expected_dict): { "id": 3, "name": "age_years", - "dtype": {"double_type": {}}, + "dtype": {"int_type": {}}, "metadata": { "owner": "zaki@fennel.ai", "description": "lookup with meta", @@ -211,7 +211,7 @@ def test_feature(actual_feature, expected_dict): { "id": 2, "name": "age", - "dtype": {"double_type": {}}, + "dtype": {"int_type": {}}, "metadata": { "description": "alias a feature that has a derived extractor" }, diff --git a/fennel/featuresets/test_featureset.py b/fennel/featuresets/test_featureset.py index 3a31f3d3c..21f7b3428 100644 --- a/fennel/featuresets/test_featureset.py +++ b/fennel/featuresets/test_featureset.py @@ -367,7 +367,7 @@ class UserInfo: income: int = feature(id=5).extract( # type: ignore field=UserInfoDataset.avg_income, provider=Request, - default="pluto", + default=1, tier=["~prod"], ) diff --git a/fennel/featuresets/test_invalid_featureset.py b/fennel/featuresets/test_invalid_featureset.py index 8c76b339e..a82ad53ff 100644 --- a/fennel/featuresets/test_invalid_featureset.py +++ b/fennel/featuresets/test_invalid_featureset.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import Optional, List +import sys import pandas as pd import pytest @@ -8,11 +9,16 @@ from fennel.datasets import dataset, field from fennel.featuresets import featureset, extractor, feature from fennel.lib.schema import inputs, outputs +from fennel.sources import source, Webhook # noinspection PyUnresolvedReferences from fennel.test_lib import * +__owner__ = "data@fennel.ai" +webhook = Webhook(name="fennel_webhook") + +@source(webhook.endpoint("UserInfoDataset")) @dataset class UserInfoDataset: user_id: int = field(key=True) @@ -21,6 +27,7 @@ class UserInfoDataset: # Users date of birth dob: str age: int + ids: List[int] account_creation_date: datetime country: Optional[str] timestamp: datetime = field(timestamp=True) @@ -238,3 +245,114 @@ class UserInfo: str(e.value) == "Feature `extractors` in `UserInfo` has a reserved name `extractors`." ) + + +@featureset +class UserInfo2: + user_id: int = feature(id=1) + home_geoid: int = feature(id=2) + age: int = feature(id=3).extract(field=UserInfoDataset.age, default=0) # type: ignore + credit_score: int = feature(id=4) + + +@mock +def test_tp(client): + client.sync(datasets=[UserInfoDataset], featuresets=[UserInfo2]) + + +@mock +def test_invalid_autogenerated_extractors(client): + with pytest.raises(TypeError) as e: + + @featureset + class UserInfo: + user_id: int = feature(id=1) + home_geoid: int = feature(id=2) + age: int = feature(id=3).extract(field=UserInfoDataset.age) + credit_score: int = feature(id=4) + + # age should be optional[int] + if sys.version_info < (3, 9): + assert ( + str(e.value) + == "Feature `UserInfo.age` has type `` but expectected type `typing.Union[int, NoneType]`" + ) + else: + assert ( + str(e.value) + == "Feature `UserInfo.age` has type `` but expectected type `typing.Optional[int]`" + ) + + with pytest.raises(TypeError) as e: + + @featureset + class UserInfo1: + user_id: int = feature(id=1) + home_geoid: int = feature(id=2) + age: float = feature(id=3).extract( + field=UserInfoDataset.age, default=0 + ) + credit_score: int = feature(id=4) + + # age should be int + assert ( + str(e.value) + == "Feature `UserInfo1.age` has type `` but expected type ``." + ) + + with pytest.raises(TypeError) as e: + + @featureset + class UserInfo2: + home_geoid: int = feature(id=2) + age: int = feature(id=3) + credit_score: int = feature(id=4) + + @featureset + class UserInfo2: + user_id: int = feature(id=1) + home_geoid: float = feature(id=2).extract( + feature=UserInfo2.home_geoid + ) + age: int = feature(id=3) + credit_score: int = feature(id=4) + + # home_geoid should be int + assert ( + str(e.value) + == "Feature `UserInfo2.home_geoid` has type `` but the extractor aliasing `UserInfo2.home_geoid` has input type ``." + ) + + with pytest.raises(ValueError) as e: + + @featureset + class UserInfo3: + user_id: int = feature(id=1) + home_geoid: int = feature(id=2) + age: int = feature(id=4).extract( + field=UserInfoDataset.age, default=0.0 + ) + credit_score: int = feature(id=5) + + # default value for age should be 0 + assert ( + str(e.value) + == "Default value `0.0` for feature `UserInfo3.age` has incorrect default value: Value `0.0` does not match type `int`" + ) + + with pytest.raises(ValueError) as e: + + @featureset + class UserInfo4: + user_id: int = feature(id=1) + home_geoid: int = feature(id=2) + country: str = feature(id=4).extract( + field=UserInfoDataset.country, default=0.0 + ) + credit_score: int = feature(id=5) + + # default value for age should be 0 + assert ( + str(e.value) + == "Default value `0.0` for feature `UserInfo4.country` has incorrect default value: Value `0.0` does not match type `Optional[str]`" + ) diff --git a/fennel/lib/__init__.py b/fennel/lib/__init__.py index e69de29bb..d03c3b43d 100644 --- a/fennel/lib/__init__.py +++ b/fennel/lib/__init__.py @@ -0,0 +1 @@ +FENNEL_GEN_CODE_MARKER = "__fennel_gen_code__" diff --git a/fennel/lib/schema/__init__.py b/fennel/lib/schema/__init__.py index 112d4bd62..ca31a2636 100644 --- a/fennel/lib/schema/__init__.py +++ b/fennel/lib/schema/__init__.py @@ -22,4 +22,5 @@ FENNEL_STRUCT, FENNEL_STRUCT_DEPENDENCIES_SRC_CODE, FENNEL_STRUCT_SRC_CODE, + validate_value_matches_type, ) diff --git a/fennel/lib/schema/schema.py b/fennel/lib/schema/schema.py index 730f0cb61..9b58aaefd 100644 --- a/fennel/lib/schema/schema.py +++ b/fennel/lib/schema/schema.py @@ -800,6 +800,19 @@ def is_hashable(dtype: Any) -> bool: return False +def validate_value_matches_type(value: Any, dtype: Any) -> None: + # Create a dataframe with a single row and check the schema + # If the schema check fails, then the value does not match the type + df = pd.DataFrame({"0": [value]}) + field = schema_proto.Field(name="0", dtype=get_datatype(dtype)) + try: + _validate_field_in_df(field, df, "value") + except ValueError: + raise ValueError( + f"Value `{value}` does not match type `{dtype_to_string(dtype)}`" + ) + + def data_schema_check( schema: schema_proto.DSSchema, df: pd.DataFrame, dataset_name="" ) -> List[ValueError]: diff --git a/fennel/lib/to_proto/__init__.py b/fennel/lib/to_proto/__init__.py index 2004b074c..7e9e01764 100644 --- a/fennel/lib/to_proto/__init__.py +++ b/fennel/lib/to_proto/__init__.py @@ -10,3 +10,4 @@ to_duration_proto, to_sync_request_proto, ) +from fennel.lib.to_proto.source_code import FENNEL_GEN_CODE_MARKER diff --git a/fennel/lib/to_proto/source_code.py b/fennel/lib/to_proto/source_code.py index c73af1d29..5dbef76fe 100644 --- a/fennel/lib/to_proto/source_code.py +++ b/fennel/lib/to_proto/source_code.py @@ -15,6 +15,7 @@ FENNEL_STRUCT_SRC_CODE, ) from fennel.utils import fennel_get_source +from fennel.lib import FENNEL_GEN_CODE_MARKER def _remove_empty_lines(source_code: str) -> str: @@ -214,7 +215,8 @@ def get_all_imports() -> str: "from fennel.datasets.datasets import dataset_lookup", ] - return "\n".join(imports + fennel_imports) + "\n" + gen_code_marker = f"{FENNEL_GEN_CODE_MARKER}=True\n" + return gen_code_marker + "\n".join(imports + fennel_imports) + "\n" def to_includes_proto(func: Callable) -> pycode_proto.PyCode: diff --git a/fennel/test_lib/executor.py b/fennel/test_lib/executor.py index ef42081f9..191280b2b 100644 --- a/fennel/test_lib/executor.py +++ b/fennel/test_lib/executor.py @@ -421,9 +421,15 @@ def visitAssign(self, obj): return None df = input_ret.df df[obj.column] = obj.func(df) - df[obj.column] = cast_col_to_dtype( - df[obj.column], get_datatype(obj.output_type) - ) + try: + df[obj.column] = cast_col_to_dtype( + df[obj.column], get_datatype(obj.output_type) + ) + except Exception as e: + raise Exception( + f"Error in assign node for column `{obj.column}` for pipeline " + f"`{self.cur_pipeline_name}`, {e}" + ) return NodeRet(df, input_ret.timestamp_field, input_ret.key_fields) def visitDedup(self, obj): diff --git a/fennel/test_lib/mock_client.py b/fennel/test_lib/mock_client.py index cec119292..60af3a580 100644 --- a/fennel/test_lib/mock_client.py +++ b/fennel/test_lib/mock_client.py @@ -228,7 +228,9 @@ def get_extractor_func(extractor_proto: ProtoExtractor) -> Callable: sys.modules[fqn] = mod exec(code, mod.__dict__) except Exception as e: - raise Exception(f"Error while executing code: {code} : {str(e)}") + raise Exception( + f"Error while executing code for {fqn}:\n {code} \n: {str(e)}" + ) return mod.__dict__[extractor_proto.pycode.entry_point] diff --git a/pyproject.toml b/pyproject.toml index 741c91fc2..891f9b8bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "fennel-ai" -version = "0.18.23" +version = "0.18.24" description = "The modern realtime feature engineering platform" authors = ["Fennel AI "] packages = [{ include = "fennel" }]