From 30b7a569109996e377b67461d002f9d7bf6542b9 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sat, 7 Sep 2024 20:37:37 +0200 Subject: [PATCH] Add Zyp Treatments, a more tailored transformation subsystem --- CHANGES.md | 3 +- src/zyp/model/collection.py | 3 + src/zyp/model/treatment.py | 69 ++++++++++++ tests/zyp/test_collection.py | 21 +++- tests/zyp/test_treatment.py | 103 ++++++++++++++++++ .../transformation-collection-treatment.yaml | 5 + 6 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 src/zyp/model/treatment.py create mode 100644 tests/zyp/test_treatment.py create mode 100644 tests/zyp/transformation-collection-treatment.yaml diff --git a/CHANGES.md b/CHANGES.md index dd1443f..0d0c9bc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ # Changelog ## Unreleased +- Added Zyp Treatments, a more tailored transformation subsystem ## 2024/09/02 v0.0.14 - Replace poor man's relation name quoting with implementation @@ -41,7 +42,7 @@ column. This allows defining primary keys on the sink table. ## 2024/08/14 v0.0.4 -- Added `BucketTransformation`, a minimal transformation engine +- Added Zyp Transformations, a minimal transformation engine based on JSON Pointer (RFC 6901). - Added documentation using Sphinx and Read the Docs diff --git a/src/zyp/model/collection.py b/src/zyp/model/collection.py index f618f2f..7ee9ccb 100644 --- a/src/zyp/model/collection.py +++ b/src/zyp/model/collection.py @@ -5,6 +5,7 @@ from zyp.model.base import Dumpable, Metadata, SchemaDefinition from zyp.model.bucket import BucketTransformation, Collection, DictOrList from zyp.model.moksha import MokshaTransformation +from zyp.model.treatment import Treatment @define(frozen=True) @@ -35,4 +36,6 @@ def apply(self, data: DictOrList) -> Collection: collection_out.append(item) if self.post: collection_out = t.cast(Collection, self.post.apply(collection_out)) + if self.treatment: + collection_out = t.cast(Collection, self.treatment.apply(collection_out)) return collection_out diff --git a/src/zyp/model/treatment.py b/src/zyp/model/treatment.py new file mode 100644 index 0000000..3bbfb7b --- /dev/null +++ b/src/zyp/model/treatment.py @@ -0,0 +1,69 @@ +import typing as t + +from attr import Factory +from attrs import define + +from zyp.model.base import Dumpable +from zyp.model.bucket import DictOrList, Record + + +@define +class Treatment(Dumpable): + ignore_complex_lists: bool = False + ignore_columns: t.List[str] = Factory(list) + convert_list: t.List[str] = Factory(list) + convert_string: t.List[str] = Factory(list) + convert_dict: t.List[t.Dict[str, str]] = Factory(list) + prune_invalid_date: t.List[str] = Factory(list) + + def apply(self, data: DictOrList) -> DictOrList: + if isinstance(data, dict): + self.apply_record(data) + return {k: self.apply(v) for (k, v) in data.items()} + elif isinstance(data, list): + return t.cast(list, [self.apply(v) for v in data]) + return data + + def apply_record(self, data: Record) -> Record: + # Optionally ignore lists of complex objects. + local_ignores = [] + if self.ignore_complex_lists: + for k, v in data.items(): + if isinstance(v, list) and v and isinstance(v[0], dict): + # Skip ignoring special-encoded items. + if v[0] and list(v[0].keys())[0].startswith("$"): + continue + local_ignores.append(k) + + # Apply global and computed ignores. + for ignore_name in self.ignore_columns + local_ignores: + if ignore_name in data: + del data[ignore_name] + + # Converge certain items to `list` even when defined differently. + for to_list_name in self.convert_list: + if to_list_name in data and not isinstance(data[to_list_name], list): + data[to_list_name] = [data[to_list_name]] + + # Converge certain items to `str` even when defined differently. + for name in self.convert_string: + if name in data and not isinstance(data[name], str): + data[name] = str(data[name]) + + # Converge certain items to `dict` even when defined differently. + for rule in self.convert_dict: + name = rule["name"] + wrapper_name = rule["wrapper_name"] + if name in data and not isinstance(data[name], dict): + data[name] = {wrapper_name: data[name]} + + # Prune invalid date representations. + for key in self.prune_invalid_date: + if key in data: + if not isinstance(data[key], dict): + del data[key] + elif "date" in data[key]: + if isinstance(data[key]["date"], str): + del data[key] + + return data diff --git a/tests/zyp/test_collection.py b/tests/zyp/test_collection.py index d690c45..b30015d 100644 --- a/tests/zyp/test_collection.py +++ b/tests/zyp/test_collection.py @@ -86,18 +86,31 @@ def test_collection_transformation_serialize(): } dict_result = transformation.to_dict() assert dict_result == transformation_dict - return yaml_result = transformation.to_yaml() assert yaml.full_load(yaml_result) == transformation_dict - CollectionTransformation.from_yaml(yaml_result) + transformation_second = CollectionTransformation.from_yaml(yaml_result) + assert isinstance(transformation_second, CollectionTransformation) -def test_collection_transformation_load_and_apply(): +def test_collection_transformation_regular_load_and_apply(): """ - Verify transformation can be loaded from JSON and applied again. + Verify rule-based transformations can be loaded and applied. """ payload = Path("tests/zyp/transformation-collection.yaml").read_text() transformation = CollectionTransformation.from_yaml(payload) result = transformation.apply(deepcopy(ComplexRecipe.data_in)) assert result == ComplexRecipe.data_out + + +def test_collection_transformation_treatment_load_and_apply(): + """ + Verify collection transformation with treatment can be loaded and applied. + """ + payload = Path("tests/zyp/transformation-collection-treatment.yaml").read_text() + transformation = CollectionTransformation.from_yaml(payload) + result = transformation.apply(deepcopy(ComplexRecipe.data_in)) + assert result == { + "message-source": "system-3000", + "message-type": "eai-warehouse", + } diff --git a/tests/zyp/test_treatment.py b/tests/zyp/test_treatment.py new file mode 100644 index 0000000..276014b --- /dev/null +++ b/tests/zyp/test_treatment.py @@ -0,0 +1,103 @@ +from zyp.model.treatment import Treatment + +RECORD_IN = { + "data": { + "ignore_complex_list": [{}], + "ignore_column": 123, + "invalid_date_scalar": 123, + "invalid_date_nested": {"date": "123"}, + "to_string": 123, + "to_list": 123, + "to_dict": 123, + }, +} + +RECORD_OUT = { + "data": { + "to_string": "123", + "to_list": [123], + "to_dict": {"id": 123}, + }, +} + + +def test_treatment_all(): + """ + Verify treating nested data. + """ + transformation = Treatment( + ignore_complex_lists=True, + ignore_columns=["ignore_column"], + prune_invalid_date=["invalid_date_scalar", "invalid_date_nested"], + convert_dict=[{"name": "to_dict", "wrapper_name": "id"}], + convert_list=["to_list"], + convert_string=["to_string"], + ) + assert transformation.apply(RECORD_IN) == RECORD_OUT + + +def test_treatment_noop(): + """ + Verify treating nested data. + """ + transformation = Treatment() + assert transformation.apply([{"data": {"abc": 123}}]) == [{"data": {"abc": 123}}] + + +def test_treatment_ignore_complex_lists_basic(): + """ + Verify treating nested data. + """ + transformation = Treatment(ignore_complex_lists=True) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{}] + + +def test_treatment_ignore_complex_lists_with_specials(): + """ + Verify treating nested data. + """ + transformation = Treatment(ignore_complex_lists=True) + assert transformation.apply([{"data": [{"abc": 123}], "stamps": [{"$date": 123}]}]) == [ + {"stamps": [{"$date": 123}]} + ] + + +def test_treatment_ignore_columns(): + """ + Verify treating nested data. + """ + transformation = Treatment(ignore_columns=["abc"]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{}]}] + + +def test_treatment_convert_string(): + """ + Verify treating nested data. + """ + transformation = Treatment(convert_string=["abc"]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": "123"}]}] + + +def test_treatment_convert_list(): + """ + Verify treating nested data. + """ + transformation = Treatment(convert_list=["abc"]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": [123]}]}] + + +def test_treatment_convert_dict(): + """ + Verify treating nested data. + """ + transformation = Treatment(convert_dict=[{"name": "abc", "wrapper_name": "id"}]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": {"id": 123}}]}] + + +def test_treatment_prune_invalid_date(): + """ + Verify treating nested data. + """ + transformation = Treatment(prune_invalid_date=["date"]) + assert transformation.apply([{"data": [{"date": 123}]}]) == [{"data": [{}]}] + assert transformation.apply([{"data": [{"date": {"date": 123}}]}]) == [{"data": [{"date": {}}]}] diff --git a/tests/zyp/transformation-collection-treatment.yaml b/tests/zyp/transformation-collection-treatment.yaml new file mode 100644 index 0000000..ff3f082 --- /dev/null +++ b/tests/zyp/transformation-collection-treatment.yaml @@ -0,0 +1,5 @@ +meta: + version: 1 + type: zyp-collection +treatment: + ignore_complex_lists: true