From f9d592681f9f8ebd64a08aecf838e04d9713c3d7 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sat, 7 Sep 2024 20:37:37 +0200 Subject: [PATCH] Zyp Treatments: A slightly tailored transformation subsystem --- CHANGES.md | 3 +- src/zyp/model/base.py | 4 + src/zyp/model/bucket.py | 4 - src/zyp/model/collection.py | 7 +- src/zyp/model/moksha.py | 3 +- src/zyp/model/treatment.py | 68 ++++++++++++ tests/zyp/test_collection.py | 21 +++- tests/zyp/test_treatment.py | 103 ++++++++++++++++++ .../transformation-collection-treatment.yaml | 5 + 9 files changed, 206 insertions(+), 12 deletions(-) create mode 100644 src/zyp/model/treatment.py create mode 100644 tests/zyp/test_treatment.py create mode 100644 tests/zyp/transformation-collection-treatment.yaml diff --git a/CHANGES.md b/CHANGES.md index dd1443f..9fc73ea 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ # Changelog ## Unreleased +- Added Zyp Treatments, a slightly tailored transformation subsystem ## 2024/09/02 v0.0.14 - Replace poor man's relation name quoting with implementation @@ -41,7 +42,7 @@ column. This allows defining primary keys on the sink table. ## 2024/08/14 v0.0.4 -- Added `BucketTransformation`, a minimal transformation engine +- Added Zyp Transformations, a minimal transformation engine based on JSON Pointer (RFC 6901). - Added documentation using Sphinx and Read the Docs diff --git a/src/zyp/model/base.py b/src/zyp/model/base.py index f833ca4..91a766d 100644 --- a/src/zyp/model/base.py +++ b/src/zyp/model/base.py @@ -9,6 +9,10 @@ from zyp.util.data import no_privates_no_nulls_no_empties +Record = t.Dict[str, t.Any] +Collection = t.List[Record] +DictOrList = t.Union[Record, Collection] + @define class Metadata: diff --git a/src/zyp/model/bucket.py b/src/zyp/model/bucket.py index 6e03d51..e9e6412 100644 --- a/src/zyp/model/bucket.py +++ b/src/zyp/model/bucket.py @@ -17,11 +17,7 @@ logger = logging.getLogger(__name__) -Record = t.Dict[str, t.Any] -Collection = t.List[Record] -DictOrList = t.Union[Record, Collection] TransonTemplate = t.Dict[str, t.Any] - MokshaTransformer = t.Union[jmespath.parser.ParsedResult, jq._Program, transon.Transformer] diff --git a/src/zyp/model/collection.py b/src/zyp/model/collection.py index f618f2f..f22ea1a 100644 --- a/src/zyp/model/collection.py +++ b/src/zyp/model/collection.py @@ -2,9 +2,10 @@ from attrs import define -from zyp.model.base import Dumpable, Metadata, SchemaDefinition -from zyp.model.bucket import BucketTransformation, Collection, DictOrList +from zyp.model.base import Collection, DictOrList, Dumpable, Metadata, SchemaDefinition +from zyp.model.bucket import BucketTransformation from zyp.model.moksha import MokshaTransformation +from zyp.model.treatment import Treatment @define(frozen=True) @@ -35,4 +36,6 @@ def apply(self, data: DictOrList) -> Collection: collection_out.append(item) if self.post: collection_out = t.cast(Collection, self.post.apply(collection_out)) + if self.treatment: + collection_out = t.cast(Collection, self.treatment.apply(collection_out)) return collection_out diff --git a/src/zyp/model/moksha.py b/src/zyp/model/moksha.py index 968e26b..97c7637 100644 --- a/src/zyp/model/moksha.py +++ b/src/zyp/model/moksha.py @@ -7,7 +7,8 @@ from attr import Factory from attrs import define -from zyp.model.bucket import ConverterBase, DictOrList, MokshaTransformer, TransonTemplate +from zyp.model.base import DictOrList +from zyp.model.bucket import ConverterBase, MokshaTransformer, TransonTemplate from zyp.util.expression import compile_expression diff --git a/src/zyp/model/treatment.py b/src/zyp/model/treatment.py new file mode 100644 index 0000000..f7392c0 --- /dev/null +++ b/src/zyp/model/treatment.py @@ -0,0 +1,68 @@ +import typing as t + +from attr import Factory +from attrs import define + +from zyp.model.base import Collection, DictOrList, Dumpable, Record + + +@define +class Treatment(Dumpable): + ignore_complex_lists: bool = False + ignore_field: t.List[str] = Factory(list) + convert_list: t.List[str] = Factory(list) + convert_string: t.List[str] = Factory(list) + convert_dict: t.List[t.Dict[str, str]] = Factory(list) + prune_invalid_date: t.List[str] = Factory(list) + + def apply(self, data: DictOrList) -> DictOrList: + if isinstance(data, dict): + self.apply_record(data) + return {k: self.apply(v) for (k, v) in data.items()} + elif isinstance(data, list): + return t.cast(list, [self.apply(v) for v in data]) + return data + + def apply_record(self, data: Record) -> Record: + # Optionally ignore lists of complex objects. + local_ignores = [] + if self.ignore_complex_lists: + for k, v in data.items(): + if isinstance(v, list) and v and isinstance(v[0], dict): + # Skip ignoring special-encoded items. + if v[0] and list(v[0].keys())[0].startswith("$"): + continue + local_ignores.append(k) + + # Apply global and computed ignores. + for ignore_name in self.ignore_field + local_ignores: + if ignore_name in data: + del data[ignore_name] + + # Converge certain items to `list` even when defined differently. + for to_list_name in self.convert_list: + if to_list_name in data and not isinstance(data[to_list_name], list): + data[to_list_name] = [data[to_list_name]] + + # Converge certain items to `str` even when defined differently. + for name in self.convert_string: + if name in data and not isinstance(data[name], str): + data[name] = str(data[name]) + + # Converge certain items to `dict` even when defined differently. + for rule in self.convert_dict: + name = rule["name"] + wrapper_name = rule["wrapper_name"] + if name in data and not isinstance(data[name], dict): + data[name] = {wrapper_name: data[name]} + + # Prune invalid date representations. + for key in self.prune_invalid_date: + if key in data: + if not isinstance(data[key], dict): + del data[key] + elif "date" in data[key]: + if isinstance(data[key]["date"], str): + del data[key] + + return data diff --git a/tests/zyp/test_collection.py b/tests/zyp/test_collection.py index d690c45..b30015d 100644 --- a/tests/zyp/test_collection.py +++ b/tests/zyp/test_collection.py @@ -86,18 +86,31 @@ def test_collection_transformation_serialize(): } dict_result = transformation.to_dict() assert dict_result == transformation_dict - return yaml_result = transformation.to_yaml() assert yaml.full_load(yaml_result) == transformation_dict - CollectionTransformation.from_yaml(yaml_result) + transformation_second = CollectionTransformation.from_yaml(yaml_result) + assert isinstance(transformation_second, CollectionTransformation) -def test_collection_transformation_load_and_apply(): +def test_collection_transformation_regular_load_and_apply(): """ - Verify transformation can be loaded from JSON and applied again. + Verify rule-based transformations can be loaded and applied. """ payload = Path("tests/zyp/transformation-collection.yaml").read_text() transformation = CollectionTransformation.from_yaml(payload) result = transformation.apply(deepcopy(ComplexRecipe.data_in)) assert result == ComplexRecipe.data_out + + +def test_collection_transformation_treatment_load_and_apply(): + """ + Verify collection transformation with treatment can be loaded and applied. + """ + payload = Path("tests/zyp/transformation-collection-treatment.yaml").read_text() + transformation = CollectionTransformation.from_yaml(payload) + result = transformation.apply(deepcopy(ComplexRecipe.data_in)) + assert result == { + "message-source": "system-3000", + "message-type": "eai-warehouse", + } diff --git a/tests/zyp/test_treatment.py b/tests/zyp/test_treatment.py new file mode 100644 index 0000000..fad5ad7 --- /dev/null +++ b/tests/zyp/test_treatment.py @@ -0,0 +1,103 @@ +from zyp.model.treatment import Treatment + +RECORD_IN = { + "data": { + "ignore_complex_list": [{}], + "ignore_field": 123, + "invalid_date_scalar": 123, + "invalid_date_nested": {"date": "123"}, + "to_string": 123, + "to_list": 123, + "to_dict": 123, + }, +} + +RECORD_OUT = { + "data": { + "to_string": "123", + "to_list": [123], + "to_dict": {"id": 123}, + }, +} + + +def test_treatment_all(): + """ + Verify treating nested data. + """ + transformation = Treatment( + ignore_complex_lists=True, + ignore_field=["ignore_field"], + prune_invalid_date=["invalid_date_scalar", "invalid_date_nested"], + convert_dict=[{"name": "to_dict", "wrapper_name": "id"}], + convert_list=["to_list"], + convert_string=["to_string"], + ) + assert transformation.apply(RECORD_IN) == RECORD_OUT + + +def test_treatment_noop(): + """ + Treating nested data without rules will yield the same result. + """ + transformation = Treatment() + assert transformation.apply([{"data": {"abc": 123}}]) == [{"data": {"abc": 123}}] + + +def test_treatment_ignore_complex_lists_basic(): + """ + Verify the "ignore_complex_lists" directive works. + """ + transformation = Treatment(ignore_complex_lists=True) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{}] + + +def test_treatment_ignore_complex_lists_with_specials(): + """ + Verify the "ignore_complex_lists" directive does not remove special encoded fields. + """ + transformation = Treatment(ignore_complex_lists=True) + assert transformation.apply([{"data": [{"abc": 123}], "stamps": [{"$date": 123}]}]) == [ + {"stamps": [{"$date": 123}]} + ] + + +def test_treatment_ignore_fields(): + """ + Verify ignoring fields works. + """ + transformation = Treatment(ignore_field=["abc"]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{}]}] + + +def test_treatment_convert_string(): + """ + Verify treating nested data to convert values into strings works. + """ + transformation = Treatment(convert_string=["abc"]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": "123"}]}] + + +def test_treatment_convert_list(): + """ + Verify treating nested data to convert values into lists works. + """ + transformation = Treatment(convert_list=["abc"]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": [123]}]}] + + +def test_treatment_convert_dict(): + """ + Verify treating nested data to convert values into dicts works. + """ + transformation = Treatment(convert_dict=[{"name": "abc", "wrapper_name": "id"}]) + assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": {"id": 123}}]}] + + +def test_treatment_prune_invalid_date(): + """ + Verify pruning invalid dates works. + """ + transformation = Treatment(prune_invalid_date=["date"]) + assert transformation.apply([{"data": [{"date": 123}]}]) == [{"data": [{}]}] + assert transformation.apply([{"data": [{"date": {"date": 123}}]}]) == [{"data": [{"date": {}}]}] diff --git a/tests/zyp/transformation-collection-treatment.yaml b/tests/zyp/transformation-collection-treatment.yaml new file mode 100644 index 0000000..ff3f082 --- /dev/null +++ b/tests/zyp/transformation-collection-treatment.yaml @@ -0,0 +1,5 @@ +meta: + version: 1 + type: zyp-collection +treatment: + ignore_complex_lists: true