-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Zyp Treatments, a more tailored transformation subsystem
- Loading branch information
Showing
6 changed files
with
199 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import typing as t | ||
|
||
from attr import Factory | ||
from attrs import define | ||
|
||
from zyp.model.base import Dumpable | ||
from zyp.model.bucket import DictOrList, Record | ||
|
||
|
||
@define | ||
class Treatment(Dumpable): | ||
ignore_complex_lists: bool = False | ||
ignore_columns: t.List[str] = Factory(list) | ||
convert_list: t.List[str] = Factory(list) | ||
convert_string: t.List[str] = Factory(list) | ||
convert_dict: t.List[t.Dict[str, str]] = Factory(list) | ||
prune_invalid_date: t.List[str] = Factory(list) | ||
|
||
def apply(self, data: DictOrList) -> DictOrList: | ||
if isinstance(data, dict): | ||
self.apply_record(data) | ||
return {k: self.apply(v) for (k, v) in data.items()} | ||
elif isinstance(data, list): | ||
return t.cast(list, [self.apply(v) for v in data]) | ||
return data | ||
|
||
def apply_record(self, data: Record) -> Record: | ||
# Optionally ignore lists of complex objects. | ||
local_ignores = [] | ||
if self.ignore_complex_lists: | ||
for k, v in data.items(): | ||
if isinstance(v, list) and v and isinstance(v[0], dict): | ||
# Skip ignoring special-encoded items. | ||
if v[0] and list(v[0].keys())[0].startswith("$"): | ||
continue | ||
local_ignores.append(k) | ||
|
||
# Apply global and computed ignores. | ||
for ignore_name in self.ignore_columns + local_ignores: | ||
if ignore_name in data: | ||
del data[ignore_name] | ||
|
||
# Converge certain items to `list` even when defined differently. | ||
for to_list_name in self.convert_list: | ||
if to_list_name in data and not isinstance(data[to_list_name], list): | ||
data[to_list_name] = [data[to_list_name]] | ||
|
||
# Converge certain items to `str` even when defined differently. | ||
for name in self.convert_string: | ||
if name in data and not isinstance(data[name], str): | ||
data[name] = str(data[name]) | ||
|
||
# Converge certain items to `dict` even when defined differently. | ||
for rule in self.convert_dict: | ||
name = rule["name"] | ||
wrapper_name = rule["wrapper_name"] | ||
if name in data and not isinstance(data[name], dict): | ||
data[name] = {wrapper_name: data[name]} | ||
|
||
# Prune invalid date representations. | ||
for key in self.prune_invalid_date: | ||
if key in data: | ||
if not isinstance(data[key], dict): | ||
del data[key] | ||
elif "date" in data[key]: | ||
if isinstance(data[key]["date"], str): | ||
del data[key] | ||
|
||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
from zyp.model.treatment import Treatment | ||
|
||
RECORD_IN = { | ||
"data": { | ||
"ignore_complex_list": [{}], | ||
"ignore_column": 123, | ||
"invalid_date_scalar": 123, | ||
"invalid_date_nested": {"date": "123"}, | ||
"to_string": 123, | ||
"to_list": 123, | ||
"to_dict": 123, | ||
}, | ||
} | ||
|
||
RECORD_OUT = { | ||
"data": { | ||
"to_string": "123", | ||
"to_list": [123], | ||
"to_dict": {"id": 123}, | ||
}, | ||
} | ||
|
||
|
||
def test_treatment_all(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment( | ||
ignore_complex_lists=True, | ||
ignore_columns=["ignore_column"], | ||
prune_invalid_date=["invalid_date_scalar", "invalid_date_nested"], | ||
convert_dict=[{"name": "to_dict", "wrapper_name": "id"}], | ||
convert_list=["to_list"], | ||
convert_string=["to_string"], | ||
) | ||
assert transformation.apply(RECORD_IN) == RECORD_OUT | ||
|
||
|
||
def test_treatment_noop(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment() | ||
assert transformation.apply([{"data": {"abc": 123}}]) == [{"data": {"abc": 123}}] | ||
|
||
|
||
def test_treatment_ignore_complex_lists_basic(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment(ignore_complex_lists=True) | ||
assert transformation.apply([{"data": [{"abc": 123}]}]) == [{}] | ||
|
||
|
||
def test_treatment_ignore_complex_lists_with_specials(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment(ignore_complex_lists=True) | ||
assert transformation.apply([{"data": [{"abc": 123}], "stamps": [{"$date": 123}]}]) == [ | ||
{"stamps": [{"$date": 123}]} | ||
] | ||
|
||
|
||
def test_treatment_ignore_columns(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment(ignore_columns=["abc"]) | ||
assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{}]}] | ||
|
||
|
||
def test_treatment_convert_string(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment(convert_string=["abc"]) | ||
assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": "123"}]}] | ||
|
||
|
||
def test_treatment_convert_list(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment(convert_list=["abc"]) | ||
assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": [123]}]}] | ||
|
||
|
||
def test_treatment_convert_dict(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment(convert_dict=[{"name": "abc", "wrapper_name": "id"}]) | ||
assert transformation.apply([{"data": [{"abc": 123}]}]) == [{"data": [{"abc": {"id": 123}}]}] | ||
|
||
|
||
def test_treatment_prune_invalid_date(): | ||
""" | ||
Verify treating nested data. | ||
""" | ||
transformation = Treatment(prune_invalid_date=["date"]) | ||
assert transformation.apply([{"data": [{"date": 123}]}]) == [{"data": [{}]}] | ||
assert transformation.apply([{"data": [{"date": {"date": 123}}]}]) == [{"data": [{"date": {}}]}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
meta: | ||
version: 1 | ||
type: zyp-collection | ||
treatment: | ||
ignore_complex_lists: true |