From d309a488022652f7bae4e600ee26ef42c4107cb0 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 19 Sep 2024 17:30:33 +0200 Subject: [PATCH] CTK: Invoke MongoDB Table Loader with Zyp Transformation On a specific collection loaded from a MongoDB Extended JSON file, mask (exclude/ignore/omit) certain elements, in order to import all records without further errors. Both elements will be dropped: - .image.available_sizes - .screenshots[].available_sizes The procedure can be improved on a later iteration. --- application/cratedb-toolkit/requirements.txt | 2 +- application/cratedb-toolkit/test_io.py | 5 +- .../zyp-mongodb-json-files.yaml | 48 +++++++++++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 application/cratedb-toolkit/zyp-mongodb-json-files.yaml diff --git a/application/cratedb-toolkit/requirements.txt b/application/cratedb-toolkit/requirements.txt index 7d1a9c9a..48e247e8 100644 --- a/application/cratedb-toolkit/requirements.txt +++ b/application/cratedb-toolkit/requirements.txt @@ -1 +1 @@ -cratedb-toolkit[influxdb,mongodb]==0.0.23 +cratedb-toolkit[influxdb,mongodb]==0.0.24 diff --git a/application/cratedb-toolkit/test_io.py b/application/cratedb-toolkit/test_io.py index f43e0444..959e96c2 100644 --- a/application/cratedb-toolkit/test_io.py +++ b/application/cratedb-toolkit/test_io.py @@ -109,7 +109,7 @@ def test_ctk_load_table_mongodb_json(drop_testing_tables): table_cardinalities = { "books": 431, "city_inspections": 81047, - "companies": 2537, + "companies": 18801, "countries-big": 21640, "countries-small": 248, "covers": 5071, @@ -138,7 +138,8 @@ def test_ctk_load_table_mongodb_json(drop_testing_tables): command = f""" ctk load table \ "file+bson://{datasets_path}/*.json?batch-size=2500" \ - --cratedb-sqlalchemy-url="crate://localhost:4200/from-mongodb" + --cratedb-sqlalchemy-url="crate://localhost:4200/from-mongodb" \ + --transformation=application/cratedb-toolkit/zyp-mongodb-json-files.yaml """ print(f"Invoking CTK: {command}", file=sys.stderr) subprocess.check_call(shlex.split(command)) diff --git a/application/cratedb-toolkit/zyp-mongodb-json-files.yaml b/application/cratedb-toolkit/zyp-mongodb-json-files.yaml new file mode 100644 index 00000000..31605164 --- /dev/null +++ b/application/cratedb-toolkit/zyp-mongodb-json-files.yaml @@ -0,0 +1,48 @@ +# A Zyp Transformation [1] file to support importing datasets +# from mongodb-json-files [2] into CrateDB [3]. +# +# [1] https://commons-codec.readthedocs.io/zyp/ +# [2] https://github.com/ozlerhakan/mongodb-json-files +# [3] https://cratedb.com/docs/guide/feature/ + +# Because CrateDB can not store nested arrays into OBJECT(DYNAMIC) columns, +# this file defines a corresponding transformation to work around the problem. +# +# The workaround applied here is to just exclude/omit relevant `available_sizes` +# elements completely. Converting them right can be implemented on behalf of a +# later iteration. +# +# "image": { +# "available_sizes": [ +# [ +# [ +# 150, +# 99 +# ], +# "assets/images/resized/0001/3896/13896v3-max-150x150.jpg" +# ], +# ] +# +# A possible representation could be: +# +# "image": { +# "available_sizes": [ +# { +# "path": "assets/images/resized/0001/3896/13896v3-max-150x150.jpg", +# "size": {"width": 150, "height": 99}, +# } +# ] +# } +--- + +meta: + type: zyp-project + version: 1 +collections: +- address: + container: datasets + name: companies + pre: + rules: + - expression: .[] |= del(.image.available_sizes, .screenshots[].available_sizes) + type: jq