From dd678afe16703ee44060ec5dd119188ebd70e6fa Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sat, 9 Dec 2023 00:02:25 +0100 Subject: [PATCH] Singer/Meltano: Add example `singerfile-to-cratedb` --- .../singerfile-to-cratedb/.gitignore | 2 + .../singerfile-to-cratedb/README.md | 63 ++++++++++++++ .../singerfile-to-cratedb/meltano.yml | 52 ++++++++++++ .../extractors/tap-singer-jsonl--kgpayne.lock | 85 +++++++++++++++++++ .../loaders/target-jsonl--andyh1203.lock | 34 ++++++++ .../singerfile-to-cratedb/pyproject.toml | 20 +++++ 6 files changed, 256 insertions(+) create mode 100644 framework/singer-meltano/singerfile-to-cratedb/.gitignore create mode 100644 framework/singer-meltano/singerfile-to-cratedb/README.md create mode 100644 framework/singer-meltano/singerfile-to-cratedb/meltano.yml create mode 100644 framework/singer-meltano/singerfile-to-cratedb/plugins/extractors/tap-singer-jsonl--kgpayne.lock create mode 100644 framework/singer-meltano/singerfile-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock create mode 100644 framework/singer-meltano/singerfile-to-cratedb/pyproject.toml diff --git a/framework/singer-meltano/singerfile-to-cratedb/.gitignore b/framework/singer-meltano/singerfile-to-cratedb/.gitignore new file mode 100644 index 00000000..e6f0c986 --- /dev/null +++ b/framework/singer-meltano/singerfile-to-cratedb/.gitignore @@ -0,0 +1,2 @@ +catalog.json +tap_countries.singer diff --git a/framework/singer-meltano/singerfile-to-cratedb/README.md b/framework/singer-meltano/singerfile-to-cratedb/README.md new file mode 100644 index 00000000..67d5b9cc --- /dev/null +++ b/framework/singer-meltano/singerfile-to-cratedb/README.md @@ -0,0 +1,63 @@ +# Meltano Singer File -> CrateDB example + +## About + +Import data from a file in Singer format (JSONL) into CrateDB, using +[tap-singer-jsonl] and [meltano-target-cratedb]. + +## Configuration + +### tap-singer-jsonl + +Within the `extractors` section, have a look at `tap-singer-jsonl`'s +`config.local.paths` section, how to configure JSONL files in Singer +format as pipeline source(s). + +### target-cratedb + +Within the `loaders` section, at `target-cratedb`, adjust +`config.sqlalchemy_url` to match your database connectivity settings +as pipeline target. + +## Usage + +Install dependencies. +```shell +meltano install +``` + +Discover data schema. +```shell +meltano invoke tap-singer-jsonl --discover > catalog.json +``` + +Run plugin standalone, testdrive. +```shell +meltano invoke tap-singer-jsonl --catalog catalog.json +``` + +Invoke data transfer to CrateDB database. +```shell +meltano run tap-singer-jsonl target-cratedb +``` + +## Screenshot + +Enjoy the list of countries. +```sql +crash --command 'SELECT "code", "name", "capital", "emoji", "languages[1]" FROM "melty"."countries" ORDER BY "name" LIMIT 42;' +``` + +![image](https://github.com/crate-workbench/meltano-target-cratedb/assets/453543/fa7076cc-267e-446c-a4f3-aa1283778ace) + + +## Development +In order to link the sandbox to a development installation of [meltano-target-cratedb], +configure the `pip_url` of the component like this: +```yaml +pip_url: --editable=/path/to/sources/meltano-target-cratedb +``` + + +[meltano-target-cratedb]: https://github.com/crate-workbench/meltano-target-cratedb +[tap-singer-jsonl]: https://github.com/kgpayne/tap-singer-jsonl diff --git a/framework/singer-meltano/singerfile-to-cratedb/meltano.yml b/framework/singer-meltano/singerfile-to-cratedb/meltano.yml new file mode 100644 index 00000000..7189655e --- /dev/null +++ b/framework/singer-meltano/singerfile-to-cratedb/meltano.yml @@ -0,0 +1,52 @@ +# A Meltano project is just a directory on your filesystem containing text-based files. +# At a minimum, a Meltano project must contain a project file named `meltano.yml`, +# which contains your project configuration, and tells Meltano that a particular +# directory is a Meltano project. +--- +version: 1 +default_environment: dev +send_anonymous_usage_stats: false +project_id: f14797b9-9d1c-414c-851c-c91e08ddbc2e + +environments: +- name: dev +- name: staging +- name: prod + +plugins: + + # Configure data source. + # In Singer jargon, it is an "extractor", wrapped into a "tap". + extractors: + + - name: tap-singer-jsonl + variant: kgpayne + pip_url: git+https://github.com/crate-workbench/tap-singer-jsonl@fix-paths + config: + source: local + add_record_metadata: false + local: + # Note: Configure Singer file(s) here. + paths: + - "tap_countries.singer" + + # Configure data sinks. + # In Singer jargon, it is a "loader", wrapped into a "target". + loaders: + + - name: target-jsonl + variant: andyh1203 + pip_url: target-jsonl + + - name: target-cratedb + namespace: cratedb + variant: cratedb + # Acquire from PyPI. + pip_url: meltano-target-cratedb + # Acquire from GitHub. + # pip_url: git+https://github.com/crate-workbench/meltano-target-cratedb.git + + # Note: Configure your database server and credentials here. + config: + sqlalchemy_url: crate://crate@localhost/ + add_record_metadata: true diff --git a/framework/singer-meltano/singerfile-to-cratedb/plugins/extractors/tap-singer-jsonl--kgpayne.lock b/framework/singer-meltano/singerfile-to-cratedb/plugins/extractors/tap-singer-jsonl--kgpayne.lock new file mode 100644 index 00000000..4a4fda6d --- /dev/null +++ b/framework/singer-meltano/singerfile-to-cratedb/plugins/extractors/tap-singer-jsonl--kgpayne.lock @@ -0,0 +1,85 @@ +{ + "plugin_type": "extractors", + "name": "tap-singer-jsonl", + "namespace": "tap_singer_jsonl", + "variant": "kgpayne", + "label": "Singer JSONL", + "docs": "https://hub.meltano.com/extractors/tap-singer-jsonl--kgpayne", + "repo": "https://github.com/kgpayne/tap-singer-jsonl", + "pip_url": "tap-singer-jsonl", + "executable": "tap-singer-jsonl", + "description": "Read Singer-formatted JSONL Files", + "logo_url": "https://hub.meltano.com/assets/logos/extractors/singer.png", + "capabilities": [ + "discover" + ], + "settings_group_validation": [ + [ + "local.folders" + ], + [ + "local.paths" + ], + [ + "source", + "s3.bucket" + ], + [ + "source", + "s3.paths" + ] + ], + "settings": [ + { + "name": "source", + "kind": "string", + "value": "local", + "label": "Source", + "description": "The source configuration to use when reading `.singer.gz` files. Currently `local` and `s3` are supported." + }, + { + "name": "add_record_metadata", + "kind": "boolean", + "value": true, + "label": "Add Record Metadata", + "description": "Whether to inject `_sdc_*` metadata columns." + }, + { + "name": "local.folders", + "kind": "array", + "label": "Folders", + "description": "Array of directory paths to scan for `.singer.gz` files." + }, + { + "name": "local.recursive", + "kind": "boolean", + "value": false, + "label": "Recursive", + "description": "Whether to scan directories recursively when discovering `.singer.gz` files." + }, + { + "name": "local.paths", + "kind": "array", + "label": "Paths", + "description": "Array of file paths to singer-formatted files. **Note:** extension is ignored, and compression is inferred automatically by `smart_open`. Both `local.folders` and `local.paths` can be specified together." + }, + { + "name": "s3.bucket", + "kind": "string", + "label": "Bucket", + "description": "S3 bucket name." + }, + { + "name": "s3.prefix", + "kind": "string", + "label": "Prefix", + "description": "S3 key prefix. **Note:** key prefixes will be scanned recursively." + }, + { + "name": "s3.paths", + "kind": "array", + "label": "Paths", + "description": "S3 file paths to singer-formatted files. **Note:** extension is ignored, and compression is inferred automatically by `smart_open`. Both `s3.prefix` and `s3.paths` can be specified together." + } + ] +} \ No newline at end of file diff --git a/framework/singer-meltano/singerfile-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock b/framework/singer-meltano/singerfile-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock new file mode 100644 index 00000000..5825fc4a --- /dev/null +++ b/framework/singer-meltano/singerfile-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock @@ -0,0 +1,34 @@ +{ + "plugin_type": "loaders", + "name": "target-jsonl", + "namespace": "target_jsonl", + "variant": "andyh1203", + "label": "JSON Lines (JSONL)", + "docs": "https://hub.meltano.com/loaders/target-jsonl--andyh1203", + "repo": "https://github.com/andyh1203/target-jsonl", + "pip_url": "target-jsonl", + "description": "JSONL loader", + "logo_url": "https://hub.meltano.com/assets/logos/loaders/jsonl.png", + "settings": [ + { + "name": "destination_path", + "kind": "string", + "value": "output", + "label": "Destination Path", + "description": "Sets the destination path the JSONL files are written to, relative\nto the project root.\n\nThe directory needs to exist already, it will not be created\nautomatically.\n\nTo write JSONL files to the project root, set an empty string (`\"\"`).\n" + }, + { + "name": "do_timestamp_file", + "kind": "boolean", + "value": false, + "label": "Include Timestamp in File Names", + "description": "Specifies if the files should get timestamped.\n\nBy default, the resulting file will not have a timestamp in the file name (i.e. `exchange_rate.jsonl`).\n\nIf this option gets set to `true`, the resulting file will have a timestamp associated with it (i.e. `exchange_rate-{timestamp}.jsonl`).\n" + }, + { + "name": "custom_name", + "kind": "string", + "label": "Custom File Name Override", + "description": "Specifies a custom name for the filename, instead of the stream name.\n\nThe file name will be `{custom_name}-{timestamp}.jsonl`, if `do_timestamp_file` is `true`.\nOtherwise the file name will be `{custom_name}.jsonl`.\n\nIf custom name is not provided, the stream name will be used.\n" + } + ] +} \ No newline at end of file diff --git a/framework/singer-meltano/singerfile-to-cratedb/pyproject.toml b/framework/singer-meltano/singerfile-to-cratedb/pyproject.toml new file mode 100644 index 00000000..13f95661 --- /dev/null +++ b/framework/singer-meltano/singerfile-to-cratedb/pyproject.toml @@ -0,0 +1,20 @@ +[tool.poe.tasks] + +test = [ + + # Acquire Singer file in JSONL format. + { cmd = "wget --no-clobber https://github.com/MeltanoLabs/target-postgres/raw/v0.0.9/target_postgres/tests/data_files/tap_countries.singer" }, + + # Install recipe. + { cmd = "meltano install" }, + + # Discover data schema. + { shell = "meltano invoke tap-singer-jsonl --discover > catalog.json" }, + + # Run plugin standalone, testdrive. + { cmd = "meltano invoke tap-singer-jsonl --catalog catalog.json" }, + + # Invoke pipeline, loading data into database, for real. + { cmd = "meltano run tap-singer-jsonl target-cratedb" }, + +]