From 75a8dd3386717d513dd3d8203080610ca6bfb967 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Fri, 8 Dec 2023 17:13:34 +0100 Subject: [PATCH] Singer/Meltano: Add example `github-to-cratedb` It uses the `meltano-target-cratedb` Singer component. https://github.com/crate-workbench/meltano-target-cratedb --- .github/workflows/test-singer-meltano.yml | 72 +++++++ .gitignore | 3 +- framework/singer-meltano/.gitignore | 2 + framework/singer-meltano/README.md | 45 +++++ .../github-to-cratedb/README.md | 82 ++++++++ .../github-to-cratedb/meltano.yml | 51 +++++ .../extractors/tap-github--meltanolabs.lock | 176 ++++++++++++++++++ .../loaders/target-jsonl--andyh1203.lock | 34 ++++ .../github-to-cratedb/pyproject.toml | 11 ++ framework/singer-meltano/pyproject.toml | 26 +++ framework/singer-meltano/requirements-dev.txt | 1 + framework/singer-meltano/requirements.txt | 1 + 12 files changed, 503 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test-singer-meltano.yml create mode 100644 framework/singer-meltano/.gitignore create mode 100644 framework/singer-meltano/README.md create mode 100644 framework/singer-meltano/github-to-cratedb/README.md create mode 100644 framework/singer-meltano/github-to-cratedb/meltano.yml create mode 100644 framework/singer-meltano/github-to-cratedb/plugins/extractors/tap-github--meltanolabs.lock create mode 100644 framework/singer-meltano/github-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock create mode 100644 framework/singer-meltano/github-to-cratedb/pyproject.toml create mode 100644 framework/singer-meltano/pyproject.toml create mode 100644 framework/singer-meltano/requirements-dev.txt create mode 100644 framework/singer-meltano/requirements.txt diff --git a/.github/workflows/test-singer-meltano.yml b/.github/workflows/test-singer-meltano.yml new file mode 100644 index 00000000..cc549bd9 --- /dev/null +++ b/.github/workflows/test-singer-meltano.yml @@ -0,0 +1,72 @@ +name: Python SQLAlchemy + +on: + pull_request: + branches: ~ + paths: + - '.github/workflows/test-singer-meltano.yml' + - 'framework/singer-meltano/**' + - 'requirements.txt' + push: + branches: [ main ] + paths: + - '.github/workflows/test-singer-meltano.yml' + - 'framework/singer-meltano/**' + - 'requirements.txt' + + # Allow job to be triggered manually. + workflow_dispatch: + + # Run job each night after CrateDB nightly has been published. + schedule: + - cron: '0 3 * * *' + +# Cancel in-progress jobs when pushing to the same branch. +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +jobs: + test: + name: " + Python: ${{ matrix.python-version }} + CrateDB: ${{ matrix.cratedb-version }} + on ${{ matrix.os }}" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ 'ubuntu-latest' ] + python-version: [ '3.8', '3.11' ] + cratedb-version: [ 'nightly' ] + + services: + cratedb: + image: crate/crate:nightly + ports: + - 4200:4200 + - 5432:5432 + + steps: + + - name: Acquire sources + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: | + requirements.txt + framework/singer-meltano/requirements.txt + framework/singer-meltano/requirements-dev.txt + + - name: Install utilities + run: | + pip install -r requirements.txt + + - name: Validate framework/singer-meltano + run: | + ngr test --accept-no-venv framework/singer-meltano diff --git a/.gitignore b/.gitignore index 44639d77..a117be49 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,9 @@ +.DS_Store .idea +.env .venv* __pycache__ .coverage coverage.xml mlruns/ archive/ -logs.log \ No newline at end of file diff --git a/framework/singer-meltano/.gitignore b/framework/singer-meltano/.gitignore new file mode 100644 index 00000000..d097c004 --- /dev/null +++ b/framework/singer-meltano/.gitignore @@ -0,0 +1,2 @@ +.meltano +output diff --git a/framework/singer-meltano/README.md b/framework/singer-meltano/README.md new file mode 100644 index 00000000..730697f4 --- /dev/null +++ b/framework/singer-meltano/README.md @@ -0,0 +1,45 @@ +# Meltano Examples + +Concise examples about working with [CrateDB] and [Meltano], for conceiving and +running flexible ELT tasks. All the recipes are using [meltano-target-cratedb] +for reading and writing data from/to CrateDB. + +## What's inside + +- `singerfile-to-cratedb`: Acquire data from Singer File, and load it into + CrateDB database table. + +- `github-to-cratedb`: Acquire repository metadata from GitHub API, and load + it separated per entity into 32 CrateDB database tables. + +## Prerequisites + +Before running an examples within the subdirectories, make sure to install +Meltano and its dependencies. + +```shell +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +## Usage + +Then, explore the individual Meltano projects, either invoke them from within +their directories, or by using the `--cwd` option from the root folder. + +```shell +meltano --cwd github-to-cratedb install +meltano --cwd github-to-cratedb run tap-github target-cratedb +``` + +## Software Tests +```shell +pip install -r requirements-dev.txt +poe check +``` + + +[CrateDB]: https://cratedb.com/product +[Meltano]: https://meltano.com/ +[meltano-target-cratedb]: https://github.com/crate-workbench/meltano-target-cratedb diff --git a/framework/singer-meltano/github-to-cratedb/README.md b/framework/singer-meltano/github-to-cratedb/README.md new file mode 100644 index 00000000..445adb77 --- /dev/null +++ b/framework/singer-meltano/github-to-cratedb/README.md @@ -0,0 +1,82 @@ +# Meltano GitHub -> CrateDB example + +## About + +Acquire repository metadata from GitHub API, and insert into CrateDB database +tables, using [meltano-target-cratedb]. + +It follows the canonical example demonstrated at the [Meltano Getting Started Tutorial]. + +## Configuration + +### tap-github + +For accessing the GitHub API, you will need an authentication token. It +can be acquired at [GitHub Developer Settings » Tokens]. + +To configure the recipe, please store it into the `TAP_GITHUB_AUTH_TOKEN` +environment variable, either interactively, or by creating a dotenv +configuration file `.env`. + +```shell +TAP_GITHUB_AUTH_TOKEN='ghp_hmQR3XTFWkfIcuyjRTBuVrRt6mnL1j2mMPT8' +``` + +Then, in `meltano.yml`, identify the `tap-github` section in `plugins.extractors`, +and adjust the value of `config.repositories` to correspond to the repository +you intend to scrape. + +### target-cratedb + +Within `loaders` section `target-cratedb`, adjust `config.sqlalchemy_url` to +match your database connectivity settings. + + +## Usage + +Install dependencies. +```shell +meltano install +``` + +Invoke data transfer to JSONL files. +```shell +meltano run tap-github target-jsonl +cat github-to-cratedb/output/commits.jsonl +``` + +Invoke data transfer to CrateDB database. +```shell +meltano run tap-github target-cratedb +``` + +## Screenshot + +Enjoy the release notes. +```sql +SELECT repo, tag_name, body FROM melty.releases ORDER BY tag_name DESC; +``` + +![image](https://github.com/crate-workbench/cratedb-toolkit/assets/453543/ac37c9cc-8e42-4c7c-84aa-64498bf48f4d) + +## Troubleshooting + +If you see such errors on stdout, please verify your GitHub authentication +token stored within the `TAP_GITHUB_AUTH_TOKEN` environment variable. +```python +singer_sdk.exceptions.RetriableAPIError: 401 Client Error: b'{"message":"This endpoint requires you to be authenticated.","documentation_url":"https://docs.github.com/graphql/guides/forming-calls-with-graphql#authenticating-with-graphql"}' (Reason: Unauthorized) for path: /graphql cmd_type=elb consumer=False name=tap-github producer=True stdio=stderr string_id=tap-github +``` + +## Development +In order to link the sandbox to a development installation of [meltano-target-cratedb], +configure the `pip_url` of the component like this: +```yaml +pip_url: --editable=/path/to/sources/meltano-target-cratedb +``` + + +[GitHub Developer Settings » Tokens]: https://github.com/settings/tokens +[Meltano Getting Started Tutorial]: https://docs.meltano.com/getting-started/part1 +[meltano-target-cratedb]: https://github.com/crate-workbench/meltano-target-cratedb +[tap-github]: https://hub.meltano.com/extractors/tap-github/ +[target-jsonl]: https://hub.meltano.com/loaders/target-jsonl/ diff --git a/framework/singer-meltano/github-to-cratedb/meltano.yml b/framework/singer-meltano/github-to-cratedb/meltano.yml new file mode 100644 index 00000000..65d84b68 --- /dev/null +++ b/framework/singer-meltano/github-to-cratedb/meltano.yml @@ -0,0 +1,51 @@ +# A Meltano project is just a directory on your filesystem containing text-based files. +# At a minimum, a Meltano project must contain a project file named `meltano.yml`, +# which contains your project configuration, and tells Meltano that a particular +# directory is a Meltano project. +--- +version: 1 +default_environment: dev +send_anonymous_usage_stats: false +project_id: f14797b9-9d1c-414c-851c-c91e08ddbc2e + +environments: +- name: dev +- name: staging +- name: prod + +plugins: + + # Configure data source. + # In Singer jargon, it is an "extractor", wrapped into a "tap". + extractors: + + - name: tap-github + variant: cratedb + namespace: cratedb + pip_url: git+https://github.com/crate-workbench/tap-github.git@cratedb + # Note: Configure your GitHub repository here. + config: + start_date: '2023-12-01' + repositories: + - crate-workbench/cratedb-toolkit + + # Configure data sinks. + # In Singer jargon, it is a "loader", wrapped into a "target". + loaders: + + - name: target-jsonl + variant: andyh1203 + pip_url: target-jsonl + + - name: target-cratedb + namespace: cratedb + variant: cratedb + # Acquire from PyPI. + pip_url: meltano-target-cratedb + # Acquire from GitHub. + # pip_url: git+https://github.com/crate-workbench/meltano-target-cratedb.git + + # Note: Configure your database server and credentials here. + config: + sqlalchemy_url: crate://crate@localhost/ + add_record_metadata: true diff --git a/framework/singer-meltano/github-to-cratedb/plugins/extractors/tap-github--meltanolabs.lock b/framework/singer-meltano/github-to-cratedb/plugins/extractors/tap-github--meltanolabs.lock new file mode 100644 index 00000000..34ecde46 --- /dev/null +++ b/framework/singer-meltano/github-to-cratedb/plugins/extractors/tap-github--meltanolabs.lock @@ -0,0 +1,176 @@ +{ + "plugin_type": "extractors", + "name": "tap-github", + "namespace": "tap_github", + "variant": "meltanolabs", + "label": "GitHub", + "docs": "https://hub.meltano.com/extractors/tap-github--meltanolabs", + "repo": "https://github.com/MeltanoLabs/tap-github", + "pip_url": "git+https://github.com/MeltanoLabs/tap-github.git", + "description": "Code hosting platform", + "logo_url": "https://hub.meltano.com/assets/logos/extractors/github.png", + "capabilities": [ + "about", + "batch", + "catalog", + "discover", + "schema-flattening", + "state", + "stream-maps" + ], + "settings_group_validation": [ + [ + "repositories" + ], + [ + "organizations" + ], + [ + "searches" + ], + [ + "user_usernames" + ], + [ + "user_ids" + ] + ], + "settings": [ + { + "name": "additional_auth_tokens", + "kind": "array", + "label": "Additional Auth Tokens", + "description": "List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits." + }, + { + "name": "auth_token", + "kind": "password", + "label": "Auth Token", + "description": "GitHub token to authenticate with." + }, + { + "name": "batch_config.encoding.compression", + "kind": "options", + "label": "Batch Config Encoding Compression", + "description": "Compression format to use for batch files.", + "options": [ + { + "label": "Gzip", + "value": "gzip" + }, + { + "label": "None", + "value": "none" + } + ] + }, + { + "name": "batch_config.encoding.format", + "kind": "options", + "label": "Batch Config Encoding Format", + "description": "Format to use for batch files.", + "options": [ + { + "label": "Jsonl", + "value": "jsonl" + } + ] + }, + { + "name": "batch_config.storage.prefix", + "kind": "string", + "label": "Batch Config Storage Prefix", + "description": "Prefix to use when writing batch files." + }, + { + "name": "batch_config.storage.root", + "kind": "string", + "label": "Batch Config Storage Root", + "description": "Root path to use when writing batch files." + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Flattening Enabled", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Flattening Max Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "metrics_log_level", + "kind": "string", + "label": "Metrics Log Level", + "description": "The log level of the API response metrics." + }, + { + "name": "organizations", + "kind": "array", + "label": "Organizations", + "description": "An array of strings containing the github organizations to be included" + }, + { + "name": "rate_limit_buffer", + "kind": "integer", + "label": "Rate Limit Buffer", + "description": "Add a buffer to avoid consuming all query points for the token at hand. Defaults to 1000." + }, + { + "name": "repositories", + "kind": "array", + "label": "Repositories", + "description": "An array of strings containing the github repos to be included" + }, + { + "name": "searches", + "kind": "array", + "label": "Searches", + "description": "An array of search descriptor objects with the following properties. \"name\" - a human readable name for the search query. \"query\" - a github search string (generally the same as would come after ?q= in the URL)" + }, + { + "name": "skip_parent_streams", + "kind": "boolean", + "label": "Skip Parent Streams", + "description": "Set to true to skip API calls for the parent streams (such as repositories) if it is not selected but children are" + }, + { + "name": "start_date", + "kind": "date_iso8601", + "label": "Start Date" + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "Stream Map Config" + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps" + }, + { + "name": "user_agent", + "kind": "string", + "label": "User Agent" + }, + { + "name": "user_ids", + "kind": "array", + "label": "User IDs", + "description": "A list of GitHub user ids." + }, + { + "name": "user_usernames", + "kind": "array", + "label": "User Usernames", + "description": "A list of GithHub usernames." + } + ], + "select": [ + "*.*", + "!traffic_*.*" + ] +} \ No newline at end of file diff --git a/framework/singer-meltano/github-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock b/framework/singer-meltano/github-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock new file mode 100644 index 00000000..5825fc4a --- /dev/null +++ b/framework/singer-meltano/github-to-cratedb/plugins/loaders/target-jsonl--andyh1203.lock @@ -0,0 +1,34 @@ +{ + "plugin_type": "loaders", + "name": "target-jsonl", + "namespace": "target_jsonl", + "variant": "andyh1203", + "label": "JSON Lines (JSONL)", + "docs": "https://hub.meltano.com/loaders/target-jsonl--andyh1203", + "repo": "https://github.com/andyh1203/target-jsonl", + "pip_url": "target-jsonl", + "description": "JSONL loader", + "logo_url": "https://hub.meltano.com/assets/logos/loaders/jsonl.png", + "settings": [ + { + "name": "destination_path", + "kind": "string", + "value": "output", + "label": "Destination Path", + "description": "Sets the destination path the JSONL files are written to, relative\nto the project root.\n\nThe directory needs to exist already, it will not be created\nautomatically.\n\nTo write JSONL files to the project root, set an empty string (`\"\"`).\n" + }, + { + "name": "do_timestamp_file", + "kind": "boolean", + "value": false, + "label": "Include Timestamp in File Names", + "description": "Specifies if the files should get timestamped.\n\nBy default, the resulting file will not have a timestamp in the file name (i.e. `exchange_rate.jsonl`).\n\nIf this option gets set to `true`, the resulting file will have a timestamp associated with it (i.e. `exchange_rate-{timestamp}.jsonl`).\n" + }, + { + "name": "custom_name", + "kind": "string", + "label": "Custom File Name Override", + "description": "Specifies a custom name for the filename, instead of the stream name.\n\nThe file name will be `{custom_name}-{timestamp}.jsonl`, if `do_timestamp_file` is `true`.\nOtherwise the file name will be `{custom_name}.jsonl`.\n\nIf custom name is not provided, the stream name will be used.\n" + } + ] +} \ No newline at end of file diff --git a/framework/singer-meltano/github-to-cratedb/pyproject.toml b/framework/singer-meltano/github-to-cratedb/pyproject.toml new file mode 100644 index 00000000..290c9797 --- /dev/null +++ b/framework/singer-meltano/github-to-cratedb/pyproject.toml @@ -0,0 +1,11 @@ +[tool.poe.tasks] + +test = [ + + # Install recipe. + { cmd = "meltano install" }, + + # We can't do anything else here, because the GitHub + # data source needs authentication via access token. + +] diff --git a/framework/singer-meltano/pyproject.toml b/framework/singer-meltano/pyproject.toml new file mode 100644 index 00000000..70dcc2a5 --- /dev/null +++ b/framework/singer-meltano/pyproject.toml @@ -0,0 +1,26 @@ +[tool.poe.tasks] + +check = [ + "lint", + "test", +] + +format = [ + { cmd = "black ." }, + # Configure Ruff not to auto-fix (remove!): + # unused imports (F401), unused variables (F841), `print` statements (T201), and commented-out code (ERA001). + { cmd = "ruff --fix --ignore=ERA --ignore=F401 --ignore=F841 --ignore=T20 --ignore=ERA001 ." }, + { cmd = "pyproject-fmt pyproject.toml" }, +] + +lint = [ + { cmd = "ruff ." }, + { cmd = "black --check ." }, + { cmd = "validate-pyproject pyproject.toml" }, + #{ cmd = "mypy" }, +] + +test = [ + { cmd = "poe --root github-to-cratedb test" }, + { cmd = "poe --root singerfile-to-cratedb test" }, +] diff --git a/framework/singer-meltano/requirements-dev.txt b/framework/singer-meltano/requirements-dev.txt new file mode 100644 index 00000000..adbc6a86 --- /dev/null +++ b/framework/singer-meltano/requirements-dev.txt @@ -0,0 +1 @@ +pueblo[develop] diff --git a/framework/singer-meltano/requirements.txt b/framework/singer-meltano/requirements.txt new file mode 100644 index 00000000..a11523ca --- /dev/null +++ b/framework/singer-meltano/requirements.txt @@ -0,0 +1 @@ +meltano==3.2.0