From d00163b4c9509762f4c6d593edc4fbe7cbd2e3fc Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:04:40 -0500 Subject: [PATCH 01/97] Deploy sm2a in dev --- .github/actions/terraform-deploy/action.yml | 1 - .../actions/terraform-deploy/sm2a_action.yml | 67 ++++ .github/workflows/cicd.yml | 30 +- sm2a/README.md | 163 ++++++++++ sm2a/airflow_services/Dockerfile | 29 ++ sm2a/airflow_services/requirements.txt | 24 ++ sm2a/airflow_services/webserver_config.py | 209 +++++++++++++ sm2a/airflow_worker/Dockerfile | 51 +++ sm2a/airflow_worker/requirements.txt | 24 ++ sm2a/deploy_requirements.txt | 0 sm2a/docker-compose.yml | 180 +++++++++++ sm2a/docs/howto/add_a_general_dag.md | 39 +++ sm2a/infrastructure/.terraform.lock.hcl | 82 +++++ sm2a/infrastructure/configuration/__init__.py | 0 sm2a/infrastructure/configuration/airflow.cfg | 63 ++++ .../configuration/airflow.cfg.tmpl | 63 ++++ .../configuration/celery_config.py | 33 ++ .../configuration/logging_config.py | 21 ++ sm2a/infrastructure/main.tf | 86 +++++ sm2a/infrastructure/outputs.tf | 6 + sm2a/infrastructure/terraform.tf.tmpl | 8 + sm2a/infrastructure/terraform.tfvars.tmpl | 15 + sm2a/infrastructure/variables.tf | 207 ++++++++++++ sm2a/plugins/.gitkeep | 0 sm2a/scripts/deploy.sh | 109 +++++++ sm2a/scripts/generate_env_file.py | 38 +++ ..._airflow_worker_autoscaling_metric_data.py | 296 ++++++++++++++++++ sm2a/scripts/run_task.py | 184 +++++++++++ sm2a/sm2a-local-config/env_example | 4 + sm2a/sm2a-local-config/local_airflow.cfg | 78 +++++ .../local_webserver_config.py | 132 ++++++++ 31 files changed, 2240 insertions(+), 2 deletions(-) create mode 100644 .github/actions/terraform-deploy/sm2a_action.yml create mode 100644 sm2a/README.md create mode 100644 sm2a/airflow_services/Dockerfile create mode 100644 sm2a/airflow_services/requirements.txt create mode 100644 sm2a/airflow_services/webserver_config.py create mode 100644 sm2a/airflow_worker/Dockerfile create mode 100644 sm2a/airflow_worker/requirements.txt create mode 100644 sm2a/deploy_requirements.txt create mode 100644 sm2a/docker-compose.yml create mode 100644 sm2a/docs/howto/add_a_general_dag.md create mode 100644 sm2a/infrastructure/.terraform.lock.hcl create mode 100644 sm2a/infrastructure/configuration/__init__.py create mode 100755 sm2a/infrastructure/configuration/airflow.cfg create mode 100644 sm2a/infrastructure/configuration/airflow.cfg.tmpl create mode 100644 sm2a/infrastructure/configuration/celery_config.py create mode 100644 sm2a/infrastructure/configuration/logging_config.py create mode 100644 sm2a/infrastructure/main.tf create mode 100644 sm2a/infrastructure/outputs.tf create mode 100644 sm2a/infrastructure/terraform.tf.tmpl create mode 100644 sm2a/infrastructure/terraform.tfvars.tmpl create mode 100644 sm2a/infrastructure/variables.tf create mode 100644 sm2a/plugins/.gitkeep create mode 100755 sm2a/scripts/deploy.sh create mode 100644 sm2a/scripts/generate_env_file.py create mode 100644 sm2a/scripts/put_airflow_worker_autoscaling_metric_data.py create mode 100644 sm2a/scripts/run_task.py create mode 100644 sm2a/sm2a-local-config/env_example create mode 100644 sm2a/sm2a-local-config/local_airflow.cfg create mode 100644 sm2a/sm2a-local-config/local_webserver_config.py diff --git a/.github/actions/terraform-deploy/action.yml b/.github/actions/terraform-deploy/action.yml index 7540124b..ab963a1c 100644 --- a/.github/actions/terraform-deploy/action.yml +++ b/.github/actions/terraform-deploy/action.yml @@ -61,4 +61,3 @@ runs: working-directory: ${{ inputs.dir }} run: | ./scripts/deploy.sh ${{ inputs.env-file }} <<< init - ./scripts/deploy.sh ${{ inputs.env-file }} <<< deploy diff --git a/.github/actions/terraform-deploy/sm2a_action.yml b/.github/actions/terraform-deploy/sm2a_action.yml new file mode 100644 index 00000000..d0947ace --- /dev/null +++ b/.github/actions/terraform-deploy/sm2a_action.yml @@ -0,0 +1,67 @@ +name: Deploy SM2A + +inputs: + environment: + type: string + required: true + aws-region: + type: string + required: false + default: us-east-1 + env-file: + type: string + required: true + role-session-name: + required: false + type: string + default: github-actions-deployment + current_dir: + required: false + type: string + default: "." + sm2a_dir: + required: false + type: string + default: "./sm2a" + script_path: + type: string + + +runs: + using: "composite" + + steps: + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Install python dependencies + shell: bash + working-directory: ${{ inputs.dir }} + run: pip install -r deploy_requirements.txt + + - name: Get relevant environment configuration from aws secrets + shell: bash + working-directory: ${{ inputs.sm2a_dir }} + env: + SECRET_SSM_NAME: ${{ inputs.env_aws_secret_name }} + AWS_DEFAULT_REGION: us-west-2 + run: | + python scripts/generate_env_file.py --secret-id ${{ vars.DEPLOYMENT_ENV_SECRET_NAME }} --env-file ${{ inputs.env-file }} + + + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.6.6 + + - name: Deploy + shell: bash + working-directory: ${{ inputs.sm2a_dir }} + run: | + cp -r ../dags . + ./scripts/deploy.sh ${{ inputs.env-file }} <<< init + ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 88696254..deb96353 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -38,6 +38,8 @@ jobs: run: | if [ "${{ github.ref }}" = "refs/heads/main" ]; then echo "env_name=staging" >> $GITHUB_OUTPUT + elif [ "${{ github.ref }}" = "refs/heads/deploy-sm2a" ]; then + echo "env_name=development" >> $GITHUB_OUTPUT elif [ "${{ github.ref }}" = "refs/heads/dev" ]; then echo "env_name=development" >> $GITHUB_OUTPUT elif [ "${{ github.ref }}" = "refs/heads/production" ]; then @@ -72,6 +74,32 @@ jobs: aws-region: "us-west-2" - name: Run deployment - uses: "./.github/actions/terraform-deploy" + uses: "./.github/actions/terraform-deploy/action.yml" with: env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} + + + deploy-sm2a: + name: Deploy to ${{ needs.define-environment.outputs.env_name }} 🚀 + runs-on: self-hosted + if: ${{ needs.define-environment.outputs.env_name }} + needs: [gitflow-enforcer, define-environment] + environment: ${{ needs.define-environment.outputs.env_name }} + concurrency: ${{ needs.define-environment.outputs.env_name }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: ${{ secrets.DEPLOYMENT_ROLE_ARN }} + role-session-name: "veda-airflow-github-sm2a-${{ needs.define-environment.outputs.env_name }}-deployment" + aws-region: "us-west-2" + + - name: Run sm2a deployment + uses: "./.github/actions/terraform-deploy/sm2a-action.yml" + with: + env_aws_secret_name: veda-sm2a-dev-deployment-secrets + env-file: .env diff --git a/sm2a/README.md b/sm2a/README.md new file mode 100644 index 00000000..c867bb53 --- /dev/null +++ b/sm2a/README.md @@ -0,0 +1,163 @@ +# CSDA-data-pipelines + +This repo houses function code and deployment code for CSDA projects. + +## Project layout + +- [dags](./dags/) contains the Directed Acyclic Graphs which constitute Airflow state machines. This includes the python for running each task as well as the python definitions of the structure of these DAGs +- [airflow_worker/requirements](./airflow_worker/requirements.txt) contains requirements.txt file. This file is used to specify the dependencies of the workers, these libraries will be installed in all SM2A workers and can be accessed by all tasks. +- [airflow_services/requirements](./airflow_services/requirements.txt) contains requirements.txt file. This file is used to specify the dependencies of the schedulers and the webserver. +- [infrastructure](./infrastructure/) contains the terraform necessary to deploy all resources to AWS +- [scripts](./scripts/) contains bash script for deploying +-[sm2a-local-config](./sm2a-local-config) contains airflow configuration to run Airflow locally. +Also you can define AWS credentials or other custom envs in [.env](./sm2a-local-config/env_example) file. You need +to copy ./sm2a-local-config/env_example to ./sm2a-local-config/.env and update the values of AWS secrets. + +### Terraform + +See [terraform-getting-started](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) + +### AWS CLI + +See [getting-started-install](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) + +### Docker && docker-compose +See [install-docker-and-docker-compose](https://docs.docker.com/compose/install/) + +## Getting started + +### Setup a local development environment + +1. Initialize the metadata db (only needed once) + +```shell +docker compose run --rm airflow-cli db init +``` + +2. Create an admin user (only needed once) + +```shell +docker compose run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin +``` + +3. Start all services + +```shell +docker compose up -d +``` + +If you want to run the services in foreground mode, you can use the following command: +```shell + +docker compose up +``` + +Typically, you need to initialize the database and create an admin user once. After that, you only need to start the services.
+After starting the services wait for a minute then visit [localhost:8080](localhost:8080). + + +3. Stop the services +- If you are running the services in the foreground mode, you can stop them by pressing Ctrl+C. +- If you are running the services in the background mode, you can stop them with the following command: +```bash +docker compose down +``` +Note: You need to be in the same folder containing `docker-compose.yml` file + + +## Deployment + +### Deployment via github actions + +This project uses Terraform modules to deploy Apache Airflow and related AWS resources. Typically, your code will deploy automatically via Github Actions, after your Pull Request has been approved and merged. For more information about Git flow please refer to this [document](https://github.com/NASA-IMPACT/csda-data-pipelines/blob/dev/GITFLOW.md)
+ +#### Github Actions workflows layout +- [cicd.yml](./.github/workflows/cicd.yml) defines multiple jobs to: +* Check the linter +* Run unit tests +* Defines the environement where the deployment will happen +- [deploy.yml](./.github/workflows/deploy.yml) file uses OpenOIDC to obtain AWS credentials and deploys Terraform modules to AWS. The necessary environment variables are retrieved from AWS Secret Manager using the following Python [script](./scripts/generate_env_file.py). +- [gitflow.yml](./.github/workflows/gitflow.yml) provides a structured way to manage the development, testing, and deployment of terraform modules. For more info refer to [gitflow](https://github.com/NASA-IMPACT/csda-data-pipelines/blob/dev/GITFLOW.md) + + + +### Deployment via local machine +You can deploy SM2A from your local machine by running: +```bash +$python scripts/generate_env_file.py --secret-id $AWS_SECRET_NAME --env-file .env +``` +Assuming you have access to [AWS Secrets Manager](https://aws.amazon.com/secrets-manager/) where the deployment variables are stored. + +```bash +./scripts/deploy.sh .env <<< init +./scripts/deploy.sh .env <<< deploy +``` + +### Login to UI +To log in to the Airflow UI, you must be added to a specific GitHub team. +Contact your administrator to be added to the appropriate GitHub team that has access to the Airflow instance. +Once added, you can log in by visiting https:// and using your GitHub credentials. + +## Developers Guide + +### Working with Airflow Variables +Airflow variables allow passing secrets, configurations, etc., to tasks without embedding sensitive values in code. +We are using AWS Secrets Manager as the secrets' backend. A secret manager will be created during the deployment with +the name /airflow/variables/aws_dags_variables. You can add the variables there and read them in a task using +the following approach: +```python +import json +from airflow.models import Variable +var = Variable.get("aws_dags_variables") +var_json = json.loads(var) +print(var['db_secret_name']) +``` + +### Adding a DAG +The DAGs are defined in Python files located in the [dags](./dags/) directory. Each DAG should be defined as a Python module that defines a DAG object. The DAGs are scheduled by the [Airflow Scheduler](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/scheduler.html#scheduler). Since we aim to keep the scheduler lightweight, every task-dependent library should be imported in the tasks and not at the DAG level.
+Example: Let's assume we need numpy library in a task, we should not import it like this +```python +from airflow import DAG +import pendulum +from airflow.operators.python import PythonOperator +from airflow.operators.empty import EmptyOperator +import numpy as np + +def foo_task(): + process = ML_processing(np.rand()) + return process +``` + +But rather like this +```python +from airflow import DAG +import pendulum +from airflow.operators.python import PythonOperator +from airflow.operators.empty import EmptyOperator + + +def foo_task(): + import numpy as np + process = ML_processing(np.rand()) + return process +``` +Doing so, the scheduler won't need numpy installed to schedule the task. + +#### Working with DAG variables +If you want to use a variable in your DAG folow these steps + +1- Define Variables in AWS Secrets Manager: + +Define the variables you want to use in your DAG within AWS Secrets Manager. +The Secrets Manager should have a specific naming convention, with the prefix ${stage}-csda-dags-variables. Where ${stage} is a placeholder for a stage or environment variable, indicating different environments (e.g., development, testing, production). + +2- Deployment: + +During the deployment process, these secrets are retrieved from AWS Secrets Manager. +The retrieved secrets are then stored in a .env file. + +3- Usage in Tasks: + +The [python-dotenv](https://pypi.org/project/python-dotenv/) library is used to access the variables stored in the .env file. +These variables can now be used within your DAG tasks. + diff --git a/sm2a/airflow_services/Dockerfile b/sm2a/airflow_services/Dockerfile new file mode 100644 index 00000000..5a411875 --- /dev/null +++ b/sm2a/airflow_services/Dockerfile @@ -0,0 +1,29 @@ +FROM --platform=linux/arm64 apache/airflow:slim-2.8.4-python3.11 +ARG AIRFLOW_VERSION=2.8.4 +USER root +# `apt-get autoremove` is used to remove packages that were automatically installed to satisfy +# dependencies for other packages and are now no longer needed. +# `apt-get clean` clears out the local repository of retrieved package files + +RUN apt-get update \ + && apt-get install -y --no-install-recommends gcc libc6-dev libcurl4-openssl-dev libssl-dev \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +COPY --chown=airflow:airflow airflow_services/requirements.txt "${AIRFLOW_HOME}/requirements.txt" + +USER airflow + +RUN pip install --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt -c "https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-3.11.txt" + +COPY --chown=airflow:airflow dags "${AIRFLOW_HOME}/dags" +COPY --chown=airflow:airflow plugins "${AIRFLOW_HOME}/plugins" +COPY --chown=airflow:airflow infrastructure/configuration "${AIRFLOW_HOME}/configuration" +COPY --chown=airflow:airflow scripts "${AIRFLOW_HOME}/scripts" +COPY --chown=airflow:airflow airflow_services/webserver_config.py "${AIRFLOW_HOME}/webserver_config.py" + +RUN cp ${AIRFLOW_HOME}/configuration/airflow.cfg* ${AIRFLOW_HOME}/. +#ENV +ENV PYTHONPATH /opt/airflow diff --git a/sm2a/airflow_services/requirements.txt b/sm2a/airflow_services/requirements.txt new file mode 100644 index 00000000..60052c75 --- /dev/null +++ b/sm2a/airflow_services/requirements.txt @@ -0,0 +1,24 @@ +apache-airflow-providers-amazon +botocore +cryptography +# To use SQS as a broker in Celery, you need to install pycurl. +# https://github.com/saleor/saleor/issues/8804 +pycurl + +apache-airflow-providers-celery +affine +netCDF4 +requests +rio-cogeo +smart-open +airflow_multi_dagrun +apache-airflow-providers-postgres +apache-airflow-providers-common-sql +typing-extensions +psycopg2-binary +pyOpenSSL +stac-pydantic +fsspec +s3fs +xarray +xstac diff --git a/sm2a/airflow_services/webserver_config.py b/sm2a/airflow_services/webserver_config.py new file mode 100644 index 00000000..83985431 --- /dev/null +++ b/sm2a/airflow_services/webserver_config.py @@ -0,0 +1,209 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Default configuration for the Airflow webserver.""" +from __future__ import annotations + + +from flask_appbuilder.security.manager import AUTH_OAUTH + +from airflow.auth.managers.fab.security_manager.override import ( + FabAirflowSecurityManagerOverride, +) +import logging +from typing import Any, Union +import os + +basedir = os.path.abspath(os.path.dirname(__file__)) + +# Flask-WTF flag for CSRF +WTF_CSRF_ENABLED = True +WTF_CSRF_TIME_LIMIT = None + +# ---------------------------------------------------- +# AUTHENTICATION CONFIG +# ---------------------------------------------------- +# For details on how to set up each of the following authentication, see +# http://flask-appbuilder.readthedocs.io/en/latest/security.html# authentication-methods +# for details. + +# The authentication type +# AUTH_OID : Is for OpenID +# AUTH_DB : Is for database +# AUTH_LDAP : Is for LDAP +# AUTH_REMOTE_USER : Is for using REMOTE_USER from web server +# AUTH_OAUTH : Is for OAuth +AUTH_TYPE = AUTH_OAUTH + +# Uncomment to setup Full admin role name +# AUTH_ROLE_ADMIN = 'Admin' + +# Uncomment and set to desired role to enable access without authentication +# AUTH_ROLE_PUBLIC = 'Viewer' + +# Will allow user self registration +# AUTH_USER_REGISTRATION = True + +# The recaptcha it's automatically enabled for user self registration is active and the keys are necessary +# RECAPTCHA_PRIVATE_KEY = PRIVATE_KEY +# RECAPTCHA_PUBLIC_KEY = PUBLIC_KEY + +# Config for Flask-Mail necessary for user self registration +# MAIL_SERVER = 'smtp.gmail.com' +# MAIL_USE_TLS = True +# MAIL_USERNAME = 'yourappemail@gmail.com' +# MAIL_PASSWORD = 'passwordformail' +# MAIL_DEFAULT_SENDER = 'sender@gmail.com' + +AUTH_ROLES_SYNC_AT_LOGIN = True # Checks roles on every login +AUTH_USER_REGISTRATION = ( + True # allow users who are not already in the FAB DB to register +) +# Make sure to replace this with the path to your security manager class +AUTH_ROLES_MAPPING = { + "Viewer": ["Viewer"], + "Admin": ["Admin"], +} +# If you wish, you can add multiple OAuth providers. +OAUTH_PROVIDERS = [ + { + "name": "github", + "icon": "fa-github", + "token_key": "access_token", + "remote_app": { + "client_id": os.getenv("GH_CLIENT_ID"), + "client_secret": os.getenv("GH_CLIENT_SECRET"), + "api_base_url": "https://api.github.com", + "client_kwargs": {"scope": "read:user, read:org"}, + "access_token_url": "https://github.com/login/oauth/access_token", + "authorize_url": "https://github.com/login/oauth/authorize", + "request_token_url": None, + }, + }, +] + + +log = logging.getLogger(__name__) +log.setLevel(os.getenv("AIRFLOW__LOGGING__FAB_LOGGING_LEVEL", "INFO")) + +FAB_ADMIN_ROLE = "Admin" +FAB_VIEWER_ROLE = "Viewer" +FAB_PUBLIC_ROLE = "Public" # The "Public" role is given no permissions +TEAM_ID_A_FROM_GITHUB = os.getenv("GH_ADMIN_TEAM_ID") +TEAM_ID_B_FROM_GITHUB = os.getenv("GH_USER_TEAM_ID") + + +def team_parser(team_payload: dict[str, Any]) -> list[int]: + # Parse the team payload from GitHub however you want here. + return [team["name"] for team in team_payload] + + +def map_roles(team_list: list[int]) -> list[str]: + # Associate the team IDs with Roles here. + # The expected output is a list of roles that FAB will use to Authorize the user. + + team_role_map = { + TEAM_ID_A_FROM_GITHUB: FAB_ADMIN_ROLE, + TEAM_ID_B_FROM_GITHUB: FAB_VIEWER_ROLE, + } + return list(set(team_role_map.get(team, FAB_PUBLIC_ROLE) for team in team_list)) + + +class GithubTeamAuthorizer(FabAirflowSecurityManagerOverride): + # In this example, the oauth provider == 'github'. + # If you ever want to support other providers, see how it is done here: + # https://github.com/dpgaspar/Flask-AppBuilder/blob/master/flask_appbuilder/security/manager.py#L550 + def get_oauth_user_info( + self, provider: str, resp: Any + ) -> dict[str, Union[str, list[str]]]: + # Creates the user info payload from Github. + # The user previously allowed your app to act on their behalf, + # so now we can query the user and teams endpoints for their data. + # Username and team membership are added to the payload and returned to FAB. + + remote_app = self.appbuilder.sm.oauth_remotes[provider] + me = remote_app.get("user") + user_data = me.json() + team_data = remote_app.get("user/teams") + teams = team_parser(team_data.json()) + roles = map_roles(teams) + log.debug(f"User info from Github: {user_data}\nTeam info from Github: {teams}") + print(f"User info from Github: {user_data}\nTeam info from Github: {teams}") + return {"username": "github_" + user_data.get("login"), "role_keys": roles} + + +SECURITY_MANAGER_CLASS = GithubTeamAuthorizer +# The default user self registration role +# AUTH_USER_REGISTRATION_ROLE = "Public" + +# When using OAuth Auth, uncomment to setup provider(s) info +# Google OAuth example: +# OAUTH_PROVIDERS = [{ +# 'name':'google', +# 'token_key':'access_token', +# 'icon':'fa-google', +# 'remote_app': { +# 'api_base_url':'https://www.googleapis.com/oauth2/v2/', +# 'client_kwargs':{ +# 'scope': 'email profile' +# }, +# 'access_token_url':'https://accounts.google.com/o/oauth2/token', +# 'authorize_url':'https://accounts.google.com/o/oauth2/auth', +# 'request_token_url': None, +# 'client_id': GOOGLE_KEY, +# 'client_secret': GOOGLE_SECRET_KEY, +# } +# }] + +# When using LDAP Auth, setup the ldap server +# AUTH_LDAP_SERVER = "ldap://ldapserver.new" + +# When using OpenID Auth, uncomment to setup OpenID providers. +# example for OpenID authentication +# OPENID_PROVIDERS = [ +# { 'name': 'Yahoo', 'url': 'https://me.yahoo.com' }, +# { 'name': 'AOL', 'url': 'http://openid.aol.com/' }, +# { 'name': 'Flickr', 'url': 'http://www.flickr.com/' }, +# { 'name': 'MyOpenID', 'url': 'https://www.myopenid.com' }] + +# ---------------------------------------------------- +# Theme CONFIG +# ---------------------------------------------------- +# Flask App Builder comes up with a number of predefined themes +# that you can use for Apache Airflow. +# http://flask-appbuilder.readthedocs.io/en/latest/customizing.html#changing-themes +# Please make sure to remove "navbar_color" configuration from airflow.cfg +# in order to fully utilize the theme. (or use that property in conjunction with theme) +# APP_THEME = "bootstrap-theme.css" # default bootstrap +# APP_THEME = "amelia.css" +# APP_THEME = "cerulean.css" +# APP_THEME = "cosmo.css" +# APP_THEME = "cyborg.css" +# APP_THEME = "darkly.css" +# APP_THEME = "flatly.css" +# APP_THEME = "journal.css" +# APP_THEME = "lumen.css" +# APP_THEME = "paper.css" +# APP_THEME = "readable.css" +# APP_THEME = "sandstone.css" +# APP_THEME = "simplex.css" +# APP_THEME = "slate.css" +# APP_THEME = "solar.css" +# APP_THEME = "spacelab.css" +# APP_THEME = "superhero.css" +# APP_THEME = "united.css" +# APP_THEME = "yeti.css" diff --git a/sm2a/airflow_worker/Dockerfile b/sm2a/airflow_worker/Dockerfile new file mode 100644 index 00000000..f69ea1ec --- /dev/null +++ b/sm2a/airflow_worker/Dockerfile @@ -0,0 +1,51 @@ +FROM --platform=linux/arm64 osgeo/gdal:ubuntu-small-3.6.3 +ARG AIRFLOW_VERSION=2.8.4 + +ARG UNAME=airflow + +ARG UID=50000 + +ARG GID=0 + +ARG AIRFLOW_HOME=/opt/airflow + +RUN groupadd -g $GID -o $UNAME + +RUN useradd -m -u $UID -g $GID -o -s /bin/bash $UNAME + +WORKDIR /opt/airflow + +RUN chown $UNAME:$GID /opt/airflow + +RUN apt-get -y update \ + && apt install -y python3-pip \ + && apt-get install -y --no-install-recommends gcc libc6-dev libcurl4-openssl-dev libssl-dev \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +USER airflow + +ENV PATH $PATH:/home/airflow/.local/bin + +COPY --chown=airflow:airflow airflow_worker/requirements.txt "${AIRFLOW_HOME}/requirements.txt" + +RUN pip install --upgrade pip \ + && pip install "apache-airflow[celery,amazon]==${AIRFLOW_VERSION}" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-3.11.txt" \ + && pip install --no-cache-dir -r requirements.txt -c "https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-3.11.txt" + +COPY --chown=airflow:airflow dags "${AIRFLOW_HOME}/dags" +COPY --chown=airflow:airflow plugins "${AIRFLOW_HOME}/plugins" +COPY --chown=airflow:airflow infrastructure/configuration "${AIRFLOW_HOME}/configuration" +COPY --chown=airflow:airflow scripts "${AIRFLOW_HOME}/scripts" + +RUN cp ${AIRFLOW_HOME}/configuration/airflow.cfg* ${AIRFLOW_HOME}/. + +RUN pip install pypgstac==0.7.4 + +# ENV +ENV AIRFLOW_HOME ${AIRFLOW_HOME} +ENV TZ UTC +ENV PYTHONPATH /opt/airflow + +CMD /bin/bash \ No newline at end of file diff --git a/sm2a/airflow_worker/requirements.txt b/sm2a/airflow_worker/requirements.txt new file mode 100644 index 00000000..7c66fbda --- /dev/null +++ b/sm2a/airflow_worker/requirements.txt @@ -0,0 +1,24 @@ +apache-airflow-providers-amazon +botocore +cryptography +# To use SQS as a broker in Celery, you need to install pycurl. +# https://github.com/saleor/saleor/issues/8804 +pycurl +psycopg2-binary +apache-airflow-providers-celery +affine +netCDF4 +requests +rio-cogeo +smart-open +airflow_multi_dagrun +apache-airflow-providers-postgres +apache-airflow-providers-common-sql +typing-extensions +pyOpenSSL +stac-pydantic +fsspec +s3fs +xarray +xstac + diff --git a/sm2a/deploy_requirements.txt b/sm2a/deploy_requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/sm2a/docker-compose.yml b/sm2a/docker-compose.yml new file mode 100644 index 00000000..fbc15257 --- /dev/null +++ b/sm2a/docker-compose.yml @@ -0,0 +1,180 @@ +--- +x-airflow-common: + &airflow-common + platform: linux/arm64 + build: + context: . + dockerfile: ./airflow_services/Dockerfile + env_file: + - ./sm2a-local-config/.env + environment: + &airflow-common-env + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@airflow-db/airflow + # THIS IS A FAKE CREDENTIAL FOR DEMONSTRATION PURPOSES + # Generate with the following code + # python -c 'from cryptography.fernet import Fernet; print(Fernet.generate_key())' + AIRFLOW__CORE__FERNET_KEY: "Ly8wMU8r5K7jPy58M3GpkZbXDNyJz8HiJll3pu8DbIM=" + AIRFLOW__WEBSERVER__SECRET_KEY: "Ly8wMU8r5K7jPy58M3GpkZbXDNyJz8HiJll3pu8DbIM=" + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@airflow-db/airflow + AIRFLOW__CELERY__BROKER_URL: sqs://user:pass@celery-broker:9324/ + AIRFLOW__WEBSERVER__INSTANCE_NAME: "${AIRFLOW__WEBSERVER__INSTANCE_NAME:-CSDA-SM2A-Airflow}" + AIRFLOW__LOGGING__LOGGING_LEVEL: DEBUG + # Gotcha: Even though we set this to "True" in airflow.cfg, an environment variable overrides it + AIRFLOW__CORE__LOAD_EXAMPLES: false + volumes: + - ./dags:/opt/airflow/dags + - ./plugins:/opt/airflow/plugins + - ./sm2a-local-config/local_airflow.cfg:/opt/airflow/airflow.cfg + - ./sm2a-local-config/local_webserver_config.py:/opt/airflow/webserver_config.py + - ./infrastructure/configuration:/opt/airflow/configuration + - ./scripts:/opt/airflow/scripts + user: "50000:0" + depends_on: + &airflow-common-depends-on + celery-broker: + condition: service_started + airflow-db: + condition: service_healthy + +x-airflow-worker: + &airflow-worker + platform: linux/arm64 + env_file: + - ./sm2a-local-config/.env + build: + context: . + dockerfile: ./airflow_worker/Dockerfile + environment: + <<: *airflow-common-env + volumes: + - ./dags:/opt/airflow/dags + - ./plugins:/opt/airflow/plugins + - ./sm2a-local-config/local_airflow.cfg:/opt/airflow/airflow.cfg + - ./infrastructure/configuration:/opt/airflow/configuration + - ./scripts:/opt/airflow/scripts + user: "50000:0" + depends_on: + <<: *airflow-common-depends-on + +services: + airflow-db: + image: postgres:13 + platform: linux/arm64 + environment: + POSTGRES_USER: airflow + POSTGRES_PASSWORD: airflow + POSTGRES_DB: airflow + volumes: + - airflow-db-volume:/var/lib/postgresql/data + healthcheck: + test: [ "CMD", "pg_isready", "-U", "airflow" ] + interval: 5s + retries: 5 + restart: always + + # For environment parity, use backend that implements SQS interface + # Doesn't have ARM64 image + # https://github.com/roribio/alpine-sqs + celery-broker: + image: roribio16/alpine-sqs:latest + platform: linux/amd64 + expose: + - 9324 + - 9325 + restart: always + + airflow-webserver: + <<: *airflow-common + command: webserver + ports: + - 8080:8080 + healthcheck: + test: + [ + "CMD", + "curl", + "--fail", + "http://localhost:8080/health" + ] + interval: 35s + timeout: 30s + retries: 5 + restart: always + depends_on: + <<: *airflow-common-depends-on + + airflow-scheduler: + <<: *airflow-common + command: scheduler + healthcheck: + test: + [ + "CMD-SHELL", + 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"' + ] + interval: 35s + timeout: 30s + retries: 5 + restart: always + depends_on: + <<: *airflow-common-depends-on + + + airflow-worker: + <<: *airflow-worker + command: airflow celery worker + # SQS does not support worker remote control commands. + # We will ping celery broker from the worker to test healthcheck. + healthcheck: + test: + [ + "CMD", + "curl", + "http://celery-broker:9324/" + ] + interval: 35s + timeout: 30s + retries: 5 + environment: + <<: *airflow-common-env + # Required to handle warm shutdown of the celery workers properly + # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation + DUMB_INIT_SETSID: "0" + restart: always + depends_on: + <<: *airflow-common-depends-on + + + # AIP-40: Deferrable ("Async") Operators + # https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=177050929 + airflow-triggerer: + <<: *airflow-common + command: triggerer + healthcheck: + test: + [ + "CMD-SHELL", + 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"' + ] + interval: 35s + timeout: 30s + retries: 5 + restart: always + depends_on: + <<: *airflow-common-depends-on + + airflow-cli: + <<: *airflow-common + profiles: + - debug + environment: + <<: *airflow-common-env + CONNECTION_CHECK_MAX_COUNT: "0" + # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 + command: + - bash + - -c + - /bin/bash + +volumes: + airflow-db-volume: diff --git a/sm2a/docs/howto/add_a_general_dag.md b/sm2a/docs/howto/add_a_general_dag.md new file mode 100644 index 00000000..c939e1d1 --- /dev/null +++ b/sm2a/docs/howto/add_a_general_dag.md @@ -0,0 +1,39 @@ +# How to Add a General DAG +> A general DAG will be standalone or separate from the dynamic Vendor ETL Pipeline DAGs. General purposes may include data quality checks, data profiling, and other utility tasks. In the event that a DAG is vendor-specific but not a fit for the Vendor ETL Pipeline, let's consider it a general DAG. + +## Steps +1. Copy the template DAG file from the `dags` directory +2. Rename the file adhering to the following naming conventions +3. Update the DAG file with the necessary configurations, including relevant Tag(s) and Owner Links +4. Configure the DAG with the necessary tasks + + +## Naming Conventions + +### DAG File & Class Name +- `___` - for general DAGs where: + - `` is the subject of the DAG + - `` is the action the DAG performs + - `` is an optional qualifier to differentiate DAGs with the same subject and verb (action) + - Example: `metadata_monitor_` +- `v___` - similar to the general DAG pattern, but for vendor-specific DAGs that don't qualify for the Dynamic Vendor ETL Pipeline (i.e. `v_data_unzip_maxar`) +- `util_` - for utility files that can be shared across multiple DAGs (e.g. `util_s3file_check_`) + +### Tags +- `` - for vendor-specific DAGs (e.g. `Maxar` or `Planet`) +- `AWS` - for interactions with AWS services +- `ETL` - for ETL tasks outside of the Dynamic Vendor ETL Pipeline +- `QAQC` - for data quality checks +- `Template` - for templates + +### General Principles +- **Keep things simple**. If a DAG is too complex, its scheduling performance may be impacted. This includes a DAG's structure: simple linear DAGs (A -> B -> C) are preferred over deeply nested DAGs that may incur delays in scheduling ([reference](https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html#reducing-dag-complexity)). +- **Write efficient Python code**. +- **Avoid Top-Level Code in the DAG file** to avoid scheduling delays, since the scheduler always executes top-level code as it parses a DAG file ([reference](https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html#best-practices-top-level-code)). +- **Use Airflow Variables or AWS Secrets Manager**. Airflow Variables can store configuration settings that may change over time ([reference](https://airflow.apache.org/docs/apache-airflow/stable/concepts/variables.html)); AWS Secrets Manager can also store variables, as well as sensitive information like passwords and API keys ([reference](https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html)). +- **Avoid storing files locally**. Instead, use XCom for small messages or S3/another cloud storage service to coordinate large messages or data files that are needed between Tasks ([reference](https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html#communication)). +- **Time and test your DAGs**. Make sure they run as expected and complete within an expected time frame ([reference](https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html#testing-a-dag)). + +## Additional Resources +- [Apache Airflow Best Practices](https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html) +- [Apache Airflow Concepts](https://airflow.apache.org/docs/apache-airflow/stable/concepts.html) \ No newline at end of file diff --git a/sm2a/infrastructure/.terraform.lock.hcl b/sm2a/infrastructure/.terraform.lock.hcl new file mode 100644 index 00000000..b9eab3ab --- /dev/null +++ b/sm2a/infrastructure/.terraform.lock.hcl @@ -0,0 +1,82 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "4.67.0" + constraints = "~> 4.0" + hashes = [ + "h1:5Zfo3GfRSWBaXs4TGQNOflr1XaYj6pRnVJLX5VAjFX4=", + "zh:0843017ecc24385f2b45f2c5fce79dc25b258e50d516877b3affee3bef34f060", + "zh:19876066cfa60de91834ec569a6448dab8c2518b8a71b5ca870b2444febddac6", + "zh:24995686b2ad88c1ffaa242e36eee791fc6070e6144f418048c4ce24d0ba5183", + "zh:4a002990b9f4d6d225d82cb2fb8805789ffef791999ee5d9cb1fef579aeff8f1", + "zh:559a2b5ace06b878c6de3ecf19b94fbae3512562f7a51e930674b16c2f606e29", + "zh:6a07da13b86b9753b95d4d8218f6dae874cf34699bca1470d6effbb4dee7f4b7", + "zh:768b3bfd126c3b77dc975c7c0e5db3207e4f9997cf41aa3385c63206242ba043", + "zh:7be5177e698d4b547083cc738b977742d70ed68487ce6f49ecd0c94dbf9d1362", + "zh:8b562a818915fb0d85959257095251a05c76f3467caa3ba95c583ba5fe043f9b", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:9c385d03a958b54e2afd5279cd8c7cbdd2d6ca5c7d6a333e61092331f38af7cf", + "zh:b3ca45f2821a89af417787df8289cb4314b273d29555ad3b2a5ab98bb4816b3b", + "zh:da3c317f1db2469615ab40aa6baba63b5643bae7110ff855277a1fb9d8eb4f2c", + "zh:dc6430622a8dc5cdab359a8704aec81d3825ea1d305bbb3bbd032b1c6adfae0c", + "zh:fac0d2ddeadf9ec53da87922f666e1e73a603a611c57bcbc4b86ac2821619b1d", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.4.1" + hashes = [ + "h1:gpp25uNkYJYzJVnkyRr7RIBVfwLs9GSq2HNnFpTRBg0=", + "zh:244b445bf34ddbd167731cc6c6b95bbed231dc4493f8cc34bd6850cfe1f78528", + "zh:3c330bdb626123228a0d1b1daa6c741b4d5d484ab1c7ae5d2f48d4c9885cc5e9", + "zh:5ff5f9b791ddd7557e815449173f2db38d338e674d2d91800ac6e6d808de1d1d", + "zh:70206147104f4bf26ae67d730c995772f85bf23e28c2c2e7612c74f4dae3c46f", + "zh:75029676993accd6bef933c196b2fad51a9ec8a69a847dbbe96ec8ebf7926cdc", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7d48d5999fe1fcdae9295a7c3448ac1541f5a24c474bd82df6d4fa3732483f2b", + "zh:b766b38b027f0f84028244d1c2f990431a37d4fc3ac645962924554016507e77", + "zh:bfc7ad301dada204cf51c59d8bd6a9a87de5fddb42190b4d6ba157d6e08a1f10", + "zh:c902b527702a8c5e2c25a6637d07bbb1690cb6c1e63917a5f6dc460efd18d43f", + "zh:d68ae0e1070cf429c46586bc87580c3ed113f76241da2b6e4f1a8348126b3c46", + "zh:f4903fd89f7c92a346ae9e666c2d0b6884c4474ae109e9b4bd15e7efaa4bfc29", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.2" + hashes = [ + "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.6.0" + hashes = [ + "h1:I8MBeauYA8J8yheLJ8oSMWqB0kovn16dF/wKZ1QTdkk=", + "zh:03360ed3ecd31e8c5dac9c95fe0858be50f3e9a0d0c654b5e504109c2159287d", + "zh:1c67ac51254ba2a2bb53a25e8ae7e4d076103483f55f39b426ec55e47d1fe211", + "zh:24a17bba7f6d679538ff51b3a2f378cedadede97af8a1db7dad4fd8d6d50f829", + "zh:30ffb297ffd1633175d6545d37c2217e2cef9545a6e03946e514c59c0859b77d", + "zh:454ce4b3dbc73e6775f2f6605d45cee6e16c3872a2e66a2c97993d6e5cbd7055", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:91df0a9fab329aff2ff4cf26797592eb7a3a90b4a0c04d64ce186654e0cc6e17", + "zh:aa57384b85622a9f7bfb5d4512ca88e61f22a9cea9f30febaa4c98c68ff0dc21", + "zh:c4a3e329ba786ffb6f2b694e1fd41d413a7010f3a53c20b432325a94fa71e839", + "zh:e2699bc9116447f96c53d55f2a00570f982e6f9935038c3810603572693712d0", + "zh:e747c0fd5d7684e5bfad8aa0ca441903f15ae7a98a737ff6aca24ba223207e2c", + "zh:f1ca75f417ce490368f047b63ec09fd003711ae48487fba90b4aba2ccf71920e", + ] +} diff --git a/sm2a/infrastructure/configuration/__init__.py b/sm2a/infrastructure/configuration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sm2a/infrastructure/configuration/airflow.cfg b/sm2a/infrastructure/configuration/airflow.cfg new file mode 100755 index 00000000..5b78f1df --- /dev/null +++ b/sm2a/infrastructure/configuration/airflow.cfg @@ -0,0 +1,63 @@ +[api] +auth_backends = airflow.api.auth.backend.basic_auth + +[core] +executor = CeleryExecutor +dags_are_paused_at_creation = true +load_examples = false +load_default_connections = false +# Allow airflow to run hundreds of tasks in parallel, because we will scale workers +# automatically. +# https://programmaticponderings.com/2020/12/29/amazon-managed-workflows-for-apache-airflow-configuration-understanding-amazon-mwaas-configuration-options/ +max_active_tasks_per_dag = 10000 +parallelism = 10000 + +[celery] +broker_url = sqs:// +celery_config_options = configuration.celery_config.CELERY_CONFIG + + +[github_enterprise] +api_rev = v3 +host = github.com +client_id = Iv23liBjz91G9wLwnaK6 +client_secret = 88b73528341a884bb418852d225f4913e69df478 +oauth_callback_route = /home +allowed_teams = csda-msfc + +[webserver] +authenticate = True +auth_backends = airflow.contrib.auth.backends.github_enterprise_auth +dag_default_view = grid +expose_config = true +dag_orientation = TB +warn_deployment_exposure = false + +# On ECS, you can deploy the CloudWatch agent as a sidecar to your application container to collect metrics. +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/deploy_servicelens_CloudWatch_agent_deploy_ECS.html +# https://airflow.apache.org/docs/apache-airflow/stable/logging-monitoring/metrics.html +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-custom-metrics-statsd.html +# https://docs.aws.amazon.com/mwaa/latest/userguide/mwaa-autoscaling.html +# https://docs.aws.amazon.com/mwaa/latest/userguide/access-metrics-cw-202.html#available-metrics-cw-v202 +# [metrics] +# statsd_on = true +# statsd_host = localhost +# statsd_port = 8125 +# statsd_prefix = airflow + +[scheduler] +catchup_by_default = false + +[logging] +# logging_config_class = configuration.logging_config.STDOUT_LOGGING_CONFIG +remote_logging = true +# We set this value as an environment variable +# remote_base_log_folder = + +[secrets] +# AWS Secrets Manager Backend +# https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/secrets-backends/aws-secrets-manager.html +# Setting full_url_mode to false allows us to use multiple fields when storing connections +# Source code: https://github.com/apache/airflow/blob/main/airflow/providers/amazon/aws/secrets/secrets_manager.py +backend = airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend +backend_kwargs = {"connections_prefix": "sm2a-staging/airflow/connections", "variables_prefix": "sm2a-staging/airflow/variables","connections_lookup_pattern": "_default$", "variables_lookup_pattern": "^aws_", "config_prefix": "sm2a-staging/airflow/config"} diff --git a/sm2a/infrastructure/configuration/airflow.cfg.tmpl b/sm2a/infrastructure/configuration/airflow.cfg.tmpl new file mode 100644 index 00000000..460f5772 --- /dev/null +++ b/sm2a/infrastructure/configuration/airflow.cfg.tmpl @@ -0,0 +1,63 @@ +[api] +auth_backends = airflow.api.auth.backend.basic_auth + +[core] +executor = CeleryExecutor +dags_are_paused_at_creation = true +load_examples = false +load_default_connections = false +# Allow airflow to run hundreds of tasks in parallel, because we will scale workers +# automatically. +# https://programmaticponderings.com/2020/12/29/amazon-managed-workflows-for-apache-airflow-configuration-understanding-amazon-mwaas-configuration-options/ +max_active_tasks_per_dag = 10000 +parallelism = 10000 + +[celery] +broker_url = sqs:// +celery_config_options = configuration.celery_config.CELERY_CONFIG + + +[github_enterprise] +api_rev = v3 +host = github.com +client_id = ${gh_app_client_id} +client_secret = ${gh_app_client_secret} +oauth_callback_route = /home +allowed_teams = ${gh_team_id} + +[webserver] +authenticate = True +auth_backends = airflow.contrib.auth.backends.github_enterprise_auth +dag_default_view = grid +expose_config = true +dag_orientation = TB +warn_deployment_exposure = false + +# On ECS, you can deploy the CloudWatch agent as a sidecar to your application container to collect metrics. +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/deploy_servicelens_CloudWatch_agent_deploy_ECS.html +# https://airflow.apache.org/docs/apache-airflow/stable/logging-monitoring/metrics.html +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-custom-metrics-statsd.html +# https://docs.aws.amazon.com/mwaa/latest/userguide/mwaa-autoscaling.html +# https://docs.aws.amazon.com/mwaa/latest/userguide/access-metrics-cw-202.html#available-metrics-cw-v202 +# [metrics] +# statsd_on = true +# statsd_host = localhost +# statsd_port = 8125 +# statsd_prefix = airflow + +[scheduler] +catchup_by_default = false + +[logging] +# logging_config_class = configuration.logging_config.STDOUT_LOGGING_CONFIG +remote_logging = true +# We set this value as an environment variable +# remote_base_log_folder = + +[secrets] +# AWS Secrets Manager Backend +# https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/secrets-backends/aws-secrets-manager.html +# Setting full_url_mode to false allows us to use multiple fields when storing connections +# Source code: https://github.com/apache/airflow/blob/main/airflow/providers/amazon/aws/secrets/secrets_manager.py +backend = airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend +backend_kwargs = {"connections_prefix": "${prefix}/airflow/connections", "variables_prefix": "${prefix}/airflow/variables","connections_lookup_pattern": "_default$", "variables_lookup_pattern": "^aws_", "config_prefix": "${prefix}/airflow/config"} diff --git a/sm2a/infrastructure/configuration/celery_config.py b/sm2a/infrastructure/configuration/celery_config.py new file mode 100644 index 00000000..b3026f9d --- /dev/null +++ b/sm2a/infrastructure/configuration/celery_config.py @@ -0,0 +1,33 @@ +import os + +from airflow.providers.celery.executors.default_celery import DEFAULT_CELERY_CONFIG + +# From here https://github.com/apache/airflow/issues/16163 +# DEFAULT_CELERY_CONFIG['task_acks_late'] = False +# DEFAULT_CELERY_CONFIG['broker_transport_options']['visibility_timeout'] = 300 + +CELERY_CONFIG = { + **DEFAULT_CELERY_CONFIG, + "broker_transport_options": { + **DEFAULT_CELERY_CONFIG["broker_transport_options"], + "predefined_queues": { + # Gotcha: kombu.transport.SQS.UndefinedQueueException + # Queue with name 'default' must be defined in 'predefined_queues' + "default": { + "url": os.getenv( + "X_AIRFLOW_SQS_CELERY_BROKER_PREDEFINED_QUEUE_URL", + "sqs://user:pass@celery-broker:9324/", + ) + }, + "gpu_queue": { + "url": os.getenv( + "X_AIRFLOW_SQS_CELERY_BROKER_GPU_QUEUE_URL", + "sqs://user:pass@celery-broker:9324/", + ) + }, + }, + }, + "polling_interval": 1.0, + # SQS broker is incompatible with remote control commands + "worker_enable_remote_control": False, +} diff --git a/sm2a/infrastructure/configuration/logging_config.py b/sm2a/infrastructure/configuration/logging_config.py new file mode 100644 index 00000000..970385d1 --- /dev/null +++ b/sm2a/infrastructure/configuration/logging_config.py @@ -0,0 +1,21 @@ +import sys +from copy import deepcopy + +from airflow.config_templates.airflow_local_settings import DEFAULT_LOGGING_CONFIG + +STDOUT_LOGGING_CONFIG = deepcopy(DEFAULT_LOGGING_CONFIG) + +# Create a new handler that streams to stdout +STDOUT_LOGGING_CONFIG["handlers"]["stdout"] = { + "class": "logging.StreamHandler", + "formatter": "airflow", + "stream": sys.stdout, + "filters": ["mask_secrets"], +} + +# Set each logger handler to stdout. For the task logger, keep the original "task" +# handler in place. This will allow us to reuse existing airflow functionality to +# view logs from the UI. +STDOUT_LOGGING_CONFIG["loggers"]["airflow.processor"]["handlers"] = ["stdout"] +STDOUT_LOGGING_CONFIG["loggers"]["airflow.task"]["handlers"] = ["stdout", "task"] +STDOUT_LOGGING_CONFIG["loggers"]["flask_appbuilder"]["handlers"] = ["stdout"] diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf new file mode 100644 index 00000000..9e01bee3 --- /dev/null +++ b/sm2a/infrastructure/main.tf @@ -0,0 +1,86 @@ +terraform { + required_providers { + aws = { + version = "~> 4.0" + } + } + required_version = ">= 1.3" +} + +provider "aws" { + region = var.aws_region +} +resource "random_password" "password" { + length = 8 + special = true + override_special = "_%@" +} + + + +module "sma-base" { + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.20/self-managed-apache-airflow.zip" + airflow_db = var.airflow_db + fernet_key = var.fernet_key + prefix = var.prefix + private_subnets_tagname = var.private_subnets_tagname + public_subnets_tagname = var.public_subnets_tagname + vpc_id = var.vpc_id + state_bucketname = var.state_bucketname + desired_max_workers_count = var.workers_configuration[var.stage].max_desired_workers + airflow_admin_password = random_password.password.result + airflow_admin_username = "admin" + rds_publicly_accessible = var.rds_publicly_accessible + permission_boundaries_arn = var.permission_boundaries_arn + custom_worker_policy_statement = var.custom_worker_policy_statement + worker_cpu = var.workers_configuration[var.stage].cpu + worker_memory = var.workers_configuration[var.stage].memory + number_of_schedulers = var.number_of_schedulers + scheduler_cpu = var.scheduler_cpu + scheduler_memory = var.scheduler_memory + rds_engine_version = var.rds_configuration[var.stage].rds_engine_version + rds_instance_class = var.rds_configuration[var.stage].rds_instance_class + rds_allocated_storage = var.rds_configuration[var.stage].rds_allocated_storage + rds_max_allocated_storage = var.rds_configuration[var.stage].rds_max_allocated_storage + workers_logs_retention_days = var.workers_configuration[var.stage].workers_logs_retention_days + airflow_custom_variables = var.airflow_custom_variables + + extra_airflow_task_common_environment = [ + { + name = "AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT" + value = "100" + }, + { + name = "AIRFLOW__CORE__DEFAULT_TASK_RETRIES" + value = var.workers_configuration[var.stage].task_retries + }, + { + name = "GH_CLIENT_ID" + value = var.gh_app_client_id + }, + { + name = "GH_CLIENT_SECRET" + value = var.gh_app_client_secret + }, + { + name = "GH_ADMIN_TEAM_ID" + value = var.gh_team_name + }, + { + name = "GH_USER_TEAM_ID" + value = "csda-airflow-data-pipeline-users" + } + + + ] + extra_airflow_configuration = { + gh_app_client_id = var.gh_app_client_id + gh_app_client_secret = var.gh_app_client_secret + gh_team_id = var.gh_team_name + } + domain_name = var.domain_name + stage = var.stage + subdomain = var.subdomain + worker_cmd = ["/home/airflow/.local/bin/airflow", "celery", "worker"] +} + diff --git a/sm2a/infrastructure/outputs.tf b/sm2a/infrastructure/outputs.tf new file mode 100644 index 00000000..13873858 --- /dev/null +++ b/sm2a/infrastructure/outputs.tf @@ -0,0 +1,6 @@ +output "Airflow_url" { + value = module.sma-base.airflow_url +} +output "Airflow_master_secret_manager" { + value = "Visit ${module.sma-base.airflow_secret_name} For SM2A credentials" +} diff --git a/sm2a/infrastructure/terraform.tf.tmpl b/sm2a/infrastructure/terraform.tf.tmpl new file mode 100644 index 00000000..57aac9e2 --- /dev/null +++ b/sm2a/infrastructure/terraform.tf.tmpl @@ -0,0 +1,8 @@ +terraform { + backend "s3" { + region = "${AWS_REGION:-us-west-2}" + bucket = "${STATE_BUCKET_NAME}" + key = "${STATE_BUCKET_KEY}" + dynamodb_table = "${STATE_DYNAMO_TABLE}" + } +} diff --git a/sm2a/infrastructure/terraform.tfvars.tmpl b/sm2a/infrastructure/terraform.tfvars.tmpl new file mode 100644 index 00000000..2ac5f162 --- /dev/null +++ b/sm2a/infrastructure/terraform.tfvars.tmpl @@ -0,0 +1,15 @@ +prefix="${PREFIX}" +vpc_id="${VPC_ID}" +permission_boundaries_arn="${PERMISSION_BOUNDARIES_ARN}" +airflow_db={ + db_name = "${AIRFLOW_DB_NAME}" + username = "${AIRFLOW_DB_USERNAME}" + password = "${AIRFLOW_DB_PASSWORD}" + port = "5432" + } +fernet_key="${AIRFLOW_FERNET_KEY}" +private_subnets_tagname="${PRIVATE_SUBNETS_TAGNAME}" +public_subnets_tagname="${PUBLIC_SUBNETS_TAGNAME}" +state_bucketname="${STATE_BUCKET_NAME}" +domain_name="${DOMAIN_NAME}" +stage="${STAGE}" diff --git a/sm2a/infrastructure/variables.tf b/sm2a/infrastructure/variables.tf new file mode 100644 index 00000000..dbd4cea7 --- /dev/null +++ b/sm2a/infrastructure/variables.tf @@ -0,0 +1,207 @@ +variable "airflow_db" { + type = object({ + db_name = string + username = string + password = string + port = number + }) + sensitive = true +} + +variable "aws_region" { + default = "us-west-2" +} + + +variable "prefix" { +} + +variable "fernet_key" { +} + + +variable "vpc_id" { +} +variable "private_subnets_tagname" { + +} +variable "public_subnets_tagname" { + +} +variable "state_bucketname" { + +} + +variable "permission_boundaries_arn" { + default = "null" +} + +variable "rds_publicly_accessible" { + default = false +} + +variable "custom_worker_policy_statement" { + type = list(object({ + Effect = string + Action = list(string) + Resource = list(string) + })) + default = [ + { + Effect = "Allow" + Action = [ + "dynamodb:UpdateItem", + "dynamodb:PutItem", + "dynamodb:GetItem", + "dynamodb:BatchWriteItem", + "dynamodb:BatchGetItem" + ] + "Resource" : [ + "arn:aws:dynamodb:us-west-2:*:table/*_sha256_store/*", + "arn:aws:dynamodb:us-west-2:*:table/*_sha256_store" + ] + + } + + + ] + +} + +variable "scheduler_cpu" { + type = number + default = 1024 * 2 +} +variable "scheduler_memory" { + type = number + default = 2048 * 2 +} + +variable "number_of_schedulers" { + default = 1 +} + +variable "domain_name" { + +} +variable "stage" { + default = "dev" +} + +variable "subdomain" { + default = "null" +} + + +variable "rds_configuration" { + type = object({ + dev = object({ + rds_instance_class = string, + rds_allocated_storage = number, + rds_max_allocated_storage = number, + rds_engine_version = string + }) + staging = object({ + rds_instance_class = string, + rds_allocated_storage = number, + rds_max_allocated_storage = number, + rds_engine_version = string + }) + prod = object({ + rds_instance_class = string, + rds_allocated_storage = number, + rds_max_allocated_storage = number, + rds_engine_version = string + }) + + }) + default = { + dev = { + rds_instance_class = "db.t4g.medium", + rds_allocated_storage = 20, + rds_max_allocated_storage = 100, + rds_engine_version = "13.13" + }, + staging = { + rds_instance_class = "db.t4g.large", + rds_allocated_storage = 40, + rds_max_allocated_storage = 100, + rds_engine_version = "13.13" + }, + prod = { + rds_instance_class = "db.r5.xlarge", + rds_allocated_storage = 100, + rds_max_allocated_storage = 200, + rds_engine_version = "13.13" + } + } +} + +variable "workers_configuration" { + type = object({ + dev = object({ + cpu = number, + memory = number, + max_desired_workers = string, + task_retries = string, + workers_logs_retention_days = number + + }) + staging = object({ + cpu = number, + memory = number, + max_desired_workers = string, + task_retries = string, + workers_logs_retention_days = number + }) + prod = object({ + cpu = number, + memory = number, + max_desired_workers = string, + task_retries = string, + workers_logs_retention_days = number + }) + }) + default = { + dev = { + cpu = 2048, + memory = 4096, + max_desired_workers = "5" + task_retries = "0" + workers_logs_retention_days = 1 + }, + staging = { + cpu = 4096, + memory = 8192, + max_desired_workers = "10", + task_retries = "1", + workers_logs_retention_days = 1 + }, + prod = { + cpu = 8192, + memory = 16384, + max_desired_workers = "30", + task_retries = "1", + workers_logs_retention_days = 14 + } + } +} + + +variable "gh_app_client_id" { + +} +variable "gh_app_client_secret" { + +} +variable "gh_team_name" { + +} + + +variable "airflow_custom_variables" { + description = "Airflow custom variables" + type = map(string) + default = {} +} + diff --git a/sm2a/plugins/.gitkeep b/sm2a/plugins/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/sm2a/scripts/deploy.sh b/sm2a/scripts/deploy.sh new file mode 100755 index 00000000..cd1a469d --- /dev/null +++ b/sm2a/scripts/deploy.sh @@ -0,0 +1,109 @@ +#! /bin/bash +# Check .env file + + +DOT_ENV=$1 + +if [ -f $DOT_ENV ] +then + set -a; source $DOT_ENV; set +a +else + echo "Run: ./scripts/deploy.sh <.env_file>" + echo "Please create $DOT_ENV file first and try again" + exit 1 +fi + +function create_state_bucket { + # $1 region + # $2 bucket_name + + aws s3 mb s3://$2 --region $1 + aws s3api put-bucket-versioning \ + --bucket $2 \ + --versioning-configuration Status=Enabled +} + +function create_dynamo_db { + # $1 region + # $2 table_name + aws dynamodb create-table \ + --table-name $2 \ + --attribute-definitions AttributeName=LockID,AttributeType=S \ + --key-schema AttributeName=LockID,KeyType=HASH \ + --billing-mode PAY_PER_REQUEST \ + --region $1 + +} +function generate_terraform_variables { + tf_vars=(tf tfvars) + for tf_var in "${tf_vars[@]}"; do + ( + echo "cat < terraform.${tf_var} + done + +} + +function check_create_remote_state { + # $1 aws_region + # $2 bucket name + # $3 dynamotable_name + AWS_REGION=$1 + STATE_BUCKET_NAME=$2 + STATE_DYNAMO_TABLE=$3 + + bucketstatus=$(aws s3api head-bucket --bucket $STATE_BUCKET_NAME 2>&1) + table_exists=$(aws dynamodb describe-table --table-name $STATE_DYNAMO_TABLE --region $AWS_REGION 2>&1) + + if echo "${table_exists}" | grep "An error"; + then + echo "Creating dynamodb table for TF state" + create_dynamo_db $AWS_REGION $STATE_DYNAMO_TABLE + else + echo "DynamoDB $STATE_DYNAMO_TABLE exists. Continue..." + fi + + if echo "${bucketstatus}" | grep 'Not Found'; + then + echo "Creating TF remote state" + create_state_bucket $AWS_REGION $STATE_BUCKET_NAME + create_dynamo_db $AWS_REGION $STATE_DYNAMO_TABLE + elif echo "${bucketstatus}" | grep 'Forbidden'; + then + echo "Bucket $STATE_BUCKET_NAME exists but not owned" + exit 1 + elif echo "${bucketstatus}" | grep 'Bad Request'; + then + echo "Bucket $STATE_BUCKET_NAME specified is less than 3 or greater than 63 characters" + exit 1 + else + echo "State Bucket $STATE_BUCKET_NAME owned and exists. Continue..."; + echo "State Dynamo table $STATE_DYNAMO_TABLE owned and exists. Continue..."; + fi +} + + +cd ./infrastructure +generate_terraform_variables +check_create_remote_state $AWS_REGION $STATE_BUCKET_NAME $STATE_DYNAMO_TABLE + +read -rp 'action [init|plan|deploy]: ' ACTION +case $ACTION in + init) + terraform init + ;; + plan) + terraform plan + ;; + + deploy) + terraform apply --auto-approve + ;; + *) + echo "Chose from 'init', 'plan' or 'deploy'" + exit 1 + ;; +esac + diff --git a/sm2a/scripts/generate_env_file.py b/sm2a/scripts/generate_env_file.py new file mode 100644 index 00000000..da60b376 --- /dev/null +++ b/sm2a/scripts/generate_env_file.py @@ -0,0 +1,38 @@ +import boto3 +import json +from argparse import ArgumentParser + + +def get_secrets_as_env(secret_id, out_file): + sm_client = boto3.client("secretsmanager") + response = sm_client.get_secret_value(SecretId=secret_id) + secrets = json.loads(response["SecretString"]) + with open(out_file, "w") as _env: + for out_key in secrets: + out_value = secrets[out_key] + _env.write(f"{out_key}={out_value}\n") + + +if __name__ == "__main__": + parser = ArgumentParser( + prog="Generate_env_file", + description="Generate dot env file for deployment", + epilog="Contact Marouane for extra help", + ) + parser.add_argument( + "--secret-id", + dest="secret_id", + help="AWS secret id", + required=True, + ) + parser.add_argument( + "--env-file", + dest="env_file", + help=".env file to write to", + required=False, + default=".env", + ) + + args = parser.parse_args() + + get_secrets_as_env(secret_id=args.secret_id, out_file=args.env_file) diff --git a/sm2a/scripts/put_airflow_worker_autoscaling_metric_data.py b/sm2a/scripts/put_airflow_worker_autoscaling_metric_data.py new file mode 100644 index 00000000..dd9722b4 --- /dev/null +++ b/sm2a/scripts/put_airflow_worker_autoscaling_metric_data.py @@ -0,0 +1,296 @@ +import argparse +import logging +import sys +import time +from contextlib import contextmanager +from typing import List + +import botocore.session +from airflow.models import DagModel, TaskInstance +from airflow.settings import Session +from airflow.utils.state import State +from sqlalchemy import func + +logging.basicConfig(level=logging.INFO, stream=sys.stdout) + + +@contextmanager +def session_scope(_session): + """Provide a transactional scope around a series of operations.""" + try: + yield _session + finally: + _session.close() + + +def get_unique_hostnames_for_states(states: List[str]) -> List[str]: + """ + Returns the list of unique hostnames where tasks are in one of {states} + + See below for a list of possible states for a Task Instance + https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html#task-instances + """ + with session_scope(Session) as _session: + unique_hostnames_query = _session.query( + TaskInstance.hostname.distinct() + ).filter(TaskInstance.state.in_(states)) + hostnames = [result[0] for result in unique_hostnames_query] + return hostnames + + +def get_pending_tasks_count(states: List[str]): + """ + Returns the number of tasks in a 'queued' state. + """ + with session_scope(Session) as _session: + pending_tasks_count = ( + _session.query(func.count(TaskInstance.task_id)) + .filter(TaskInstance.state.in_(states)) + .scalar() + ) + return pending_tasks_count + + +def get_tasks_info(ecs_client, cluster_name, service_name): + # List running tasks for the specified ECS service + response = ecs_client.list_tasks( + cluster=cluster_name, serviceName=service_name, desiredStatus="RUNNING" + ) + + # Extract and return the running task ARNs + running_task_arns = response.get("taskArns", []) + + tasks_info = ( + ecs_client.describe_tasks(cluster=cluster_name, tasks=running_task_arns) + if running_task_arns + else {"tasks": []} + ) + # Extract and return task information, including hostnames + task_info_list = [] + hostname = None + for task_info in tasks_info["tasks"]: + task_arn = task_info["taskArn"] + for detail in task_info["attachments"][0]["details"]: + if detail["name"] == "privateDnsName": + hostname = detail["value"] + continue + task_info_list.append({"task_arn": task_arn, "task_hostname": hostname}) + + return task_info_list + + +def scale_up_ecs_service(ecs_client, cluster_name, service_name, max_desired_count): + # Scale up the ECS service by updating the desired count + response = ecs_client.describe_services( + cluster=cluster_name, services=[service_name] + ) + current_desired_count = response["services"][0]["desiredCount"] + + # Calculate the new desired count after scale-up + new_desired_count = min(current_desired_count + 1, max_desired_count) + if new_desired_count == max_desired_count: + logging.info("We reached the max needed tasks") + return + + # Scale Up the ECS service + ecs_client.update_service( + cluster=cluster_name, service=service_name, desiredCount=new_desired_count + ) + + print(f"ECS service {service_name} scaled up to {new_desired_count} tasks.") + + +def scale_down_ecs_service( + ecs_client, cluster_name, service_name, task_state, min_desired_count +): + tasks_to_kill = set() + tasks_to_not_kill = set() + # Get tasks info + tasks_info = get_tasks_info( + ecs_client=ecs_client, cluster_name=cluster_name, service_name=service_name + ) + hosts = get_unique_hostnames_for_states(states=task_state) + pending_tasks = get_pending_tasks_count(states=[State.QUEUED, State.SCHEDULED]) + if pending_tasks: + print(f"Nothing to scale down since we have {pending_tasks} tasks in the queue") + return + for task_info in tasks_info: + task_arn, task_hostname = task_info["task_arn"], task_info["task_hostname"] + tasks_to_kill.add(task_arn) + for host in hosts: + if host == task_hostname: + tasks_to_not_kill.add(task_arn) + tasks_to_kill = tasks_to_kill.difference(tasks_to_not_kill) + if len(tasks_to_kill) == 0: + print("No tasks to kill") + return + # Scale down the ECS service by updating the desired count + response = ecs_client.describe_services( + cluster=cluster_name, services=[service_name] + ) + current_desired_count = response["services"][0]["desiredCount"] + + # Calculate the new desired count after scale-down + new_desired_count = max( + min_desired_count, current_desired_count - len(tasks_to_kill) + ) + + if new_desired_count == current_desired_count: + logging.info("Nothing to do here") + return + + # Terminate specified tasks + # And keep desired count + tasks_to_kill = list(tasks_to_kill)[: len(tasks_to_kill) - new_desired_count] + + logging.info( + f"ECS service {service_name} scaled down to {current_desired_count - len(tasks_to_kill)} tasks." + ) + for task_to_kill in tasks_to_kill: + print(f"Terminating task: {task_to_kill}") + ecs_client.stop_task(cluster=cluster_name, task=task_to_kill) + # Scale down the ECS service + ecs_client.update_service( + cluster=cluster_name, + service=service_name, + desiredCount=current_desired_count - len(tasks_to_kill), + ) + + +def get_task_count_where_state(states: List[str]) -> int: + """ + Returns the number of tasks in one of {states} + + See below for a list of possible states for a Task Instance + https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html#task-instances + """ + with session_scope(Session) as session: + tasks_query = ( + session.query( + TaskInstance.dag_id, + func.count("*").label("count"), + ) + .filter(TaskInstance.state.in_(states)) + .group_by(TaskInstance.dag_id) + .subquery() + ) + count = ( + session.query(func.sum(tasks_query.c.count)) + .join(DagModel, DagModel.dag_id == tasks_query.c.dag_id) + .filter( + DagModel.is_active.is_(True), + DagModel.is_paused.is_(False), + ) + .scalar() + ) + if count is None: + return 0 + return int(count) + + +def get_capacity_provider_reservation( + current_task_count: int, + current_worker_count: int, + desired_tasks_per_instance: int = 5, +) -> int: + """ + CapacityProviderReservation = M / N * 100 + + M is the number of instances you need. + N is the number of instances already up and running. + + If M and N are both zero, meaning no instances and no running tasks, then + CapacityProviderReservation = 100. If M > 0 and N = 0, meaning no instances and no + running tasks, but at least one required task, then CapacityProviderReservation = 200. + + The return value unit is a percentage. Scale airflow workers by applying this metric + in a target tracking scaling policy with a target value of 100. + + Source: + https://aws.amazon.com/blogs/containers/deep-dive-on-amazon-ecs-cluster-auto-scaling/ + """ + m = current_task_count / desired_tasks_per_instance + n = current_worker_count + if m == 0 and n == 0: + return 100 + elif m > 0 and n == 0: + return 200 + return int(m / n * 100) + + +# Publish a custom metric for worker scaling +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/publishingMetrics.html +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--cluster-name", + type=str, + required=True, + help="Cluster name used as metric dimension", + ) + parser.add_argument( + "--period", + type=int, + default=60, + help="The interval (in seconds) to call the put_metric_data API", + ) + parser.add_argument( + "--desired-count", + type=int, + default=0, + help="The desired workers", + ) + parser.add_argument( + "--region-name", type=str, required=True, help="AWS region name" + ) + parser.add_argument( + "--worker-service-name", + type=str, + required=True, + help="The name of the airflow worker ECS service.", + ) + args = parser.parse_args() + logging.info("Arguments parsed successfully") + + session = botocore.session.get_session() + cloudwatch = session.create_client("cloudwatch", region_name=args.region_name) + ecs = session.create_client("ecs", region_name=args.region_name) + task_count_pointer = 0 + while True: + + task_count = get_task_count_where_state( + states=[State.QUEUED, State.RUNNING, State.UP_FOR_RETRY] + ) + logging.info(f"NumberOfActiveRunningTasks: {task_count}") + + worker_service = ecs.describe_services( + cluster=args.cluster_name, services=[args.worker_service_name] + )["services"][0] + worker_count = worker_service["pendingCount"] + worker_service["runningCount"] + logging.info(f"NumberOfWorkers: {worker_count}") + + metric_value = get_capacity_provider_reservation( + current_task_count=task_count, + current_worker_count=worker_count, + desired_tasks_per_instance=10, + ) + if metric_value > 100: + logging.info(f"We are scaling up {metric_value}") + scale_up_ecs_service( + ecs_client=ecs, + cluster_name=args.cluster_name, + service_name=args.worker_service_name, + max_desired_count=args.desired_count, + ) + + elif metric_value < 100: + scale_down_ecs_service( + ecs_client=ecs, + cluster_name=args.cluster_name, + service_name=args.worker_service_name, + task_state=[State.RUNNING], + min_desired_count=1, + ) + + logging.info(f"Sleeping for {args.period} seconds") + time.sleep(args.period) diff --git a/sm2a/scripts/run_task.py b/sm2a/scripts/run_task.py new file mode 100644 index 00000000..a988e451 --- /dev/null +++ b/sm2a/scripts/run_task.py @@ -0,0 +1,184 @@ +import argparse +import os +import sys +from textwrap import dedent +from typing import List + +if sys.version_info.major < 3: + print("Please try again with python version 3+") + sys.exit(1) + +try: + import botocore.session +except ImportError: + print("Please install botocore and try again") + print("python -m pip install botocore") + sys.exit(1) + + +def list_public_subnet_ids(botocore_ec2_client, vpc_id: str) -> List[str]: + """ + Use botocore_ec2_client to obtain a list of public subnet ids for vpc named {vpc_name} + """ + + subnets = botocore_ec2_client.describe_subnets( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}] + ) + public_subnet_ids = [ + subnet["SubnetId"] + for subnet in subnets["Subnets"] + if subnet["MapPublicIpOnLaunch"] + ] + return public_subnet_ids + + +def get_security_group_id(botocore_ec2_client, security_group_name: str) -> str: + """ + Use botocore_ec2_client to obtain the id of the security group named {security_group_name} + """ + res = botocore_ec2_client.describe_security_groups( + Filters=[{"Name": "group-name", "Values": [security_group_name]}] + ) + if not res["SecurityGroups"]: + raise Exception( + f"Security group where tag:Name='{security_group_name}' does not exist" + ) + return res["SecurityGroups"][0]["GroupId"] + + +if __name__ == "__main__": + prefix = os.getenv("PREFIX") + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + description=dedent( + """ + Examples + -------- + Initialize the db + $ python3 scripts/run_task.py --public-subnet-ids subnet-xxx --security-group sg-xxx --command 'db init' + + Create an admin user + $ python3 scripts/run_task.py --public-subnet-ids subnet-xxx --security-group sg-xxx --command \\ + 'users create --username airflow --firstname airflow --lastname airflow --password airflow --email airflow@example.com --role Admin' + """ + ), + ) + parser.add_argument( + "--cluster", + type=str, + default=f"{prefix}-airflow", + help="The name of the target cluster. Defaults to 'airflow'.", + ) + parser.add_argument( + "--task-definition", + type=str, + default=f"{prefix}-standalone-task", + help="The name of the standalone task definition. Defaults to 'airflow-standalone-task'.", + ) + parser.add_argument( + "--container-name", + type=str, + default="airflow", + help="The name of the container in the standalone task definition. Defaults to 'airflow'.", + ) + parser.add_argument( + "--profile", + type=str, + default=os.getenv("AWS_PROFILE"), + help="The name of the awscli profile to use. Defaults to 'default'.", + ) + parser.add_argument( + "--vpc-id", + type=str, + default=os.getenv("VPC_ID"), + help="The shared VPC_ID", + ) + parser.add_argument( + "--security-group-name", + type=str, + default=f"{prefix}-standalone-task", + help="The name of the standalone task security group. Defaults to 'airflow-standalone-task'.", + ) + parser.add_argument( + "--command", + type=str, + required=True, + help=( + "Specify the command string *as a single string* to prevent parsing errors " + "(eg. 'users create --role Admin')" + ), + ) + parser.add_argument( + "--wait-tasks-stopped", + action="store_true", + default=False, + help="After calling run-task, wait until the task status returns STOPPED", + ) + parser.add_argument( + "--cpu", + type=int, + default=1024, + help="Specify cpu as an integer. Defaults to 1024.", + ) + parser.add_argument( + "--memory", + type=int, + default=2048, + help="Specify memory as an integer. Defaults to 2048.", + ) + parser.add_argument( + "--capacity-provider", + type=str, + default="FARGATE", + choices=["FARGATE", "FARGATE_SPOT"], + ) + args = parser.parse_args() + print("Arguments valid") + + print("Finding public subnet ids") + os.environ["AWS_DEFAULT_REGION"] = os.getenv("AWS_REGION") + session = botocore.session.Session(profile=args.profile) + ec2_client = session.create_client("ec2") + public_subnet_ids = list_public_subnet_ids(ec2_client, args.vpc_id) + + if not public_subnet_ids: + raise Exception(f"No public subnets available on VPC '{args.vpc_id}'") + + print("Finding security group id") + security_group_id = get_security_group_id(ec2_client, args.security_group_name) + + print("Submitting task to cluster") + ecs_client = session.create_client("ecs") + response = ecs_client.run_task( + capacityProviderStrategy=[{"capacityProvider": args.capacity_provider}], + cluster=args.cluster, + count=1, + networkConfiguration={ + "awsvpcConfiguration": { + "subnets": public_subnet_ids, + "securityGroups": [security_group_id], + "assignPublicIp": "ENABLED", + } + }, + overrides={ + "containerOverrides": [ + { + "name": args.container_name, + "command": args.command.split(" "), + "cpu": args.cpu, + "memory": args.memory, + }, + ], + "cpu": str(args.cpu), + "memory": str(args.memory), + }, + platformVersion="1.4.0", + taskDefinition=args.task_definition, + ) + task_arn = response["tasks"][0]["taskArn"] + print(f"Task arn: {task_arn}") + if args.wait_tasks_stopped: + print("Waiting until task stops") + waiter = ecs_client.get_waiter("tasks_stopped") + waiter.wait(cluster=args.cluster, tasks=[task_arn]) + print("Done") diff --git a/sm2a/sm2a-local-config/env_example b/sm2a/sm2a-local-config/env_example new file mode 100644 index 00000000..7843e2d4 --- /dev/null +++ b/sm2a/sm2a-local-config/env_example @@ -0,0 +1,4 @@ +AWS_ACCESS_KEY_ID=XXXXXXXXXX +AWS_SECRET_ACCESS_KEY=YYYYYYYYYYY +AWS_DEFAULT_REGION=us-west-2 +AWS_REGION=us-west-2 diff --git a/sm2a/sm2a-local-config/local_airflow.cfg b/sm2a/sm2a-local-config/local_airflow.cfg new file mode 100644 index 00000000..6507e4ad --- /dev/null +++ b/sm2a/sm2a-local-config/local_airflow.cfg @@ -0,0 +1,78 @@ +[api] +auth_backends = airflow.api.auth.backend.basic_auth + +[core] +executor = CeleryExecutor +dags_are_paused_at_creation = true +dags_folder = /opt/airflow/dags +load_examples = false +load_default_connections = false + + +[webserver] +dag_default_view = grid +expose_config = true +dag_orientation = TB +warn_deployment_exposure = false + +[metrics] +# The airflow scheduler sends statsd metrics over UDP to port 8125. +# https://airflow.apache.org/docs/apache-airflow/stable/logging-monitoring/logging-architecture.html +# You can verify this by setting the statsd_host to localhost or 0.0.0.0 and listening via netcat. +# Eg. docker compose exec airflow-scheduler nc -l -u -p 8125 127.0.0.1 +statsd_on = true +statsd_host = statsd-exporter +statsd_port = 8125 +statsd_prefix = airflow + +[scheduler] +catchup_by_default = false + +[celery] + +# This section only applies if you are using the CeleryExecutor in +# ``[core]`` section above +# The app name that will be used by celery +celery_app_name = airflow.executors.celery_executor + +# The concurrency that will be used when starting workers with the +# ``airflow celery worker`` command. This defines the number of task instances that +# a worker will take, so size up your workers based on the resources on +# your worker box and the nature of your tasks +worker_concurrency = 16 + +# The maximum and minimum concurrency that will be used when starting workers with the +# ``airflow celery worker`` command (always keep minimum processes, but grow +# to maximum if necessary). Note the value should be max_concurrency,min_concurrency +# Pick these numbers based on resources on worker box and the nature of the task. +# If autoscale option is available, worker_concurrency will be ignored. +# http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale +# Example: worker_autoscale = 16,12 + +# Used to increase the number of tasks that a worker prefetches which can improve performance. +# The number of processes multiplied by worker_prefetch_multiplier is the number of tasks +# that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily +# blocked if there are multiple workers and one worker prefetches tasks that sit behind long +# running tasks while another worker has unutilized processes that are unable to process the already +# claimed blocked tasks. +# https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits +# Example: worker_prefetch_multiplier = 1 +# worker_prefetch_multiplier = + +# Umask that will be used when starting workers with the ``airflow celery worker`` +# in daemon mode. This control the file-creation mode mask which determines the initial +# value of file permission bits for newly created files. +worker_umask = 0o077 +# celery_config_options = deploy_airflow_on_ecs_fargate.celery_config.CELERY_CONFIG + +# The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally +# a sqlalchemy database. Refer to the Celery documentation for more information. +# broker_url = redis://redis:6379/0 +broker_url = sqs://user:pass@celery-broker:9324/ +# +# [logging] +# logging_config_class = deploy_airflow_on_ecs_fargate.logging_config.STDOUT_LOGGING_CONFIG + +[secrets] +backend = airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend +backend_kwargs = {"connections_prefix": "sm2a-dev/airflow/connections", "variables_prefix": "sm2a-dev/airflow/variables","connections_lookup_pattern": "_default$", "variables_lookup_pattern": "^aws_", "config_prefix": "sm2a-dev/airflow/config"} diff --git a/sm2a/sm2a-local-config/local_webserver_config.py b/sm2a/sm2a-local-config/local_webserver_config.py new file mode 100644 index 00000000..3048bb21 --- /dev/null +++ b/sm2a/sm2a-local-config/local_webserver_config.py @@ -0,0 +1,132 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Default configuration for the Airflow webserver.""" + +from __future__ import annotations + +import os + +from flask_appbuilder.const import AUTH_DB + +# from airflow.www.fab_security.manager import AUTH_LDAP +# from airflow.www.fab_security.manager import AUTH_OAUTH +# from airflow.www.fab_security.manager import AUTH_OID +# from airflow.www.fab_security.manager import AUTH_REMOTE_USER + + +basedir = os.path.abspath(os.path.dirname(__file__)) + +# Flask-WTF flag for CSRF +WTF_CSRF_ENABLED = True +WTF_CSRF_TIME_LIMIT = None + +# ---------------------------------------------------- +# AUTHENTICATION CONFIG +# ---------------------------------------------------- +# For details on how to set up each of the following authentication, see +# http://flask-appbuilder.readthedocs.io/en/latest/security.html# authentication-methods +# for details. + +# The authentication type +# AUTH_OID : Is for OpenID +# AUTH_DB : Is for database +# AUTH_LDAP : Is for LDAP +# AUTH_REMOTE_USER : Is for using REMOTE_USER from web server +# AUTH_OAUTH : Is for OAuth +AUTH_TYPE = AUTH_DB + +# Uncomment to setup Full admin role name +# AUTH_ROLE_ADMIN = 'Admin' + +# Uncomment and set to desired role to enable access without authentication +# AUTH_ROLE_PUBLIC = 'Viewer' + +# Will allow user self registration +# AUTH_USER_REGISTRATION = True + +# The recaptcha it's automatically enabled for user self registration is active and the keys are necessary +# RECAPTCHA_PRIVATE_KEY = PRIVATE_KEY +# RECAPTCHA_PUBLIC_KEY = PUBLIC_KEY + +# Config for Flask-Mail necessary for user self registration +# MAIL_SERVER = 'smtp.gmail.com' +# MAIL_USE_TLS = True +# MAIL_USERNAME = 'yourappemail@gmail.com' +# MAIL_PASSWORD = 'passwordformail' +# MAIL_DEFAULT_SENDER = 'sender@gmail.com' + +# The default user self registration role +# AUTH_USER_REGISTRATION_ROLE = "Public" + +# When using OAuth Auth, uncomment to setup provider(s) info +# Google OAuth example: +# OAUTH_PROVIDERS = [{ +# 'name':'google', +# 'token_key':'access_token', +# 'icon':'fa-google', +# 'remote_app': { +# 'api_base_url':'https://www.googleapis.com/oauth2/v2/', +# 'client_kwargs':{ +# 'scope': 'email profile' +# }, +# 'access_token_url':'https://accounts.google.com/o/oauth2/token', +# 'authorize_url':'https://accounts.google.com/o/oauth2/auth', +# 'request_token_url': None, +# 'client_id': GOOGLE_KEY, +# 'client_secret': GOOGLE_SECRET_KEY, +# } +# }] + +# When using LDAP Auth, setup the ldap server +# AUTH_LDAP_SERVER = "ldap://ldapserver.new" + +# When using OpenID Auth, uncomment to setup OpenID providers. +# example for OpenID authentication +# OPENID_PROVIDERS = [ +# { 'name': 'Yahoo', 'url': 'https://me.yahoo.com' }, +# { 'name': 'AOL', 'url': 'http://openid.aol.com/' }, +# { 'name': 'Flickr', 'url': 'http://www.flickr.com/' }, +# { 'name': 'MyOpenID', 'url': 'https://www.myopenid.com' }] + +# ---------------------------------------------------- +# Theme CONFIG +# ---------------------------------------------------- +# Flask App Builder comes up with a number of predefined themes +# that you can use for Apache Airflow. +# http://flask-appbuilder.readthedocs.io/en/latest/customizing.html#changing-themes +# Please make sure to remove "navbar_color" configuration from airflow.cfg +# in order to fully utilize the theme. (or use that property in conjunction with theme) +# APP_THEME = "bootstrap-theme.css" # default bootstrap +# APP_THEME = "amelia.css" +# APP_THEME = "cerulean.css" +# APP_THEME = "cosmo.css" +# APP_THEME = "cyborg.css" +# APP_THEME = "darkly.css" +# APP_THEME = "flatly.css" +# APP_THEME = "journal.css" +# APP_THEME = "lumen.css" +# APP_THEME = "paper.css" +# APP_THEME = "readable.css" +# APP_THEME = "sandstone.css" +# APP_THEME = "simplex.css" +# APP_THEME = "slate.css" +# APP_THEME = "solar.css" +# APP_THEME = "spacelab.css" +# APP_THEME = "superhero.css" +# APP_THEME = "united.css" +# APP_THEME = "yeti.css" From c82e3df2fb9a03fbc2743d1af3fe3c6ac5bf03bd Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:07:12 -0500 Subject: [PATCH 02/97] Deploy sm2a in dev --- .github/workflows/cicd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index deb96353..60942ee0 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -10,6 +10,7 @@ on: - main - dev - production + - deploy-sm2a pull_request: branches: - main From f282c01bf0253af50375467f175d5581cd4cbb01 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:15:19 -0500 Subject: [PATCH 03/97] Deploy sm2a in dev env --- .github/workflows/cicd.yml | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 60942ee0..0b4ed753 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -78,27 +78,6 @@ jobs: uses: "./.github/actions/terraform-deploy/action.yml" with: env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} - - - deploy-sm2a: - name: Deploy to ${{ needs.define-environment.outputs.env_name }} 🚀 - runs-on: self-hosted - if: ${{ needs.define-environment.outputs.env_name }} - needs: [gitflow-enforcer, define-environment] - environment: ${{ needs.define-environment.outputs.env_name }} - concurrency: ${{ needs.define-environment.outputs.env_name }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v2 - with: - role-to-assume: ${{ secrets.DEPLOYMENT_ROLE_ARN }} - role-session-name: "veda-airflow-github-sm2a-${{ needs.define-environment.outputs.env_name }}-deployment" - aws-region: "us-west-2" - - name: Run sm2a deployment uses: "./.github/actions/terraform-deploy/sm2a-action.yml" with: From 38450c6bd41e9a991785546e3530c2a2c61e478e Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:17:54 -0500 Subject: [PATCH 04/97] Deploy sm2a in dev env --- .../sm2a_action.yml | 0 .github/workflows/cicd.yml | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename .github/actions/{terraform-deploy => terraform-deploy-sm2a}/sm2a_action.yml (100%) diff --git a/.github/actions/terraform-deploy/sm2a_action.yml b/.github/actions/terraform-deploy-sm2a/sm2a_action.yml similarity index 100% rename from .github/actions/terraform-deploy/sm2a_action.yml rename to .github/actions/terraform-deploy-sm2a/sm2a_action.yml diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 0b4ed753..d7653a10 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -75,11 +75,11 @@ jobs: aws-region: "us-west-2" - name: Run deployment - uses: "./.github/actions/terraform-deploy/action.yml" + uses: "./.github/actions/terraform-deployl" with: env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} - name: Run sm2a deployment - uses: "./.github/actions/terraform-deploy/sm2a-action.yml" + uses: "./.github/actions/terraform-deploy-sm2a" with: env_aws_secret_name: veda-sm2a-dev-deployment-secrets env-file: .env From 127275d51b8044deba4b73b398354ac35fc05987 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:22:37 -0500 Subject: [PATCH 05/97] Deploy sm2a in dev env --- .github/workflows/cicd.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index d7653a10..1dcc3b19 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -74,10 +74,6 @@ jobs: role-session-name: "veda-airflow-github-${{ needs.define-environment.outputs.env_name }}-deployment" aws-region: "us-west-2" - - name: Run deployment - uses: "./.github/actions/terraform-deployl" - with: - env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} - name: Run sm2a deployment uses: "./.github/actions/terraform-deploy-sm2a" with: From 0d4e1add472d5ad9d3f8738bc52d4ab4a487de96 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:24:02 -0500 Subject: [PATCH 06/97] Deploy sm2a in dev env --- .../actions/terraform-deploy-sm2a/{sm2a_action.yml => action.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/actions/terraform-deploy-sm2a/{sm2a_action.yml => action.yml} (100%) diff --git a/.github/actions/terraform-deploy-sm2a/sm2a_action.yml b/.github/actions/terraform-deploy-sm2a/action.yml similarity index 100% rename from .github/actions/terraform-deploy-sm2a/sm2a_action.yml rename to .github/actions/terraform-deploy-sm2a/action.yml From ac0eb55e926e29c1b74545efae2beeba37fe2260 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:27:23 -0500 Subject: [PATCH 07/97] Deploy sm2a with AWS SSM envs --- .github/actions/terraform-deploy-sm2a/action.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index d0947ace..d269c1f4 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -46,10 +46,9 @@ runs: shell: bash working-directory: ${{ inputs.sm2a_dir }} env: - SECRET_SSM_NAME: ${{ inputs.env_aws_secret_name }} AWS_DEFAULT_REGION: us-west-2 run: | - python scripts/generate_env_file.py --secret-id ${{ vars.DEPLOYMENT_ENV_SECRET_NAME }} --env-file ${{ inputs.env-file }} + python scripts/generate_env_file.py --secret-id ${{ inputs.env_aws_secret_name }} --env-file ${{ inputs.env-file }} From ac9935001c51f8fc092b45b2857b8d4de618b987 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 13:36:13 -0500 Subject: [PATCH 08/97] Update terraform action --- .github/actions/terraform-deploy-sm2a/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index d269c1f4..c5115e7f 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -53,9 +53,9 @@ runs: - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 + uses: hashicorp/setup-terraform@v1 with: - terraform_version: 1.6.6 + terraform_version: 1.3.3 - name: Deploy shell: bash From 6ec1ce29a1936242bff6d40aadeab69ed1ffa22a Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 14:10:06 -0500 Subject: [PATCH 09/97] Update terraform action --- .github/actions/terraform-deploy/action.yml | 1 + .github/workflows/cicd.yml | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/.github/actions/terraform-deploy/action.yml b/.github/actions/terraform-deploy/action.yml index ab963a1c..e2ed62ef 100644 --- a/.github/actions/terraform-deploy/action.yml +++ b/.github/actions/terraform-deploy/action.yml @@ -61,3 +61,4 @@ runs: working-directory: ${{ inputs.dir }} run: | ./scripts/deploy.sh ${{ inputs.env-file }} <<< init + ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 1dcc3b19..586b70a2 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -74,6 +74,11 @@ jobs: role-session-name: "veda-airflow-github-${{ needs.define-environment.outputs.env_name }}-deployment" aws-region: "us-west-2" + - name: Run deployment + uses: "./.github/actions/terraform-deploy" + with: + env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} + - name: Run sm2a deployment uses: "./.github/actions/terraform-deploy-sm2a" with: From aa8cf07438288787a96268ff230702a961fd4ab1 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 14:31:07 -0500 Subject: [PATCH 10/97] Update terraform action from env --- .github/actions/terraform-deploy-sm2a/action.yml | 1 + sm2a/airflow_worker/Dockerfile | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index c5115e7f..a5101978 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -62,5 +62,6 @@ runs: working-directory: ${{ inputs.sm2a_dir }} run: | cp -r ../dags . + ls -al ./scripts/deploy.sh ${{ inputs.env-file }} <<< init ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan diff --git a/sm2a/airflow_worker/Dockerfile b/sm2a/airflow_worker/Dockerfile index f69ea1ec..6d4642be 100644 --- a/sm2a/airflow_worker/Dockerfile +++ b/sm2a/airflow_worker/Dockerfile @@ -41,6 +41,7 @@ COPY --chown=airflow:airflow scripts "${AIRFLOW_HOME}/scripts" RUN cp ${AIRFLOW_HOME}/configuration/airflow.cfg* ${AIRFLOW_HOME}/. + RUN pip install pypgstac==0.7.4 # ENV From 960882822bf95d95dffbbcb8024f18f84b04a86d Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 14:36:57 -0500 Subject: [PATCH 11/97] debug tf actions --- .github/actions/terraform-deploy-sm2a/action.yml | 2 ++ sm2a/scripts/deploy.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index a5101978..c44d8ef1 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -61,7 +61,9 @@ runs: shell: bash working-directory: ${{ inputs.sm2a_dir }} run: | + ls -al cp -r ../dags . ls -al + cat ./scripts/deploy.sh ${{ inputs.env-file }} <<< init ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan diff --git a/sm2a/scripts/deploy.sh b/sm2a/scripts/deploy.sh index cd1a469d..1b068c2e 100755 --- a/sm2a/scripts/deploy.sh +++ b/sm2a/scripts/deploy.sh @@ -88,7 +88,7 @@ function check_create_remote_state { cd ./infrastructure generate_terraform_variables check_create_remote_state $AWS_REGION $STATE_BUCKET_NAME $STATE_DYNAMO_TABLE - +cat terraform.tf read -rp 'action [init|plan|deploy]: ' ACTION case $ACTION in init) From dd2c08603753c1657b9aa4b79f4a9786a5883cf6 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 14:44:48 -0500 Subject: [PATCH 12/97] debug tf actions with .env --- .github/actions/terraform-deploy-sm2a/action.yml | 4 +--- .github/workflows/cicd.yml | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index c44d8ef1..0e97950a 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -61,9 +61,7 @@ runs: shell: bash working-directory: ${{ inputs.sm2a_dir }} run: | - ls -al cp -r ../dags . - ls -al - cat + cat ${{ inputs.env-file }} ./scripts/deploy.sh ${{ inputs.env-file }} <<< init ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 586b70a2..20b4a33e 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -72,7 +72,7 @@ jobs: with: role-to-assume: ${{ secrets.DEPLOYMENT_ROLE_ARN }} role-session-name: "veda-airflow-github-${{ needs.define-environment.outputs.env_name }}-deployment" - aws-region: "us-west-2" + aws-region: us-west-2 - name: Run deployment uses: "./.github/actions/terraform-deploy" From 32b6949fc211560931e00ad20db271743fe5a3a5 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 14:51:59 -0500 Subject: [PATCH 13/97] debug tf actions with .env --- .github/actions/terraform-deploy-sm2a/action.yml | 1 + sm2a/infrastructure/terraform.tf.tmpl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 0e97950a..7506542e 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -63,5 +63,6 @@ runs: run: | cp -r ../dags . cat ${{ inputs.env-file }} + echo $AWS_REGION ./scripts/deploy.sh ${{ inputs.env-file }} <<< init ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan diff --git a/sm2a/infrastructure/terraform.tf.tmpl b/sm2a/infrastructure/terraform.tf.tmpl index 57aac9e2..c567acb8 100644 --- a/sm2a/infrastructure/terraform.tf.tmpl +++ b/sm2a/infrastructure/terraform.tf.tmpl @@ -1,6 +1,6 @@ terraform { backend "s3" { - region = "${AWS_REGION:-us-west-2}" + region = "us-west-2" bucket = "${STATE_BUCKET_NAME}" key = "${STATE_BUCKET_KEY}" dynamodb_table = "${STATE_DYNAMO_TABLE}" From c294d179f8ed0449cdf3676a7297912ba859aa45 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 14:57:02 -0500 Subject: [PATCH 14/97] debug tf actions with .env --- .github/actions/terraform-deploy-sm2a/action.yml | 9 ++++++--- .github/workflows/cicd.yml | 4 ++-- sm2a/infrastructure/terraform.tf.tmpl | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 7506542e..ca4f63f0 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -7,7 +7,7 @@ inputs: aws-region: type: string required: false - default: us-east-1 + default: us-west-2 env-file: type: string required: true @@ -46,7 +46,8 @@ runs: shell: bash working-directory: ${{ inputs.sm2a_dir }} env: - AWS_DEFAULT_REGION: us-west-2 + AWS_DEFAULT_REGION: ${{ inputs.aws_region }} + AWS_REGION: ${{ inputs.aws_region }} run: | python scripts/generate_env_file.py --secret-id ${{ inputs.env_aws_secret_name }} --env-file ${{ inputs.env-file }} @@ -60,9 +61,11 @@ runs: - name: Deploy shell: bash working-directory: ${{ inputs.sm2a_dir }} + env: + AWS_DEFAULT_REGION: ${{ inputs.aws_region }} + AWS_REGION: ${{ inputs.aws_region }} run: | cp -r ../dags . - cat ${{ inputs.env-file }} echo $AWS_REGION ./scripts/deploy.sh ${{ inputs.env-file }} <<< init ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 20b4a33e..98572722 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -74,12 +74,12 @@ jobs: role-session-name: "veda-airflow-github-${{ needs.define-environment.outputs.env_name }}-deployment" aws-region: us-west-2 - - name: Run deployment + - name: Run MWAA deployment uses: "./.github/actions/terraform-deploy" with: env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} - - name: Run sm2a deployment + - name: Run SM2A deployment uses: "./.github/actions/terraform-deploy-sm2a" with: env_aws_secret_name: veda-sm2a-dev-deployment-secrets diff --git a/sm2a/infrastructure/terraform.tf.tmpl b/sm2a/infrastructure/terraform.tf.tmpl index c567acb8..57aac9e2 100644 --- a/sm2a/infrastructure/terraform.tf.tmpl +++ b/sm2a/infrastructure/terraform.tf.tmpl @@ -1,6 +1,6 @@ terraform { backend "s3" { - region = "us-west-2" + region = "${AWS_REGION:-us-west-2}" bucket = "${STATE_BUCKET_NAME}" key = "${STATE_BUCKET_KEY}" dynamodb_table = "${STATE_DYNAMO_TABLE}" From 7b8db3fe8194ab8f7b52709eec4bed2b3d8ad561 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 15:04:08 -0500 Subject: [PATCH 15/97] debug tf actions with .env --- .github/actions/terraform-deploy-sm2a/action.yml | 8 ++++---- .github/workflows/cicd.yml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index ca4f63f0..ce8ee030 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -46,8 +46,8 @@ runs: shell: bash working-directory: ${{ inputs.sm2a_dir }} env: - AWS_DEFAULT_REGION: ${{ inputs.aws_region }} - AWS_REGION: ${{ inputs.aws_region }} + AWS_DEFAULT_REGION: ${{ inputs.aws-region }} + AWS_REGION: ${{ inputs.aws-region }} run: | python scripts/generate_env_file.py --secret-id ${{ inputs.env_aws_secret_name }} --env-file ${{ inputs.env-file }} @@ -62,8 +62,8 @@ runs: shell: bash working-directory: ${{ inputs.sm2a_dir }} env: - AWS_DEFAULT_REGION: ${{ inputs.aws_region }} - AWS_REGION: ${{ inputs.aws_region }} + AWS_DEFAULT_REGION: ${{ inputs.aws-region }} + AWS_REGION: ${{ inputs.aws-region }} run: | cp -r ../dags . echo $AWS_REGION diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 98572722..f72dd6df 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -84,3 +84,4 @@ jobs: with: env_aws_secret_name: veda-sm2a-dev-deployment-secrets env-file: .env + aws-region: us-west-2 From 8061ca6e0a48f4e033b0263371735379195bea5c Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 15:30:09 -0500 Subject: [PATCH 16/97] Deploy SM2A from github actions --- .github/actions/terraform-deploy-sm2a/action.yml | 3 +-- dags/example_dag.py | 2 ++ sm2a/airflow_services/Dockerfile | 2 +- sm2a/airflow_worker/Dockerfile | 2 +- sm2a/infrastructure/main.tf | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index ce8ee030..2519c3ea 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -66,6 +66,5 @@ runs: AWS_REGION: ${{ inputs.aws-region }} run: | cp -r ../dags . - echo $AWS_REGION ./scripts/deploy.sh ${{ inputs.env-file }} <<< init - ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan + ./scripts/deploy.sh ${{ inputs.env-file }} <<< deploy diff --git a/dags/example_dag.py b/dags/example_dag.py index e8491f09..73067a80 100644 --- a/dags/example_dag.py +++ b/dags/example_dag.py @@ -79,7 +79,9 @@ def push_to_cmr_task(text): ) end = EmptyOperator(task_id="end", dag=dag) + start >> discover_from_cmr + start >> discover_from_s3 >> move_files_to_maap_store ( [discover_from_cmr, move_files_to_maap_store] diff --git a/sm2a/airflow_services/Dockerfile b/sm2a/airflow_services/Dockerfile index 5a411875..1792a0d5 100644 --- a/sm2a/airflow_services/Dockerfile +++ b/sm2a/airflow_services/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/arm64 apache/airflow:slim-2.8.4-python3.11 +FROM --platform=linux/amd64 apache/airflow:slim-2.8.4-python3.11 ARG AIRFLOW_VERSION=2.8.4 USER root # `apt-get autoremove` is used to remove packages that were automatically installed to satisfy diff --git a/sm2a/airflow_worker/Dockerfile b/sm2a/airflow_worker/Dockerfile index 6d4642be..58f9dead 100644 --- a/sm2a/airflow_worker/Dockerfile +++ b/sm2a/airflow_worker/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/arm64 osgeo/gdal:ubuntu-small-3.6.3 +FROM --platform=linux/amd64 osgeo/gdal:ubuntu-small-3.6.3 ARG AIRFLOW_VERSION=2.8.4 ARG UNAME=airflow diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index 9e01bee3..5b9ba82d 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.20/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.22.dev/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix From c190c97303673295c8bd9a8b2e3ff16dde123aff Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 15:35:15 -0500 Subject: [PATCH 17/97] Deploy SM2A from github actions --- .github/actions/terraform-deploy-sm2a/action.yml | 2 -- dags/example_dag.py | 5 ++++- sm2a/infrastructure/main.tf | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 2519c3ea..de44a3b1 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -51,8 +51,6 @@ runs: run: | python scripts/generate_env_file.py --secret-id ${{ inputs.env_aws_secret_name }} --env-file ${{ inputs.env-file }} - - - name: Setup Terraform uses: hashicorp/setup-terraform@v1 with: diff --git a/dags/example_dag.py b/dags/example_dag.py index 73067a80..5133041c 100644 --- a/dags/example_dag.py +++ b/dags/example_dag.py @@ -43,13 +43,16 @@ def push_to_cmr_task(text): schedule_interval=None, tags=["example"], ) as dag: + start = EmptyOperator(task_id="start", dag=dag) + discover_from_cmr = PythonOperator( task_id="discover_from_cmr", python_callable=discover_from_cmr_task, op_kwargs={"text": "Discover from CMR"}, dag=dag, ) + discover_from_s3 = PythonOperator( task_id="discover_from_s3", python_callable=discover_from_s3_task, @@ -81,7 +84,7 @@ def push_to_cmr_task(text): end = EmptyOperator(task_id="end", dag=dag) start >> discover_from_cmr - + start >> discover_from_s3 >> move_files_to_maap_store ( [discover_from_cmr, move_files_to_maap_store] diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index 5b9ba82d..11428ac4 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.22.dev/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.22.dev.1/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix From cca41084466a4dc42c0023bca06695c32844413a Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 15:57:29 -0500 Subject: [PATCH 18/97] Remove architecture dependency --- dags/example_dag.py | 2 +- sm2a/infrastructure/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/example_dag.py b/dags/example_dag.py index 5133041c..7bd11599 100644 --- a/dags/example_dag.py +++ b/dags/example_dag.py @@ -45,7 +45,7 @@ def push_to_cmr_task(text): ) as dag: start = EmptyOperator(task_id="start", dag=dag) - + discover_from_cmr = PythonOperator( task_id="discover_from_cmr", python_callable=discover_from_cmr_task, diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index 11428ac4..5daf2d37 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.22.dev.1/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.23/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix From 4c8e8050548ea75a6dc22939ba381a809939c530 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 16:03:41 -0500 Subject: [PATCH 19/97] Run VEDA workers om X86 --- sm2a/airflow_services/Dockerfile | 1 + sm2a/airflow_worker/Dockerfile | 1 - sm2a/infrastructure/main.tf | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sm2a/airflow_services/Dockerfile b/sm2a/airflow_services/Dockerfile index 1792a0d5..b8d5ceeb 100644 --- a/sm2a/airflow_services/Dockerfile +++ b/sm2a/airflow_services/Dockerfile @@ -25,5 +25,6 @@ COPY --chown=airflow:airflow scripts "${AIRFLOW_HOME}/scripts" COPY --chown=airflow:airflow airflow_services/webserver_config.py "${AIRFLOW_HOME}/webserver_config.py" RUN cp ${AIRFLOW_HOME}/configuration/airflow.cfg* ${AIRFLOW_HOME}/. + #ENV ENV PYTHONPATH /opt/airflow diff --git a/sm2a/airflow_worker/Dockerfile b/sm2a/airflow_worker/Dockerfile index 58f9dead..0b13f92e 100644 --- a/sm2a/airflow_worker/Dockerfile +++ b/sm2a/airflow_worker/Dockerfile @@ -41,7 +41,6 @@ COPY --chown=airflow:airflow scripts "${AIRFLOW_HOME}/scripts" RUN cp ${AIRFLOW_HOME}/configuration/airflow.cfg* ${AIRFLOW_HOME}/. - RUN pip install pypgstac==0.7.4 # ENV diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index 5daf2d37..728ee780 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.23/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.23.beta/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix From 23565f8720eeb4a381a8c0b8b4d6eff6f86d2c63 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 16:36:33 -0500 Subject: [PATCH 20/97] Make the task architecture configurable --- .github/actions/terraform-deploy/action.yml | 3 +- sm2a/.env_dev | 9 + sm2a/dags/__init__.py | 0 sm2a/dags/example_dag.py | 94 +++ sm2a/dags/generate_dags.py | 50 ++ sm2a/dags/rds_example_dag.py | 89 +++ sm2a/dags/requirements-constraints.txt | 663 ++++++++++++++++++ sm2a/dags/requirements.txt | 20 + sm2a/dags/veda_data_pipeline/__init__.py | 0 .../veda_data_pipeline/groups/__init__.py | 0 .../groups/collection_group.py | 79 +++ .../groups/discover_group.py | 110 +++ .../veda_data_pipeline/groups/ecs_tasks.py | 117 ++++ .../groups/processing_group.py | 95 +++ .../groups/transfer_group.py | 90 +++ .../veda_data_pipeline/requirements_dev.txt | 1 + sm2a/dags/veda_data_pipeline/utils/README.md | 26 + .../dags/veda_data_pipeline/utils/__init__.py | 0 .../utils/collection_generation.py | 138 ++++ .../veda_data_pipeline/utils/s3_discovery.py | 292 ++++++++ sm2a/dags/veda_data_pipeline/utils/schemas.py | 15 + .../veda_data_pipeline/utils/submit_stac.py | 136 ++++ .../dags/veda_data_pipeline/utils/transfer.py | 110 +++ .../veda_collection_pipeline.py | 49 ++ .../veda_dataset_pipeline.py | 80 +++ .../veda_discover_pipeline.py | 92 +++ .../veda_process_raster_pipeline.py | 52 ++ .../veda_process_vector_pipeline.py | 110 +++ .../veda_transfer_pipeline.py | 50 ++ sm2a/infrastructure/main.tf | 2 +- 30 files changed, 2569 insertions(+), 3 deletions(-) create mode 100644 sm2a/.env_dev create mode 100644 sm2a/dags/__init__.py create mode 100644 sm2a/dags/example_dag.py create mode 100644 sm2a/dags/generate_dags.py create mode 100644 sm2a/dags/rds_example_dag.py create mode 100644 sm2a/dags/requirements-constraints.txt create mode 100644 sm2a/dags/requirements.txt create mode 100644 sm2a/dags/veda_data_pipeline/__init__.py create mode 100644 sm2a/dags/veda_data_pipeline/groups/__init__.py create mode 100644 sm2a/dags/veda_data_pipeline/groups/collection_group.py create mode 100644 sm2a/dags/veda_data_pipeline/groups/discover_group.py create mode 100644 sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py create mode 100644 sm2a/dags/veda_data_pipeline/groups/processing_group.py create mode 100644 sm2a/dags/veda_data_pipeline/groups/transfer_group.py create mode 100644 sm2a/dags/veda_data_pipeline/requirements_dev.txt create mode 100644 sm2a/dags/veda_data_pipeline/utils/README.md create mode 100644 sm2a/dags/veda_data_pipeline/utils/__init__.py create mode 100644 sm2a/dags/veda_data_pipeline/utils/collection_generation.py create mode 100644 sm2a/dags/veda_data_pipeline/utils/s3_discovery.py create mode 100644 sm2a/dags/veda_data_pipeline/utils/schemas.py create mode 100644 sm2a/dags/veda_data_pipeline/utils/submit_stac.py create mode 100644 sm2a/dags/veda_data_pipeline/utils/transfer.py create mode 100644 sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py create mode 100644 sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py create mode 100644 sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py create mode 100644 sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py create mode 100644 sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py create mode 100644 sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py diff --git a/.github/actions/terraform-deploy/action.yml b/.github/actions/terraform-deploy/action.yml index e2ed62ef..e4497394 100644 --- a/.github/actions/terraform-deploy/action.yml +++ b/.github/actions/terraform-deploy/action.yml @@ -60,5 +60,4 @@ runs: shell: bash working-directory: ${{ inputs.dir }} run: | - ./scripts/deploy.sh ${{ inputs.env-file }} <<< init - ./scripts/deploy.sh ${{ inputs.env-file }} <<< plan + echo "skip" diff --git a/sm2a/.env_dev b/sm2a/.env_dev new file mode 100644 index 00000000..7da48002 --- /dev/null +++ b/sm2a/.env_dev @@ -0,0 +1,9 @@ +STAGE=dev +APP_NAME=veda-pipeline +PREFIX=${APP_NAME}-${STAGE} +AWS_REGION=us-west-2 +SUBNET_TAGNAME="MWAAEnvironment Private*" +STATE_BUCKET_NAME=veda-tf-state-shared +STATE_BUCKET_KEY=veda-mwaa/${PREFIX}-mwaa/terraform.tfstate +STATE_DYNAMO_TABLE=${PREFIX}-shared-state-mwaa-lock-state +IAM_ROLE_PERMISSIONS_BOUNDARY=arn:aws:iam::${AWS_ACCOUNT_ID}:policy/mcp-tenantOperator diff --git a/sm2a/dags/__init__.py b/sm2a/dags/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sm2a/dags/example_dag.py b/sm2a/dags/example_dag.py new file mode 100644 index 00000000..7bd11599 --- /dev/null +++ b/sm2a/dags/example_dag.py @@ -0,0 +1,94 @@ +import logging +import time + +import pendulum +from airflow import DAG +from airflow.operators.dummy_operator import DummyOperator as EmptyOperator +from airflow.operators.python import PythonOperator + + +def log_task(text: str): + logging.info(text) + + +def discover_from_cmr_task(text): + log_task(text) + + +def discover_from_s3_task(text): + log_task("I am discovering") + time.sleep(1) + log_task("Done discovering") + log_task(text) + + +def move_files_to_maap_store_task(text): + log_task("I am moving files") + time.sleep(3) + log_task("Done moving files") + log_task(text) + + +def generate_cmr_metadata_task(text): + log_task(text) + + +def push_to_cmr_task(text): + log_task(text) + + +with DAG( + dag_id="example_etl_flow_test", + start_date=pendulum.today("UTC").add(days=-1), + schedule_interval=None, + tags=["example"], +) as dag: + + start = EmptyOperator(task_id="start", dag=dag) + + discover_from_cmr = PythonOperator( + task_id="discover_from_cmr", + python_callable=discover_from_cmr_task, + op_kwargs={"text": "Discover from CMR"}, + dag=dag, + ) + + discover_from_s3 = PythonOperator( + task_id="discover_from_s3", + python_callable=discover_from_s3_task, + op_kwargs={"text": "Discover from S3"}, + dag=dag, + ) + + move_files_to_maap_store = PythonOperator( + task_id="move_files_to_maap_store", + python_callable=move_files_to_maap_store_task, + op_kwargs={"text": "Moving Files to MAAP store"}, + dag=dag, + ) + + generate_cmr_metadata = PythonOperator( + task_id="generate_cmr_metadata", + python_callable=generate_cmr_metadata_task, + op_kwargs={"text": "Generate CMR metadata"}, + dag=dag, + ) + + push_to_cmr = PythonOperator( + task_id="push_to_cmr", + python_callable=push_to_cmr_task, + op_kwargs={"text": "Push to CMR"}, + dag=dag, + ) + + end = EmptyOperator(task_id="end", dag=dag) + + start >> discover_from_cmr + + start >> discover_from_s3 >> move_files_to_maap_store + ( + [discover_from_cmr, move_files_to_maap_store] + >> generate_cmr_metadata + >> push_to_cmr + >> end + ) diff --git a/sm2a/dags/generate_dags.py b/sm2a/dags/generate_dags.py new file mode 100644 index 00000000..f20db9f3 --- /dev/null +++ b/sm2a/dags/generate_dags.py @@ -0,0 +1,50 @@ +""" +Builds a DAG for each collection (indicated by a .json file) in the /collections/ folder. +These DAGs are used to discover and ingest items for each collection. +""" + +from airflow.models.variable import Variable + +from veda_data_pipeline.veda_discover_pipeline import get_discover_dag + + +def generate_dags(): + import boto3 + import json + + from pathlib import Path + + + mwaa_stac_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) + bucket = mwaa_stac_conf["EVENT_BUCKET"] + + client = boto3.client("s3") + response = client.list_objects_v2(Bucket=bucket, Prefix="collections/") + + for file_ in response.get("Contents", []): + key = file_["Key"] + if key.endswith("/"): + continue + file_name = Path(key).stem + result = client.get_object(Bucket=bucket, Key=key) + discovery_configs = result["Body"].read().decode() + discovery_configs = json.loads(discovery_configs) + + # Allow the file content to be either one config or a list of configs + if type(discovery_configs) is dict: + discovery_configs = [discovery_configs] + scheduled_discovery_configs = [ + discovery_config + for discovery_config in discovery_configs + if discovery_config.get("schedule") + ] + for idx, discovery_config in enumerate(scheduled_discovery_configs): + id = f"discover-{file_name}" + if idx > 0: + id = f"{id}-{idx}" + get_discover_dag( + id=id, event=discovery_config + ) + + +generate_dags() diff --git a/sm2a/dags/rds_example_dag.py b/sm2a/dags/rds_example_dag.py new file mode 100644 index 00000000..66420b79 --- /dev/null +++ b/sm2a/dags/rds_example_dag.py @@ -0,0 +1,89 @@ +from __future__ import annotations +from airflow import DAG +from airflow.providers.postgres.operators.postgres import PostgresOperator +from airflow.hooks.postgres_hook import PostgresHook +from datetime import datetime, date +import json +from airflow.decorators import task + + +def json_serial(obj): + """JSON serializer for objects not serializable by default json code""" + if isinstance(obj, (datetime, date)): + return obj.isoformat() + raise TypeError("Type %s not serializable" % type(obj)) + + +# [START postgres_operator_howto_guide] + + +# create_pet_table, populate_pet_table, get_all_pets, and get_birth_date are examples of tasks created by +# instantiating the Postgres Operator + +DAG_ID = "postgres_operator_dag" + + +with DAG( + dag_id=DAG_ID, + start_date=datetime(2020, 2, 2), + schedule="@once", + catchup=False, + tags=["example"], +) as dag: + # [START postgres_operator_howto_guide_create_pet_table] + create_pet_table = PostgresOperator( + postgres_conn_id="cluster_rds_connection", + task_id="create_pet_table", + sql=""" + CREATE TABLE IF NOT EXISTS pet ( + pet_id SERIAL PRIMARY KEY, + name VARCHAR NOT NULL, + pet_type VARCHAR NOT NULL, + birth_date DATE NOT NULL, + OWNER VARCHAR NOT NULL); + """, + ) + # [END postgres_operator_howto_guide_create_pet_table] + # [START postgres_operator_howto_guide_populate_pet_table] + populate_pet_table = PostgresOperator( + postgres_conn_id="cluster_rds_connection", + task_id="populate_pet_table", + sql=""" + INSERT INTO pet (name, pet_type, birth_date, OWNER) + VALUES ( 'Max', 'Dog', '2018-07-05', 'Jane'); + INSERT INTO pet (name, pet_type, birth_date, OWNER) + VALUES ( 'Susie', 'Cat', '2019-05-01', 'Phil'); + INSERT INTO pet (name, pet_type, birth_date, OWNER) + VALUES ( 'Lester', 'Hamster', '2020-06-23', 'Lily'); + INSERT INTO pet (name, pet_type, birth_date, OWNER) + VALUES ( 'Quincy', 'Parrot', '2013-08-11', 'Anne'); + """, + ) + # [END postgres_operator_howto_guide_populate_pet_table] + # [START postgres_operator_howto_guide_get_all_pets] + + @task + def get_all_pets(): + sql = "SELECT * FROM pet" + pg_hook = PostgresHook(postgres_conn_id="cluster_rds_connection") + connection = pg_hook.get_conn() + cursor = connection.cursor() + cursor.execute(sql) + results = cursor.fetchall() + for result in results: + print(result) + return {"results": json.dumps(results, default=json_serial)} + + # [END postgres_operator_howto_guide_get_all_pets] + # [START postgres_operator_howto_guide_get_birth_date] + get_birth_date = PostgresOperator( + postgres_conn_id="cluster_rds_connection", + task_id="get_birth_date", + sql="SELECT * FROM pet WHERE birth_date BETWEEN SYMMETRIC %(begin_date)s AND %(end_date)s", + parameters={"begin_date": "2020-01-01", "end_date": "2020-12-31"}, + runtime_parameters={"statement_timeout": "3000ms"}, + ) + # [END postgres_operator_howto_guide_get_birth_date] + + create_pet_table >> populate_pet_table >> get_all_pets() >> get_birth_date + # [END postgres_operator_howto_guide] diff --git a/sm2a/dags/requirements-constraints.txt b/sm2a/dags/requirements-constraints.txt new file mode 100644 index 00000000..a3bfd18b --- /dev/null +++ b/sm2a/dags/requirements-constraints.txt @@ -0,0 +1,663 @@ +# +# This constraints file was automatically generated on 2023-01-18T18:46:04Z +# via "eager-upgrade" mechanism of PIP. For the "v2-5-test" branch of Airflow. +# This variant of constraints install uses the HEAD of the branch version for 'apache-airflow' but installs +# the providers from PIP-released packages at the moment of the constraint generation. +# +# Those constraints are actually those that regular users use to install released version of Airflow. +# We also use those constraints after "apache-airflow" is released and the constraints are tagged with +# "constraints-X.Y.Z" tag to build the production image for that version. +# +# +# This constraints file is meant to be used only in the "apache-airflow" installation command and not +# in all subsequent pip commands. By using a constraints.txt file, we ensure that solely the Airflow +# installation step is reproducible. Subsequent pip commands may install packages that would have +# been incompatible with the constraints used in Airflow reproducible installation step. Finally, pip +# commands that might change the installed version of apache-airflow should include "apache-airflow==X.Y.Z" +# in the list of install targets to prevent Airflow accidental upgrade or downgrade. +# +# Typical installation process of airflow for Python 3.8 is (with random selection of extras and custom +# dependencies added), usually consists of two steps: +# +# 1. Reproducible installation of airflow with selected providers (note constraints are used): +# +# pip install "apache-airflow[celery,cncf.kubernetes,google,amazon,snowflake]==X.Y.Z" \ +# --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-X.Y.Z/constraints-3.8.txt" +# +# 2. Installing own dependencies that are potentially not matching the constraints (note constraints are not +# used, and apache-airflow==X.Y.Z is used to make sure there is no accidental airflow upgrade/downgrade. +# +# pip install "apache-airflow==X.Y.Z" "snowflake-connector-python[pandas]==2.9.0" +# +APScheduler==3.6.3 +Authlib==1.2.0 +Babel==2.11.0 +ConfigUpdater==3.1.1 +Deprecated==1.2.13 +Flask-AppBuilder==4.1.4 +Flask-Babel==2.0.0 +Flask-Bcrypt==1.0.1 +Flask-Caching==2.0.2 +Flask-JWT-Extended==4.4.4 +Flask-Login==0.6.2 +Flask-SQLAlchemy==2.5.1 +Flask-Session==0.4.0 +Flask-WTF==1.1.1 +Flask==2.2.2 +GitPython==3.1.30 +HeapDict==1.0.1 +JPype1==1.4.1 +JayDeBeApi==1.2.3 +Jinja2==3.1.2 +Mako==1.2.4 +Markdown==3.4.1 +MarkupSafe==2.1.2 +PyGithub==1.57 +PyHive==0.6.5 +PyJWT==2.6.0 +PyNaCl==1.5.0 +PyYAML==6.0 +Pygments==2.14.0 +SQLAlchemy-JSONField==1.0.1.post0 +SQLAlchemy-Utils==0.39.0 +SQLAlchemy==1.4.46 +SecretStorage==3.3.3 +Sphinx==5.3.0 +Unidecode==1.3.6 +WTForms==3.0.1 +Werkzeug==2.2.2 +adal==1.2.7 +aiofiles==22.1.0 +aiohttp==3.8.3 +aiosignal==1.3.1 +alabaster==0.7.13 +alembic==1.9.2 +aliyun-python-sdk-core==2.13.36 +aliyun-python-sdk-kms==2.16.0 +amqp==5.1.1 +analytics-python==1.4.0 +ansiwrap==0.8.4 +anyio==3.6.2 +apache-airflow-providers-airbyte==3.2.0 +apache-airflow-providers-alibaba==2.2.0 +apache-airflow-providers-amazon==7.1.0 +apache-airflow-providers-apache-beam==4.1.1 +apache-airflow-providers-apache-cassandra==3.1.0 +apache-airflow-providers-apache-drill==2.3.1 +apache-airflow-providers-apache-druid==3.3.1 +apache-airflow-providers-apache-hdfs==3.2.0 +apache-airflow-providers-apache-hive==5.1.1 +apache-airflow-providers-apache-kylin==3.1.0 +apache-airflow-providers-apache-livy==3.2.0 +apache-airflow-providers-apache-pig==4.0.0 +apache-airflow-providers-apache-pinot==4.0.1 +apache-airflow-providers-apache-spark==4.0.0 +apache-airflow-providers-apache-sqoop==3.1.0 +apache-airflow-providers-arangodb==2.1.0 +apache-airflow-providers-asana==2.1.0 +apache-airflow-providers-atlassian-jira==2.0.0 +apache-airflow-providers-celery==3.1.0 +apache-airflow-providers-cloudant==3.1.0 +apache-airflow-providers-cncf-kubernetes==5.1.1 +apache-airflow-providers-common-sql==1.3.3 +apache-airflow-providers-databricks==4.0.0 +apache-airflow-providers-datadog==3.1.0 +apache-airflow-providers-dbt-cloud==2.3.1 +apache-airflow-providers-dingding==3.1.0 +apache-airflow-providers-discord==3.1.0 +apache-airflow-providers-docker==3.4.0 +apache-airflow-providers-elasticsearch==4.3.3 +apache-airflow-providers-exasol==4.1.3 +apache-airflow-providers-facebook==3.1.0 +apache-airflow-providers-ftp==3.3.0 +apache-airflow-providers-github==2.2.0 +apache-airflow-providers-google==8.8.0 +apache-airflow-providers-grpc==3.1.0 +apache-airflow-providers-hashicorp==3.2.0 +apache-airflow-providers-http==4.1.1 +apache-airflow-providers-imap==3.1.1 +apache-airflow-providers-influxdb==2.1.0 +apache-airflow-providers-jdbc==3.3.0 +apache-airflow-providers-jenkins==3.2.0 +apache-airflow-providers-microsoft-azure==5.1.0 +apache-airflow-providers-microsoft-mssql==3.3.2 +apache-airflow-providers-microsoft-psrp==2.2.0 +apache-airflow-providers-microsoft-winrm==3.1.1 +apache-airflow-providers-mongo==3.1.1 +apache-airflow-providers-mysql==4.0.0 +apache-airflow-providers-neo4j==3.2.1 +apache-airflow-providers-odbc==3.2.1 +apache-airflow-providers-openfaas==3.1.0 +apache-airflow-providers-opsgenie==5.0.0 +apache-airflow-providers-oracle==3.6.0 +apache-airflow-providers-pagerduty==3.1.0 +apache-airflow-providers-papermill==3.1.0 +apache-airflow-providers-plexus==3.1.0 +apache-airflow-providers-postgres==5.4.0 +apache-airflow-providers-presto==4.2.1 +apache-airflow-providers-qubole==3.3.1 +apache-airflow-providers-redis==3.1.0 +apache-airflow-providers-salesforce==5.3.0 +apache-airflow-providers-samba==4.1.0 +apache-airflow-providers-segment==3.1.0 +apache-airflow-providers-sendgrid==3.1.0 +apache-airflow-providers-sftp==4.2.1 +apache-airflow-providers-singularity==3.1.0 +apache-airflow-providers-slack==7.2.0 +apache-airflow-providers-snowflake==4.0.2 +apache-airflow-providers-sqlite==3.3.1 +apache-airflow-providers-ssh==3.4.0 +apache-airflow-providers-tableau==4.0.0 +apache-airflow-providers-tabular==1.1.0 +apache-airflow-providers-telegram==3.1.1 +apache-airflow-providers-trino==4.3.1 +apache-airflow-providers-vertica==3.3.1 +apache-airflow-providers-yandex==3.2.0 +apache-airflow-providers-zendesk==4.2.0 +apache-beam==2.44.0 +apispec==3.3.2 +appdirs==1.4.4 +argcomplete==2.0.0 +arrow==1.2.3 +asana==3.0.0 +asgiref==3.6.0 +asn1crypto==1.5.1 +astroid==2.11.7 +asttokens==2.2.1 +async-timeout==4.0.2 +asynctest==0.13.0 +atlasclient==1.0.0 +atlassian-python-api==3.32.2 +attrs==22.2.0 +aws-sam-translator==1.57.0 +aws-xray-sdk==2.11.0 +azure-batch==13.0.0 +azure-common==1.1.28 +azure-core==1.26.2 +azure-cosmos==4.3.0 +azure-datalake-store==0.0.52 +azure-identity==1.12.0 +azure-keyvault-secrets==4.6.0 +azure-kusto-data==0.0.45 +azure-mgmt-containerinstance==1.5.0 +azure-mgmt-core==1.3.2 +azure-mgmt-datafactory==1.1.0 +azure-mgmt-datalake-nspkg==3.0.1 +azure-mgmt-datalake-store==0.5.0 +azure-mgmt-nspkg==3.0.2 +azure-mgmt-resource==22.0.0 +azure-nspkg==3.0.2 +azure-servicebus==7.8.2 +azure-storage-blob==12.14.1 +azure-storage-common==2.1.0 +azure-storage-file-datalake==12.9.1 +azure-storage-file==2.1.0 +azure-synapse-spark==0.7.0 +backcall==0.2.0 +backoff==1.10.0 +bcrypt==4.0.1 +beautifulsoup4==4.11.1 +billiard==3.6.4.0 +black==23.1a1 +bleach==5.0.1 +blinker==1.5 +boto3==1.26.51 +boto==2.49.0 +botocore==1.29.51 +bowler==0.9.0 +cachelib==0.9.0 +cachetools==4.2.2 +cassandra-driver==3.25.0 +cattrs==22.2.0 +celery==5.2.7 +certifi==2022.12.7 +cffi==1.15.1 +cfgv==3.3.1 +cfn-lint==0.72.9 +cgroupspy==0.2.2 +chardet==4.0.0 +charset-normalizer==2.1.1 +checksumdir==1.2.0 +ciso8601==2.3.0 +click-default-group==1.2.2 +click-didyoumean==0.3.0 +click-plugins==1.1.1 +click-repl==0.2.0 +click==8.1.3 +clickclick==20.10.2 +cloudant==2.15.0 +cloudpickle==2.2.0 +colorama==0.4.6 +colorlog==4.8.0 +commonmark==0.9.1 +connexion==2.14.1 +coverage==7.0.5 +crcmod==1.7 +cron-descriptor==1.2.32 +croniter==1.3.8 +cryptography==38.0.4 +curlify==2.2.1 +dask==2023.1.0 +databricks-sql-connector==2.2.0 +datadog==0.44.0 +db-dtypes==1.0.5 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.1.1 +distlib==0.3.6 +distributed==2023.1.0 +dnspython==2.3.0 +docker==6.0.1 +docopt==0.6.2 +docutils==0.19 +ecdsa==0.18.0 +elasticsearch-dbapi==0.2.9 +elasticsearch-dsl==7.4.0 +elasticsearch==7.13.4 +email-validator==1.3.0 +entrypoints==0.4 +eralchemy2==1.3.6 +eventlet==0.33.3 +exceptiongroup==1.1.0 +execnet==1.9.0 +executing==1.2.0 +facebook-business==15.0.2 +fastavro==1.7.0 +fasteners==0.18 +fastjsonschema==2.16.2 +filelock==3.9.0 +fissix==21.11.13 +flake8-colors==0.1.9 +flake8==6.0.0 +flake8_implicit_str_concat==0.3.0 +flaky==3.7.0 +flower==1.2.0 +freezegun==1.2.2 +frozenlist==1.3.3 +fsspec==2022.11.0 +future==0.18.3 +gcloud-aio-auth==4.1.5 +gcloud-aio-bigquery==6.2.0 +gcloud-aio-storage==8.0.0 +gcsfs==2022.11.0 +geomet==0.2.1.post1 +gevent==22.10.2 +gitdb==4.0.10 +google-ads==18.0.0 +google-api-core==2.8.2 +google-api-python-client==1.12.11 +google-auth-httplib2==0.1.0 +google-auth-oauthlib==0.8.0 +google-auth==2.16.0 +google-cloud-aiplatform==1.16.1 +google-cloud-appengine-logging==1.1.3 +google-cloud-audit-log==0.2.4 +google-cloud-automl==2.8.0 +google-cloud-bigquery-datatransfer==3.7.0 +google-cloud-bigquery-storage==2.14.1 +google-cloud-bigquery==2.34.4 +google-cloud-bigtable==1.7.3 +google-cloud-build==3.9.0 +google-cloud-compute==0.7.0 +google-cloud-container==2.11.1 +google-cloud-core==2.3.2 +google-cloud-datacatalog==3.9.0 +google-cloud-dataform==0.2.0 +google-cloud-dataplex==1.1.0 +google-cloud-dataproc-metastore==1.6.0 +google-cloud-dataproc==5.0.0 +google-cloud-dlp==1.0.2 +google-cloud-kms==2.12.0 +google-cloud-language==1.3.2 +google-cloud-logging==3.2.1 +google-cloud-memcache==1.4.1 +google-cloud-monitoring==2.11.0 +google-cloud-orchestration-airflow==1.4.1 +google-cloud-os-login==2.7.1 +google-cloud-pubsub==2.13.5 +google-cloud-redis==2.9.0 +google-cloud-resource-manager==1.6.0 +google-cloud-secret-manager==1.0.2 +google-cloud-spanner==1.19.3 +google-cloud-speech==1.3.4 +google-cloud-storage==2.7.0 +google-cloud-tasks==2.10.1 +google-cloud-texttospeech==1.0.3 +google-cloud-translate==1.7.2 +google-cloud-videointelligence==1.16.3 +google-cloud-vision==1.0.2 +google-cloud-workflows==1.7.1 +google-crc32c==1.5.0 +google-resumable-media==2.4.0 +googleapis-common-protos==1.56.4 +graphql-core==3.2.3 +graphviz==0.20.1 +greenlet==2.0.1 +grpc-google-iam-v1==0.12.4 +grpcio-gcp==0.2.2 +grpcio-status==1.48.2 +grpcio==1.51.1 +gssapi==1.8.2 +gunicorn==20.1.0 +h11==0.14.0 +hdfs==2.7.0 +hmsclient==0.1.1 +httpcore==0.16.3 +httplib2==0.20.4 +httpx==0.23.3 +humanize==4.4.0 +hvac==1.0.2 +identify==2.5.13 +idna==3.4 +ijson==3.2.0.post0 +imagesize==1.4.1 +importlib-metadata==6.0.0 +incremental==22.10.0 +inflection==0.5.1 +influxdb-client==1.35.0 +iniconfig==2.0.0 +ipdb==0.13.11 +ipython==8.8.0 +isodate==0.6.1 +isort==5.11.2 +itsdangerous==2.1.2 +jaraco.classes==3.2.3 +jedi==0.18.2 +jeepney==0.8.0 +jira==3.4.1 +jmespath==0.10.0 +jschema-to-python==1.2.3 +json-merge-patch==0.2 +jsondiff==2.0.0 +jsonpatch==1.32 +jsonpath-ng==1.5.3 +jsonpickle==3.0.1 +jsonpointer==2.3 +jsonschema-spec==0.1.2 +jsonschema==4.17.3 +junit-xml==1.9 +jupyter-client==7.3.4 +jupyter_core==5.1.3 +keyring==23.13.1 +kombu==5.2.4 +krb5==0.4.1 +kubernetes==23.6.0 +kylinpy==2.8.4 +lazy-object-proxy==1.9.0 +ldap3==2.9.1 +linkify-it-py==2.0.0 +locket==1.0.0 +lockfile==0.12.2 +looker-sdk==22.20.0 +lxml==4.9.2 +lz4==4.3.2 +markdown-it-py==2.1.0 +marshmallow-enum==1.5.1 +marshmallow-oneofschema==3.0.1 +marshmallow-sqlalchemy==0.26.1 +marshmallow==3.19.0 +matplotlib-inline==0.1.6 +mccabe==0.7.0 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mongomock==4.1.2 +monotonic==1.6 +more-itertools==8.14.0 +moreorless==0.4.0 +moto==4.1.0 +msal-extensions==1.0.0 +msal==1.20.0 +msgpack==1.0.4 +msrest==0.7.1 +msrestazure==0.6.4 +multi-key-dict==2.0.3 +multidict==6.0.4 +mypy-boto3-appflow==1.26.32 +mypy-boto3-rds==1.26.47 +mypy-boto3-redshift-data==1.26.30 +mypy-extensions==0.4.3 +mypy==0.971 +mysql-connector-python==8.0.32 +mysqlclient==2.1.1 +nbclient==0.7.2 +nbformat==5.7.3 +neo4j==5.4.0 +nest-asyncio==1.5.6 +networkx==2.8.8 +nodeenv==1.7.0 +ntlm-auth==1.5.0 +numpy==1.22.4 +oauthlib==3.2.2 +objsize==0.6.1 +openapi-schema-validator==0.4.0 +openapi-spec-validator==0.5.2 +opsgenie-sdk==2.1.5 +oracledb==1.2.1 +orjson==3.8.5 +oscrypto==1.3.0 +oss2==2.16.0 +packaging==21.3 +pandas-gbq==0.17.9 +pandas==1.5.2 +papermill==2.4.0 +parameterized==0.8.1 +paramiko==2.12.0 +parso==0.8.3 +partd==1.3.0 +pathable==0.4.3 +pathspec==0.9.0 +pbr==5.11.1 +pdpyras==4.5.2 +pendulum==2.1.2 +pexpect==4.8.0 +pickleshare==0.7.5 +pinotdb==0.4.12 +pipdeptree==2.3.3 +pipx==1.1.0 +pkginfo==1.9.6 +platformdirs==2.6.2 +pluggy==1.0.0 +ply==3.11 +plyvel==1.5.0 +portalocker==2.6.0 +pre-commit==2.21.0 +presto-python-client==0.8.3 +prison==0.2.1 +prometheus-client==0.15.0 +prompt-toolkit==3.0.36 +proto-plus==1.19.6 +protobuf==3.20.0 +psutil==5.9.4 +psycopg2-binary==2.9.5 +psycopg2==2.9.5 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pure-sasl==0.6.2 +py4j==0.10.9.5 +py==1.11.0 +pyOpenSSL==22.1.0 +pyarrow==9.0.0 +pyasn1-modules==0.2.8 +pyasn1==0.4.8 +pycodestyle==2.10.0 +pycountry==22.3.5 +pycparser==2.21 +pycryptodome==3.16.0 +pycryptodomex==3.16.0 +pydantic==1.10.4 +pydata-google-auth==1.5.0 +pydot==1.4.2 +pydruid==0.6.5 +pyenchant==3.2.2 +pyexasol==0.25.1 +pyflakes==3.0.1 +pygraphviz==1.10 +pyhcl==0.4.4 +pykerberos==1.2.4 +pymongo==3.13.0 +pymssql==2.2.8 +pyodbc==4.0.35 +pyparsing==3.0.9 +pypsrp==0.8.1 +pyrsistent==0.19.3 +pyspark==3.3.1 +pyspnego==0.7.0 +pytest-asyncio==0.20.3 +pytest-capture-warnings==0.0.4 +pytest-cov==4.0.0 +pytest-httpx==0.21.2 +pytest-instafail==0.4.2 +pytest-rerunfailures==9.1.1 +pytest-timeouts==1.2.1 +pytest-xdist==3.1.0 +pytest==6.2.5 +python-arango==7.5.5 +python-daemon==2.3.2 +python-dateutil==2.8.2 +python-dotenv==0.21.0 +python-http-client==3.3.7 +python-jenkins==1.7.0 +python-jose==3.3.0 +python-ldap==3.4.3 +python-nvd3==0.15.0 +python-slugify==7.0.0 +python-telegram-bot==13.15 +pytz-deprecation-shim==0.1.0.post0 +pytz==2022.7.1 +pytzdata==2020.1 +pywinrm==0.4.3 +pyzmq==25.0.0 +qds-sdk==1.16.1 +reactivex==4.0.4 +readme-renderer==37.3 +redis==3.5.3 +redshift-connector==2.0.909 +regex==2022.10.31 +requests-file==1.5.1 +requests-kerberos==0.14.0 +requests-mock==1.10.0 +requests-ntlm==1.1.0 +requests-oauthlib==1.3.1 +requests-toolbelt==0.10.1 +requests==2.28.2 +responses==0.22.0 +rfc3986==1.5.0 +rich-click==1.6.0 +rich==13.1.0 +rsa==4.9 +s3transfer==0.6.0 +sarif-om==1.0.4 +sasl==0.3.1 +scramp==1.4.4 +scrapbook==0.5.0 +semver==2.13.0 +sendgrid==6.9.7 +sentinels==1.0.0 +sentry-sdk==1.13.0 +setproctitle==1.3.2 +simple-salesforce==1.12.3 +six==1.16.0 +slack-sdk==3.19.5 +smbprotocol==1.10.1 +smmap==5.0.0 +snakebite-py3==3.0.5 +sniffio==1.3.0 +snowballstemmer==2.2.0 +snowflake-connector-python==2.9.0 +snowflake-sqlalchemy==1.4.4 +sortedcontainers==2.4.0 +soupsieve==2.3.2.post1 +sphinx-airflow-theme==0.0.11 +sphinx-argparse==0.4.0 +sphinx-autoapi==2.0.1 +sphinx-copybutton==0.5.1 +sphinx-jinja==2.0.2 +sphinx-rtd-theme==1.1.1 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-httpdomain==1.8.1 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-redoc==1.6.0 +sphinxcontrib-serializinghtml==1.1.5 +sphinxcontrib-spelling==7.7.0 +sphinxcontrib.applehelp==1.0.3 +spython==0.3.0 +sqlalchemy-bigquery==1.5.0 +sqlalchemy-drill==1.1.2 +sqlalchemy-redshift==0.8.12 +sqlparse==0.4.3 +sshpubkeys==3.3.1 +sshtunnel==0.4.0 +stack-data==0.6.2 +starkbank-ecdsa==2.2.0 +statsd==4.0.1 +tableauserverclient==0.23.4 +tabulate==0.9.0 +tblib==1.7.0 +tenacity==8.1.0 +termcolor==2.2.0 +text-unidecode==1.3 +textwrap3==0.9.2 +thrift-sasl==0.4.3 +thrift==0.16.0 +toml==0.10.2 +tomli==2.0.1 +toolz==0.12.0 +tornado==6.1 +towncrier==22.12.0 +tqdm==4.64.1 +traitlets==5.8.1 +trino==0.321.0 +twine==4.0.2 +types-Deprecated==1.2.9 +types-Markdown==3.4.2.2 +types-PyMySQL==1.0.19.2 +types-PyYAML==6.0.12.3 +types-boto==2.49.18.5 +types-certifi==2021.10.8.3 +types-croniter==1.3.2.2 +types-docutils==0.19.1.2 +types-freezegun==1.1.10 +types-paramiko==2.12.0.3 +types-protobuf==4.21.0.3 +types-pyOpenSSL==23.0.0.1 +types-python-dateutil==2.8.19.6 +types-python-slugify==7.0.0.1 +types-pytz==2022.7.1.0 +types-redis==4.4.0.2 +types-requests==2.28.11.8 +types-setuptools==65.7.0.2 +types-tabulate==0.9.0.0 +types-termcolor==1.1.6 +types-toml==0.10.8.1 +types-urllib3==1.26.25.4 +typing_extensions==4.4.0 +tzdata==2022.7 +tzlocal==4.2 +uamqp==1.6.3 +uc-micro-py==1.0.1 +unicodecsv==0.14.1 +uritemplate==3.0.1 +urllib3==1.26.14 +userpath==1.8.0 +vertica-python==1.2.0 +vine==5.0.0 +virtualenv==20.17.1 +volatile==2.1.0 +watchtower==2.0.1 +wcwidth==0.2.6 +webencodings==0.5.1 +websocket-client==1.4.2 +wrapt==1.14.1 +xmltodict==0.13.0 +yamllint==1.29.0 +yandexcloud==0.194.0 +yarl==1.8.2 +zeep==4.2.1 +zenpy==2.0.25 +zict==2.2.0 +zipp==3.11.0 +zope.event==4.6 +zope.interface==5.5.2 +zstandard==0.19.0 diff --git a/sm2a/dags/requirements.txt b/sm2a/dags/requirements.txt new file mode 100644 index 00000000..8c9ec097 --- /dev/null +++ b/sm2a/dags/requirements.txt @@ -0,0 +1,20 @@ +#--constraint /usr/local/airflow/dags/requirements-constraints.txt +affine==2.4.0 +netCDF4==1.6.2 +pydantic==1.10.4 +requests==2.28.1 +rio-cogeo==3.5.0 +smart-open==6.3.0 +airflow_multi_dagrun==2.3.1 +apache-airflow-providers-docker==3.2.0 +apache-airflow-providers-postgres==5.2.2 +apache-airflow-providers-common-sql==1.2.0 +typing-extensions==4.4.0 +psycopg2-binary==2.9.5 +pypgstac==0.7.4 +pyOpenSSL==22.0.0 +stac-pydantic +fsspec +s3fs +xarray +xstac diff --git a/sm2a/dags/veda_data_pipeline/__init__.py b/sm2a/dags/veda_data_pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sm2a/dags/veda_data_pipeline/groups/__init__.py b/sm2a/dags/veda_data_pipeline/groups/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sm2a/dags/veda_data_pipeline/groups/collection_group.py b/sm2a/dags/veda_data_pipeline/groups/collection_group.py new file mode 100644 index 00000000..de4f2dd1 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/groups/collection_group.py @@ -0,0 +1,79 @@ +import requests +from airflow.models.variable import Variable +from airflow.operators.python import PythonOperator +from airflow.utils.task_group import TaskGroup +from veda_data_pipeline.utils.collection_generation import GenerateCollection +from veda_data_pipeline.utils.submit_stac import submission_handler + +generator = GenerateCollection() + + +def check_collection_exists(endpoint: str, collection_id: str): + """ + Check if a collection exists in the STAC catalog + + Args: + endpoint (str): STAC catalog endpoint + collection_id (str): collection id + """ + response = requests.get(f"{endpoint}/collections/{collection_id}") + return ( + "Collection.existing_collection" + if (response.status_code == 200) + else "Collection.generate_collection" + ) + + +def ingest_collection_task(ti): + """ + Ingest a collection into the STAC catalog + + Args: + dataset (Dict[str, Any]): dataset dictionary (JSON) + role_arn (str): role arn for Zarr collection generation + """ + collection = ti.xcom_pull(task_ids='Collection.generate_collection') + + return submission_handler( + event=collection, + endpoint="/collections", + cognito_app_secret=Variable.get("COGNITO_APP_SECRET"), + stac_ingestor_api_url=Variable.get("STAC_INGESTOR_API_URL"), + ) + + +# NOTE unused, but useful for item ingests, since collections are a dependency for items +def check_collection_exists_task(ti): + config = ti.dag_run.conf + return check_collection_exists( + endpoint=Variable.get("STAC_URL", default_var=None), + collection_id=config.get("collection"), + ) + + +def generate_collection_task(ti): + config = ti.dag_run.conf + role_arn = Variable.get("ASSUME_ROLE_READ_ARN", default_var=None) + + # TODO it would be ideal if this also works with complete collections where provided - this would make the collection ingest more re-usable + collection = generator.generate_stac( + dataset_config=config, role_arn=role_arn + ) + return collection + + + +group_kwgs = {"group_id": "Collection", "tooltip": "Collection"} + + +def collection_task_group(): + with TaskGroup(**group_kwgs) as collection_task_grp: + generate_collection = PythonOperator( + task_id="generate_collection", python_callable=generate_collection_task + ) + ingest_collection = PythonOperator( + task_id="ingest_collection", python_callable=ingest_collection_task + ) + generate_collection >> ingest_collection + + return collection_task_grp diff --git a/sm2a/dags/veda_data_pipeline/groups/discover_group.py b/sm2a/dags/veda_data_pipeline/groups/discover_group.py new file mode 100644 index 00000000..38b754fb --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/groups/discover_group.py @@ -0,0 +1,110 @@ +import time +import uuid + +from airflow.models.variable import Variable +from airflow.models.xcom import LazyXComAccess +from airflow.operators.dummy_operator import DummyOperator as EmptyOperator +from airflow.decorators import task_group +from airflow.operators.python import BranchPythonOperator, PythonOperator, ShortCircuitOperator +from airflow.utils.trigger_rule import TriggerRule +from airflow_multi_dagrun.operators import TriggerMultiDagRunOperator +from veda_data_pipeline.utils.s3_discovery import ( + s3_discovery_handler, EmptyFileListError +) + +group_kwgs = {"group_id": "Discover", "tooltip": "Discover"} + + +def discover_from_s3_task(ti, event={}, **kwargs): + """Discover grouped assets/files from S3 in batches of 2800. Produce a list of such files stored on S3 to process. + This task is used as part of the discover_group subdag and outputs data to EVENT_BUCKET. + """ + config = { + **event, + **ti.dag_run.conf, + } + last_successful_execution = kwargs.get("prev_start_date_success") + if event.get("schedule") and last_successful_execution: + config["last_successful_execution"] = last_successful_execution.isoformat() + # (event, chunk_size=2800, role_arn=None, bucket_output=None): + MWAA_STAC_CONF = Variable.get("MWAA_STACK_CONF", deserialize_json=True) + read_assume_arn = Variable.get("ASSUME_ROLE_READ_ARN", default_var=None) + # Making the chunk size small, this helped us process large data faster than + # passing a large chunk of 500 + chunk_size = config.get("chunk_size", 500) + try: + return s3_discovery_handler( + event=config, + role_arn=read_assume_arn, + bucket_output=MWAA_STAC_CONF["EVENT_BUCKET"], + chunk_size=chunk_size + ) + except EmptyFileListError as ex: + print(f"Received an exception {ex}") + return [] + + +def get_files_to_process(ti): + """Get files from S3 produced by the discovery task. + Used as part of both the parallel_run_process_rasters and parallel_run_process_vectors tasks. + """ + dynamic_group_id = ti.task_id.split(".")[0] + payload = ti.xcom_pull(task_ids=f"{dynamic_group_id}.discover_from_s3") + if isinstance(payload, LazyXComAccess): + payloads_xcom = payload[0].pop("payload", []) + payload = payload[0] + else: + payloads_xcom = payload.pop("payload", []) + dag_run_id = ti.dag_run.run_id + for indx, payload_xcom in enumerate(payloads_xcom): + time.sleep(2) + yield { + "run_id": f"{dag_run_id}_{uuid.uuid4()}_{indx}", + **payload, + "payload": payload_xcom, + } + + +def vector_raster_choice(ti): + """Choose whether to process rasters or vectors based on the payload.""" + payload = ti.dag_run.conf + dynamic_group_id = ti.task_id.split(".")[0] + + if payload.get("vector"): + return f"{dynamic_group_id}.parallel_run_process_vectors" + return f"{dynamic_group_id}.parallel_run_process_rasters" + +@task_group +def subdag_discover(event={}): + discover_from_s3 = ShortCircuitOperator( + task_id="discover_from_s3", + python_callable=discover_from_s3_task, + op_kwargs={"text": "Discover from S3", "event": event}, + trigger_rule=TriggerRule.NONE_FAILED, + provide_context=True, + ) + + raster_vector_branching = BranchPythonOperator( + task_id="raster_vector_branching", + python_callable=vector_raster_choice, + ) + + run_process_raster = TriggerMultiDagRunOperator( + task_id="parallel_run_process_rasters", + trigger_dag_id="veda_ingest_raster", + python_callable=get_files_to_process, + ) + + run_process_vector = TriggerMultiDagRunOperator( + task_id="parallel_run_process_vectors", + trigger_dag_id="veda_ingest_vector", + python_callable=get_files_to_process, + ) + + # extra no-op, needed to run in dynamic mapping context + end_discover = EmptyOperator(task_id="end_discover", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,) + + discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector] + run_process_raster >> end_discover + run_process_vector >> end_discover + diff --git a/sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py b/sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py new file mode 100644 index 00000000..2c8852e2 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py @@ -0,0 +1,117 @@ +import json + +from airflow.hooks.base import BaseHook +from airflow.providers.amazon.aws.operators.ecs import ( + EcsDeregisterTaskDefinitionOperator, + EcsRegisterTaskDefinitionOperator, + EcsRunTaskOperator, +) +from airflow.utils.task_group import TaskGroup +from airflow.utils.trigger_rule import TriggerRule + + +def get_aws_keys_from_connection(connection_id="aws_default"): + conn = BaseHook.get_connection(connection_id) + return { + "AWS_ACCESS_KEY_ID": conn.login, + "AWS_SECRET_ACCESS_KEY": conn.password, + "AWS_DEFAULT_REGION": json.loads(conn.extra).get("region_name", "us-west-2"), + } + + +group_kwgs = {"group_id": "ECSTasks", "tooltip": "ECSTasks"} + + +def subdag_ecs_task( + task_id, + task_definition_family, + container_name, + docker_image, + cmd: str, + mwaa_stack_conf, + aws_region="us-west-2", + cpu="256", + memory="512", + stage="dev", + environment_vars=None, +): + if environment_vars is None: + environment_vars = list() + with TaskGroup(**group_kwgs) as ecs_task_grp: + if stage == "local": + from airflow.providers.docker.operators.docker import DockerOperator + + return DockerOperator( + task_id=task_id, + container_name=container_name, + image=docker_image, + api_version="auto", + auto_remove=True, + command=cmd, + environment=get_aws_keys_from_connection(), + docker_url="tcp://docker-in-docker:2375", + mount_tmp_dir=False, + network_mode="bridge", + ) + + register_task = EcsRegisterTaskDefinitionOperator( + task_id=f"{task_id}_task_register", + family=task_definition_family, + trigger_rule=TriggerRule.ONE_SUCCESS, + container_definitions=[ + { + "name": container_name, + "image": docker_image, + "entryPoint": ["sh", "-c"], + "command": ["ls"], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": mwaa_stack_conf.get("LOG_GROUP_NAME"), + "awslogs-region": aws_region, + "awslogs-stream-prefix": "ecs", + }, + }, + } + ], + register_task_kwargs={ + "cpu": cpu, + "memory": memory, + "networkMode": "awsvpc", + "taskRoleArn": mwaa_stack_conf.get("MWAA_EXECUTION_ROLE_ARN"), + "executionRoleArn": mwaa_stack_conf.get("MWAA_EXECUTION_ROLE_ARN"), + "requiresCompatibilities": ["FARGATE"], + }, + ) + ecs_task_run = EcsRunTaskOperator( + task_id=task_id, + cluster=mwaa_stack_conf.get("ECS_CLUSTER_NAME"), + task_definition=register_task.output, + launch_type="FARGATE", + do_xcom_push=True, + overrides={ + "containerOverrides": [ + { + "name": container_name, + "command": [cmd], + "environment": environment_vars, + }, + ], + }, + network_configuration={ + "awsvpcConfiguration": { + "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), + "subnets": mwaa_stack_conf.get("SUBNETS"), + }, + }, + awslogs_region="us-west-2", + awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), + awslogs_stream_prefix=f"ecs/{container_name}", + ) + deregister_task = EcsDeregisterTaskDefinitionOperator( + task_id=f"{task_id}_deregister_task", + task_definition=register_task.output, + ) + + register_task >> ecs_task_run >> deregister_task + return ecs_task_grp diff --git a/sm2a/dags/veda_data_pipeline/groups/processing_group.py b/sm2a/dags/veda_data_pipeline/groups/processing_group.py new file mode 100644 index 00000000..9a8382f9 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/groups/processing_group.py @@ -0,0 +1,95 @@ +import json +import logging +from datetime import timedelta + +import smart_open +from airflow.models.variable import Variable +from airflow.operators.python import PythonOperator +from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator +from airflow.utils.task_group import TaskGroup +from veda_data_pipeline.utils.submit_stac import ( + submission_handler, +) + +group_kwgs = {"group_id": "Process", "tooltip": "Process"} + + +def log_task(text: str): + logging.info(text) + + +def submit_to_stac_ingestor_task(ti): + """Submit STAC items to the STAC ingestor API.""" + print("Submit STAC ingestor") + event = json.loads(ti.xcom_pull(task_ids=f"{group_kwgs['group_id']}.build_stac")) + success_file = event["payload"]["success_event_key"] + with smart_open.open(success_file, "r") as _file: + stac_items = json.loads(_file.read()) + + for item in stac_items: + submission_handler( + event=item, + endpoint="/ingestions", + cognito_app_secret=Variable.get("COGNITO_APP_SECRET"), + stac_ingestor_api_url=Variable.get("STAC_INGESTOR_API_URL"), + ) + return event + + +def subdag_process(): + with TaskGroup(**group_kwgs) as process_grp: + mwaa_stack_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) + build_stac = EcsRunTaskOperator( + task_id="build_stac", + trigger_rule="none_failed", + cluster=f"{mwaa_stack_conf.get('PREFIX')}-cluster", + task_definition=f"{mwaa_stack_conf.get('PREFIX')}-tasks", + launch_type="FARGATE", + do_xcom_push=True, + execution_timeout=timedelta(minutes=60), + overrides={ + "containerOverrides": [ + { + "name": f"{mwaa_stack_conf.get('PREFIX')}-veda-stac-build", + "command": [ + "/usr/local/bin/python", + "handler.py", + "--payload", + "{}".format("{{ task_instance.dag_run.conf }}"), + ], + "environment": [ + { + "name": "EXTERNAL_ROLE_ARN", + "value": Variable.get( + "ASSUME_ROLE_READ_ARN", default_var="" + ), + }, + { + "name": "BUCKET", + "value": "veda-data-pipelines-staging-lambda-ndjson-bucket", + }, + { + "name": "EVENT_BUCKET", + "value": mwaa_stack_conf.get("EVENT_BUCKET"), + }, + ], + "memory": 2048, + "cpu": 1024, + }, + ], + }, + network_configuration={ + "awsvpcConfiguration": { + "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), + "subnets": mwaa_stack_conf.get("SUBNETS"), + }, + }, + awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), + awslogs_stream_prefix=f"ecs/{mwaa_stack_conf.get('PREFIX')}-veda-stac-build", # prefix with container name + ) + submit_to_stac_ingestor = PythonOperator( + task_id="submit_to_stac_ingestor", + python_callable=submit_to_stac_ingestor_task, + ) + build_stac >> submit_to_stac_ingestor + return process_grp diff --git a/sm2a/dags/veda_data_pipeline/groups/transfer_group.py b/sm2a/dags/veda_data_pipeline/groups/transfer_group.py new file mode 100644 index 00000000..a4235496 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/groups/transfer_group.py @@ -0,0 +1,90 @@ +from datetime import timedelta + +from airflow.models.variable import Variable +from airflow.operators.python import BranchPythonOperator, PythonOperator +from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator +from airflow.utils.task_group import TaskGroup +from airflow.utils.trigger_rule import TriggerRule +from veda_data_pipeline.utils.transfer import ( + data_transfer_handler, +) + +group_kwgs = {"group_id": "Transfer", "tooltip": "Transfer"} + + +def cogify_choice(ti): + """Choos whether to cogify or not; if yes, use a docker container""" + payload = ti.dag_run.conf + + if payload.get("cogify"): + return f"{group_kwgs['group_id']}.cogify_and_copy_data" + else: + return f"{group_kwgs['group_id']}.copy_data" + + +def transfer_data(ti): + """Transfer data from one S3 bucket to another; s3 copy, no need for docker""" + config = ti.dag_run.conf + role_arn = Variable.get("ASSUME_ROLE_READ_ARN", default_var="") + # (event, chunk_size=2800, role_arn=None, bucket_output=None): + return data_transfer_handler(event=config, role_arn=role_arn) + +# TODO: cogify_transfer handler is missing arg parser so this subdag will not work +def subdag_transfer(): + with TaskGroup(**group_kwgs) as discover_grp: + cogify_branching = BranchPythonOperator( + task_id="cogify_branching", + trigger_rule=TriggerRule.ONE_SUCCESS, + python_callable=cogify_choice, + ) + + run_copy = PythonOperator( + task_id="copy_data", + python_callable=transfer_data, + op_kwargs={"text": "Copy files on S3"}, + ) + + mwaa_stack_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) + run_cogify_copy = EcsRunTaskOperator( + task_id="cogify_and_copy_data", + trigger_rule="none_failed", + cluster=f"{mwaa_stack_conf.get('PREFIX')}-cluster", + task_definition=f"{mwaa_stack_conf.get('PREFIX')}-transfer-tasks", + launch_type="FARGATE", + do_xcom_push=True, + execution_timeout=timedelta(minutes=120), + overrides={ + "containerOverrides": [ + { + "name": f"{mwaa_stack_conf.get('PREFIX')}-veda-cogify-transfer", + "command": [ + "/usr/local/bin/python", + "handler.py", + "--payload", + "{}".format("{{ task_instance.dag_run.conf }}"), + ], + "environment": [ + { + "name": "EXTERNAL_ROLE_ARN", + "value": Variable.get( + "ASSUME_ROLE_READ_ARN", default_var="" + ), + }, + ], + "memory": 2048, + "cpu": 1024, + }, + ], + }, + network_configuration={ + "awsvpcConfiguration": { + "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), + "subnets": mwaa_stack_conf.get("SUBNETS"), + }, + }, + awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), + awslogs_stream_prefix=f"ecs/{mwaa_stack_conf.get('PREFIX')}-veda-cogify-transfer", # prefix with container name + ) + + (cogify_branching >> [run_copy, run_cogify_copy]) + return discover_grp diff --git a/sm2a/dags/veda_data_pipeline/requirements_dev.txt b/sm2a/dags/veda_data_pipeline/requirements_dev.txt new file mode 100644 index 00000000..e21ff359 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/requirements_dev.txt @@ -0,0 +1 @@ +requests_mock==1.12.1 \ No newline at end of file diff --git a/sm2a/dags/veda_data_pipeline/utils/README.md b/sm2a/dags/veda_data_pipeline/utils/README.md new file mode 100644 index 00000000..42c1d982 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/utils/README.md @@ -0,0 +1,26 @@ +# Data Pipeline Utils + +## submit_stac + +Test with python locally (uses example data in [hlss30_stac_example.ndjson](./hlss30_stac_example.ndjson)) + +```bash +python -m submit_stac +``` + +---------------- + +## s3_discovery + +Module to query an `s3` bucket to discover COGs +```bash +docker build -t s3-discovery. +# Currently runs an example for OMI Ozone +docker run s3-discovery python -m s3_discovery_handler +``` + +To run this locally, you may need to pass your AWS credentials to the module: `docker run -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY s3-discovery python -m s3_discovery_handler` + +AWS Provisioning +This Lambda needs to list the contents of a S3 Bucket in order to discover files. +- Add `s3:ListBucket` to the Lambda's execution role diff --git a/sm2a/dags/veda_data_pipeline/utils/__init__.py b/sm2a/dags/veda_data_pipeline/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sm2a/dags/veda_data_pipeline/utils/collection_generation.py b/sm2a/dags/veda_data_pipeline/utils/collection_generation.py new file mode 100644 index 00000000..abba2de5 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/utils/collection_generation.py @@ -0,0 +1,138 @@ +from typing import Any, Dict + +import fsspec +import xarray as xr +import xstac +from veda_data_pipeline.utils.schemas import SpatioTemporalExtent +from datetime import datetime, timezone + + +class GenerateCollection: + common = { + "links": [], + "extent": { + "spatial": {"bbox": [[-180, -90, 180, 90]]}, + "temporal": {"interval": [[None, None]]}, + }, + "type": "Collection", + "stac_version": "1.0.0", + } + keys_to_ignore = [ + "collection", + "data_type", + "sample_files", + "discovery_items", + "spatial_extent", + "temporal_extent", + "is_periodic", + "time_density", + "type", + ] + + def get_template(self, dataset: Dict[str, Any]) -> dict: + extra_fields = { + key: dataset[key] + for key in dataset.keys() + if key not in GenerateCollection.keys_to_ignore + } + + collection_dict = { + "id": dataset["collection"], + **GenerateCollection.common, + **extra_fields, + } + + # Default REQUIRED fields + if not collection_dict.get("description"): + collection_dict["description"] = dataset["collection"] + if not collection_dict.get("license"): + collection_dict["license"] = "proprietary" + + return collection_dict + + def _create_zarr_template(self, dataset: Dict[str, Any], store_path: str) -> dict: + template = self.get_template(dataset) + template["assets"] = { + "zarr": { + "href": store_path, + "title": "Zarr Array Store", + "description": "Zarr array store with one or several arrays (variables)", + "roles": ["data", "zarr"], + "type": "application/vnd+zarr", + "xarray:open_kwargs": { + "engine": "zarr", + "chunks": {}, + **dataset.xarray_kwargs, + }, + } + } + return template + + def create_zarr_collection(self, dataset: Dict[str, Any], role_arn: str) -> dict: + """ + Creates a zarr stac collection based off of the user input + """ + discovery = dataset.discovery_items[0] + store_path = f"s3://{discovery.bucket}/{discovery.prefix}{discovery.zarr_store}" + template = self._create_zarr_template(dataset, store_path) + + fs = fsspec.filesystem("s3", anon=False, role_arn=role_arn) + store = fs.get_mapper(store_path) + ds = xr.open_zarr( + store, consolidated=bool(dataset.xarray_kwargs.get("consolidated")) + ) + + collection = xstac.xarray_to_stac( + ds, + template, + temporal_dimension=dataset.temporal_dimension or "time", + x_dimension=dataset.x_dimension or "lon", + y_dimension=dataset.y_dimension or "lat", + reference_system=dataset.reference_system or 4326, + ) + return collection.to_dict() + + def create_cog_collection(self, dataset: Dict[str, Any]) -> dict: + collection_stac = self.get_template(dataset) + + # Override the extents if they exists + if spatial_extent := dataset.get("spatial_extent"): + collection_stac["extent"]["spatial"] = {"bbox": [list(spatial_extent.values())]}, + + if temporal_extent := dataset.get("temporal_extent"): + collection_stac["extent"]["temporal"] = { + "interval": [ + # most of our data uses the Z suffix for UTC - isoformat() doesn't + [ + datetime.fromisoformat(x).astimezone(timezone.utc).isoformat().replace("+00:00", "Z") + if x else None + for x in list(temporal_extent.values()) + ] + ] + } + + collection_stac["item_assets"] = { + "cog_default": { + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": ["data", "layer"], + "title": "Default COG Layer", + "description": "Cloud optimized default layer to display on map", + } + } + return collection_stac + + def generate_stac( + self, dataset_config: Dict[str, Any], role_arn: str = None + ) -> dict: + """ + Generates a STAC collection based on the dataset and data type + + Args: + dataset_config (Dict[str, Any]): dataset configuration + role_arn (str): role arn for Zarr collection generation + """ + data_type = dataset_config.get("data_type", "cog") + if data_type == "zarr": + return self.create_zarr_collection(dataset_config, role_arn) + else: + return self.create_cog_collection(dataset_config) diff --git a/sm2a/dags/veda_data_pipeline/utils/s3_discovery.py b/sm2a/dags/veda_data_pipeline/utils/s3_discovery.py new file mode 100644 index 00000000..5a275701 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/utils/s3_discovery.py @@ -0,0 +1,292 @@ +import itertools +import json +import os +import re +from typing import List +from uuid import uuid4 +from pathlib import Path + +from datetime import datetime +from dateutil.tz import tzlocal +import boto3 +from smart_open import open as smrt_open + + +# Adding a custom exception for empty list +class EmptyFileListError(Exception): + def __init__(self, error_message): + self.error_message = error_message + super().__init__(self.error_message) + + +def assume_role(role_arn, session_name="veda-data-pipelines_s3-discovery"): + sts = boto3.client("sts") + credentials = sts.assume_role( + RoleArn=role_arn, + RoleSessionName=session_name, + ) + creds = credentials["Credentials"] + return { + "aws_access_key_id": creds["AccessKeyId"], + "aws_secret_access_key": creds.get("SecretAccessKey"), + "aws_session_token": creds.get("SessionToken"), + } + + +def get_s3_resp_iterator(bucket_name, prefix, s3_client, page_size=1000): + """ + Returns an s3 paginator. + :param bucket_name: The bucket. + :param prefix: The path for the s3 granules. + :param s3_client: Initialized boto3 S3 client + :param page_size: Number of records returned + """ + s3_paginator = s3_client.get_paginator("list_objects") + print(f"Getting S3 response iterator for bucket: {bucket_name}, prefix: {prefix}") + return s3_paginator.paginate( + Bucket=bucket_name, Prefix=prefix, PaginationConfig={"page_size": page_size} + ) + + +def discover_from_s3( + response_iterator, filename_regex: str, last_execution: datetime +) -> dict: + """Iterate through pages of S3 objects returned by a ListObjectsV2 operation. + The discover_from_s3 function takes in an iterator over the pages of S3 objects returned + by a ListObjectsV2 operation. It iterates through the pages and yields each S3 object in the page as a dictionary. + This function can be used to iterate through a large number of S3 objects returned by a ListObjectsV2 operation + without having to load all the objects into memory at once. + + Parameters: + response_iterator (iter): An iterator over the pages of S3 objects returned by a ListObjectsV2 operation. + filename_regex (str): A regular expression used to filter the S3 objects returned by the ListObjectsV2 operation. + + Yields: + dict: A dictionary representing an S3 object. + """ + for page in response_iterator: + for s3_object in page.get("Contents", {}): + key = s3_object["Key"] + conditionals = [re.match(filename_regex, key)] + if last_execution: + last_modified = s3_object["LastModified"] + conditionals.append(last_modified > last_execution) + if all(conditionals): + yield s3_object + + +def group_by_item(discovered_files: List[str], id_regex: str, assets: dict) -> dict: + """Group assets by matching regex patterns against discovered files.""" + grouped_files = [] + for uri in discovered_files: + # Each file gets its matched asset type and id + filename = uri.split("/")[-1] + prefix = "/".join(uri.split("/")[:-1]) + asset_type = None + if match := re.match(id_regex, filename): + # At least one match; can use the match here to construct an ID (match groups separated by '-') + item_id = "-".join(match.groups()) + for asset_name, asset_definition in assets.items(): + regex = asset_definition["regex"] + if re.match(regex, filename): + asset_type = asset_name + break + if asset_type: + grouped_files.append( + { + "prefix": prefix, + "filename": filename, + "asset_type": asset_type, + "item_id": item_id, + } + ) + else: + print(f"Warning: skipping file. No id match found: {filename}") + # At this point, files are labeled with type and id. Now, group them by id + sorted_list = sorted(grouped_files, key=lambda x: x["item_id"]) + grouped_data = [ + {"item_id": key, "data": list(group)} + for key, group in itertools.groupby(sorted_list, key=lambda x: x["item_id"]) + ] + items_with_assets = [] + # Produce a dictionary in which each record is keyed by an item ID and contains a list of associated asset hrefs + for group in grouped_data: + item = {"item_id": group["item_id"], "assets": {}} + for file in group["data"]: + asset_type = file["asset_type"] + filename = file["filename"] + # Copy the asset definition and update the href + updated_asset = assets[file["asset_type"]].copy() + updated_asset["href"] = f"{file['prefix']}/{file['filename']}" + item["assets"][asset_type] = updated_asset + items_with_assets.append(item) + return items_with_assets + + +def construct_single_asset_items(discovered_files: List[str], assets: dict|None) -> dict: + items_with_assets = [] + asset_key = "default" + asset_value = {} + if assets: + asset_key = list(assets.keys())[0] + asset_value = assets[asset_key] + for uri in discovered_files: + # Each file gets its matched asset type and id + filename = uri.split("/")[-1] + filename_without_extension = Path(filename).stem + prefix = "/".join(uri.split("/")[:-1]) + item = { + "item_id": filename_without_extension, + "assets": { + asset_key: { + "title": "Default COG Layer", + "description": "Cloud optimized default layer to display on map", + "href": f"{prefix}/{filename}", + **asset_value + } + }, + } + items_with_assets.append(item) + return items_with_assets + + +def generate_payload(s3_prefix_key: str, payload: dict): + """Generate a payload and write it to an S3 file. + This function takes in a prefix for an S3 key and a dictionary containing a payload. + The function then writes the payload to an S3 file using the provided prefix and a randomly + generated UUID as the key. The key of the output file is then returned. + Parameters: + s3_prefix_key (str): The prefix for the S3 key where the output file will be written. + payload (dict): A dictionary containing the payload to be written to the output file. + + Returns: + str: The S3 key of the output file. + """ + output_key = f"{s3_prefix_key}/s3_discover_output_{uuid4()}.json" + with smrt_open(output_key, "w") as file: + file.write(json.dumps(payload)) + return output_key + + +def propagate_forward_datetime_args(event): + """ + This function extracts datetime-related arguments from the input event dictionary. + The purpose is to forward these datetime arguments to other functions that may require them. + + The function looks for the keys "single_datetime", "start_datetime", "end_datetime", + and "datetime_range" in the event dictionary. If any of these keys are present, + it includes them in the output dictionary. + + Parameters: + event (dict): Input dictionary potentially containing datetime arguments. + + Returns: + dict: A new dictionary containing only the datetime-related keys from the input + that were present. If none of the specified keys are in the event, + the function returns an empty dictionary. + """ + keys = ["single_datetime", "start_datetime", "end_datetime", "datetime_range"] + return {key: event[key] for key in keys if key in event} + + +def s3_discovery_handler(event, chunk_size=2800, role_arn=None, bucket_output=None): + bucket = event.get("bucket") + prefix = event.get("prefix", "") + filename_regex = event.get("filename_regex", None) + collection = event.get("collection", prefix.rstrip("/")) + properties = event.get("properties", {}) + assets = event.get("assets") + id_regex = event.get("id_regex") + id_template = event.get("id_template", "{}") + date_fields = propagate_forward_datetime_args(event) + dry_run = event.get("dry_run", False) + if process_from := event.get("process_from_yyyy_mm_dd"): + process_from = datetime.strptime(process_from, "%Y-%m-%d").replace( + tzinfo=tzlocal() + ) + if last_execution := event.get("last_successful_execution"): + last_execution = datetime.fromisoformat(last_execution) + if dry_run: + print("Running discovery in dry run mode") + + payload = {**event, "objects": []} + slice = event.get("slice") + + bucket_output = os.environ.get("EVENT_BUCKET", bucket_output) + key = f"s3://{bucket_output}/events/{collection}" + records = 0 + out_keys = [] + discovered = 0 + + kwargs = assume_role(role_arn=role_arn) if role_arn else {} + s3client = boto3.client("s3", **kwargs) + s3_iterator = get_s3_resp_iterator( + bucket_name=bucket, prefix=prefix, s3_client=s3client + ) + file_uris = [ + f"s3://{bucket}/{obj['Key']}" + for obj in discover_from_s3( + s3_iterator, filename_regex, last_execution=process_from or last_execution + ) + ] + + if len(file_uris) == 0: + raise EmptyFileListError(f"No files discovered at bucket: {bucket}, prefix: {prefix}") + + # group only if more than 1 assets + if assets and len(assets.keys()) > 1: + items_with_assets = group_by_item(file_uris, id_regex, assets) + else: + # out of convenience, we might not always want to explicitly define assets + # or if only a single asset is defined, follow default flow + items_with_assets = construct_single_asset_items(file_uris, assets) + + if len(items_with_assets) == 0: + raise EmptyFileListError( + f"No items could be constructed for files at bucket: {bucket}, prefix: {prefix}" + ) + + # Update IDs using id_template + for item in items_with_assets: + item["item_id"] = id_template.format(item["item_id"]) + + item_count = 0 + for item in items_with_assets: + item_count += 1 + # Logic to ingest a 'slice' of data + if slice: + if item_count < slice[0]: # Skip until we reach the start of the slice + continue + if ( + item_count >= slice[1] + ): # Stop once we reach the end of the slice, while saving progress + break + file_obj = { + "collection": collection, + "item_id": item["item_id"], + "assets": item["assets"], + "properties": properties, + **date_fields, + } + + if dry_run and item_count < 10: + print("-DRYRUN- Example item") + print(json.dumps(file_obj)) + + payload["objects"].append(file_obj) + if records == chunk_size: + out_keys.append(generate_payload(s3_prefix_key=key, payload=payload)) + records = 0 + discovered += len(payload["objects"]) + payload["objects"] = [] + records += 1 + + if payload["objects"]: + out_keys.append(generate_payload(s3_prefix_key=key, payload=payload)) + discovered += len(payload["objects"]) + # We need to make sure the payload isn't too large for ECS overrides + try: + del event["assets"] + except KeyError: + pass + return {**event, "payload": out_keys, "discovered": discovered} diff --git a/sm2a/dags/veda_data_pipeline/utils/schemas.py b/sm2a/dags/veda_data_pipeline/utils/schemas.py new file mode 100644 index 00000000..c5f33b9e --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/utils/schemas.py @@ -0,0 +1,15 @@ +# Description: Lightweight schema definitions + +from datetime import datetime +from typing import List, Union +from stac_pydantic.collection import Extent, TimeInterval + + +class DatetimeInterval(TimeInterval): + # reimplement stac_pydantic's TimeInterval to leverage datetime types + interval: List[List[Union[datetime, None]]] + + +class SpatioTemporalExtent(Extent): + # reimplement stac_pydantic's Extent to leverage datetime types + temporal: DatetimeInterval diff --git a/sm2a/dags/veda_data_pipeline/utils/submit_stac.py b/sm2a/dags/veda_data_pipeline/utils/submit_stac.py new file mode 100644 index 00000000..1d4edfca --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/utils/submit_stac.py @@ -0,0 +1,136 @@ +import json +import os +import sys +from dataclasses import dataclass + +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +from typing import Any, Dict, Optional, Union + +import boto3 +import requests + + +class InputBase(TypedDict): + dry_run: Optional[Any] + + +class S3LinkInput(InputBase): + stac_file_url: str + + +class StacItemInput(InputBase): + stac_item: Dict[str, Any] + + +class AppConfig(TypedDict): + cognito_domain: str + client_id: str + client_secret: str + scope: str + + +class Creds(TypedDict): + access_token: str + expires_in: int + token_type: str + + +@dataclass +class IngestionApi: + base_url: str + token: str + + @classmethod + def from_veda_auth_secret(cls, *, secret_id: str, base_url: str) -> "IngestionApi": + cognito_details = cls._get_cognito_service_details(secret_id) + credentials = cls._get_app_credentials(**cognito_details) + return cls(token=credentials["access_token"], base_url=base_url) + + @staticmethod + def _get_cognito_service_details(secret_id: str) -> AppConfig: + client = boto3.client("secretsmanager") + response = client.get_secret_value(SecretId=secret_id) + return json.loads(response["SecretString"]) + + @staticmethod + def _get_app_credentials( + cognito_domain: str, client_id: str, client_secret: str, scope: str, **kwargs + ) -> Creds: + response = requests.post( + f"{cognito_domain}/oauth2/token", + headers={ + "Content-Type": "application/x-www-form-urlencoded", + }, + auth=(client_id, client_secret), + data={ + "grant_type": "client_credentials", + # A space-separated list of scopes to request for the generated access token. + "scope": scope, + }, + ) + try: + response.raise_for_status() + except Exception as ex: + print(response.text) + raise f"Error, {ex}" + return response.json() + + def submit(self, event: Dict[str, Any], endpoint: str) -> Dict[str, Any]: + headers = { + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json", + } + response = requests.post( + f"{self.base_url.rstrip('/')}{endpoint}", + json=event, + headers=headers, + ) + try: + response.raise_for_status() + except Exception as e: + print(response.text) + raise e + return response.json() + + +def submission_handler( + event: Union[S3LinkInput, StacItemInput, Dict[str, Any]], + endpoint: str = "/ingestions", + cognito_app_secret=None, + stac_ingestor_api_url=None, + context=None, +) -> None: + if context is None: + context = {} + + stac_item = event + + if stac_item.get("dry_run"): + print("Dry run, not inserting, would have inserted:") + print(json.dumps(stac_item, indent=2)) + return + + cognito_app_secret = cognito_app_secret or os.getenv("COGNITO_APP_SECRET") + stac_ingestor_api_url = stac_ingestor_api_url or os.getenv("STAC_INGESTOR_API_URL") + + ingestor = IngestionApi.from_veda_auth_secret( + secret_id=cognito_app_secret, + base_url=stac_ingestor_api_url, + ) + ingestor.submit(event=stac_item, endpoint=endpoint) + # print("Successfully submitted STAC item") + + +if __name__ == "__main__": + filename = "example.ndjson" + sample_event = { + "stac_file_url": "example.ndjson", + # or + "stac_item": {}, + "type": "collections", + } + submission_handler(sample_event) diff --git a/sm2a/dags/veda_data_pipeline/utils/transfer.py b/sm2a/dags/veda_data_pipeline/utils/transfer.py new file mode 100644 index 00000000..20823f37 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/utils/transfer.py @@ -0,0 +1,110 @@ +import re + +import boto3 +from airflow.exceptions import AirflowException + + +def assume_role(role_arn, session_name="veda-data-airflow_s3-discovery"): + sts = boto3.client("sts") + print(f"Assuming role: {role_arn}") + credentials = sts.assume_role( + RoleArn=role_arn, + RoleSessionName=session_name, + ) + creds = credentials["Credentials"] + return { + "aws_access_key_id": creds["AccessKeyId"], + "aws_secret_access_key": creds.get("SecretAccessKey"), + "aws_session_token": creds.get("SessionToken"), + } + + +def get_matching_files(s3_client, bucket, prefix, regex_pattern): + matching_files = [] + + response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) + while True: + for obj in response["Contents"]: + file_key = obj["Key"] + if re.match(regex_pattern, file_key): + matching_files.append(file_key) + + if "NextContinuationToken" in response: + response = s3_client.list_objects_v2( + Bucket=bucket, + Prefix=prefix, + ContinuationToken=response["NextContinuationToken"], + ) + else: + break + + return matching_files + + +def transfer_files_within_s3( + s3_client, origin_bucket, matching_files, destination_bucket, collection +): + for file_key in matching_files: + filename = file_key.split("/")[-1] + # print(f"Transferring file: {filename}") + target_key = f"{collection}/{filename}" + copy_source = {"Bucket": origin_bucket, "Key": file_key} + + # We can use the etag to check if the file has already been copied and avoid duplication of effort + # by using the CopySourceIfNoneMatch parameter below. + try: + target_metadata = s3_client.head_object( + Bucket=destination_bucket, Key=target_key + ) + target_etag = target_metadata["ETag"] + # print(f"File already exists, checking Etag: {filename}") + s3_client.copy_object( + CopySource=copy_source, + Bucket=destination_bucket, + Key=target_key, + CopySourceIfNoneMatch=target_etag, + ) + except s3_client.exceptions.ClientError as err: + if err.response["Error"]["Code"] == "404": + # print(f"Copying file: {filename}") + s3_client.copy_object( + CopySource=copy_source, + Bucket=destination_bucket, + Key=target_key + ) + + +def data_transfer_handler(event, role_arn=None): + origin_bucket = event.get("origin_bucket") + origin_prefix = event.get("origin_prefix") + filename_regex = event.get("filename_regex") + target_bucket = event.get("target_bucket") + collection = event.get("collection") + + kwargs = assume_role(role_arn=role_arn) if role_arn else {} + s3client = boto3.client("s3", **kwargs) + matching_files = get_matching_files( + s3_client=s3client, + bucket=origin_bucket, + prefix=origin_prefix, + regex_pattern=filename_regex, + ) + + if len(matching_files) == 0: + raise AirflowException("No matching files found") + + if not event.get("dry_run"): + transfer_files_within_s3( + s3_client=s3client, + origin_bucket=origin_bucket, + matching_files=matching_files, + destination_bucket=target_bucket, + collection=collection, + ) + else: + print( + f"Would have copied {len(matching_files)} files from {origin_bucket} to {target_bucket}" + ) + print(f"Files matched: {matching_files}") + + return {**event} diff --git a/sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py new file mode 100644 index 00000000..8e67584a --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py @@ -0,0 +1,49 @@ +import pendulum +from airflow import DAG +from airflow.operators.dummy_operator import DummyOperator as EmptyOperator +from airflow.utils.trigger_rule import TriggerRule +from veda_data_pipeline.groups.collection_group import collection_task_group + +dag_doc_md = """ +### Collection Creation and Ingestion +Generates a collection based on the Dataset model and ingests into the catalog +#### Notes +- This DAG can run with the following configuration
+```json +{ + "collection": "collection-id", + "data_type": "cog", + "description": "collection description", + "is_periodic": true, + "license": "collection-LICENSE", + "time_density": "year", + "title": "collection-title" +} +``` +""" + +dag_args = { + "start_date": pendulum.today("UTC").add(days=-1), + "schedule_interval": None, + "catchup": False, + "doc_md": dag_doc_md, + "tags": ["collection"], +} + +template_dag_run_conf = { + "collection": "", + "data_type": "cog", + "description": "", + "is_periodic": "", + "license": "", + "time_density": "", + "title": "" +} + +with DAG("veda_collection_pipeline", params=template_dag_run_conf, **dag_args) as dag: + start = EmptyOperator(task_id="start", dag=dag) + end = EmptyOperator(task_id="end", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS, dag=dag) + + collection_grp = collection_task_group() + + start >> collection_grp >> end diff --git a/sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py new file mode 100644 index 00000000..d456a80a --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py @@ -0,0 +1,80 @@ +import pendulum +from airflow import DAG +from airflow.decorators import task +from airflow.operators.dummy_operator import DummyOperator as EmptyOperator +from airflow.utils.trigger_rule import TriggerRule +from veda_data_pipeline.groups.collection_group import collection_task_group +from veda_data_pipeline.groups.discover_group import subdag_discover + +dag_doc_md = """ +### Dataset Pipeline +Generates a collection and triggers the file discovery process +#### Notes +- This DAG can run with the following configuration
+```json +{ + "collection": "collection-id", + "data_type": "cog", + "description": "collection description", + "discovery_items": + [ + { + "bucket": "veda-data-store-staging", + "datetime_range": "year", + "discovery": "s3", + "filename_regex": "^(.*).tif$", + "prefix": "example-prefix/" + } + ], + "is_periodic": true, + "license": "collection-LICENSE", + "time_density": "year", + "title": "collection-title" +} +``` +""" + +dag_args = { + "start_date": pendulum.today("UTC").add(days=-1), + "schedule_interval": None, + "catchup": False, + "doc_md": dag_doc_md, + "tags": ["collection", "discovery"], +} + +@task +def extract_discovery_items(**kwargs): + ti = kwargs.get("ti") + discovery_items = ti.dag_run.conf.get("discovery_items") + print(discovery_items) + return discovery_items + +template_dag_run_conf = { + "collection": "", + "data_type": "cog", + "description": "", + "discovery_items": + [ + { + "bucket": "", + "datetime_range": "", + "discovery": "s3", + "filename_regex": "", + "prefix": "" + } + ], + "is_periodic": "", + "license": "", + "time_density": "", + "title": "" +} + +with DAG("veda_dataset_pipeline", params=template_dag_run_conf, **dag_args) as dag: + start = EmptyOperator(task_id="start", dag=dag) + end = EmptyOperator(task_id="end", dag=dag) + + collection_grp = collection_task_group() + discover_grp = subdag_discover.expand(event=extract_discovery_items()) + + start >> collection_grp >> discover_grp >> end + diff --git a/sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py new file mode 100644 index 00000000..37a5d520 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py @@ -0,0 +1,92 @@ +import pendulum +from airflow import DAG +from airflow.operators.dummy_operator import DummyOperator +from airflow.utils.trigger_rule import TriggerRule +from veda_data_pipeline.groups.discover_group import subdag_discover + +dag_doc_md = """ +### Discover files from S3 +#### Purpose +This DAG discovers files from either S3 and/or CMR then runs a DAG id `veda_ingest`. +The DAG `veda_ingest` will run in parallel processing (2800 files per each DAG) +#### Notes +- This DAG can run with the following configuration
+```json +{ + "collection": "collection-id", + "bucket": "veda-data-store-staging", + "prefix": "s3-prefix/", + "filename_regex": "^(.*).tif$", + "id_regex": ".*_(.*).tif$", + "process_from_yyyy_mm_dd": "YYYY-MM-DD", + "id_template": "example-id-prefix-{}", + "datetime_range": "month", + "last_successful_execution": datetime(2015,01,01), + "assets": { + "asset1": { + "title": "Asset type 1", + "description": "First of a multi-asset item.", + "regex": ".*asset1.*", + }, + "asset2": { + "title": "Asset type 2", + "description": "Second of a multi-asset item.", + "regex": ".*asset2.*", + }, + } +} +``` +- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) +""" + +dag_args = { + "start_date": pendulum.today("UTC").add(days=-1), + "catchup": False, + "doc_md": dag_doc_md, + "is_paused_upon_creation": False, +} + +templat_dag_run_conf = { + "collection": "", + "bucket": "", + "prefix": "/", + "filename_regex": "", + "id_regex": "", + "id_template": "", + "datetime_range": "||", + "assets": { + "": { + "title": "", + "description": "", + "regex": "", + }, + "": { + "title": "", + "description": "", + "regex": "", + }, + }, +} + + +def get_discover_dag(id, event={}): + params_dag_run_conf = event or templat_dag_run_conf + with DAG( + id, + schedule_interval=event.get("schedule"), + params=params_dag_run_conf, + **dag_args + ) as dag: + start = DummyOperator(task_id="Start", dag=dag) + end = DummyOperator( + task_id="End", trigger_rule=TriggerRule.ONE_SUCCESS, dag=dag + ) + + discover_grp = subdag_discover(event) + + start >> discover_grp >> end + + return dag + + +get_discover_dag("veda_discover") diff --git a/sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py new file mode 100644 index 00000000..2555c6a9 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py @@ -0,0 +1,52 @@ +import pendulum +from airflow import DAG +from airflow.operators.dummy_operator import DummyOperator +from airflow.utils.trigger_rule import TriggerRule +from veda_data_pipeline.groups.processing_group import subdag_process + +dag_doc_md = """ +### Build and submit stac +#### Purpose +This DAG is supposed to be triggered by `veda_discover`. But you still can trigger this DAG manually or through an API + +#### Notes +- This DAG can run with the following configuration
+```json +{ + "collection": "geoglam", + "prefix": "geoglam/", + "bucket": "veda-data-store-staging", + "filename_regex": "^(.*).tif$", + "discovery": "s3", + "datetime_range": "month", + "discovered": 33, + "payload": "s3://veda-uah-sit-mwaa-853558080719/events/geoglam/s3_discover_output_6c46b57a-7474-41fe-977a-.json" +} +``` +- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) +""" + +template_dag_run_conf = { + "collection": "", + "prefix": "/", + "bucket": "", + "filename_regex": "", + "discovery": "", + "datetime_range": "|", + "payload": "> process_grp >> end diff --git a/sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py new file mode 100644 index 00000000..89c75848 --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py @@ -0,0 +1,110 @@ +import pendulum +from airflow import DAG +from airflow.models.variable import Variable +from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator +from airflow.operators.dummy_operator import DummyOperator +from airflow.utils.trigger_rule import TriggerRule + +from datetime import timedelta + +dag_doc_md = """ +### Build and submit stac +#### Purpose +This DAG is supposed to be triggered by `veda_discover`. But you still can trigger this DAG manually or through an API + +#### Notes +- This DAG can run with the following configuration
+```json +{ + "collection": "geoglam", + "prefix": "geoglam/", + "bucket": "veda-data-store-staging", + "filename_regex": "^(.*).tif$", + "discovery": "s3", + "datetime_range": "month", + "upload": false, + "cogify": false, + "discovered": 33, + "payload": "s3://veda-uah-sit-mwaa-853558080719/events/geoglam/s3_discover_output_6c46b57a-7474-41fe-977a-19d164531cdc.json" +} +``` +- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) +""" + +templat_dag_run_conf = { + "collection": "", + "prefix": "/", + "bucket": "", + "filename_regex": "", + "discovery": "|cmr", + "datetime_range": "|", + "upload": " | true", + "cogify": "false | true", + "payload": "> ingest_vector >> end diff --git a/sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py new file mode 100644 index 00000000..6c1b4f3a --- /dev/null +++ b/sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py @@ -0,0 +1,50 @@ +import pendulum +from airflow import DAG +from airflow.operators.dummy_operator import DummyOperator +from airflow.utils.trigger_rule import TriggerRule +from veda_data_pipeline.groups.transfer_group import subdag_transfer + +dag_doc_md = """ +### Discover files from S3 +#### Purpose +This DAG is used to transfer files that are to permanent locations for indexing with STAC. +#### Notes +- This DAG can run with a configuration similar to this
+```json +{ + "origin_bucket": "covid-eo-dashboard", + "origin_prefix": "s3-prefix/", + "filename_regex": "^(.*).tif$", + "target_bucket": "target_s3_bucket", + "collection": "collection-id", + "cogify": false, + "dry_run": true +} +``` +- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) +""" + +dag_args = { + "start_date": pendulum.today("UTC").add(days=-1), + "schedule_interval": None, + "catchup": False, + "doc_md": dag_doc_md, +} + +templat_dag_run_conf = { + "origin_bucket": "", + "origin_prefix": "/", + "filename_regex": "", + "target_bucket": "", + "collection": "", + "cogify": "true|false", + "dry_run": "true|false", +} + +with DAG("veda_transfer", params=templat_dag_run_conf, **dag_args) as dag: + start = DummyOperator(task_id="Start", dag=dag) + end = DummyOperator(task_id="End", trigger_rule=TriggerRule.ONE_SUCCESS, dag=dag) + + transfer_grp = subdag_transfer() + + start >> transfer_grp >> end diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index 728ee780..a23af0a3 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.23.beta/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.24.beta/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix From bd65501abce2d960ad89c0e08174dd063766e73f Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 16:41:34 -0500 Subject: [PATCH 21/97] Make the task architecture configurable --- .github/actions/terraform-deploy-sm2a/action.yml | 5 ++++- sm2a/infrastructure/main.tf | 2 +- sm2a/scripts/deploy.sh | 1 - 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index de44a3b1..a4f0edc7 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -19,6 +19,9 @@ inputs: required: false type: string default: "." + env_aws_secret_name: + type: string + required: true sm2a_dir: required: false type: string @@ -49,7 +52,7 @@ runs: AWS_DEFAULT_REGION: ${{ inputs.aws-region }} AWS_REGION: ${{ inputs.aws-region }} run: | - python scripts/generate_env_file.py --secret-id ${{ inputs.env_aws_secret_name }} --env-file ${{ inputs.env-file }} + python scripts/generate_env_file.py --secret-id ${{ inputs.env-aws_secret_name }} --env-file ${{ inputs.env-file }} - name: Setup Terraform uses: hashicorp/setup-terraform@v1 diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index a23af0a3..d5a95657 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.24.beta/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.24.beta.1/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix diff --git a/sm2a/scripts/deploy.sh b/sm2a/scripts/deploy.sh index 1b068c2e..0d9aafca 100755 --- a/sm2a/scripts/deploy.sh +++ b/sm2a/scripts/deploy.sh @@ -88,7 +88,6 @@ function check_create_remote_state { cd ./infrastructure generate_terraform_variables check_create_remote_state $AWS_REGION $STATE_BUCKET_NAME $STATE_DYNAMO_TABLE -cat terraform.tf read -rp 'action [init|plan|deploy]: ' ACTION case $ACTION in init) From 6ef903bc029b4fa5bf53111081fecc6c460ec384 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 16:43:11 -0500 Subject: [PATCH 22/97] Make the task architecture configurable --- .github/actions/terraform-deploy-sm2a/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index a4f0edc7..678edc3e 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -52,7 +52,7 @@ runs: AWS_DEFAULT_REGION: ${{ inputs.aws-region }} AWS_REGION: ${{ inputs.aws-region }} run: | - python scripts/generate_env_file.py --secret-id ${{ inputs.env-aws_secret_name }} --env-file ${{ inputs.env-file }} + python scripts/generate_env_file.py --secret-id ${{ inputs.env_aws_secret_name }} --env-file ${{ inputs.env-file }} - name: Setup Terraform uses: hashicorp/setup-terraform@v1 From 0be6c9757957c03e6c19c23946d9e1fba8c778f0 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 17 Jul 2024 16:57:23 -0500 Subject: [PATCH 23/97] Make the task architecture configurable --- sm2a/airflow_services/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sm2a/airflow_services/Dockerfile b/sm2a/airflow_services/Dockerfile index b8d5ceeb..c2e63b72 100644 --- a/sm2a/airflow_services/Dockerfile +++ b/sm2a/airflow_services/Dockerfile @@ -1,5 +1,7 @@ FROM --platform=linux/amd64 apache/airflow:slim-2.8.4-python3.11 + ARG AIRFLOW_VERSION=2.8.4 + USER root # `apt-get autoremove` is used to remove packages that were automatically installed to satisfy # dependencies for other packages and are now no longer needed. From ff6b10782efefef2a4e8f22154df570abf3a3893 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Thu, 18 Jul 2024 09:49:33 -0500 Subject: [PATCH 24/97] Deploy SM2A --- sm2a/dags/generate_dags.py | 9 +++------ sm2a/infrastructure/configuration/airflow.cfg | 8 ++++---- sm2a/infrastructure/main.tf | 2 +- sm2a/infrastructure/variables.tf | 20 +------------------ 4 files changed, 9 insertions(+), 30 deletions(-) diff --git a/sm2a/dags/generate_dags.py b/sm2a/dags/generate_dags.py index f20db9f3..7ab43d62 100644 --- a/sm2a/dags/generate_dags.py +++ b/sm2a/dags/generate_dags.py @@ -13,11 +13,8 @@ def generate_dags(): import json from pathlib import Path - - mwaa_stac_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) bucket = mwaa_stac_conf["EVENT_BUCKET"] - client = boto3.client("s3") response = client.list_objects_v2(Bucket=bucket, Prefix="collections/") @@ -35,9 +32,9 @@ def generate_dags(): discovery_configs = [discovery_configs] scheduled_discovery_configs = [ discovery_config - for discovery_config in discovery_configs - if discovery_config.get("schedule") - ] + for discovery_config in discovery_configs + if discovery_config.get("schedule") + ] for idx, discovery_config in enumerate(scheduled_discovery_configs): id = f"discover-{file_name}" if idx > 0: diff --git a/sm2a/infrastructure/configuration/airflow.cfg b/sm2a/infrastructure/configuration/airflow.cfg index 5b78f1df..49591f28 100755 --- a/sm2a/infrastructure/configuration/airflow.cfg +++ b/sm2a/infrastructure/configuration/airflow.cfg @@ -20,10 +20,10 @@ celery_config_options = configuration.celery_config.CELERY_CONFIG [github_enterprise] api_rev = v3 host = github.com -client_id = Iv23liBjz91G9wLwnaK6 -client_secret = 88b73528341a884bb418852d225f4913e69df478 +client_id = Iv23lil9JEmXAM6QJlFe +client_secret = 8cbd483d2cb4e73599dffba93dbd0295ef0830c5 oauth_callback_route = /home -allowed_teams = csda-msfc +allowed_teams = VEDA [webserver] authenticate = True @@ -60,4 +60,4 @@ remote_logging = true # Setting full_url_mode to false allows us to use multiple fields when storing connections # Source code: https://github.com/apache/airflow/blob/main/airflow/providers/amazon/aws/secrets/secrets_manager.py backend = airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend -backend_kwargs = {"connections_prefix": "sm2a-staging/airflow/connections", "variables_prefix": "sm2a-staging/airflow/variables","connections_lookup_pattern": "_default$", "variables_lookup_pattern": "^aws_", "config_prefix": "sm2a-staging/airflow/config"} +backend_kwargs = {"connections_prefix": "sm2a-dev/airflow/connections", "variables_prefix": "sm2a-dev/airflow/variables","connections_lookup_pattern": "_default$", "variables_lookup_pattern": "^aws_", "config_prefix": "sm2a-dev/airflow/config"} diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index d5a95657..c58be709 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.24.beta.1/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.25/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix diff --git a/sm2a/infrastructure/variables.tf b/sm2a/infrastructure/variables.tf index dbd4cea7..a89f1dd4 100644 --- a/sm2a/infrastructure/variables.tf +++ b/sm2a/infrastructure/variables.tf @@ -46,25 +46,7 @@ variable "custom_worker_policy_statement" { Action = list(string) Resource = list(string) })) - default = [ - { - Effect = "Allow" - Action = [ - "dynamodb:UpdateItem", - "dynamodb:PutItem", - "dynamodb:GetItem", - "dynamodb:BatchWriteItem", - "dynamodb:BatchGetItem" - ] - "Resource" : [ - "arn:aws:dynamodb:us-west-2:*:table/*_sha256_store/*", - "arn:aws:dynamodb:us-west-2:*:table/*_sha256_store" - ] - - } - - - ] + default = [] } From b4d281d03975f0247b02aa2744bcbeaf890abc1e Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Thu, 18 Jul 2024 09:54:14 -0500 Subject: [PATCH 25/97] Deploy SM2A to dev --- .github/actions/terraform-deploy-sm2a/action.yml | 4 ---- .github/actions/terraform-deploy/action.yml | 4 +++- .github/workflows/cicd.yml | 3 ++- sm2a/infrastructure/main.tf | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 678edc3e..d3ad6426 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -15,10 +15,6 @@ inputs: required: false type: string default: github-actions-deployment - current_dir: - required: false - type: string - default: "." env_aws_secret_name: type: string required: true diff --git a/.github/actions/terraform-deploy/action.yml b/.github/actions/terraform-deploy/action.yml index e4497394..3e71c394 100644 --- a/.github/actions/terraform-deploy/action.yml +++ b/.github/actions/terraform-deploy/action.yml @@ -60,4 +60,6 @@ runs: shell: bash working-directory: ${{ inputs.dir }} run: | - echo "skip" + ./scripts/deploy.sh ${{ inputs.env-file }} <<< init + ./scripts/deploy.sh ${{ inputs.env-file }} <<< deploy + diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index f72dd6df..8c801582 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -80,8 +80,9 @@ jobs: env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} - name: Run SM2A deployment + if: ${{ needs.define-environment.outputs.env_name }} = "development" uses: "./.github/actions/terraform-deploy-sm2a" with: - env_aws_secret_name: veda-sm2a-dev-deployment-secrets + env_aws_secret_name: ${{ vars.SM2A_ENVS_DEPLOYMENT_SECRET_NAME }} env-file: .env aws-region: us-west-2 diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index c58be709..77181a9b 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.0.25/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.1.0/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix From 5785223dc84217421c29eea3fcff0ab1e3182b0c Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Thu, 18 Jul 2024 10:13:54 -0500 Subject: [PATCH 26/97] Fix dependencies with github --- sm2a/airflow_services/requirements.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sm2a/airflow_services/requirements.txt b/sm2a/airflow_services/requirements.txt index 60052c75..b1e99bb7 100644 --- a/sm2a/airflow_services/requirements.txt +++ b/sm2a/airflow_services/requirements.txt @@ -4,18 +4,20 @@ cryptography # To use SQS as a broker in Celery, you need to install pycurl. # https://github.com/saleor/saleor/issues/8804 pycurl - +psycopg2-binary apache-airflow-providers-celery +airflow_multi_dagrun +apache-airflow-providers-postgres +apache-airflow-providers-slack +apache-airflow-providers-slack[http] +apache-airflow[github_enterprise] affine netCDF4 requests rio-cogeo smart-open -airflow_multi_dagrun -apache-airflow-providers-postgres apache-airflow-providers-common-sql typing-extensions -psycopg2-binary pyOpenSSL stac-pydantic fsspec From de540e186cb1737fe9c54360aab28b5fab2626b5 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Thu, 18 Jul 2024 10:33:48 -0500 Subject: [PATCH 27/97] Add SM2A to VEDA data pipeline --- .github/workflows/cicd.yml | 3 --- sm2a/README.md | 4 ++-- sm2a/docker-compose.yml | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 8c801582..99abf822 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -10,7 +10,6 @@ on: - main - dev - production - - deploy-sm2a pull_request: branches: - main @@ -39,8 +38,6 @@ jobs: run: | if [ "${{ github.ref }}" = "refs/heads/main" ]; then echo "env_name=staging" >> $GITHUB_OUTPUT - elif [ "${{ github.ref }}" = "refs/heads/deploy-sm2a" ]; then - echo "env_name=development" >> $GITHUB_OUTPUT elif [ "${{ github.ref }}" = "refs/heads/dev" ]; then echo "env_name=development" >> $GITHUB_OUTPUT elif [ "${{ github.ref }}" = "refs/heads/production" ]; then diff --git a/sm2a/README.md b/sm2a/README.md index c867bb53..b64bf5ae 100644 --- a/sm2a/README.md +++ b/sm2a/README.md @@ -1,6 +1,6 @@ -# CSDA-data-pipelines +# VEDA-data-pipelines -This repo houses function code and deployment code for CSDA projects. +This repo houses function code and deployment code for VEDA projects. ## Project layout diff --git a/sm2a/docker-compose.yml b/sm2a/docker-compose.yml index fbc15257..c441b6c7 100644 --- a/sm2a/docker-compose.yml +++ b/sm2a/docker-compose.yml @@ -17,7 +17,7 @@ x-airflow-common: AIRFLOW__WEBSERVER__SECRET_KEY: "Ly8wMU8r5K7jPy58M3GpkZbXDNyJz8HiJll3pu8DbIM=" AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@airflow-db/airflow AIRFLOW__CELERY__BROKER_URL: sqs://user:pass@celery-broker:9324/ - AIRFLOW__WEBSERVER__INSTANCE_NAME: "${AIRFLOW__WEBSERVER__INSTANCE_NAME:-CSDA-SM2A-Airflow}" + AIRFLOW__WEBSERVER__INSTANCE_NAME: "${AIRFLOW__WEBSERVER__INSTANCE_NAME:-VEDA-SM2A-Airflow}" AIRFLOW__LOGGING__LOGGING_LEVEL: DEBUG # Gotcha: Even though we set this to "True" in airflow.cfg, an environment variable overrides it AIRFLOW__CORE__LOAD_EXAMPLES: false From 5b3f88355e59a24671efa1e7fb2c427fecaa398c Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 29 Jul 2024 09:32:10 -0500 Subject: [PATCH 28/97] Upgrade sm2a base --- sm2a/infrastructure/main.tf | 2 +- sm2a/infrastructure/variables.tf | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index 77181a9b..5c8f3940 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.1.0/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.1.3/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix diff --git a/sm2a/infrastructure/variables.tf b/sm2a/infrastructure/variables.tf index a89f1dd4..d216de13 100644 --- a/sm2a/infrastructure/variables.tf +++ b/sm2a/infrastructure/variables.tf @@ -102,19 +102,19 @@ variable "rds_configuration" { rds_instance_class = "db.t4g.medium", rds_allocated_storage = 20, rds_max_allocated_storage = 100, - rds_engine_version = "13.13" + rds_engine_version = "13" }, staging = { rds_instance_class = "db.t4g.large", rds_allocated_storage = 40, rds_max_allocated_storage = 100, - rds_engine_version = "13.13" + rds_engine_version = "13" }, prod = { rds_instance_class = "db.r5.xlarge", rds_allocated_storage = 100, rds_max_allocated_storage = 200, - rds_engine_version = "13.13" + rds_engine_version = "13" } } } @@ -180,10 +180,28 @@ variable "gh_team_name" { } +variable "custom_worker_policy_statement" { + type = list(object({ + Effect = string + Action = list(string) + Resource = list(string) + })) + default = [ + { + Effect = "Allow" + Action = [ + "sts:AssumeRole", + "iam:PassRole", + "logs:GetLogEvents" + ] + "Resource" : [ + "*" + ] + + } + + ] -variable "airflow_custom_variables" { - description = "Airflow custom variables" - type = map(string) - default = {} } + From 9a26baf9312ec742a0bacc28e38f43dee36aae5e Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 29 Jul 2024 09:36:38 -0500 Subject: [PATCH 29/97] Add deployment env example --- sm2a/.deploy_env_example | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 sm2a/.deploy_env_example diff --git a/sm2a/.deploy_env_example b/sm2a/.deploy_env_example new file mode 100644 index 00000000..53247eed --- /dev/null +++ b/sm2a/.deploy_env_example @@ -0,0 +1,19 @@ +AIRFLOW_UID=501 +PREFIX=**** +VPC_ID=**** +STATE_BUCKET_NAME=**** +STATE_BUCKET_KEY=**** +STATE_DYNAMO_TABLE=**** +PRIVATE_SUBNETS_TAGNAME=**** +PUBLIC_SUBNETS_TAGNAME=**** +AIRFLOW_FERNET_KEY=**** +AIRFLOW_DB_NAME=**** +AIRFLOW_DB_USERNAME=**** +AIRFLOW_DB_PASSWORD=**** +PERMISSION_BOUNDARIES_ARN=**** +DOMAIN_NAME=openveda.cloud +STAGE=**** +TF_VAR_gh_app_client_id=**** +TF_VAR_gh_app_client_secret=**** +TF_VAR_gh_team_name=**** +TF_VAR_subdomain=**** From 040a3c4453bdb6a7f39c03b568979dc21ab95df7 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 29 Jul 2024 11:58:42 -0500 Subject: [PATCH 30/97] Remove sm2a DAGs to use the repo DAGs --- sm2a/airflow_worker/Dockerfile | 2 +- sm2a/dags/__init__.py | 0 sm2a/dags/example_dag.py | 94 --- sm2a/dags/generate_dags.py | 47 -- sm2a/dags/rds_example_dag.py | 89 --- sm2a/dags/requirements-constraints.txt | 663 ------------------ sm2a/dags/requirements.txt | 20 - sm2a/dags/veda_data_pipeline/__init__.py | 0 .../veda_data_pipeline/groups/__init__.py | 0 .../groups/collection_group.py | 79 --- .../groups/discover_group.py | 110 --- .../veda_data_pipeline/groups/ecs_tasks.py | 117 ---- .../groups/processing_group.py | 95 --- .../groups/transfer_group.py | 90 --- .../veda_data_pipeline/requirements_dev.txt | 1 - sm2a/dags/veda_data_pipeline/utils/README.md | 26 - .../dags/veda_data_pipeline/utils/__init__.py | 0 .../utils/collection_generation.py | 138 ---- .../veda_data_pipeline/utils/s3_discovery.py | 292 -------- sm2a/dags/veda_data_pipeline/utils/schemas.py | 15 - .../veda_data_pipeline/utils/submit_stac.py | 136 ---- .../dags/veda_data_pipeline/utils/transfer.py | 110 --- .../veda_collection_pipeline.py | 49 -- .../veda_dataset_pipeline.py | 80 --- .../veda_discover_pipeline.py | 92 --- .../veda_process_raster_pipeline.py | 52 -- .../veda_process_vector_pipeline.py | 110 --- .../veda_transfer_pipeline.py | 50 -- 28 files changed, 1 insertion(+), 2556 deletions(-) delete mode 100644 sm2a/dags/__init__.py delete mode 100644 sm2a/dags/example_dag.py delete mode 100644 sm2a/dags/generate_dags.py delete mode 100644 sm2a/dags/rds_example_dag.py delete mode 100644 sm2a/dags/requirements-constraints.txt delete mode 100644 sm2a/dags/requirements.txt delete mode 100644 sm2a/dags/veda_data_pipeline/__init__.py delete mode 100644 sm2a/dags/veda_data_pipeline/groups/__init__.py delete mode 100644 sm2a/dags/veda_data_pipeline/groups/collection_group.py delete mode 100644 sm2a/dags/veda_data_pipeline/groups/discover_group.py delete mode 100644 sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py delete mode 100644 sm2a/dags/veda_data_pipeline/groups/processing_group.py delete mode 100644 sm2a/dags/veda_data_pipeline/groups/transfer_group.py delete mode 100644 sm2a/dags/veda_data_pipeline/requirements_dev.txt delete mode 100644 sm2a/dags/veda_data_pipeline/utils/README.md delete mode 100644 sm2a/dags/veda_data_pipeline/utils/__init__.py delete mode 100644 sm2a/dags/veda_data_pipeline/utils/collection_generation.py delete mode 100644 sm2a/dags/veda_data_pipeline/utils/s3_discovery.py delete mode 100644 sm2a/dags/veda_data_pipeline/utils/schemas.py delete mode 100644 sm2a/dags/veda_data_pipeline/utils/submit_stac.py delete mode 100644 sm2a/dags/veda_data_pipeline/utils/transfer.py delete mode 100644 sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py delete mode 100644 sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py delete mode 100644 sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py delete mode 100644 sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py delete mode 100644 sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py delete mode 100644 sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py diff --git a/sm2a/airflow_worker/Dockerfile b/sm2a/airflow_worker/Dockerfile index 0b13f92e..154cd1ae 100644 --- a/sm2a/airflow_worker/Dockerfile +++ b/sm2a/airflow_worker/Dockerfile @@ -48,4 +48,4 @@ ENV AIRFLOW_HOME ${AIRFLOW_HOME} ENV TZ UTC ENV PYTHONPATH /opt/airflow -CMD /bin/bash \ No newline at end of file +CMD /bin/bash diff --git a/sm2a/dags/__init__.py b/sm2a/dags/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sm2a/dags/example_dag.py b/sm2a/dags/example_dag.py deleted file mode 100644 index 7bd11599..00000000 --- a/sm2a/dags/example_dag.py +++ /dev/null @@ -1,94 +0,0 @@ -import logging -import time - -import pendulum -from airflow import DAG -from airflow.operators.dummy_operator import DummyOperator as EmptyOperator -from airflow.operators.python import PythonOperator - - -def log_task(text: str): - logging.info(text) - - -def discover_from_cmr_task(text): - log_task(text) - - -def discover_from_s3_task(text): - log_task("I am discovering") - time.sleep(1) - log_task("Done discovering") - log_task(text) - - -def move_files_to_maap_store_task(text): - log_task("I am moving files") - time.sleep(3) - log_task("Done moving files") - log_task(text) - - -def generate_cmr_metadata_task(text): - log_task(text) - - -def push_to_cmr_task(text): - log_task(text) - - -with DAG( - dag_id="example_etl_flow_test", - start_date=pendulum.today("UTC").add(days=-1), - schedule_interval=None, - tags=["example"], -) as dag: - - start = EmptyOperator(task_id="start", dag=dag) - - discover_from_cmr = PythonOperator( - task_id="discover_from_cmr", - python_callable=discover_from_cmr_task, - op_kwargs={"text": "Discover from CMR"}, - dag=dag, - ) - - discover_from_s3 = PythonOperator( - task_id="discover_from_s3", - python_callable=discover_from_s3_task, - op_kwargs={"text": "Discover from S3"}, - dag=dag, - ) - - move_files_to_maap_store = PythonOperator( - task_id="move_files_to_maap_store", - python_callable=move_files_to_maap_store_task, - op_kwargs={"text": "Moving Files to MAAP store"}, - dag=dag, - ) - - generate_cmr_metadata = PythonOperator( - task_id="generate_cmr_metadata", - python_callable=generate_cmr_metadata_task, - op_kwargs={"text": "Generate CMR metadata"}, - dag=dag, - ) - - push_to_cmr = PythonOperator( - task_id="push_to_cmr", - python_callable=push_to_cmr_task, - op_kwargs={"text": "Push to CMR"}, - dag=dag, - ) - - end = EmptyOperator(task_id="end", dag=dag) - - start >> discover_from_cmr - - start >> discover_from_s3 >> move_files_to_maap_store - ( - [discover_from_cmr, move_files_to_maap_store] - >> generate_cmr_metadata - >> push_to_cmr - >> end - ) diff --git a/sm2a/dags/generate_dags.py b/sm2a/dags/generate_dags.py deleted file mode 100644 index 7ab43d62..00000000 --- a/sm2a/dags/generate_dags.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Builds a DAG for each collection (indicated by a .json file) in the /collections/ folder. -These DAGs are used to discover and ingest items for each collection. -""" - -from airflow.models.variable import Variable - -from veda_data_pipeline.veda_discover_pipeline import get_discover_dag - - -def generate_dags(): - import boto3 - import json - - from pathlib import Path - mwaa_stac_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) - bucket = mwaa_stac_conf["EVENT_BUCKET"] - client = boto3.client("s3") - response = client.list_objects_v2(Bucket=bucket, Prefix="collections/") - - for file_ in response.get("Contents", []): - key = file_["Key"] - if key.endswith("/"): - continue - file_name = Path(key).stem - result = client.get_object(Bucket=bucket, Key=key) - discovery_configs = result["Body"].read().decode() - discovery_configs = json.loads(discovery_configs) - - # Allow the file content to be either one config or a list of configs - if type(discovery_configs) is dict: - discovery_configs = [discovery_configs] - scheduled_discovery_configs = [ - discovery_config - for discovery_config in discovery_configs - if discovery_config.get("schedule") - ] - for idx, discovery_config in enumerate(scheduled_discovery_configs): - id = f"discover-{file_name}" - if idx > 0: - id = f"{id}-{idx}" - get_discover_dag( - id=id, event=discovery_config - ) - - -generate_dags() diff --git a/sm2a/dags/rds_example_dag.py b/sm2a/dags/rds_example_dag.py deleted file mode 100644 index 66420b79..00000000 --- a/sm2a/dags/rds_example_dag.py +++ /dev/null @@ -1,89 +0,0 @@ -from __future__ import annotations -from airflow import DAG -from airflow.providers.postgres.operators.postgres import PostgresOperator -from airflow.hooks.postgres_hook import PostgresHook -from datetime import datetime, date -import json -from airflow.decorators import task - - -def json_serial(obj): - """JSON serializer for objects not serializable by default json code""" - if isinstance(obj, (datetime, date)): - return obj.isoformat() - raise TypeError("Type %s not serializable" % type(obj)) - - -# [START postgres_operator_howto_guide] - - -# create_pet_table, populate_pet_table, get_all_pets, and get_birth_date are examples of tasks created by -# instantiating the Postgres Operator - -DAG_ID = "postgres_operator_dag" - - -with DAG( - dag_id=DAG_ID, - start_date=datetime(2020, 2, 2), - schedule="@once", - catchup=False, - tags=["example"], -) as dag: - # [START postgres_operator_howto_guide_create_pet_table] - create_pet_table = PostgresOperator( - postgres_conn_id="cluster_rds_connection", - task_id="create_pet_table", - sql=""" - CREATE TABLE IF NOT EXISTS pet ( - pet_id SERIAL PRIMARY KEY, - name VARCHAR NOT NULL, - pet_type VARCHAR NOT NULL, - birth_date DATE NOT NULL, - OWNER VARCHAR NOT NULL); - """, - ) - # [END postgres_operator_howto_guide_create_pet_table] - # [START postgres_operator_howto_guide_populate_pet_table] - populate_pet_table = PostgresOperator( - postgres_conn_id="cluster_rds_connection", - task_id="populate_pet_table", - sql=""" - INSERT INTO pet (name, pet_type, birth_date, OWNER) - VALUES ( 'Max', 'Dog', '2018-07-05', 'Jane'); - INSERT INTO pet (name, pet_type, birth_date, OWNER) - VALUES ( 'Susie', 'Cat', '2019-05-01', 'Phil'); - INSERT INTO pet (name, pet_type, birth_date, OWNER) - VALUES ( 'Lester', 'Hamster', '2020-06-23', 'Lily'); - INSERT INTO pet (name, pet_type, birth_date, OWNER) - VALUES ( 'Quincy', 'Parrot', '2013-08-11', 'Anne'); - """, - ) - # [END postgres_operator_howto_guide_populate_pet_table] - # [START postgres_operator_howto_guide_get_all_pets] - - @task - def get_all_pets(): - sql = "SELECT * FROM pet" - pg_hook = PostgresHook(postgres_conn_id="cluster_rds_connection") - connection = pg_hook.get_conn() - cursor = connection.cursor() - cursor.execute(sql) - results = cursor.fetchall() - for result in results: - print(result) - return {"results": json.dumps(results, default=json_serial)} - - # [END postgres_operator_howto_guide_get_all_pets] - # [START postgres_operator_howto_guide_get_birth_date] - get_birth_date = PostgresOperator( - postgres_conn_id="cluster_rds_connection", - task_id="get_birth_date", - sql="SELECT * FROM pet WHERE birth_date BETWEEN SYMMETRIC %(begin_date)s AND %(end_date)s", - parameters={"begin_date": "2020-01-01", "end_date": "2020-12-31"}, - runtime_parameters={"statement_timeout": "3000ms"}, - ) - # [END postgres_operator_howto_guide_get_birth_date] - - create_pet_table >> populate_pet_table >> get_all_pets() >> get_birth_date - # [END postgres_operator_howto_guide] diff --git a/sm2a/dags/requirements-constraints.txt b/sm2a/dags/requirements-constraints.txt deleted file mode 100644 index a3bfd18b..00000000 --- a/sm2a/dags/requirements-constraints.txt +++ /dev/null @@ -1,663 +0,0 @@ -# -# This constraints file was automatically generated on 2023-01-18T18:46:04Z -# via "eager-upgrade" mechanism of PIP. For the "v2-5-test" branch of Airflow. -# This variant of constraints install uses the HEAD of the branch version for 'apache-airflow' but installs -# the providers from PIP-released packages at the moment of the constraint generation. -# -# Those constraints are actually those that regular users use to install released version of Airflow. -# We also use those constraints after "apache-airflow" is released and the constraints are tagged with -# "constraints-X.Y.Z" tag to build the production image for that version. -# -# -# This constraints file is meant to be used only in the "apache-airflow" installation command and not -# in all subsequent pip commands. By using a constraints.txt file, we ensure that solely the Airflow -# installation step is reproducible. Subsequent pip commands may install packages that would have -# been incompatible with the constraints used in Airflow reproducible installation step. Finally, pip -# commands that might change the installed version of apache-airflow should include "apache-airflow==X.Y.Z" -# in the list of install targets to prevent Airflow accidental upgrade or downgrade. -# -# Typical installation process of airflow for Python 3.8 is (with random selection of extras and custom -# dependencies added), usually consists of two steps: -# -# 1. Reproducible installation of airflow with selected providers (note constraints are used): -# -# pip install "apache-airflow[celery,cncf.kubernetes,google,amazon,snowflake]==X.Y.Z" \ -# --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-X.Y.Z/constraints-3.8.txt" -# -# 2. Installing own dependencies that are potentially not matching the constraints (note constraints are not -# used, and apache-airflow==X.Y.Z is used to make sure there is no accidental airflow upgrade/downgrade. -# -# pip install "apache-airflow==X.Y.Z" "snowflake-connector-python[pandas]==2.9.0" -# -APScheduler==3.6.3 -Authlib==1.2.0 -Babel==2.11.0 -ConfigUpdater==3.1.1 -Deprecated==1.2.13 -Flask-AppBuilder==4.1.4 -Flask-Babel==2.0.0 -Flask-Bcrypt==1.0.1 -Flask-Caching==2.0.2 -Flask-JWT-Extended==4.4.4 -Flask-Login==0.6.2 -Flask-SQLAlchemy==2.5.1 -Flask-Session==0.4.0 -Flask-WTF==1.1.1 -Flask==2.2.2 -GitPython==3.1.30 -HeapDict==1.0.1 -JPype1==1.4.1 -JayDeBeApi==1.2.3 -Jinja2==3.1.2 -Mako==1.2.4 -Markdown==3.4.1 -MarkupSafe==2.1.2 -PyGithub==1.57 -PyHive==0.6.5 -PyJWT==2.6.0 -PyNaCl==1.5.0 -PyYAML==6.0 -Pygments==2.14.0 -SQLAlchemy-JSONField==1.0.1.post0 -SQLAlchemy-Utils==0.39.0 -SQLAlchemy==1.4.46 -SecretStorage==3.3.3 -Sphinx==5.3.0 -Unidecode==1.3.6 -WTForms==3.0.1 -Werkzeug==2.2.2 -adal==1.2.7 -aiofiles==22.1.0 -aiohttp==3.8.3 -aiosignal==1.3.1 -alabaster==0.7.13 -alembic==1.9.2 -aliyun-python-sdk-core==2.13.36 -aliyun-python-sdk-kms==2.16.0 -amqp==5.1.1 -analytics-python==1.4.0 -ansiwrap==0.8.4 -anyio==3.6.2 -apache-airflow-providers-airbyte==3.2.0 -apache-airflow-providers-alibaba==2.2.0 -apache-airflow-providers-amazon==7.1.0 -apache-airflow-providers-apache-beam==4.1.1 -apache-airflow-providers-apache-cassandra==3.1.0 -apache-airflow-providers-apache-drill==2.3.1 -apache-airflow-providers-apache-druid==3.3.1 -apache-airflow-providers-apache-hdfs==3.2.0 -apache-airflow-providers-apache-hive==5.1.1 -apache-airflow-providers-apache-kylin==3.1.0 -apache-airflow-providers-apache-livy==3.2.0 -apache-airflow-providers-apache-pig==4.0.0 -apache-airflow-providers-apache-pinot==4.0.1 -apache-airflow-providers-apache-spark==4.0.0 -apache-airflow-providers-apache-sqoop==3.1.0 -apache-airflow-providers-arangodb==2.1.0 -apache-airflow-providers-asana==2.1.0 -apache-airflow-providers-atlassian-jira==2.0.0 -apache-airflow-providers-celery==3.1.0 -apache-airflow-providers-cloudant==3.1.0 -apache-airflow-providers-cncf-kubernetes==5.1.1 -apache-airflow-providers-common-sql==1.3.3 -apache-airflow-providers-databricks==4.0.0 -apache-airflow-providers-datadog==3.1.0 -apache-airflow-providers-dbt-cloud==2.3.1 -apache-airflow-providers-dingding==3.1.0 -apache-airflow-providers-discord==3.1.0 -apache-airflow-providers-docker==3.4.0 -apache-airflow-providers-elasticsearch==4.3.3 -apache-airflow-providers-exasol==4.1.3 -apache-airflow-providers-facebook==3.1.0 -apache-airflow-providers-ftp==3.3.0 -apache-airflow-providers-github==2.2.0 -apache-airflow-providers-google==8.8.0 -apache-airflow-providers-grpc==3.1.0 -apache-airflow-providers-hashicorp==3.2.0 -apache-airflow-providers-http==4.1.1 -apache-airflow-providers-imap==3.1.1 -apache-airflow-providers-influxdb==2.1.0 -apache-airflow-providers-jdbc==3.3.0 -apache-airflow-providers-jenkins==3.2.0 -apache-airflow-providers-microsoft-azure==5.1.0 -apache-airflow-providers-microsoft-mssql==3.3.2 -apache-airflow-providers-microsoft-psrp==2.2.0 -apache-airflow-providers-microsoft-winrm==3.1.1 -apache-airflow-providers-mongo==3.1.1 -apache-airflow-providers-mysql==4.0.0 -apache-airflow-providers-neo4j==3.2.1 -apache-airflow-providers-odbc==3.2.1 -apache-airflow-providers-openfaas==3.1.0 -apache-airflow-providers-opsgenie==5.0.0 -apache-airflow-providers-oracle==3.6.0 -apache-airflow-providers-pagerduty==3.1.0 -apache-airflow-providers-papermill==3.1.0 -apache-airflow-providers-plexus==3.1.0 -apache-airflow-providers-postgres==5.4.0 -apache-airflow-providers-presto==4.2.1 -apache-airflow-providers-qubole==3.3.1 -apache-airflow-providers-redis==3.1.0 -apache-airflow-providers-salesforce==5.3.0 -apache-airflow-providers-samba==4.1.0 -apache-airflow-providers-segment==3.1.0 -apache-airflow-providers-sendgrid==3.1.0 -apache-airflow-providers-sftp==4.2.1 -apache-airflow-providers-singularity==3.1.0 -apache-airflow-providers-slack==7.2.0 -apache-airflow-providers-snowflake==4.0.2 -apache-airflow-providers-sqlite==3.3.1 -apache-airflow-providers-ssh==3.4.0 -apache-airflow-providers-tableau==4.0.0 -apache-airflow-providers-tabular==1.1.0 -apache-airflow-providers-telegram==3.1.1 -apache-airflow-providers-trino==4.3.1 -apache-airflow-providers-vertica==3.3.1 -apache-airflow-providers-yandex==3.2.0 -apache-airflow-providers-zendesk==4.2.0 -apache-beam==2.44.0 -apispec==3.3.2 -appdirs==1.4.4 -argcomplete==2.0.0 -arrow==1.2.3 -asana==3.0.0 -asgiref==3.6.0 -asn1crypto==1.5.1 -astroid==2.11.7 -asttokens==2.2.1 -async-timeout==4.0.2 -asynctest==0.13.0 -atlasclient==1.0.0 -atlassian-python-api==3.32.2 -attrs==22.2.0 -aws-sam-translator==1.57.0 -aws-xray-sdk==2.11.0 -azure-batch==13.0.0 -azure-common==1.1.28 -azure-core==1.26.2 -azure-cosmos==4.3.0 -azure-datalake-store==0.0.52 -azure-identity==1.12.0 -azure-keyvault-secrets==4.6.0 -azure-kusto-data==0.0.45 -azure-mgmt-containerinstance==1.5.0 -azure-mgmt-core==1.3.2 -azure-mgmt-datafactory==1.1.0 -azure-mgmt-datalake-nspkg==3.0.1 -azure-mgmt-datalake-store==0.5.0 -azure-mgmt-nspkg==3.0.2 -azure-mgmt-resource==22.0.0 -azure-nspkg==3.0.2 -azure-servicebus==7.8.2 -azure-storage-blob==12.14.1 -azure-storage-common==2.1.0 -azure-storage-file-datalake==12.9.1 -azure-storage-file==2.1.0 -azure-synapse-spark==0.7.0 -backcall==0.2.0 -backoff==1.10.0 -bcrypt==4.0.1 -beautifulsoup4==4.11.1 -billiard==3.6.4.0 -black==23.1a1 -bleach==5.0.1 -blinker==1.5 -boto3==1.26.51 -boto==2.49.0 -botocore==1.29.51 -bowler==0.9.0 -cachelib==0.9.0 -cachetools==4.2.2 -cassandra-driver==3.25.0 -cattrs==22.2.0 -celery==5.2.7 -certifi==2022.12.7 -cffi==1.15.1 -cfgv==3.3.1 -cfn-lint==0.72.9 -cgroupspy==0.2.2 -chardet==4.0.0 -charset-normalizer==2.1.1 -checksumdir==1.2.0 -ciso8601==2.3.0 -click-default-group==1.2.2 -click-didyoumean==0.3.0 -click-plugins==1.1.1 -click-repl==0.2.0 -click==8.1.3 -clickclick==20.10.2 -cloudant==2.15.0 -cloudpickle==2.2.0 -colorama==0.4.6 -colorlog==4.8.0 -commonmark==0.9.1 -connexion==2.14.1 -coverage==7.0.5 -crcmod==1.7 -cron-descriptor==1.2.32 -croniter==1.3.8 -cryptography==38.0.4 -curlify==2.2.1 -dask==2023.1.0 -databricks-sql-connector==2.2.0 -datadog==0.44.0 -db-dtypes==1.0.5 -decorator==5.1.1 -defusedxml==0.7.1 -dill==0.3.1.1 -distlib==0.3.6 -distributed==2023.1.0 -dnspython==2.3.0 -docker==6.0.1 -docopt==0.6.2 -docutils==0.19 -ecdsa==0.18.0 -elasticsearch-dbapi==0.2.9 -elasticsearch-dsl==7.4.0 -elasticsearch==7.13.4 -email-validator==1.3.0 -entrypoints==0.4 -eralchemy2==1.3.6 -eventlet==0.33.3 -exceptiongroup==1.1.0 -execnet==1.9.0 -executing==1.2.0 -facebook-business==15.0.2 -fastavro==1.7.0 -fasteners==0.18 -fastjsonschema==2.16.2 -filelock==3.9.0 -fissix==21.11.13 -flake8-colors==0.1.9 -flake8==6.0.0 -flake8_implicit_str_concat==0.3.0 -flaky==3.7.0 -flower==1.2.0 -freezegun==1.2.2 -frozenlist==1.3.3 -fsspec==2022.11.0 -future==0.18.3 -gcloud-aio-auth==4.1.5 -gcloud-aio-bigquery==6.2.0 -gcloud-aio-storage==8.0.0 -gcsfs==2022.11.0 -geomet==0.2.1.post1 -gevent==22.10.2 -gitdb==4.0.10 -google-ads==18.0.0 -google-api-core==2.8.2 -google-api-python-client==1.12.11 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==0.8.0 -google-auth==2.16.0 -google-cloud-aiplatform==1.16.1 -google-cloud-appengine-logging==1.1.3 -google-cloud-audit-log==0.2.4 -google-cloud-automl==2.8.0 -google-cloud-bigquery-datatransfer==3.7.0 -google-cloud-bigquery-storage==2.14.1 -google-cloud-bigquery==2.34.4 -google-cloud-bigtable==1.7.3 -google-cloud-build==3.9.0 -google-cloud-compute==0.7.0 -google-cloud-container==2.11.1 -google-cloud-core==2.3.2 -google-cloud-datacatalog==3.9.0 -google-cloud-dataform==0.2.0 -google-cloud-dataplex==1.1.0 -google-cloud-dataproc-metastore==1.6.0 -google-cloud-dataproc==5.0.0 -google-cloud-dlp==1.0.2 -google-cloud-kms==2.12.0 -google-cloud-language==1.3.2 -google-cloud-logging==3.2.1 -google-cloud-memcache==1.4.1 -google-cloud-monitoring==2.11.0 -google-cloud-orchestration-airflow==1.4.1 -google-cloud-os-login==2.7.1 -google-cloud-pubsub==2.13.5 -google-cloud-redis==2.9.0 -google-cloud-resource-manager==1.6.0 -google-cloud-secret-manager==1.0.2 -google-cloud-spanner==1.19.3 -google-cloud-speech==1.3.4 -google-cloud-storage==2.7.0 -google-cloud-tasks==2.10.1 -google-cloud-texttospeech==1.0.3 -google-cloud-translate==1.7.2 -google-cloud-videointelligence==1.16.3 -google-cloud-vision==1.0.2 -google-cloud-workflows==1.7.1 -google-crc32c==1.5.0 -google-resumable-media==2.4.0 -googleapis-common-protos==1.56.4 -graphql-core==3.2.3 -graphviz==0.20.1 -greenlet==2.0.1 -grpc-google-iam-v1==0.12.4 -grpcio-gcp==0.2.2 -grpcio-status==1.48.2 -grpcio==1.51.1 -gssapi==1.8.2 -gunicorn==20.1.0 -h11==0.14.0 -hdfs==2.7.0 -hmsclient==0.1.1 -httpcore==0.16.3 -httplib2==0.20.4 -httpx==0.23.3 -humanize==4.4.0 -hvac==1.0.2 -identify==2.5.13 -idna==3.4 -ijson==3.2.0.post0 -imagesize==1.4.1 -importlib-metadata==6.0.0 -incremental==22.10.0 -inflection==0.5.1 -influxdb-client==1.35.0 -iniconfig==2.0.0 -ipdb==0.13.11 -ipython==8.8.0 -isodate==0.6.1 -isort==5.11.2 -itsdangerous==2.1.2 -jaraco.classes==3.2.3 -jedi==0.18.2 -jeepney==0.8.0 -jira==3.4.1 -jmespath==0.10.0 -jschema-to-python==1.2.3 -json-merge-patch==0.2 -jsondiff==2.0.0 -jsonpatch==1.32 -jsonpath-ng==1.5.3 -jsonpickle==3.0.1 -jsonpointer==2.3 -jsonschema-spec==0.1.2 -jsonschema==4.17.3 -junit-xml==1.9 -jupyter-client==7.3.4 -jupyter_core==5.1.3 -keyring==23.13.1 -kombu==5.2.4 -krb5==0.4.1 -kubernetes==23.6.0 -kylinpy==2.8.4 -lazy-object-proxy==1.9.0 -ldap3==2.9.1 -linkify-it-py==2.0.0 -locket==1.0.0 -lockfile==0.12.2 -looker-sdk==22.20.0 -lxml==4.9.2 -lz4==4.3.2 -markdown-it-py==2.1.0 -marshmallow-enum==1.5.1 -marshmallow-oneofschema==3.0.1 -marshmallow-sqlalchemy==0.26.1 -marshmallow==3.19.0 -matplotlib-inline==0.1.6 -mccabe==0.7.0 -mdit-py-plugins==0.3.3 -mdurl==0.1.2 -mongomock==4.1.2 -monotonic==1.6 -more-itertools==8.14.0 -moreorless==0.4.0 -moto==4.1.0 -msal-extensions==1.0.0 -msal==1.20.0 -msgpack==1.0.4 -msrest==0.7.1 -msrestazure==0.6.4 -multi-key-dict==2.0.3 -multidict==6.0.4 -mypy-boto3-appflow==1.26.32 -mypy-boto3-rds==1.26.47 -mypy-boto3-redshift-data==1.26.30 -mypy-extensions==0.4.3 -mypy==0.971 -mysql-connector-python==8.0.32 -mysqlclient==2.1.1 -nbclient==0.7.2 -nbformat==5.7.3 -neo4j==5.4.0 -nest-asyncio==1.5.6 -networkx==2.8.8 -nodeenv==1.7.0 -ntlm-auth==1.5.0 -numpy==1.22.4 -oauthlib==3.2.2 -objsize==0.6.1 -openapi-schema-validator==0.4.0 -openapi-spec-validator==0.5.2 -opsgenie-sdk==2.1.5 -oracledb==1.2.1 -orjson==3.8.5 -oscrypto==1.3.0 -oss2==2.16.0 -packaging==21.3 -pandas-gbq==0.17.9 -pandas==1.5.2 -papermill==2.4.0 -parameterized==0.8.1 -paramiko==2.12.0 -parso==0.8.3 -partd==1.3.0 -pathable==0.4.3 -pathspec==0.9.0 -pbr==5.11.1 -pdpyras==4.5.2 -pendulum==2.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pinotdb==0.4.12 -pipdeptree==2.3.3 -pipx==1.1.0 -pkginfo==1.9.6 -platformdirs==2.6.2 -pluggy==1.0.0 -ply==3.11 -plyvel==1.5.0 -portalocker==2.6.0 -pre-commit==2.21.0 -presto-python-client==0.8.3 -prison==0.2.1 -prometheus-client==0.15.0 -prompt-toolkit==3.0.36 -proto-plus==1.19.6 -protobuf==3.20.0 -psutil==5.9.4 -psycopg2-binary==2.9.5 -psycopg2==2.9.5 -ptyprocess==0.7.0 -pure-eval==0.2.2 -pure-sasl==0.6.2 -py4j==0.10.9.5 -py==1.11.0 -pyOpenSSL==22.1.0 -pyarrow==9.0.0 -pyasn1-modules==0.2.8 -pyasn1==0.4.8 -pycodestyle==2.10.0 -pycountry==22.3.5 -pycparser==2.21 -pycryptodome==3.16.0 -pycryptodomex==3.16.0 -pydantic==1.10.4 -pydata-google-auth==1.5.0 -pydot==1.4.2 -pydruid==0.6.5 -pyenchant==3.2.2 -pyexasol==0.25.1 -pyflakes==3.0.1 -pygraphviz==1.10 -pyhcl==0.4.4 -pykerberos==1.2.4 -pymongo==3.13.0 -pymssql==2.2.8 -pyodbc==4.0.35 -pyparsing==3.0.9 -pypsrp==0.8.1 -pyrsistent==0.19.3 -pyspark==3.3.1 -pyspnego==0.7.0 -pytest-asyncio==0.20.3 -pytest-capture-warnings==0.0.4 -pytest-cov==4.0.0 -pytest-httpx==0.21.2 -pytest-instafail==0.4.2 -pytest-rerunfailures==9.1.1 -pytest-timeouts==1.2.1 -pytest-xdist==3.1.0 -pytest==6.2.5 -python-arango==7.5.5 -python-daemon==2.3.2 -python-dateutil==2.8.2 -python-dotenv==0.21.0 -python-http-client==3.3.7 -python-jenkins==1.7.0 -python-jose==3.3.0 -python-ldap==3.4.3 -python-nvd3==0.15.0 -python-slugify==7.0.0 -python-telegram-bot==13.15 -pytz-deprecation-shim==0.1.0.post0 -pytz==2022.7.1 -pytzdata==2020.1 -pywinrm==0.4.3 -pyzmq==25.0.0 -qds-sdk==1.16.1 -reactivex==4.0.4 -readme-renderer==37.3 -redis==3.5.3 -redshift-connector==2.0.909 -regex==2022.10.31 -requests-file==1.5.1 -requests-kerberos==0.14.0 -requests-mock==1.10.0 -requests-ntlm==1.1.0 -requests-oauthlib==1.3.1 -requests-toolbelt==0.10.1 -requests==2.28.2 -responses==0.22.0 -rfc3986==1.5.0 -rich-click==1.6.0 -rich==13.1.0 -rsa==4.9 -s3transfer==0.6.0 -sarif-om==1.0.4 -sasl==0.3.1 -scramp==1.4.4 -scrapbook==0.5.0 -semver==2.13.0 -sendgrid==6.9.7 -sentinels==1.0.0 -sentry-sdk==1.13.0 -setproctitle==1.3.2 -simple-salesforce==1.12.3 -six==1.16.0 -slack-sdk==3.19.5 -smbprotocol==1.10.1 -smmap==5.0.0 -snakebite-py3==3.0.5 -sniffio==1.3.0 -snowballstemmer==2.2.0 -snowflake-connector-python==2.9.0 -snowflake-sqlalchemy==1.4.4 -sortedcontainers==2.4.0 -soupsieve==2.3.2.post1 -sphinx-airflow-theme==0.0.11 -sphinx-argparse==0.4.0 -sphinx-autoapi==2.0.1 -sphinx-copybutton==0.5.1 -sphinx-jinja==2.0.2 -sphinx-rtd-theme==1.1.1 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==2.0.0 -sphinxcontrib-httpdomain==1.8.1 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 -sphinxcontrib-redoc==1.6.0 -sphinxcontrib-serializinghtml==1.1.5 -sphinxcontrib-spelling==7.7.0 -sphinxcontrib.applehelp==1.0.3 -spython==0.3.0 -sqlalchemy-bigquery==1.5.0 -sqlalchemy-drill==1.1.2 -sqlalchemy-redshift==0.8.12 -sqlparse==0.4.3 -sshpubkeys==3.3.1 -sshtunnel==0.4.0 -stack-data==0.6.2 -starkbank-ecdsa==2.2.0 -statsd==4.0.1 -tableauserverclient==0.23.4 -tabulate==0.9.0 -tblib==1.7.0 -tenacity==8.1.0 -termcolor==2.2.0 -text-unidecode==1.3 -textwrap3==0.9.2 -thrift-sasl==0.4.3 -thrift==0.16.0 -toml==0.10.2 -tomli==2.0.1 -toolz==0.12.0 -tornado==6.1 -towncrier==22.12.0 -tqdm==4.64.1 -traitlets==5.8.1 -trino==0.321.0 -twine==4.0.2 -types-Deprecated==1.2.9 -types-Markdown==3.4.2.2 -types-PyMySQL==1.0.19.2 -types-PyYAML==6.0.12.3 -types-boto==2.49.18.5 -types-certifi==2021.10.8.3 -types-croniter==1.3.2.2 -types-docutils==0.19.1.2 -types-freezegun==1.1.10 -types-paramiko==2.12.0.3 -types-protobuf==4.21.0.3 -types-pyOpenSSL==23.0.0.1 -types-python-dateutil==2.8.19.6 -types-python-slugify==7.0.0.1 -types-pytz==2022.7.1.0 -types-redis==4.4.0.2 -types-requests==2.28.11.8 -types-setuptools==65.7.0.2 -types-tabulate==0.9.0.0 -types-termcolor==1.1.6 -types-toml==0.10.8.1 -types-urllib3==1.26.25.4 -typing_extensions==4.4.0 -tzdata==2022.7 -tzlocal==4.2 -uamqp==1.6.3 -uc-micro-py==1.0.1 -unicodecsv==0.14.1 -uritemplate==3.0.1 -urllib3==1.26.14 -userpath==1.8.0 -vertica-python==1.2.0 -vine==5.0.0 -virtualenv==20.17.1 -volatile==2.1.0 -watchtower==2.0.1 -wcwidth==0.2.6 -webencodings==0.5.1 -websocket-client==1.4.2 -wrapt==1.14.1 -xmltodict==0.13.0 -yamllint==1.29.0 -yandexcloud==0.194.0 -yarl==1.8.2 -zeep==4.2.1 -zenpy==2.0.25 -zict==2.2.0 -zipp==3.11.0 -zope.event==4.6 -zope.interface==5.5.2 -zstandard==0.19.0 diff --git a/sm2a/dags/requirements.txt b/sm2a/dags/requirements.txt deleted file mode 100644 index 8c9ec097..00000000 --- a/sm2a/dags/requirements.txt +++ /dev/null @@ -1,20 +0,0 @@ -#--constraint /usr/local/airflow/dags/requirements-constraints.txt -affine==2.4.0 -netCDF4==1.6.2 -pydantic==1.10.4 -requests==2.28.1 -rio-cogeo==3.5.0 -smart-open==6.3.0 -airflow_multi_dagrun==2.3.1 -apache-airflow-providers-docker==3.2.0 -apache-airflow-providers-postgres==5.2.2 -apache-airflow-providers-common-sql==1.2.0 -typing-extensions==4.4.0 -psycopg2-binary==2.9.5 -pypgstac==0.7.4 -pyOpenSSL==22.0.0 -stac-pydantic -fsspec -s3fs -xarray -xstac diff --git a/sm2a/dags/veda_data_pipeline/__init__.py b/sm2a/dags/veda_data_pipeline/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sm2a/dags/veda_data_pipeline/groups/__init__.py b/sm2a/dags/veda_data_pipeline/groups/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sm2a/dags/veda_data_pipeline/groups/collection_group.py b/sm2a/dags/veda_data_pipeline/groups/collection_group.py deleted file mode 100644 index de4f2dd1..00000000 --- a/sm2a/dags/veda_data_pipeline/groups/collection_group.py +++ /dev/null @@ -1,79 +0,0 @@ -import requests -from airflow.models.variable import Variable -from airflow.operators.python import PythonOperator -from airflow.utils.task_group import TaskGroup -from veda_data_pipeline.utils.collection_generation import GenerateCollection -from veda_data_pipeline.utils.submit_stac import submission_handler - -generator = GenerateCollection() - - -def check_collection_exists(endpoint: str, collection_id: str): - """ - Check if a collection exists in the STAC catalog - - Args: - endpoint (str): STAC catalog endpoint - collection_id (str): collection id - """ - response = requests.get(f"{endpoint}/collections/{collection_id}") - return ( - "Collection.existing_collection" - if (response.status_code == 200) - else "Collection.generate_collection" - ) - - -def ingest_collection_task(ti): - """ - Ingest a collection into the STAC catalog - - Args: - dataset (Dict[str, Any]): dataset dictionary (JSON) - role_arn (str): role arn for Zarr collection generation - """ - collection = ti.xcom_pull(task_ids='Collection.generate_collection') - - return submission_handler( - event=collection, - endpoint="/collections", - cognito_app_secret=Variable.get("COGNITO_APP_SECRET"), - stac_ingestor_api_url=Variable.get("STAC_INGESTOR_API_URL"), - ) - - -# NOTE unused, but useful for item ingests, since collections are a dependency for items -def check_collection_exists_task(ti): - config = ti.dag_run.conf - return check_collection_exists( - endpoint=Variable.get("STAC_URL", default_var=None), - collection_id=config.get("collection"), - ) - - -def generate_collection_task(ti): - config = ti.dag_run.conf - role_arn = Variable.get("ASSUME_ROLE_READ_ARN", default_var=None) - - # TODO it would be ideal if this also works with complete collections where provided - this would make the collection ingest more re-usable - collection = generator.generate_stac( - dataset_config=config, role_arn=role_arn - ) - return collection - - - -group_kwgs = {"group_id": "Collection", "tooltip": "Collection"} - - -def collection_task_group(): - with TaskGroup(**group_kwgs) as collection_task_grp: - generate_collection = PythonOperator( - task_id="generate_collection", python_callable=generate_collection_task - ) - ingest_collection = PythonOperator( - task_id="ingest_collection", python_callable=ingest_collection_task - ) - generate_collection >> ingest_collection - - return collection_task_grp diff --git a/sm2a/dags/veda_data_pipeline/groups/discover_group.py b/sm2a/dags/veda_data_pipeline/groups/discover_group.py deleted file mode 100644 index 38b754fb..00000000 --- a/sm2a/dags/veda_data_pipeline/groups/discover_group.py +++ /dev/null @@ -1,110 +0,0 @@ -import time -import uuid - -from airflow.models.variable import Variable -from airflow.models.xcom import LazyXComAccess -from airflow.operators.dummy_operator import DummyOperator as EmptyOperator -from airflow.decorators import task_group -from airflow.operators.python import BranchPythonOperator, PythonOperator, ShortCircuitOperator -from airflow.utils.trigger_rule import TriggerRule -from airflow_multi_dagrun.operators import TriggerMultiDagRunOperator -from veda_data_pipeline.utils.s3_discovery import ( - s3_discovery_handler, EmptyFileListError -) - -group_kwgs = {"group_id": "Discover", "tooltip": "Discover"} - - -def discover_from_s3_task(ti, event={}, **kwargs): - """Discover grouped assets/files from S3 in batches of 2800. Produce a list of such files stored on S3 to process. - This task is used as part of the discover_group subdag and outputs data to EVENT_BUCKET. - """ - config = { - **event, - **ti.dag_run.conf, - } - last_successful_execution = kwargs.get("prev_start_date_success") - if event.get("schedule") and last_successful_execution: - config["last_successful_execution"] = last_successful_execution.isoformat() - # (event, chunk_size=2800, role_arn=None, bucket_output=None): - MWAA_STAC_CONF = Variable.get("MWAA_STACK_CONF", deserialize_json=True) - read_assume_arn = Variable.get("ASSUME_ROLE_READ_ARN", default_var=None) - # Making the chunk size small, this helped us process large data faster than - # passing a large chunk of 500 - chunk_size = config.get("chunk_size", 500) - try: - return s3_discovery_handler( - event=config, - role_arn=read_assume_arn, - bucket_output=MWAA_STAC_CONF["EVENT_BUCKET"], - chunk_size=chunk_size - ) - except EmptyFileListError as ex: - print(f"Received an exception {ex}") - return [] - - -def get_files_to_process(ti): - """Get files from S3 produced by the discovery task. - Used as part of both the parallel_run_process_rasters and parallel_run_process_vectors tasks. - """ - dynamic_group_id = ti.task_id.split(".")[0] - payload = ti.xcom_pull(task_ids=f"{dynamic_group_id}.discover_from_s3") - if isinstance(payload, LazyXComAccess): - payloads_xcom = payload[0].pop("payload", []) - payload = payload[0] - else: - payloads_xcom = payload.pop("payload", []) - dag_run_id = ti.dag_run.run_id - for indx, payload_xcom in enumerate(payloads_xcom): - time.sleep(2) - yield { - "run_id": f"{dag_run_id}_{uuid.uuid4()}_{indx}", - **payload, - "payload": payload_xcom, - } - - -def vector_raster_choice(ti): - """Choose whether to process rasters or vectors based on the payload.""" - payload = ti.dag_run.conf - dynamic_group_id = ti.task_id.split(".")[0] - - if payload.get("vector"): - return f"{dynamic_group_id}.parallel_run_process_vectors" - return f"{dynamic_group_id}.parallel_run_process_rasters" - -@task_group -def subdag_discover(event={}): - discover_from_s3 = ShortCircuitOperator( - task_id="discover_from_s3", - python_callable=discover_from_s3_task, - op_kwargs={"text": "Discover from S3", "event": event}, - trigger_rule=TriggerRule.NONE_FAILED, - provide_context=True, - ) - - raster_vector_branching = BranchPythonOperator( - task_id="raster_vector_branching", - python_callable=vector_raster_choice, - ) - - run_process_raster = TriggerMultiDagRunOperator( - task_id="parallel_run_process_rasters", - trigger_dag_id="veda_ingest_raster", - python_callable=get_files_to_process, - ) - - run_process_vector = TriggerMultiDagRunOperator( - task_id="parallel_run_process_vectors", - trigger_dag_id="veda_ingest_vector", - python_callable=get_files_to_process, - ) - - # extra no-op, needed to run in dynamic mapping context - end_discover = EmptyOperator(task_id="end_discover", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,) - - discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector] - run_process_raster >> end_discover - run_process_vector >> end_discover - diff --git a/sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py b/sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py deleted file mode 100644 index 2c8852e2..00000000 --- a/sm2a/dags/veda_data_pipeline/groups/ecs_tasks.py +++ /dev/null @@ -1,117 +0,0 @@ -import json - -from airflow.hooks.base import BaseHook -from airflow.providers.amazon.aws.operators.ecs import ( - EcsDeregisterTaskDefinitionOperator, - EcsRegisterTaskDefinitionOperator, - EcsRunTaskOperator, -) -from airflow.utils.task_group import TaskGroup -from airflow.utils.trigger_rule import TriggerRule - - -def get_aws_keys_from_connection(connection_id="aws_default"): - conn = BaseHook.get_connection(connection_id) - return { - "AWS_ACCESS_KEY_ID": conn.login, - "AWS_SECRET_ACCESS_KEY": conn.password, - "AWS_DEFAULT_REGION": json.loads(conn.extra).get("region_name", "us-west-2"), - } - - -group_kwgs = {"group_id": "ECSTasks", "tooltip": "ECSTasks"} - - -def subdag_ecs_task( - task_id, - task_definition_family, - container_name, - docker_image, - cmd: str, - mwaa_stack_conf, - aws_region="us-west-2", - cpu="256", - memory="512", - stage="dev", - environment_vars=None, -): - if environment_vars is None: - environment_vars = list() - with TaskGroup(**group_kwgs) as ecs_task_grp: - if stage == "local": - from airflow.providers.docker.operators.docker import DockerOperator - - return DockerOperator( - task_id=task_id, - container_name=container_name, - image=docker_image, - api_version="auto", - auto_remove=True, - command=cmd, - environment=get_aws_keys_from_connection(), - docker_url="tcp://docker-in-docker:2375", - mount_tmp_dir=False, - network_mode="bridge", - ) - - register_task = EcsRegisterTaskDefinitionOperator( - task_id=f"{task_id}_task_register", - family=task_definition_family, - trigger_rule=TriggerRule.ONE_SUCCESS, - container_definitions=[ - { - "name": container_name, - "image": docker_image, - "entryPoint": ["sh", "-c"], - "command": ["ls"], - "logConfiguration": { - "logDriver": "awslogs", - "options": { - "awslogs-group": mwaa_stack_conf.get("LOG_GROUP_NAME"), - "awslogs-region": aws_region, - "awslogs-stream-prefix": "ecs", - }, - }, - } - ], - register_task_kwargs={ - "cpu": cpu, - "memory": memory, - "networkMode": "awsvpc", - "taskRoleArn": mwaa_stack_conf.get("MWAA_EXECUTION_ROLE_ARN"), - "executionRoleArn": mwaa_stack_conf.get("MWAA_EXECUTION_ROLE_ARN"), - "requiresCompatibilities": ["FARGATE"], - }, - ) - ecs_task_run = EcsRunTaskOperator( - task_id=task_id, - cluster=mwaa_stack_conf.get("ECS_CLUSTER_NAME"), - task_definition=register_task.output, - launch_type="FARGATE", - do_xcom_push=True, - overrides={ - "containerOverrides": [ - { - "name": container_name, - "command": [cmd], - "environment": environment_vars, - }, - ], - }, - network_configuration={ - "awsvpcConfiguration": { - "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), - "subnets": mwaa_stack_conf.get("SUBNETS"), - }, - }, - awslogs_region="us-west-2", - awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), - awslogs_stream_prefix=f"ecs/{container_name}", - ) - deregister_task = EcsDeregisterTaskDefinitionOperator( - task_id=f"{task_id}_deregister_task", - task_definition=register_task.output, - ) - - register_task >> ecs_task_run >> deregister_task - return ecs_task_grp diff --git a/sm2a/dags/veda_data_pipeline/groups/processing_group.py b/sm2a/dags/veda_data_pipeline/groups/processing_group.py deleted file mode 100644 index 9a8382f9..00000000 --- a/sm2a/dags/veda_data_pipeline/groups/processing_group.py +++ /dev/null @@ -1,95 +0,0 @@ -import json -import logging -from datetime import timedelta - -import smart_open -from airflow.models.variable import Variable -from airflow.operators.python import PythonOperator -from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator -from airflow.utils.task_group import TaskGroup -from veda_data_pipeline.utils.submit_stac import ( - submission_handler, -) - -group_kwgs = {"group_id": "Process", "tooltip": "Process"} - - -def log_task(text: str): - logging.info(text) - - -def submit_to_stac_ingestor_task(ti): - """Submit STAC items to the STAC ingestor API.""" - print("Submit STAC ingestor") - event = json.loads(ti.xcom_pull(task_ids=f"{group_kwgs['group_id']}.build_stac")) - success_file = event["payload"]["success_event_key"] - with smart_open.open(success_file, "r") as _file: - stac_items = json.loads(_file.read()) - - for item in stac_items: - submission_handler( - event=item, - endpoint="/ingestions", - cognito_app_secret=Variable.get("COGNITO_APP_SECRET"), - stac_ingestor_api_url=Variable.get("STAC_INGESTOR_API_URL"), - ) - return event - - -def subdag_process(): - with TaskGroup(**group_kwgs) as process_grp: - mwaa_stack_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) - build_stac = EcsRunTaskOperator( - task_id="build_stac", - trigger_rule="none_failed", - cluster=f"{mwaa_stack_conf.get('PREFIX')}-cluster", - task_definition=f"{mwaa_stack_conf.get('PREFIX')}-tasks", - launch_type="FARGATE", - do_xcom_push=True, - execution_timeout=timedelta(minutes=60), - overrides={ - "containerOverrides": [ - { - "name": f"{mwaa_stack_conf.get('PREFIX')}-veda-stac-build", - "command": [ - "/usr/local/bin/python", - "handler.py", - "--payload", - "{}".format("{{ task_instance.dag_run.conf }}"), - ], - "environment": [ - { - "name": "EXTERNAL_ROLE_ARN", - "value": Variable.get( - "ASSUME_ROLE_READ_ARN", default_var="" - ), - }, - { - "name": "BUCKET", - "value": "veda-data-pipelines-staging-lambda-ndjson-bucket", - }, - { - "name": "EVENT_BUCKET", - "value": mwaa_stack_conf.get("EVENT_BUCKET"), - }, - ], - "memory": 2048, - "cpu": 1024, - }, - ], - }, - network_configuration={ - "awsvpcConfiguration": { - "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), - "subnets": mwaa_stack_conf.get("SUBNETS"), - }, - }, - awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), - awslogs_stream_prefix=f"ecs/{mwaa_stack_conf.get('PREFIX')}-veda-stac-build", # prefix with container name - ) - submit_to_stac_ingestor = PythonOperator( - task_id="submit_to_stac_ingestor", - python_callable=submit_to_stac_ingestor_task, - ) - build_stac >> submit_to_stac_ingestor - return process_grp diff --git a/sm2a/dags/veda_data_pipeline/groups/transfer_group.py b/sm2a/dags/veda_data_pipeline/groups/transfer_group.py deleted file mode 100644 index a4235496..00000000 --- a/sm2a/dags/veda_data_pipeline/groups/transfer_group.py +++ /dev/null @@ -1,90 +0,0 @@ -from datetime import timedelta - -from airflow.models.variable import Variable -from airflow.operators.python import BranchPythonOperator, PythonOperator -from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator -from airflow.utils.task_group import TaskGroup -from airflow.utils.trigger_rule import TriggerRule -from veda_data_pipeline.utils.transfer import ( - data_transfer_handler, -) - -group_kwgs = {"group_id": "Transfer", "tooltip": "Transfer"} - - -def cogify_choice(ti): - """Choos whether to cogify or not; if yes, use a docker container""" - payload = ti.dag_run.conf - - if payload.get("cogify"): - return f"{group_kwgs['group_id']}.cogify_and_copy_data" - else: - return f"{group_kwgs['group_id']}.copy_data" - - -def transfer_data(ti): - """Transfer data from one S3 bucket to another; s3 copy, no need for docker""" - config = ti.dag_run.conf - role_arn = Variable.get("ASSUME_ROLE_READ_ARN", default_var="") - # (event, chunk_size=2800, role_arn=None, bucket_output=None): - return data_transfer_handler(event=config, role_arn=role_arn) - -# TODO: cogify_transfer handler is missing arg parser so this subdag will not work -def subdag_transfer(): - with TaskGroup(**group_kwgs) as discover_grp: - cogify_branching = BranchPythonOperator( - task_id="cogify_branching", - trigger_rule=TriggerRule.ONE_SUCCESS, - python_callable=cogify_choice, - ) - - run_copy = PythonOperator( - task_id="copy_data", - python_callable=transfer_data, - op_kwargs={"text": "Copy files on S3"}, - ) - - mwaa_stack_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) - run_cogify_copy = EcsRunTaskOperator( - task_id="cogify_and_copy_data", - trigger_rule="none_failed", - cluster=f"{mwaa_stack_conf.get('PREFIX')}-cluster", - task_definition=f"{mwaa_stack_conf.get('PREFIX')}-transfer-tasks", - launch_type="FARGATE", - do_xcom_push=True, - execution_timeout=timedelta(minutes=120), - overrides={ - "containerOverrides": [ - { - "name": f"{mwaa_stack_conf.get('PREFIX')}-veda-cogify-transfer", - "command": [ - "/usr/local/bin/python", - "handler.py", - "--payload", - "{}".format("{{ task_instance.dag_run.conf }}"), - ], - "environment": [ - { - "name": "EXTERNAL_ROLE_ARN", - "value": Variable.get( - "ASSUME_ROLE_READ_ARN", default_var="" - ), - }, - ], - "memory": 2048, - "cpu": 1024, - }, - ], - }, - network_configuration={ - "awsvpcConfiguration": { - "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), - "subnets": mwaa_stack_conf.get("SUBNETS"), - }, - }, - awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), - awslogs_stream_prefix=f"ecs/{mwaa_stack_conf.get('PREFIX')}-veda-cogify-transfer", # prefix with container name - ) - - (cogify_branching >> [run_copy, run_cogify_copy]) - return discover_grp diff --git a/sm2a/dags/veda_data_pipeline/requirements_dev.txt b/sm2a/dags/veda_data_pipeline/requirements_dev.txt deleted file mode 100644 index e21ff359..00000000 --- a/sm2a/dags/veda_data_pipeline/requirements_dev.txt +++ /dev/null @@ -1 +0,0 @@ -requests_mock==1.12.1 \ No newline at end of file diff --git a/sm2a/dags/veda_data_pipeline/utils/README.md b/sm2a/dags/veda_data_pipeline/utils/README.md deleted file mode 100644 index 42c1d982..00000000 --- a/sm2a/dags/veda_data_pipeline/utils/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Data Pipeline Utils - -## submit_stac - -Test with python locally (uses example data in [hlss30_stac_example.ndjson](./hlss30_stac_example.ndjson)) - -```bash -python -m submit_stac -``` - ----------------- - -## s3_discovery - -Module to query an `s3` bucket to discover COGs -```bash -docker build -t s3-discovery. -# Currently runs an example for OMI Ozone -docker run s3-discovery python -m s3_discovery_handler -``` - -To run this locally, you may need to pass your AWS credentials to the module: `docker run -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY s3-discovery python -m s3_discovery_handler` - -AWS Provisioning -This Lambda needs to list the contents of a S3 Bucket in order to discover files. -- Add `s3:ListBucket` to the Lambda's execution role diff --git a/sm2a/dags/veda_data_pipeline/utils/__init__.py b/sm2a/dags/veda_data_pipeline/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/sm2a/dags/veda_data_pipeline/utils/collection_generation.py b/sm2a/dags/veda_data_pipeline/utils/collection_generation.py deleted file mode 100644 index abba2de5..00000000 --- a/sm2a/dags/veda_data_pipeline/utils/collection_generation.py +++ /dev/null @@ -1,138 +0,0 @@ -from typing import Any, Dict - -import fsspec -import xarray as xr -import xstac -from veda_data_pipeline.utils.schemas import SpatioTemporalExtent -from datetime import datetime, timezone - - -class GenerateCollection: - common = { - "links": [], - "extent": { - "spatial": {"bbox": [[-180, -90, 180, 90]]}, - "temporal": {"interval": [[None, None]]}, - }, - "type": "Collection", - "stac_version": "1.0.0", - } - keys_to_ignore = [ - "collection", - "data_type", - "sample_files", - "discovery_items", - "spatial_extent", - "temporal_extent", - "is_periodic", - "time_density", - "type", - ] - - def get_template(self, dataset: Dict[str, Any]) -> dict: - extra_fields = { - key: dataset[key] - for key in dataset.keys() - if key not in GenerateCollection.keys_to_ignore - } - - collection_dict = { - "id": dataset["collection"], - **GenerateCollection.common, - **extra_fields, - } - - # Default REQUIRED fields - if not collection_dict.get("description"): - collection_dict["description"] = dataset["collection"] - if not collection_dict.get("license"): - collection_dict["license"] = "proprietary" - - return collection_dict - - def _create_zarr_template(self, dataset: Dict[str, Any], store_path: str) -> dict: - template = self.get_template(dataset) - template["assets"] = { - "zarr": { - "href": store_path, - "title": "Zarr Array Store", - "description": "Zarr array store with one or several arrays (variables)", - "roles": ["data", "zarr"], - "type": "application/vnd+zarr", - "xarray:open_kwargs": { - "engine": "zarr", - "chunks": {}, - **dataset.xarray_kwargs, - }, - } - } - return template - - def create_zarr_collection(self, dataset: Dict[str, Any], role_arn: str) -> dict: - """ - Creates a zarr stac collection based off of the user input - """ - discovery = dataset.discovery_items[0] - store_path = f"s3://{discovery.bucket}/{discovery.prefix}{discovery.zarr_store}" - template = self._create_zarr_template(dataset, store_path) - - fs = fsspec.filesystem("s3", anon=False, role_arn=role_arn) - store = fs.get_mapper(store_path) - ds = xr.open_zarr( - store, consolidated=bool(dataset.xarray_kwargs.get("consolidated")) - ) - - collection = xstac.xarray_to_stac( - ds, - template, - temporal_dimension=dataset.temporal_dimension or "time", - x_dimension=dataset.x_dimension or "lon", - y_dimension=dataset.y_dimension or "lat", - reference_system=dataset.reference_system or 4326, - ) - return collection.to_dict() - - def create_cog_collection(self, dataset: Dict[str, Any]) -> dict: - collection_stac = self.get_template(dataset) - - # Override the extents if they exists - if spatial_extent := dataset.get("spatial_extent"): - collection_stac["extent"]["spatial"] = {"bbox": [list(spatial_extent.values())]}, - - if temporal_extent := dataset.get("temporal_extent"): - collection_stac["extent"]["temporal"] = { - "interval": [ - # most of our data uses the Z suffix for UTC - isoformat() doesn't - [ - datetime.fromisoformat(x).astimezone(timezone.utc).isoformat().replace("+00:00", "Z") - if x else None - for x in list(temporal_extent.values()) - ] - ] - } - - collection_stac["item_assets"] = { - "cog_default": { - "type": "image/tiff; application=geotiff; profile=cloud-optimized", - "roles": ["data", "layer"], - "title": "Default COG Layer", - "description": "Cloud optimized default layer to display on map", - } - } - return collection_stac - - def generate_stac( - self, dataset_config: Dict[str, Any], role_arn: str = None - ) -> dict: - """ - Generates a STAC collection based on the dataset and data type - - Args: - dataset_config (Dict[str, Any]): dataset configuration - role_arn (str): role arn for Zarr collection generation - """ - data_type = dataset_config.get("data_type", "cog") - if data_type == "zarr": - return self.create_zarr_collection(dataset_config, role_arn) - else: - return self.create_cog_collection(dataset_config) diff --git a/sm2a/dags/veda_data_pipeline/utils/s3_discovery.py b/sm2a/dags/veda_data_pipeline/utils/s3_discovery.py deleted file mode 100644 index 5a275701..00000000 --- a/sm2a/dags/veda_data_pipeline/utils/s3_discovery.py +++ /dev/null @@ -1,292 +0,0 @@ -import itertools -import json -import os -import re -from typing import List -from uuid import uuid4 -from pathlib import Path - -from datetime import datetime -from dateutil.tz import tzlocal -import boto3 -from smart_open import open as smrt_open - - -# Adding a custom exception for empty list -class EmptyFileListError(Exception): - def __init__(self, error_message): - self.error_message = error_message - super().__init__(self.error_message) - - -def assume_role(role_arn, session_name="veda-data-pipelines_s3-discovery"): - sts = boto3.client("sts") - credentials = sts.assume_role( - RoleArn=role_arn, - RoleSessionName=session_name, - ) - creds = credentials["Credentials"] - return { - "aws_access_key_id": creds["AccessKeyId"], - "aws_secret_access_key": creds.get("SecretAccessKey"), - "aws_session_token": creds.get("SessionToken"), - } - - -def get_s3_resp_iterator(bucket_name, prefix, s3_client, page_size=1000): - """ - Returns an s3 paginator. - :param bucket_name: The bucket. - :param prefix: The path for the s3 granules. - :param s3_client: Initialized boto3 S3 client - :param page_size: Number of records returned - """ - s3_paginator = s3_client.get_paginator("list_objects") - print(f"Getting S3 response iterator for bucket: {bucket_name}, prefix: {prefix}") - return s3_paginator.paginate( - Bucket=bucket_name, Prefix=prefix, PaginationConfig={"page_size": page_size} - ) - - -def discover_from_s3( - response_iterator, filename_regex: str, last_execution: datetime -) -> dict: - """Iterate through pages of S3 objects returned by a ListObjectsV2 operation. - The discover_from_s3 function takes in an iterator over the pages of S3 objects returned - by a ListObjectsV2 operation. It iterates through the pages and yields each S3 object in the page as a dictionary. - This function can be used to iterate through a large number of S3 objects returned by a ListObjectsV2 operation - without having to load all the objects into memory at once. - - Parameters: - response_iterator (iter): An iterator over the pages of S3 objects returned by a ListObjectsV2 operation. - filename_regex (str): A regular expression used to filter the S3 objects returned by the ListObjectsV2 operation. - - Yields: - dict: A dictionary representing an S3 object. - """ - for page in response_iterator: - for s3_object in page.get("Contents", {}): - key = s3_object["Key"] - conditionals = [re.match(filename_regex, key)] - if last_execution: - last_modified = s3_object["LastModified"] - conditionals.append(last_modified > last_execution) - if all(conditionals): - yield s3_object - - -def group_by_item(discovered_files: List[str], id_regex: str, assets: dict) -> dict: - """Group assets by matching regex patterns against discovered files.""" - grouped_files = [] - for uri in discovered_files: - # Each file gets its matched asset type and id - filename = uri.split("/")[-1] - prefix = "/".join(uri.split("/")[:-1]) - asset_type = None - if match := re.match(id_regex, filename): - # At least one match; can use the match here to construct an ID (match groups separated by '-') - item_id = "-".join(match.groups()) - for asset_name, asset_definition in assets.items(): - regex = asset_definition["regex"] - if re.match(regex, filename): - asset_type = asset_name - break - if asset_type: - grouped_files.append( - { - "prefix": prefix, - "filename": filename, - "asset_type": asset_type, - "item_id": item_id, - } - ) - else: - print(f"Warning: skipping file. No id match found: {filename}") - # At this point, files are labeled with type and id. Now, group them by id - sorted_list = sorted(grouped_files, key=lambda x: x["item_id"]) - grouped_data = [ - {"item_id": key, "data": list(group)} - for key, group in itertools.groupby(sorted_list, key=lambda x: x["item_id"]) - ] - items_with_assets = [] - # Produce a dictionary in which each record is keyed by an item ID and contains a list of associated asset hrefs - for group in grouped_data: - item = {"item_id": group["item_id"], "assets": {}} - for file in group["data"]: - asset_type = file["asset_type"] - filename = file["filename"] - # Copy the asset definition and update the href - updated_asset = assets[file["asset_type"]].copy() - updated_asset["href"] = f"{file['prefix']}/{file['filename']}" - item["assets"][asset_type] = updated_asset - items_with_assets.append(item) - return items_with_assets - - -def construct_single_asset_items(discovered_files: List[str], assets: dict|None) -> dict: - items_with_assets = [] - asset_key = "default" - asset_value = {} - if assets: - asset_key = list(assets.keys())[0] - asset_value = assets[asset_key] - for uri in discovered_files: - # Each file gets its matched asset type and id - filename = uri.split("/")[-1] - filename_without_extension = Path(filename).stem - prefix = "/".join(uri.split("/")[:-1]) - item = { - "item_id": filename_without_extension, - "assets": { - asset_key: { - "title": "Default COG Layer", - "description": "Cloud optimized default layer to display on map", - "href": f"{prefix}/{filename}", - **asset_value - } - }, - } - items_with_assets.append(item) - return items_with_assets - - -def generate_payload(s3_prefix_key: str, payload: dict): - """Generate a payload and write it to an S3 file. - This function takes in a prefix for an S3 key and a dictionary containing a payload. - The function then writes the payload to an S3 file using the provided prefix and a randomly - generated UUID as the key. The key of the output file is then returned. - Parameters: - s3_prefix_key (str): The prefix for the S3 key where the output file will be written. - payload (dict): A dictionary containing the payload to be written to the output file. - - Returns: - str: The S3 key of the output file. - """ - output_key = f"{s3_prefix_key}/s3_discover_output_{uuid4()}.json" - with smrt_open(output_key, "w") as file: - file.write(json.dumps(payload)) - return output_key - - -def propagate_forward_datetime_args(event): - """ - This function extracts datetime-related arguments from the input event dictionary. - The purpose is to forward these datetime arguments to other functions that may require them. - - The function looks for the keys "single_datetime", "start_datetime", "end_datetime", - and "datetime_range" in the event dictionary. If any of these keys are present, - it includes them in the output dictionary. - - Parameters: - event (dict): Input dictionary potentially containing datetime arguments. - - Returns: - dict: A new dictionary containing only the datetime-related keys from the input - that were present. If none of the specified keys are in the event, - the function returns an empty dictionary. - """ - keys = ["single_datetime", "start_datetime", "end_datetime", "datetime_range"] - return {key: event[key] for key in keys if key in event} - - -def s3_discovery_handler(event, chunk_size=2800, role_arn=None, bucket_output=None): - bucket = event.get("bucket") - prefix = event.get("prefix", "") - filename_regex = event.get("filename_regex", None) - collection = event.get("collection", prefix.rstrip("/")) - properties = event.get("properties", {}) - assets = event.get("assets") - id_regex = event.get("id_regex") - id_template = event.get("id_template", "{}") - date_fields = propagate_forward_datetime_args(event) - dry_run = event.get("dry_run", False) - if process_from := event.get("process_from_yyyy_mm_dd"): - process_from = datetime.strptime(process_from, "%Y-%m-%d").replace( - tzinfo=tzlocal() - ) - if last_execution := event.get("last_successful_execution"): - last_execution = datetime.fromisoformat(last_execution) - if dry_run: - print("Running discovery in dry run mode") - - payload = {**event, "objects": []} - slice = event.get("slice") - - bucket_output = os.environ.get("EVENT_BUCKET", bucket_output) - key = f"s3://{bucket_output}/events/{collection}" - records = 0 - out_keys = [] - discovered = 0 - - kwargs = assume_role(role_arn=role_arn) if role_arn else {} - s3client = boto3.client("s3", **kwargs) - s3_iterator = get_s3_resp_iterator( - bucket_name=bucket, prefix=prefix, s3_client=s3client - ) - file_uris = [ - f"s3://{bucket}/{obj['Key']}" - for obj in discover_from_s3( - s3_iterator, filename_regex, last_execution=process_from or last_execution - ) - ] - - if len(file_uris) == 0: - raise EmptyFileListError(f"No files discovered at bucket: {bucket}, prefix: {prefix}") - - # group only if more than 1 assets - if assets and len(assets.keys()) > 1: - items_with_assets = group_by_item(file_uris, id_regex, assets) - else: - # out of convenience, we might not always want to explicitly define assets - # or if only a single asset is defined, follow default flow - items_with_assets = construct_single_asset_items(file_uris, assets) - - if len(items_with_assets) == 0: - raise EmptyFileListError( - f"No items could be constructed for files at bucket: {bucket}, prefix: {prefix}" - ) - - # Update IDs using id_template - for item in items_with_assets: - item["item_id"] = id_template.format(item["item_id"]) - - item_count = 0 - for item in items_with_assets: - item_count += 1 - # Logic to ingest a 'slice' of data - if slice: - if item_count < slice[0]: # Skip until we reach the start of the slice - continue - if ( - item_count >= slice[1] - ): # Stop once we reach the end of the slice, while saving progress - break - file_obj = { - "collection": collection, - "item_id": item["item_id"], - "assets": item["assets"], - "properties": properties, - **date_fields, - } - - if dry_run and item_count < 10: - print("-DRYRUN- Example item") - print(json.dumps(file_obj)) - - payload["objects"].append(file_obj) - if records == chunk_size: - out_keys.append(generate_payload(s3_prefix_key=key, payload=payload)) - records = 0 - discovered += len(payload["objects"]) - payload["objects"] = [] - records += 1 - - if payload["objects"]: - out_keys.append(generate_payload(s3_prefix_key=key, payload=payload)) - discovered += len(payload["objects"]) - # We need to make sure the payload isn't too large for ECS overrides - try: - del event["assets"] - except KeyError: - pass - return {**event, "payload": out_keys, "discovered": discovered} diff --git a/sm2a/dags/veda_data_pipeline/utils/schemas.py b/sm2a/dags/veda_data_pipeline/utils/schemas.py deleted file mode 100644 index c5f33b9e..00000000 --- a/sm2a/dags/veda_data_pipeline/utils/schemas.py +++ /dev/null @@ -1,15 +0,0 @@ -# Description: Lightweight schema definitions - -from datetime import datetime -from typing import List, Union -from stac_pydantic.collection import Extent, TimeInterval - - -class DatetimeInterval(TimeInterval): - # reimplement stac_pydantic's TimeInterval to leverage datetime types - interval: List[List[Union[datetime, None]]] - - -class SpatioTemporalExtent(Extent): - # reimplement stac_pydantic's Extent to leverage datetime types - temporal: DatetimeInterval diff --git a/sm2a/dags/veda_data_pipeline/utils/submit_stac.py b/sm2a/dags/veda_data_pipeline/utils/submit_stac.py deleted file mode 100644 index 1d4edfca..00000000 --- a/sm2a/dags/veda_data_pipeline/utils/submit_stac.py +++ /dev/null @@ -1,136 +0,0 @@ -import json -import os -import sys -from dataclasses import dataclass - -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict - -from typing import Any, Dict, Optional, Union - -import boto3 -import requests - - -class InputBase(TypedDict): - dry_run: Optional[Any] - - -class S3LinkInput(InputBase): - stac_file_url: str - - -class StacItemInput(InputBase): - stac_item: Dict[str, Any] - - -class AppConfig(TypedDict): - cognito_domain: str - client_id: str - client_secret: str - scope: str - - -class Creds(TypedDict): - access_token: str - expires_in: int - token_type: str - - -@dataclass -class IngestionApi: - base_url: str - token: str - - @classmethod - def from_veda_auth_secret(cls, *, secret_id: str, base_url: str) -> "IngestionApi": - cognito_details = cls._get_cognito_service_details(secret_id) - credentials = cls._get_app_credentials(**cognito_details) - return cls(token=credentials["access_token"], base_url=base_url) - - @staticmethod - def _get_cognito_service_details(secret_id: str) -> AppConfig: - client = boto3.client("secretsmanager") - response = client.get_secret_value(SecretId=secret_id) - return json.loads(response["SecretString"]) - - @staticmethod - def _get_app_credentials( - cognito_domain: str, client_id: str, client_secret: str, scope: str, **kwargs - ) -> Creds: - response = requests.post( - f"{cognito_domain}/oauth2/token", - headers={ - "Content-Type": "application/x-www-form-urlencoded", - }, - auth=(client_id, client_secret), - data={ - "grant_type": "client_credentials", - # A space-separated list of scopes to request for the generated access token. - "scope": scope, - }, - ) - try: - response.raise_for_status() - except Exception as ex: - print(response.text) - raise f"Error, {ex}" - return response.json() - - def submit(self, event: Dict[str, Any], endpoint: str) -> Dict[str, Any]: - headers = { - "Authorization": f"Bearer {self.token}", - "Content-Type": "application/json", - } - response = requests.post( - f"{self.base_url.rstrip('/')}{endpoint}", - json=event, - headers=headers, - ) - try: - response.raise_for_status() - except Exception as e: - print(response.text) - raise e - return response.json() - - -def submission_handler( - event: Union[S3LinkInput, StacItemInput, Dict[str, Any]], - endpoint: str = "/ingestions", - cognito_app_secret=None, - stac_ingestor_api_url=None, - context=None, -) -> None: - if context is None: - context = {} - - stac_item = event - - if stac_item.get("dry_run"): - print("Dry run, not inserting, would have inserted:") - print(json.dumps(stac_item, indent=2)) - return - - cognito_app_secret = cognito_app_secret or os.getenv("COGNITO_APP_SECRET") - stac_ingestor_api_url = stac_ingestor_api_url or os.getenv("STAC_INGESTOR_API_URL") - - ingestor = IngestionApi.from_veda_auth_secret( - secret_id=cognito_app_secret, - base_url=stac_ingestor_api_url, - ) - ingestor.submit(event=stac_item, endpoint=endpoint) - # print("Successfully submitted STAC item") - - -if __name__ == "__main__": - filename = "example.ndjson" - sample_event = { - "stac_file_url": "example.ndjson", - # or - "stac_item": {}, - "type": "collections", - } - submission_handler(sample_event) diff --git a/sm2a/dags/veda_data_pipeline/utils/transfer.py b/sm2a/dags/veda_data_pipeline/utils/transfer.py deleted file mode 100644 index 20823f37..00000000 --- a/sm2a/dags/veda_data_pipeline/utils/transfer.py +++ /dev/null @@ -1,110 +0,0 @@ -import re - -import boto3 -from airflow.exceptions import AirflowException - - -def assume_role(role_arn, session_name="veda-data-airflow_s3-discovery"): - sts = boto3.client("sts") - print(f"Assuming role: {role_arn}") - credentials = sts.assume_role( - RoleArn=role_arn, - RoleSessionName=session_name, - ) - creds = credentials["Credentials"] - return { - "aws_access_key_id": creds["AccessKeyId"], - "aws_secret_access_key": creds.get("SecretAccessKey"), - "aws_session_token": creds.get("SessionToken"), - } - - -def get_matching_files(s3_client, bucket, prefix, regex_pattern): - matching_files = [] - - response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) - while True: - for obj in response["Contents"]: - file_key = obj["Key"] - if re.match(regex_pattern, file_key): - matching_files.append(file_key) - - if "NextContinuationToken" in response: - response = s3_client.list_objects_v2( - Bucket=bucket, - Prefix=prefix, - ContinuationToken=response["NextContinuationToken"], - ) - else: - break - - return matching_files - - -def transfer_files_within_s3( - s3_client, origin_bucket, matching_files, destination_bucket, collection -): - for file_key in matching_files: - filename = file_key.split("/")[-1] - # print(f"Transferring file: {filename}") - target_key = f"{collection}/{filename}" - copy_source = {"Bucket": origin_bucket, "Key": file_key} - - # We can use the etag to check if the file has already been copied and avoid duplication of effort - # by using the CopySourceIfNoneMatch parameter below. - try: - target_metadata = s3_client.head_object( - Bucket=destination_bucket, Key=target_key - ) - target_etag = target_metadata["ETag"] - # print(f"File already exists, checking Etag: {filename}") - s3_client.copy_object( - CopySource=copy_source, - Bucket=destination_bucket, - Key=target_key, - CopySourceIfNoneMatch=target_etag, - ) - except s3_client.exceptions.ClientError as err: - if err.response["Error"]["Code"] == "404": - # print(f"Copying file: {filename}") - s3_client.copy_object( - CopySource=copy_source, - Bucket=destination_bucket, - Key=target_key - ) - - -def data_transfer_handler(event, role_arn=None): - origin_bucket = event.get("origin_bucket") - origin_prefix = event.get("origin_prefix") - filename_regex = event.get("filename_regex") - target_bucket = event.get("target_bucket") - collection = event.get("collection") - - kwargs = assume_role(role_arn=role_arn) if role_arn else {} - s3client = boto3.client("s3", **kwargs) - matching_files = get_matching_files( - s3_client=s3client, - bucket=origin_bucket, - prefix=origin_prefix, - regex_pattern=filename_regex, - ) - - if len(matching_files) == 0: - raise AirflowException("No matching files found") - - if not event.get("dry_run"): - transfer_files_within_s3( - s3_client=s3client, - origin_bucket=origin_bucket, - matching_files=matching_files, - destination_bucket=target_bucket, - collection=collection, - ) - else: - print( - f"Would have copied {len(matching_files)} files from {origin_bucket} to {target_bucket}" - ) - print(f"Files matched: {matching_files}") - - return {**event} diff --git a/sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py deleted file mode 100644 index 8e67584a..00000000 --- a/sm2a/dags/veda_data_pipeline/veda_collection_pipeline.py +++ /dev/null @@ -1,49 +0,0 @@ -import pendulum -from airflow import DAG -from airflow.operators.dummy_operator import DummyOperator as EmptyOperator -from airflow.utils.trigger_rule import TriggerRule -from veda_data_pipeline.groups.collection_group import collection_task_group - -dag_doc_md = """ -### Collection Creation and Ingestion -Generates a collection based on the Dataset model and ingests into the catalog -#### Notes -- This DAG can run with the following configuration
-```json -{ - "collection": "collection-id", - "data_type": "cog", - "description": "collection description", - "is_periodic": true, - "license": "collection-LICENSE", - "time_density": "year", - "title": "collection-title" -} -``` -""" - -dag_args = { - "start_date": pendulum.today("UTC").add(days=-1), - "schedule_interval": None, - "catchup": False, - "doc_md": dag_doc_md, - "tags": ["collection"], -} - -template_dag_run_conf = { - "collection": "", - "data_type": "cog", - "description": "", - "is_periodic": "", - "license": "", - "time_density": "", - "title": "" -} - -with DAG("veda_collection_pipeline", params=template_dag_run_conf, **dag_args) as dag: - start = EmptyOperator(task_id="start", dag=dag) - end = EmptyOperator(task_id="end", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS, dag=dag) - - collection_grp = collection_task_group() - - start >> collection_grp >> end diff --git a/sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py deleted file mode 100644 index d456a80a..00000000 --- a/sm2a/dags/veda_data_pipeline/veda_dataset_pipeline.py +++ /dev/null @@ -1,80 +0,0 @@ -import pendulum -from airflow import DAG -from airflow.decorators import task -from airflow.operators.dummy_operator import DummyOperator as EmptyOperator -from airflow.utils.trigger_rule import TriggerRule -from veda_data_pipeline.groups.collection_group import collection_task_group -from veda_data_pipeline.groups.discover_group import subdag_discover - -dag_doc_md = """ -### Dataset Pipeline -Generates a collection and triggers the file discovery process -#### Notes -- This DAG can run with the following configuration
-```json -{ - "collection": "collection-id", - "data_type": "cog", - "description": "collection description", - "discovery_items": - [ - { - "bucket": "veda-data-store-staging", - "datetime_range": "year", - "discovery": "s3", - "filename_regex": "^(.*).tif$", - "prefix": "example-prefix/" - } - ], - "is_periodic": true, - "license": "collection-LICENSE", - "time_density": "year", - "title": "collection-title" -} -``` -""" - -dag_args = { - "start_date": pendulum.today("UTC").add(days=-1), - "schedule_interval": None, - "catchup": False, - "doc_md": dag_doc_md, - "tags": ["collection", "discovery"], -} - -@task -def extract_discovery_items(**kwargs): - ti = kwargs.get("ti") - discovery_items = ti.dag_run.conf.get("discovery_items") - print(discovery_items) - return discovery_items - -template_dag_run_conf = { - "collection": "", - "data_type": "cog", - "description": "", - "discovery_items": - [ - { - "bucket": "", - "datetime_range": "", - "discovery": "s3", - "filename_regex": "", - "prefix": "" - } - ], - "is_periodic": "", - "license": "", - "time_density": "", - "title": "" -} - -with DAG("veda_dataset_pipeline", params=template_dag_run_conf, **dag_args) as dag: - start = EmptyOperator(task_id="start", dag=dag) - end = EmptyOperator(task_id="end", dag=dag) - - collection_grp = collection_task_group() - discover_grp = subdag_discover.expand(event=extract_discovery_items()) - - start >> collection_grp >> discover_grp >> end - diff --git a/sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py deleted file mode 100644 index 37a5d520..00000000 --- a/sm2a/dags/veda_data_pipeline/veda_discover_pipeline.py +++ /dev/null @@ -1,92 +0,0 @@ -import pendulum -from airflow import DAG -from airflow.operators.dummy_operator import DummyOperator -from airflow.utils.trigger_rule import TriggerRule -from veda_data_pipeline.groups.discover_group import subdag_discover - -dag_doc_md = """ -### Discover files from S3 -#### Purpose -This DAG discovers files from either S3 and/or CMR then runs a DAG id `veda_ingest`. -The DAG `veda_ingest` will run in parallel processing (2800 files per each DAG) -#### Notes -- This DAG can run with the following configuration
-```json -{ - "collection": "collection-id", - "bucket": "veda-data-store-staging", - "prefix": "s3-prefix/", - "filename_regex": "^(.*).tif$", - "id_regex": ".*_(.*).tif$", - "process_from_yyyy_mm_dd": "YYYY-MM-DD", - "id_template": "example-id-prefix-{}", - "datetime_range": "month", - "last_successful_execution": datetime(2015,01,01), - "assets": { - "asset1": { - "title": "Asset type 1", - "description": "First of a multi-asset item.", - "regex": ".*asset1.*", - }, - "asset2": { - "title": "Asset type 2", - "description": "Second of a multi-asset item.", - "regex": ".*asset2.*", - }, - } -} -``` -- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) -""" - -dag_args = { - "start_date": pendulum.today("UTC").add(days=-1), - "catchup": False, - "doc_md": dag_doc_md, - "is_paused_upon_creation": False, -} - -templat_dag_run_conf = { - "collection": "", - "bucket": "", - "prefix": "/", - "filename_regex": "", - "id_regex": "", - "id_template": "", - "datetime_range": "||", - "assets": { - "": { - "title": "", - "description": "", - "regex": "", - }, - "": { - "title": "", - "description": "", - "regex": "", - }, - }, -} - - -def get_discover_dag(id, event={}): - params_dag_run_conf = event or templat_dag_run_conf - with DAG( - id, - schedule_interval=event.get("schedule"), - params=params_dag_run_conf, - **dag_args - ) as dag: - start = DummyOperator(task_id="Start", dag=dag) - end = DummyOperator( - task_id="End", trigger_rule=TriggerRule.ONE_SUCCESS, dag=dag - ) - - discover_grp = subdag_discover(event) - - start >> discover_grp >> end - - return dag - - -get_discover_dag("veda_discover") diff --git a/sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py deleted file mode 100644 index 2555c6a9..00000000 --- a/sm2a/dags/veda_data_pipeline/veda_process_raster_pipeline.py +++ /dev/null @@ -1,52 +0,0 @@ -import pendulum -from airflow import DAG -from airflow.operators.dummy_operator import DummyOperator -from airflow.utils.trigger_rule import TriggerRule -from veda_data_pipeline.groups.processing_group import subdag_process - -dag_doc_md = """ -### Build and submit stac -#### Purpose -This DAG is supposed to be triggered by `veda_discover`. But you still can trigger this DAG manually or through an API - -#### Notes -- This DAG can run with the following configuration
-```json -{ - "collection": "geoglam", - "prefix": "geoglam/", - "bucket": "veda-data-store-staging", - "filename_regex": "^(.*).tif$", - "discovery": "s3", - "datetime_range": "month", - "discovered": 33, - "payload": "s3://veda-uah-sit-mwaa-853558080719/events/geoglam/s3_discover_output_6c46b57a-7474-41fe-977a-.json" -} -``` -- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) -""" - -template_dag_run_conf = { - "collection": "", - "prefix": "/", - "bucket": "", - "filename_regex": "", - "discovery": "", - "datetime_range": "|", - "payload": "> process_grp >> end diff --git a/sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py deleted file mode 100644 index 89c75848..00000000 --- a/sm2a/dags/veda_data_pipeline/veda_process_vector_pipeline.py +++ /dev/null @@ -1,110 +0,0 @@ -import pendulum -from airflow import DAG -from airflow.models.variable import Variable -from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator -from airflow.operators.dummy_operator import DummyOperator -from airflow.utils.trigger_rule import TriggerRule - -from datetime import timedelta - -dag_doc_md = """ -### Build and submit stac -#### Purpose -This DAG is supposed to be triggered by `veda_discover`. But you still can trigger this DAG manually or through an API - -#### Notes -- This DAG can run with the following configuration
-```json -{ - "collection": "geoglam", - "prefix": "geoglam/", - "bucket": "veda-data-store-staging", - "filename_regex": "^(.*).tif$", - "discovery": "s3", - "datetime_range": "month", - "upload": false, - "cogify": false, - "discovered": 33, - "payload": "s3://veda-uah-sit-mwaa-853558080719/events/geoglam/s3_discover_output_6c46b57a-7474-41fe-977a-19d164531cdc.json" -} -``` -- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) -""" - -templat_dag_run_conf = { - "collection": "", - "prefix": "/", - "bucket": "", - "filename_regex": "", - "discovery": "|cmr", - "datetime_range": "|", - "upload": " | true", - "cogify": "false | true", - "payload": "> ingest_vector >> end diff --git a/sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py b/sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py deleted file mode 100644 index 6c1b4f3a..00000000 --- a/sm2a/dags/veda_data_pipeline/veda_transfer_pipeline.py +++ /dev/null @@ -1,50 +0,0 @@ -import pendulum -from airflow import DAG -from airflow.operators.dummy_operator import DummyOperator -from airflow.utils.trigger_rule import TriggerRule -from veda_data_pipeline.groups.transfer_group import subdag_transfer - -dag_doc_md = """ -### Discover files from S3 -#### Purpose -This DAG is used to transfer files that are to permanent locations for indexing with STAC. -#### Notes -- This DAG can run with a configuration similar to this
-```json -{ - "origin_bucket": "covid-eo-dashboard", - "origin_prefix": "s3-prefix/", - "filename_regex": "^(.*).tif$", - "target_bucket": "target_s3_bucket", - "collection": "collection-id", - "cogify": false, - "dry_run": true -} -``` -- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) -""" - -dag_args = { - "start_date": pendulum.today("UTC").add(days=-1), - "schedule_interval": None, - "catchup": False, - "doc_md": dag_doc_md, -} - -templat_dag_run_conf = { - "origin_bucket": "", - "origin_prefix": "/", - "filename_regex": "", - "target_bucket": "", - "collection": "", - "cogify": "true|false", - "dry_run": "true|false", -} - -with DAG("veda_transfer", params=templat_dag_run_conf, **dag_args) as dag: - start = DummyOperator(task_id="Start", dag=dag) - end = DummyOperator(task_id="End", trigger_rule=TriggerRule.ONE_SUCCESS, dag=dag) - - transfer_grp = subdag_transfer() - - start >> transfer_grp >> end From b93d79f54ba0a89abde263155c1910c622a873d8 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 6 Aug 2024 13:59:28 -0500 Subject: [PATCH 31/97] Add generic vector ingest --- docker_tasks/vector_ingest/handler.py | 323 ++++++-------------------- 1 file changed, 71 insertions(+), 252 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 633e32b2..5dd85e4c 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -8,13 +8,6 @@ import smart_open from urllib.parse import urlparse import psycopg2 -import geopandas as gpd -from shapely import wkb -from geoalchemy2 import Geometry -import sqlalchemy -from sqlalchemy import create_engine, MetaData, Table, Column, inspect -import concurrent.futures -from sqlalchemy.dialects.postgresql import DOUBLE_PRECISION, INTEGER, VARCHAR, TIMESTAMP def download_file(file_uri: str): @@ -44,159 +37,36 @@ def download_file(file_uri: str): sts.close() return target_filepath +# Just to test locally +def download_file2(file_uri: str): + s3 = boto3.client("s3") + + + url_parse = urlparse(file_uri) + print("url_parsed: ", url_parse) + + bucket = url_parse.netloc + path = url_parse.path[1:] + filename = url_parse.path.split("/")[-1] + print(bucket, path, filename) + target_filepath = os.path.join("/tmp", filename) + + s3.download_file(bucket, path, target_filepath) + + print(f"downloaded {target_filepath}") + + s3.close() + return target_filepath + def get_connection_string(secret: dict, as_uri: bool = False) -> str: if as_uri: return f"postgresql://{secret['username']}:{secret['password']}@{secret['host']}:5432/{secret['dbname']}" else: + #return f"PG:host=localhost port=5432 dbname=postgis user=username password=password" return f"PG:host={secret['host']} dbname={secret['dbname']} user={secret['username']} password={secret['password']}" -def get_gdf_schema(gdf, target_projection): - """map GeoDataFrame columns into a table schema - - :param gdf: GeoDataFrame from geopandas - :param target_projection: srid for the target table geometry column - :return: - """ - # map geodatafrome dtypes to sqlalchemy types - dtype_map = { - "int64": INTEGER, - "float64": DOUBLE_PRECISION, - "object": VARCHAR, - "datetime64": TIMESTAMP, - } - schema = [] - for column, dtype in zip(gdf.columns, gdf.dtypes): - if str(dtype) == "geometry": - # do not inpsect to retrieve geom type, just use generic GEOMETRY - # geom_type = str(gdf[column].geom_type.unique()[0]).upper() - geom_type = str(dtype).upper() - # do not taKe SRID from existing file for target table - # we always want to transform from file EPSG to Table EPSG() - column_type = Geometry(geometry_type=geom_type, srid=target_projection) - else: - dtype_str = str(dtype) - column_type = dtype_map.get(dtype_str.split("[")[0], VARCHAR) - - if column == "primarykey": - schema.append(Column(column.lower(), column_type, unique=True)) - else: - schema.append(Column(column.lower(), column_type)) - return schema - - -def ensure_table_exists( - db_metadata: MetaData, gpkg_file: str, target_projection: int, table_name: str -): - """create a table if it doesn't exist or just - validate GeoDataFrame columns against existing table - - :param db_metadata: instance of sqlalchemy.MetaData - :param gpkg_file: file path to GPKG - :param target_projection: srid for target DB table geometry column - :param table_name: name of table to create - :return: None - """ - gdf = gpd.read_file(gpkg_file) - gdf_schema = get_gdf_schema(gdf, target_projection) - engine = db_metadata.bind - try: - Table(table_name, db_metadata, autoload_with=engine) - except sqlalchemy.exc.NoSuchTableError: - Table(table_name, db_metadata, *gdf_schema) - db_metadata.create_all(engine) - - # validate gdf schema against existing table schema - insp = inspect(engine) - existing_columns = insp.get_columns(table_name) - existing_column_names = [col["name"] for col in existing_columns] - for column in gdf_schema: - if column.name not in existing_column_names: - raise ValueError( - f"your .gpkg seems to have a column={column.name} that does not exist in the existing table columns={existing_column_names}" - ) - - -def delete_region( - engine, - gpkg_path: str, - table_name: str, -): - """delete all existing records by region name""" - gdf = gpd.read_file(gpkg_path) - region_name = gdf["region"].iloc[0] - with engine.connect() as conn: - with conn.begin(): - delete_sql = sqlalchemy.text( - f""" - DELETE FROM {table_name} WHERE region='{region_name}' - """ - ) - conn.execute(delete_sql) - - -def upsert_to_postgis( - engine, - gpkg_path: str, - target_projection: int, - table_name: str, - batch_size: int = 10000, -): - """batch the GPKG file and upsert via threads - - :param engine: instance of sqlalchemy.Engine - :param gpkg_path: file path to GPKG - :param table_name: name of the target table - :param batch_size: upper limit of batch size - :return: - """ - gdf = gpd.read_file(gpkg_path) - source_epsg_code = gdf.crs.to_epsg() - if not source_epsg_code: - # assume NAD27 Equal Area for now :shrug: - # since that's what the default is for Fire Atlas team exports - # that's what PROJ4 does under the hood for 9311 :wethinksmirk: - source_epsg_code = 2163 - - # convert the `t` column to something suitable for sql insertion otherwise we get 'Timestamp()' - gdf["t"] = gdf["t"].dt.strftime("%Y-%m-%d %H:%M:%S") - # convert to WKB - gdf["geometry"] = gdf["geometry"].apply(lambda geom: wkb.dumps(geom, hex=True)) - - def upsert_batch(batch): - with engine.connect() as conn: - with conn.begin(): - for row in batch.to_dict(orient="records"): - # make sure all column names are lower case for keys and values - row = {k.lower(): v for k, v in row.items()} - columns = [col.lower() for col in batch.columns] - - non_geom_placeholders = ", ".join( - [f":{col}" for col in columns[:-1]] - ) - # NOTE: we need to escape `::geometry` so parameterized statements don't try to replace it - # because parametrized statements in sqlalchemy are `:` - geom_placeholder = f"ST_Transform(ST_SetSRID(ST_GeomFromWKB(:geometry\:\:geometry), {source_epsg_code}), {target_projection})" # noqa: W605 - upsert_sql = sqlalchemy.text( - f""" - INSERT INTO {table_name} ({', '.join([col for col in columns])}) - VALUES ({non_geom_placeholders},{geom_placeholder}) - ON CONFLICT (primarykey) - DO UPDATE SET {', '.join(f"{col}=EXCLUDED.{col}" for col in columns if col != 'primarykey')} - """ - ) - - # logging.debug(f"[ UPSERT SQL ]:\n{str(upsert_sql)}") - conn.execute(upsert_sql, row) - - batches = [gdf.iloc[i : i + batch_size] for i in range(0, len(gdf), batch_size)] - # set `max_workers` to something below max concurrent connections for postgresql - # https://www.postgresql.org/docs/14/runtime-config-connection.html - with concurrent.futures.ThreadPoolExecutor(max_workers=75) as executor: - executor.map(upsert_batch, batches) - - def get_secret(secret_name: str) -> None: """Retrieve secrets from AWS Secrets Manager @@ -225,17 +95,14 @@ def get_secret(secret_name: str) -> None: return json.loads(base64.b64decode(get_secret_value_response["SecretBinary"])) + def load_to_featuresdb( filename: str, collection: str, - extra_flags: list = None, - target_projection: str = "EPSG:4326", + x_possible: str = "longitude", + y_possible: str = "latitude", ): - if extra_flags is None: - extra_flags = ["-overwrite", "-progress"] - secret_name = os.environ.get("VECTOR_SECRET_NAME") - con_secrets = get_secret(secret_name) connection = get_connection_string(con_secrets) @@ -245,18 +112,25 @@ def load_to_featuresdb( "-f", "PostgreSQL", connection, - "-t_srs", - target_projection, filename, + "-oo", + f"X_POSSIBLE_NAMES={x_possible}", + "-oo", + f"Y_POSSIBLE_NAMES={y_possible}", "-nln", - collection, - *extra_flags, + collection, # Or could be the actual filename + "-s_srs", + "EPSG:4326", + "-t_srs", + "EPSG:4326", + "-overwrite" ] out = subprocess.run( options, check=False, capture_output=True, ) + #print("db connection ", options) if out.stderr: error_description = f"Error: {out.stderr}" @@ -266,81 +140,12 @@ def load_to_featuresdb( return {"status": "success"} -def load_to_featuresdb_eis( - filename: str, - collection: str, - target_projection: int = 4326, -): - """create table if not exists and upload GPKG - - :param filename: the file path to the downloaded GPKG - :param collection: the name of the collection - :param target_projection: srid for the target table - :return: None - """ - secret_name = os.environ.get("VECTOR_SECRET_NAME") - conn_secrets = get_secret(secret_name) - connection_string = get_connection_string(conn_secrets, as_uri=True) - - # NOTE: about `collection.rsplit` below: - # - # EIS Fire team naming convention for outputs - # Snapshots: "snapshot_{layer_name}_nrt_{region_name}.gpkg" - # Lf_archive: "lf_{layer_name}_archive_{region_name}.gpkg" - # Lf_nrt: "lf_{layer_name}_nrt_{region_name}.gpkg" - # - # Insert/Alter on table call everything except the region name: - # e.g. `snapshot_perimeter_nrt_conus` this gets inserted into the table `eis_fire_snapshot_perimeter_nrt` - collection = collection.rsplit("_", 1)[0] - target_table_name = f"eis_fire_{collection}" - - engine = create_engine(connection_string) - metadata = MetaData() - metadata.bind = engine - - ensure_table_exists(metadata, filename, target_projection, target_table_name) - delete_region(engine, filename, target_table_name) - upsert_to_postgis(engine, filename, target_projection, target_table_name) - return {"status": "success"} - - -def alter_datetime_add_indexes_eis(collection: str): - # NOTE: about `collection.rsplit` below: - # - # EIS Fire team naming convention for outputs - # Snapshots: "snapshot_{layer_name}_nrt_{region_name}.gpkg" - # Lf_archive: "lf_{layer_name}_archive_{region_name}.gpkg" - # Lf_nrt: "lf_{layer_name}_nrt_{region_name}.gpkg" - # - # Insert/Alter on table call everything except the region name: - # e.g. `snapshot_perimeter_nrt_conus` this gets inserted into the table `eis_fire_snapshot_perimeter_nrt` - collection = collection.rsplit("_", 1)[0] - secret_name = os.environ.get("VECTOR_SECRET_NAME") - conn_secrets = get_secret(secret_name) - conn = psycopg2.connect( - host=conn_secrets["host"], - dbname=conn_secrets["dbname"], - user=conn_secrets["username"], - password=conn_secrets["password"], - ) - - cur = conn.cursor() - cur.execute( - f"ALTER table eis_fire_{collection} " - f"ALTER COLUMN t TYPE TIMESTAMP USING t::timestamp without time zone; " - f"CREATE INDEX IF NOT EXISTS idx_eis_fire_{collection}_datetime ON eis_fire_{collection}(t);" - f"CREATE INDEX IF NOT EXISTS idx_eis_fire_{collection}_primarykey ON eis_fire_{collection}(primarykey);" - f"CREATE INDEX IF NOT EXISTS idx_eis_fire_{collection}_region ON eis_fire_{collection}(region);" - ) - conn.commit() - - -def handler(): - print("Vector ingest started") +def handler(event, context): + print("Generic Vector ingest started") parser = ArgumentParser( - prog="vector_ingest", - description="Ingest Vector", + prog="generic_vector_ingest", + description="Ingest Vector- Generic", epilog="Running the code as ECS task", ) parser.add_argument( @@ -350,35 +155,49 @@ def handler(): payload_event = ast.literal_eval(args.payload) s3_event = payload_event.pop("payload") - with smart_open.open(s3_event, "r") as _file: + + # extract the actual link of the json file and read + with smart_open.open(s3_event[0], "r") as _file: s3_event_read = _file.read() + print("file read done") event_received = json.loads(s3_event_read) s3_objects = event_received["objects"] status = list() for s3_object in s3_objects: - href = s3_object["s3_filename"] - collection = s3_object["collection"] + href = s3_object["assets"]["default"]["href"] #s3://ghgc-data-store-develop/transformed_csv/NIST_Urban_Testbed/NEB-ch4.csv + + # These will be later extracted from the json file. Need to see if the json file put x_possible in the json file after the dag is triggered with x_possibel in it + x_possible = s3_object["x_possible"] + y_possible = s3_object["y_possible"] + + #collection = s3_object["collection"] + #collection = href.split("/")[-1].split(".")[0] + # or it could be + collection = href.split("/")[-2] + '_' + href.split("/")[-1].split(".")[0] + downloaded_filepath = download_file(href) + print("-----------------------------------------------------\n") print(f"[ DOWNLOAD FILEPATH ]: {downloaded_filepath}") print(f"[ COLLECTION ]: {collection}") - - s3_object_prefix = event_received["prefix"] - if s3_object_prefix.startswith("EIS/"): - coll_status = load_to_featuresdb_eis(downloaded_filepath, collection) - else: - coll_status = load_to_featuresdb(downloaded_filepath, collection) - + coll_status = load_to_featuresdb(downloaded_filepath, collection, x_possible, y_possible) status.append(coll_status) + # delete file after ingest os.remove(downloaded_filepath) - if coll_status["status"] == "success" and s3_object_prefix.startswith("EIS/"): - alter_datetime_add_indexes_eis(collection) - elif coll_status["status"] != "success": - # bubble exception so Airflow shows it as a failure - raise Exception(coll_status["reason"]) - print(status) + # Not sure if we need it + # if coll_status["status"] == "success": + # alter_datetime_add_indexes(collection) + # else: + # # bubble exception so Airflow shows it as a failure + # raise Exception(coll_status["reason"]) + print("\n **********Overall Status*********\n", f"Done for {len(status)} csv files",status) if __name__ == "__main__": - handler() + # It has nothing to do + sample_event = { + "collection": "eis_fire_newfirepix_2", + "href": "s3://covid-eo-data/fireline/newfirepix.fgb", + } + handler(sample_event, {}) From 474f42b0f0a4139ce5181bcd1ce498db0b0078ff Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 6 Aug 2024 16:50:03 -0500 Subject: [PATCH 32/97] Resolve arm role bug --- dags/veda_data_pipeline/veda_process_vector_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dags/veda_data_pipeline/veda_process_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_vector_pipeline.py index 89c75848..c47732cd 100644 --- a/dags/veda_data_pipeline/veda_process_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_vector_pipeline.py @@ -82,7 +82,7 @@ { "name": "EXTERNAL_ROLE_ARN", "value": Variable.get( - "ASSUME_ROLE_READ_ARN", default_var=None + "ASSUME_ROLE_READ_ARN", default_var="" ), }, { @@ -99,8 +99,8 @@ }, network_configuration={ "awsvpcConfiguration": { - "securityGroups": vector_ecs_conf.get("VECTOR_SECURITY_GROUP"), - "subnets": vector_ecs_conf.get("VECTOR_SUBNETS"), + "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), + "subnets": mwaa_stack_conf.get("SUBNETS"), }, }, awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), From bdceef1c2c788f7f2ec81be21efb03408143a34a Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 6 Aug 2024 16:50:53 -0500 Subject: [PATCH 33/97] fix smart open bug --- docker_tasks/vector_ingest/handler.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 5dd85e4c..338fd61e 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -101,6 +101,8 @@ def load_to_featuresdb( collection: str, x_possible: str = "longitude", y_possible: str = "latitude", + source_projection : str ="EPSG:4326", + target_projection : str ="EPSG:4326" ): secret_name = os.environ.get("VECTOR_SECRET_NAME") con_secrets = get_secret(secret_name) @@ -120,9 +122,9 @@ def load_to_featuresdb( "-nln", collection, # Or could be the actual filename "-s_srs", - "EPSG:4326", + source_projection, "-t_srs", - "EPSG:4326", + target_projection, "-overwrite" ] out = subprocess.run( @@ -157,7 +159,7 @@ def handler(event, context): s3_event = payload_event.pop("payload") # extract the actual link of the json file and read - with smart_open.open(s3_event[0], "r") as _file: + with smart_open.open(s3_event, "r") as _file: s3_event_read = _file.read() print("file read done") event_received = json.loads(s3_event_read) @@ -169,6 +171,8 @@ def handler(event, context): # These will be later extracted from the json file. Need to see if the json file put x_possible in the json file after the dag is triggered with x_possibel in it x_possible = s3_object["x_possible"] y_possible = s3_object["y_possible"] + source_projection = s3_object["source_projection"] + target_projection = s3_object["target_projection"] #collection = s3_object["collection"] #collection = href.split("/")[-1].split(".")[0] @@ -179,7 +183,7 @@ def handler(event, context): print("-----------------------------------------------------\n") print(f"[ DOWNLOAD FILEPATH ]: {downloaded_filepath}") print(f"[ COLLECTION ]: {collection}") - coll_status = load_to_featuresdb(downloaded_filepath, collection, x_possible, y_possible) + coll_status = load_to_featuresdb(downloaded_filepath, collection, x_possible, y_possible, source_projection, target_projection) status.append(coll_status) # delete file after ingest From 991e76435d73a5ac071b95afe271c36dcd1ae9c1 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 6 Aug 2024 18:34:34 -0500 Subject: [PATCH 34/97] logging few things --- docker_tasks/vector_ingest/handler.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 338fd61e..5119df9d 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -156,7 +156,13 @@ def handler(event, context): args = parser.parse_args() payload_event = ast.literal_eval(args.payload) + print("*********** payload", payload_event) s3_event = payload_event.pop("payload") + # These will be later extracted from the json file. Need to see if the json file put x_possible in the json file after the dag is triggered with x_possibel in it + x_possible = s3_object["x_possible"] + y_possible = s3_object["y_possible"] + source_projection = s3_object["source_projection"] + target_projection = s3_object["target_projection"] # extract the actual link of the json file and read with smart_open.open(s3_event, "r") as _file: @@ -167,13 +173,6 @@ def handler(event, context): status = list() for s3_object in s3_objects: href = s3_object["assets"]["default"]["href"] #s3://ghgc-data-store-develop/transformed_csv/NIST_Urban_Testbed/NEB-ch4.csv - - # These will be later extracted from the json file. Need to see if the json file put x_possible in the json file after the dag is triggered with x_possibel in it - x_possible = s3_object["x_possible"] - y_possible = s3_object["y_possible"] - source_projection = s3_object["source_projection"] - target_projection = s3_object["target_projection"] - #collection = s3_object["collection"] #collection = href.split("/")[-1].split(".")[0] # or it could be From 02faaac72fc7cda3ba6b969f15b613b35a80628a Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 6 Aug 2024 19:43:37 -0500 Subject: [PATCH 35/97] passing params --- docker_tasks/vector_ingest/handler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 5119df9d..9b552587 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -159,10 +159,10 @@ def handler(event, context): print("*********** payload", payload_event) s3_event = payload_event.pop("payload") # These will be later extracted from the json file. Need to see if the json file put x_possible in the json file after the dag is triggered with x_possibel in it - x_possible = s3_object["x_possible"] - y_possible = s3_object["y_possible"] - source_projection = s3_object["source_projection"] - target_projection = s3_object["target_projection"] + x_possible = payload_event["x_possible"] + y_possible = payload_event["y_possible"] + source_projection = payload_event["source_projection"] + target_projection = payload_event["target_projection"] # extract the actual link of the json file and read with smart_open.open(s3_event, "r") as _file: From 3ac1f40ed1f23a3663f6824a49fbc23e9af9cc0c Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 7 Aug 2024 09:33:09 -0500 Subject: [PATCH 36/97] Try alternate download --- docker_tasks/vector_ingest/handler.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 9b552587..6687a06a 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -40,8 +40,6 @@ def download_file(file_uri: str): # Just to test locally def download_file2(file_uri: str): s3 = boto3.client("s3") - - url_parse = urlparse(file_uri) print("url_parsed: ", url_parse) @@ -178,7 +176,7 @@ def handler(event, context): # or it could be collection = href.split("/")[-2] + '_' + href.split("/")[-1].split(".")[0] - downloaded_filepath = download_file(href) + downloaded_filepath = download_file2(href) print("-----------------------------------------------------\n") print(f"[ DOWNLOAD FILEPATH ]: {downloaded_filepath}") print(f"[ COLLECTION ]: {collection}") From 3c65a4945bb523b8a6284a3167318aaf5aefe77c Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 7 Aug 2024 15:33:50 -0500 Subject: [PATCH 37/97] Add a flag to deploy SM2A --- .github/workflows/cicd.yml | 3 ++- .gitignore | 1 + sm2a/infrastructure/variables.tf | 15 +++++---------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 99abf822..2e47f7ae 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -77,7 +77,8 @@ jobs: env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }} - name: Run SM2A deployment - if: ${{ needs.define-environment.outputs.env_name }} = "development" + # Flag to deploy SM2A + if: ${{ vars.DEPLOY_SM2A }} = "true" uses: "./.github/actions/terraform-deploy-sm2a" with: env_aws_secret_name: ${{ vars.SM2A_ENVS_DEPLOYMENT_SECRET_NAME }} diff --git a/.gitignore b/.gitignore index 5f60bd93..547284f8 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ __pycache__ # Ignore Terraform .terraform .env +.env_ghg_dev terraform.tf terraform.tfvars # Ignore data files which are downloaded for local testing diff --git a/sm2a/infrastructure/variables.tf b/sm2a/infrastructure/variables.tf index d216de13..378958e8 100644 --- a/sm2a/infrastructure/variables.tf +++ b/sm2a/infrastructure/variables.tf @@ -40,15 +40,6 @@ variable "rds_publicly_accessible" { default = false } -variable "custom_worker_policy_statement" { - type = list(object({ - Effect = string - Action = list(string) - Resource = list(string) - })) - default = [] - -} variable "scheduler_cpu" { type = number @@ -203,5 +194,9 @@ variable "custom_worker_policy_statement" { ] } - +variable "airflow_custom_variables" { + description = "Airflow custom variables" + type = map(string) + default = {} +} From ff307d6ec4349794e51a9b27e81d23cf0eb50926 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 7 Aug 2024 16:07:58 -0500 Subject: [PATCH 38/97] Add test tables --- docker_tasks/vector_ingest/handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 6687a06a..7547591e 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -130,7 +130,7 @@ def load_to_featuresdb( check=False, capture_output=True, ) - #print("db connection ", options) + print("db connection ", options) if out.stderr: error_description = f"Error: {out.stderr}" @@ -174,7 +174,7 @@ def handler(event, context): #collection = s3_object["collection"] #collection = href.split("/")[-1].split(".")[0] # or it could be - collection = href.split("/")[-2] + '_' + href.split("/")[-1].split(".")[0] + collection = href.split("/")[-2] + '_test_' + href.split("/")[-1].split(".")[0] downloaded_filepath = download_file2(href) print("-----------------------------------------------------\n") From 84067518fa6c22a62ca6708f9e6946e5751210d5 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Thu, 8 Aug 2024 15:02:39 -0500 Subject: [PATCH 39/97] add extra flags --- docker_tasks/vector_ingest/handler.py | 55 +++++++-------------------- 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 7547591e..f30db2ba 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -7,38 +7,8 @@ import json import smart_open from urllib.parse import urlparse -import psycopg2 - def download_file(file_uri: str): - sts = boto3.client("sts") - response = sts.assume_role( - RoleArn=os.environ.get("EXTERNAL_ROLE_ARN"), - RoleSessionName="sts-assume-114506680961", - ) - new_session = boto3.Session( - aws_access_key_id=response["Credentials"]["AccessKeyId"], - aws_secret_access_key=response["Credentials"]["SecretAccessKey"], - aws_session_token=response["Credentials"]["SessionToken"], - ) - s3 = new_session.client("s3") - - url_parse = urlparse(file_uri) - - bucket = url_parse.netloc - path = url_parse.path[1:] - filename = url_parse.path.split("/")[-1] - target_filepath = os.path.join("/tmp", filename) - - s3.download_file(bucket, path, target_filepath) - - print(f"downloaded {target_filepath}") - - sts.close() - return target_filepath - -# Just to test locally -def download_file2(file_uri: str): s3 = boto3.client("s3") url_parse = urlparse(file_uri) print("url_parsed: ", url_parse) @@ -100,7 +70,8 @@ def load_to_featuresdb( x_possible: str = "longitude", y_possible: str = "latitude", source_projection : str ="EPSG:4326", - target_projection : str ="EPSG:4326" + target_projection : str ="EPSG:4326", + extra_flags: list = ["-overwrite", "-progress"] ): secret_name = os.environ.get("VECTOR_SECRET_NAME") con_secrets = get_secret(secret_name) @@ -123,7 +94,7 @@ def load_to_featuresdb( source_projection, "-t_srs", target_projection, - "-overwrite" + *extra_flags ] out = subprocess.run( options, @@ -156,11 +127,12 @@ def handler(event, context): payload_event = ast.literal_eval(args.payload) print("*********** payload", payload_event) s3_event = payload_event.pop("payload") - # These will be later extracted from the json file. Need to see if the json file put x_possible in the json file after the dag is triggered with x_possibel in it + x_possible = payload_event["x_possible"] y_possible = payload_event["y_possible"] source_projection = payload_event["source_projection"] target_projection = payload_event["target_projection"] + extra_flags = payload_event["extra_flags"] # extract the actual link of the json file and read with smart_open.open(s3_event, "r") as _file: @@ -176,22 +148,23 @@ def handler(event, context): # or it could be collection = href.split("/")[-2] + '_test_' + href.split("/")[-1].split(".")[0] - downloaded_filepath = download_file2(href) + downloaded_filepath = download_file(href) print("-----------------------------------------------------\n") print(f"[ DOWNLOAD FILEPATH ]: {downloaded_filepath}") print(f"[ COLLECTION ]: {collection}") - coll_status = load_to_featuresdb(downloaded_filepath, collection, x_possible, y_possible, source_projection, target_projection) + coll_status = load_to_featuresdb(downloaded_filepath, collection, + x_possible, y_possible, + source_projection, target_projection, + extra_flags) status.append(coll_status) # delete file after ingest os.remove(downloaded_filepath) - # Not sure if we need it - # if coll_status["status"] == "success": - # alter_datetime_add_indexes(collection) - # else: - # # bubble exception so Airflow shows it as a failure - # raise Exception(coll_status["reason"]) + if coll_status["status"] != "success": + # bubble exception so Airflow shows it as a failure + raise Exception(coll_status["reason"]) + print("\n **********Overall Status*********\n", f"Done for {len(status)} csv files",status) From 3e062a3dca537cdd3767a128f2738f867d0fd114 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Thu, 8 Aug 2024 15:41:21 -0500 Subject: [PATCH 40/97] Add comments --- docker_tasks/vector_ingest/handler.py | 50 ++++++++++++++------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index f30db2ba..fcc91fc2 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -9,20 +9,24 @@ from urllib.parse import urlparse def download_file(file_uri: str): + """Downloads file from s3 + + Args: + file_uri (str): s3 URL of the file to be downloaded + + Returns: + target_filepath (str): filepath of the downloaded file + """ s3 = boto3.client("s3") url_parse = urlparse(file_uri) - print("url_parsed: ", url_parse) bucket = url_parse.netloc path = url_parse.path[1:] filename = url_parse.path.split("/")[-1] - print(bucket, path, filename) target_filepath = os.path.join("/tmp", filename) s3.download_file(bucket, path, target_filepath) - print(f"downloaded {target_filepath}") - s3.close() return target_filepath @@ -77,7 +81,7 @@ def load_to_featuresdb( con_secrets = get_secret(secret_name) connection = get_connection_string(con_secrets) - print(f"running ogr2ogr import for collection: {collection}") + print(f"running ogr2ogr import for collection/file: {collection}") options = [ "ogr2ogr", "-f", @@ -101,7 +105,6 @@ def load_to_featuresdb( check=False, capture_output=True, ) - print("db connection ", options) if out.stderr: error_description = f"Error: {out.stderr}" @@ -110,13 +113,11 @@ def load_to_featuresdb( return {"status": "success"} - - def handler(event, context): - print("Generic Vector ingest started") + print("------Vector ingestion for Features API started------") parser = ArgumentParser( - prog="generic_vector_ingest", - description="Ingest Vector- Generic", + prog="vector_ingest", + description="Ingest Vector", epilog="Running the code as ECS task", ) parser.add_argument( @@ -124,48 +125,49 @@ def handler(event, context): ) args = parser.parse_args() + # Extracting the payload passed from upstream task/dag or conf payload_event = ast.literal_eval(args.payload) - print("*********** payload", payload_event) s3_event = payload_event.pop("payload") + # Extracting configs for ingestion x_possible = payload_event["x_possible"] y_possible = payload_event["y_possible"] source_projection = payload_event["source_projection"] target_projection = payload_event["target_projection"] extra_flags = payload_event["extra_flags"] - # extract the actual link of the json file and read + # Read the json to extract the discovered file paths with smart_open.open(s3_event, "r") as _file: s3_event_read = _file.read() - print("file read done") + event_received = json.loads(s3_event_read) s3_objects = event_received["objects"] status = list() + + for s3_object in s3_objects: - href = s3_object["assets"]["default"]["href"] #s3://ghgc-data-store-develop/transformed_csv/NIST_Urban_Testbed/NEB-ch4.csv + href = s3_object["assets"]["default"]["href"] + #collection = s3_object["collection"] - #collection = href.split("/")[-1].split(".")[0] - # or it could be - collection = href.split("/")[-2] + '_test_' + href.split("/")[-1].split(".")[0] + collection = href.split("/")[-2] + href.split("/")[-1].split(".")[0] downloaded_filepath = download_file(href) - print("-----------------------------------------------------\n") - print(f"[ DOWNLOAD FILEPATH ]: {downloaded_filepath}") - print(f"[ COLLECTION ]: {collection}") + print(f"[ COLLECTION ]: {collection}, [ DOWNLOAD FILEPATH ]: {downloaded_filepath}") + coll_status = load_to_featuresdb(downloaded_filepath, collection, x_possible, y_possible, source_projection, target_projection, extra_flags) status.append(coll_status) - # delete file after ingest + # Delete file after ingest os.remove(downloaded_filepath) if coll_status["status"] != "success": - # bubble exception so Airflow shows it as a failure + # Bubble exception so Airflow shows it as a failure raise Exception(coll_status["reason"]) - print("\n **********Overall Status*********\n", f"Done for {len(status)} csv files",status) + print("------Overall Status------\n", f"Done for {len(status)} discovered files\n",status) if __name__ == "__main__": From 86884e8456188e72e2d937d535b179363da55e21 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Thu, 8 Aug 2024 21:46:32 -0500 Subject: [PATCH 41/97] Refactor layer naming and add IAM role assumption --- docker_tasks/vector_ingest/handler.py | 53 +++++++++++++++++++-------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index fcc91fc2..e0ee3daa 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -17,7 +17,10 @@ def download_file(file_uri: str): Returns: target_filepath (str): filepath of the downloaded file """ - s3 = boto3.client("s3") + role_arn = os.environ.get("EXTERNAL_ROLE_ARN") + kwargs = assume_role(role_arn=role_arn) if role_arn else {} + + s3 = boto3.client("s3", **kwargs) url_parse = urlparse(file_uri) bucket = url_parse.netloc @@ -30,6 +33,28 @@ def download_file(file_uri: str): s3.close() return target_filepath +def assume_role(role_arn, session_name="veda-data-pipelines_vector-ingest"): + """Assumes an AWS IAM role and returns temporary credentials. + + Args: + role_arn (str): The ARN of the role to assume. + session_name (str): A name for the assumed session. + + Returns: + dict: Temporary AWS credentials. + """ + sts = boto3.client("sts") + credentials = sts.assume_role( + RoleArn=role_arn, + RoleSessionName=session_name, + ) + creds = credentials["Credentials"] + return { + "aws_access_key_id": creds["AccessKeyId"], + "aws_secret_access_key": creds.get("SecretAccessKey"), + "aws_session_token": creds.get("SessionToken"), + } + def get_connection_string(secret: dict, as_uri: bool = False) -> str: if as_uri: @@ -70,7 +95,7 @@ def get_secret(secret_name: str) -> None: def load_to_featuresdb( filename: str, - collection: str, + layer_name: str, x_possible: str = "longitude", y_possible: str = "latitude", source_projection : str ="EPSG:4326", @@ -81,7 +106,7 @@ def load_to_featuresdb( con_secrets = get_secret(secret_name) connection = get_connection_string(con_secrets) - print(f"running ogr2ogr import for collection/file: {collection}") + print(f"running ogr2ogr import for collection/file: {layer_name}") options = [ "ogr2ogr", "-f", @@ -93,7 +118,7 @@ def load_to_featuresdb( "-oo", f"Y_POSSIBLE_NAMES={y_possible}", "-nln", - collection, # Or could be the actual filename + layer_name, "-s_srs", source_projection, "-t_srs", @@ -136,6 +161,8 @@ def handler(event, context): target_projection = payload_event["target_projection"] extra_flags = payload_event["extra_flags"] + layer_name = payload_event["collection"] + # Read the json to extract the discovered file paths with smart_open.open(s3_event, "r") as _file: s3_event_read = _file.read() @@ -144,17 +171,18 @@ def handler(event, context): s3_objects = event_received["objects"] status = list() - for s3_object in s3_objects: href = s3_object["assets"]["default"]["href"] + filename = href.split("/")[-1].split(".")[0] - #collection = s3_object["collection"] - collection = href.split("/")[-2] + href.split("/")[-1].split(".")[0] + # Use id template when collection is not provided in the conf + if layer_name == "": + layer_name = payload_event["id_template"].format(filename) downloaded_filepath = download_file(href) - print(f"[ COLLECTION ]: {collection}, [ DOWNLOAD FILEPATH ]: {downloaded_filepath}") + print(f"[ COLLECTION ]: {layer_name}, [ DOWNLOAD FILEPATH ]: {downloaded_filepath}") - coll_status = load_to_featuresdb(downloaded_filepath, collection, + coll_status = load_to_featuresdb(downloaded_filepath, layer_name, x_possible, y_possible, source_projection, target_projection, extra_flags) @@ -171,9 +199,4 @@ def handler(event, context): if __name__ == "__main__": - # It has nothing to do - sample_event = { - "collection": "eis_fire_newfirepix_2", - "href": "s3://covid-eo-data/fireline/newfirepix.fgb", - } - handler(sample_event, {}) + handler({}, {}) From 5624a5ae2c495bc61f9d7e103c98bd569e850d74 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Mon, 12 Aug 2024 09:49:40 -0500 Subject: [PATCH 42/97] Add generic vector ingest --- docker_tasks/generic_vector_ingest/Dockerfile | 10 + docker_tasks/generic_vector_ingest/handler.py | 202 ++++++++++++++++++ .../generic_vector_ingest/requirements.txt | 7 + 3 files changed, 219 insertions(+) create mode 100644 docker_tasks/generic_vector_ingest/Dockerfile create mode 100644 docker_tasks/generic_vector_ingest/handler.py create mode 100644 docker_tasks/generic_vector_ingest/requirements.txt diff --git a/docker_tasks/generic_vector_ingest/Dockerfile b/docker_tasks/generic_vector_ingest/Dockerfile new file mode 100644 index 00000000..546e6a83 --- /dev/null +++ b/docker_tasks/generic_vector_ingest/Dockerfile @@ -0,0 +1,10 @@ +FROM --platform=linux/amd64 ghcr.io/lambgeo/lambda-gdal:3.6-python3.9 +RUN yum update -y + +WORKDIR /app +ENTRYPOINT [] +RUN pip install --upgrade pip +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +COPY handler.py handler.py diff --git a/docker_tasks/generic_vector_ingest/handler.py b/docker_tasks/generic_vector_ingest/handler.py new file mode 100644 index 00000000..e0ee3daa --- /dev/null +++ b/docker_tasks/generic_vector_ingest/handler.py @@ -0,0 +1,202 @@ +import base64 +from argparse import ArgumentParser +import boto3 +import os +import ast +import subprocess +import json +import smart_open +from urllib.parse import urlparse + +def download_file(file_uri: str): + """Downloads file from s3 + + Args: + file_uri (str): s3 URL of the file to be downloaded + + Returns: + target_filepath (str): filepath of the downloaded file + """ + role_arn = os.environ.get("EXTERNAL_ROLE_ARN") + kwargs = assume_role(role_arn=role_arn) if role_arn else {} + + s3 = boto3.client("s3", **kwargs) + url_parse = urlparse(file_uri) + + bucket = url_parse.netloc + path = url_parse.path[1:] + filename = url_parse.path.split("/")[-1] + target_filepath = os.path.join("/tmp", filename) + + s3.download_file(bucket, path, target_filepath) + + s3.close() + return target_filepath + +def assume_role(role_arn, session_name="veda-data-pipelines_vector-ingest"): + """Assumes an AWS IAM role and returns temporary credentials. + + Args: + role_arn (str): The ARN of the role to assume. + session_name (str): A name for the assumed session. + + Returns: + dict: Temporary AWS credentials. + """ + sts = boto3.client("sts") + credentials = sts.assume_role( + RoleArn=role_arn, + RoleSessionName=session_name, + ) + creds = credentials["Credentials"] + return { + "aws_access_key_id": creds["AccessKeyId"], + "aws_secret_access_key": creds.get("SecretAccessKey"), + "aws_session_token": creds.get("SessionToken"), + } + + +def get_connection_string(secret: dict, as_uri: bool = False) -> str: + if as_uri: + return f"postgresql://{secret['username']}:{secret['password']}@{secret['host']}:5432/{secret['dbname']}" + else: + #return f"PG:host=localhost port=5432 dbname=postgis user=username password=password" + return f"PG:host={secret['host']} dbname={secret['dbname']} user={secret['username']} password={secret['password']}" + + +def get_secret(secret_name: str) -> None: + """Retrieve secrets from AWS Secrets Manager + + Args: + secret_name (str): name of aws secrets manager secret containing database connection secrets + + Returns: + secrets (dict): decrypted secrets in dict + """ + + # Create a Secrets Manager client + session = boto3.session.Session(region_name=os.environ.get("AWS_REGION")) + client = session.client(service_name="secretsmanager") + + # In this sample we only handle the specific exceptions for the 'GetSecretValue' API. + # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html + # We rethrow the exception by default. + + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + + # Decrypts secret using the associated KMS key. + # Depending on whether the secret is a string or binary, one of these fields will be populated. + if "SecretString" in get_secret_value_response: + return json.loads(get_secret_value_response["SecretString"]) + else: + return json.loads(base64.b64decode(get_secret_value_response["SecretBinary"])) + + + +def load_to_featuresdb( + filename: str, + layer_name: str, + x_possible: str = "longitude", + y_possible: str = "latitude", + source_projection : str ="EPSG:4326", + target_projection : str ="EPSG:4326", + extra_flags: list = ["-overwrite", "-progress"] +): + secret_name = os.environ.get("VECTOR_SECRET_NAME") + con_secrets = get_secret(secret_name) + connection = get_connection_string(con_secrets) + + print(f"running ogr2ogr import for collection/file: {layer_name}") + options = [ + "ogr2ogr", + "-f", + "PostgreSQL", + connection, + filename, + "-oo", + f"X_POSSIBLE_NAMES={x_possible}", + "-oo", + f"Y_POSSIBLE_NAMES={y_possible}", + "-nln", + layer_name, + "-s_srs", + source_projection, + "-t_srs", + target_projection, + *extra_flags + ] + out = subprocess.run( + options, + check=False, + capture_output=True, + ) + + if out.stderr: + error_description = f"Error: {out.stderr}" + print(error_description) + return {"status": "failure", "reason": error_description} + + return {"status": "success"} + +def handler(event, context): + print("------Vector ingestion for Features API started------") + parser = ArgumentParser( + prog="vector_ingest", + description="Ingest Vector", + epilog="Running the code as ECS task", + ) + parser.add_argument( + "--payload", dest="payload", help="event passed to stac_handler function" + ) + args = parser.parse_args() + + # Extracting the payload passed from upstream task/dag or conf + payload_event = ast.literal_eval(args.payload) + s3_event = payload_event.pop("payload") + + # Extracting configs for ingestion + x_possible = payload_event["x_possible"] + y_possible = payload_event["y_possible"] + source_projection = payload_event["source_projection"] + target_projection = payload_event["target_projection"] + extra_flags = payload_event["extra_flags"] + + layer_name = payload_event["collection"] + + # Read the json to extract the discovered file paths + with smart_open.open(s3_event, "r") as _file: + s3_event_read = _file.read() + + event_received = json.loads(s3_event_read) + s3_objects = event_received["objects"] + status = list() + + for s3_object in s3_objects: + href = s3_object["assets"]["default"]["href"] + filename = href.split("/")[-1].split(".")[0] + + # Use id template when collection is not provided in the conf + if layer_name == "": + layer_name = payload_event["id_template"].format(filename) + + downloaded_filepath = download_file(href) + print(f"[ COLLECTION ]: {layer_name}, [ DOWNLOAD FILEPATH ]: {downloaded_filepath}") + + coll_status = load_to_featuresdb(downloaded_filepath, layer_name, + x_possible, y_possible, + source_projection, target_projection, + extra_flags) + status.append(coll_status) + + # Delete file after ingest + os.remove(downloaded_filepath) + + if coll_status["status"] != "success": + # Bubble exception so Airflow shows it as a failure + raise Exception(coll_status["reason"]) + + print("------Overall Status------\n", f"Done for {len(status)} discovered files\n",status) + + +if __name__ == "__main__": + handler({}, {}) diff --git a/docker_tasks/generic_vector_ingest/requirements.txt b/docker_tasks/generic_vector_ingest/requirements.txt new file mode 100644 index 00000000..38263eed --- /dev/null +++ b/docker_tasks/generic_vector_ingest/requirements.txt @@ -0,0 +1,7 @@ +smart-open==6.3.0 +psycopg2-binary==2.9.9 +requests==2.30.0 +boto3==1.26.129 +GeoAlchemy2==0.14.2 +geopandas==0.14.0 +SQLAlchemy==2.0.23 \ No newline at end of file From 66921c530e2152a0346487e51089abfad7b811db Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Mon, 12 Aug 2024 09:50:46 -0500 Subject: [PATCH 43/97] revert back changes --- docker_tasks/vector_ingest/handler.py | 351 ++++++++++++++++++++------ 1 file changed, 269 insertions(+), 82 deletions(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index e98435f5..0938d8ce 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -7,20 +7,29 @@ import json import smart_open from urllib.parse import urlparse +import psycopg2 +import geopandas as gpd +from shapely import wkb +from geoalchemy2 import Geometry +import sqlalchemy +from sqlalchemy import create_engine, MetaData, Table, Column, inspect +import concurrent.futures +from sqlalchemy.dialects.postgresql import DOUBLE_PRECISION, INTEGER, VARCHAR, TIMESTAMP -def download_file(file_uri: str): - """Downloads file from s3 - - Args: - file_uri (str): s3 URL of the file to be downloaded - Returns: - target_filepath (str): filepath of the downloaded file - """ - role_arn = os.environ.get("EXTERNAL_ROLE_ARN") - kwargs = assume_role(role_arn=role_arn) if role_arn else {} +def download_file(file_uri: str): + sts = boto3.client("sts") + response = sts.assume_role( + RoleArn=os.environ.get("EXTERNAL_ROLE_ARN"), + RoleSessionName="sts-assume-114506680961", + ) + new_session = boto3.Session( + aws_access_key_id=response["Credentials"]["AccessKeyId"], + aws_secret_access_key=response["Credentials"]["SecretAccessKey"], + aws_session_token=response["Credentials"]["SessionToken"], + ) + s3 = new_session.client("s3") - s3 = boto3.client("s3", **kwargs) url_parse = urlparse(file_uri) bucket = url_parse.netloc @@ -30,40 +39,164 @@ def download_file(file_uri: str): s3.download_file(bucket, path, target_filepath) - s3.close() - return target_filepath - -def assume_role(role_arn, session_name="veda-data-pipelines_vector-ingest"): - """Assumes an AWS IAM role and returns temporary credentials. - - Args: - role_arn (str): The ARN of the role to assume. - session_name (str): A name for the assumed session. + print(f"downloaded {target_filepath}") - Returns: - dict: Temporary AWS credentials. - """ - sts = boto3.client("sts") - credentials = sts.assume_role( - RoleArn=role_arn, - RoleSessionName=session_name, - ) - creds = credentials["Credentials"] - return { - "aws_access_key_id": creds["AccessKeyId"], - "aws_secret_access_key": creds.get("SecretAccessKey"), - "aws_session_token": creds.get("SessionToken"), - } + sts.close() + return target_filepath def get_connection_string(secret: dict, as_uri: bool = False) -> str: if as_uri: return f"postgresql://{secret['username']}:{secret['password']}@{secret['host']}:5432/{secret['dbname']}" else: - #return f"PG:host=localhost port=5432 dbname=postgis user=username password=password" return f"PG:host={secret['host']} dbname={secret['dbname']} user={secret['username']} password={secret['password']}" +def get_gdf_schema(gdf, target_projection): + """map GeoDataFrame columns into a table schema + + :param gdf: GeoDataFrame from geopandas + :param target_projection: srid for the target table geometry column + :return: + """ + # map geodatafrome dtypes to sqlalchemy types + dtype_map = { + "int64": INTEGER, + "float64": DOUBLE_PRECISION, + "object": VARCHAR, + "datetime64": TIMESTAMP, + } + schema = [] + for column, dtype in zip(gdf.columns, gdf.dtypes): + if str(dtype) == "geometry": + # do not inpsect to retrieve geom type, just use generic GEOMETRY + # geom_type = str(gdf[column].geom_type.unique()[0]).upper() + geom_type = str(dtype).upper() + # do not taKe SRID from existing file for target table + # we always want to transform from file EPSG to Table EPSG() + column_type = Geometry(geometry_type=geom_type, srid=target_projection) + else: + dtype_str = str(dtype) + column_type = dtype_map.get(dtype_str.split("[")[0], VARCHAR) + + if column == "primarykey": + schema.append(Column(column.lower(), column_type, unique=True)) + else: + schema.append(Column(column.lower(), column_type)) + return schema + + +def ensure_table_exists( + db_metadata: MetaData, gpkg_file: str, target_projection: int, table_name: str +): + """create a table if it doesn't exist or just + validate GeoDataFrame columns against existing table + + :param db_metadata: instance of sqlalchemy.MetaData + :param gpkg_file: file path to GPKG + :param target_projection: srid for target DB table geometry column + :param table_name: name of table to create + :return: None + """ + gdf = gpd.read_file(gpkg_file) + gdf_schema = get_gdf_schema(gdf, target_projection) + engine = db_metadata.bind + try: + Table(table_name, db_metadata, autoload_with=engine) + except sqlalchemy.exc.NoSuchTableError: + Table(table_name, db_metadata, *gdf_schema) + db_metadata.create_all(engine) + + # validate gdf schema against existing table schema + insp = inspect(engine) + existing_columns = insp.get_columns(table_name) + existing_column_names = [col["name"] for col in existing_columns] + for column in gdf_schema: + if column.name not in existing_column_names: + raise ValueError( + f"your .gpkg seems to have a column={column.name} that does not exist in the existing table columns={existing_column_names}" + ) + + +def delete_region( + engine, + gpkg_path: str, + table_name: str, +): + """delete all existing records by region name""" + gdf = gpd.read_file(gpkg_path) + region_name = gdf["region"].iloc[0] + with engine.connect() as conn: + with conn.begin(): + delete_sql = sqlalchemy.text( + f""" + DELETE FROM {table_name} WHERE region='{region_name}' + """ + ) + conn.execute(delete_sql) + + +def upsert_to_postgis( + engine, + gpkg_path: str, + target_projection: int, + table_name: str, + batch_size: int = 10000, +): + """batch the GPKG file and upsert via threads + + :param engine: instance of sqlalchemy.Engine + :param gpkg_path: file path to GPKG + :param table_name: name of the target table + :param batch_size: upper limit of batch size + :return: + """ + gdf = gpd.read_file(gpkg_path) + source_epsg_code = gdf.crs.to_epsg() + if not source_epsg_code: + # assume NAD27 Equal Area for now :shrug: + # since that's what the default is for Fire Atlas team exports + # that's what PROJ4 does under the hood for 9311 :wethinksmirk: + source_epsg_code = 2163 + + # convert the `t` column to something suitable for sql insertion otherwise we get 'Timestamp()' + gdf["t"] = gdf["t"].dt.strftime("%Y-%m-%d %H:%M:%S") + # convert to WKB + gdf["geometry"] = gdf["geometry"].apply(lambda geom: wkb.dumps(geom, hex=True)) + + def upsert_batch(batch): + with engine.connect() as conn: + with conn.begin(): + for row in batch.to_dict(orient="records"): + # make sure all column names are lower case for keys and values + row = {k.lower(): v for k, v in row.items()} + columns = [col.lower() for col in batch.columns] + + non_geom_placeholders = ", ".join( + [f":{col}" for col in columns[:-1]] + ) + # NOTE: we need to escape `::geometry` so parameterized statements don't try to replace it + # because parametrized statements in sqlalchemy are `:` + geom_placeholder = f"ST_Transform(ST_SetSRID(ST_GeomFromWKB(:geometry\:\:geometry), {source_epsg_code}), {target_projection})" # noqa: W605 + upsert_sql = sqlalchemy.text( + f""" + INSERT INTO {table_name} ({', '.join([col for col in columns])}) + VALUES ({non_geom_placeholders},{geom_placeholder}) + ON CONFLICT (primarykey) + DO UPDATE SET {', '.join(f"{col}=EXCLUDED.{col}" for col in columns if col != 'primarykey')} + """ + ) + + # logging.debug(f"[ UPSERT SQL ]:\n{str(upsert_sql)}") + conn.execute(upsert_sql, row) + + batches = [gdf.iloc[i : i + batch_size] for i in range(0, len(gdf), batch_size)] + # set `max_workers` to something below max concurrent connections for postgresql + # https://www.postgresql.org/docs/14/runtime-config-connection.html + with concurrent.futures.ThreadPoolExecutor(max_workers=75) as executor: + executor.map(upsert_batch, batches) + + def get_secret(secret_name: str) -> None: """Retrieve secrets from AWS Secrets Manager @@ -92,38 +225,32 @@ def get_secret(secret_name: str) -> None: return json.loads(base64.b64decode(get_secret_value_response["SecretBinary"])) - def load_to_featuresdb( filename: str, - layer_name: str, - x_possible: str = "longitude", - y_possible: str = "latitude", - source_projection : str ="EPSG:4326", - target_projection : str ="EPSG:4326", - extra_flags: list = ["-overwrite", "-progress"] + collection: str, + extra_flags: list = None, + target_projection: str = "EPSG:4326", ): + if extra_flags is None: + extra_flags = ["-overwrite", "-progress"] + secret_name = os.environ.get("VECTOR_SECRET_NAME") + con_secrets = get_secret(secret_name) connection = get_connection_string(con_secrets) - print(f"running ogr2ogr import for collection/file: {layer_name}") + print(f"running ogr2ogr import for collection: {collection}") options = [ "ogr2ogr", "-f", "PostgreSQL", connection, - filename, - "-oo", - f"X_POSSIBLE_NAMES={x_possible}", - "-oo", - f"Y_POSSIBLE_NAMES={y_possible}", - "-nln", - layer_name, - "-s_srs", - source_projection, "-t_srs", target_projection, - *extra_flags + filename, + "-nln", + collection, + *extra_flags, ] out = subprocess.run( options, @@ -138,8 +265,79 @@ def load_to_featuresdb( return {"status": "success"} -def handler(event, context): - print("------Vector ingestion for Features API started------") + +def load_to_featuresdb_eis( + filename: str, + collection: str, + target_projection: int = 4326, +): + """create table if not exists and upload GPKG + + :param filename: the file path to the downloaded GPKG + :param collection: the name of the collection + :param target_projection: srid for the target table + :return: None + """ + secret_name = os.environ.get("VECTOR_SECRET_NAME") + conn_secrets = get_secret(secret_name) + connection_string = get_connection_string(conn_secrets, as_uri=True) + + # NOTE: about `collection.rsplit` below: + # + # EIS Fire team naming convention for outputs + # Snapshots: "snapshot_{layer_name}_nrt_{region_name}.gpkg" + # Lf_archive: "lf_{layer_name}_archive_{region_name}.gpkg" + # Lf_nrt: "lf_{layer_name}_nrt_{region_name}.gpkg" + # + # Insert/Alter on table call everything except the region name: + # e.g. `snapshot_perimeter_nrt_conus` this gets inserted into the table `eis_fire_snapshot_perimeter_nrt` + collection = collection.rsplit("_", 1)[0] + target_table_name = f"eis_fire_{collection}" + + engine = create_engine(connection_string) + metadata = MetaData() + metadata.bind = engine + + ensure_table_exists(metadata, filename, target_projection, target_table_name) + delete_region(engine, filename, target_table_name) + upsert_to_postgis(engine, filename, target_projection, target_table_name) + return {"status": "success"} + + +def alter_datetime_add_indexes_eis(collection: str): + # NOTE: about `collection.rsplit` below: + # + # EIS Fire team naming convention for outputs + # Snapshots: "snapshot_{layer_name}_nrt_{region_name}.gpkg" + # Lf_archive: "lf_{layer_name}_archive_{region_name}.gpkg" + # Lf_nrt: "lf_{layer_name}_nrt_{region_name}.gpkg" + # + # Insert/Alter on table call everything except the region name: + # e.g. `snapshot_perimeter_nrt_conus` this gets inserted into the table `eis_fire_snapshot_perimeter_nrt` + collection = collection.rsplit("_", 1)[0] + + secret_name = os.environ.get("VECTOR_SECRET_NAME") + conn_secrets = get_secret(secret_name) + conn = psycopg2.connect( + host=conn_secrets["host"], + dbname=conn_secrets["dbname"], + user=conn_secrets["username"], + password=conn_secrets["password"], + ) + + cur = conn.cursor() + cur.execute( + f"ALTER table eis_fire_{collection} " + f"ALTER COLUMN t TYPE TIMESTAMP USING t::timestamp without time zone; " + f"CREATE INDEX IF NOT EXISTS idx_eis_fire_{collection}_datetime ON eis_fire_{collection}(t);" + f"CREATE INDEX IF NOT EXISTS idx_eis_fire_{collection}_primarykey ON eis_fire_{collection}(primarykey);" + f"CREATE INDEX IF NOT EXISTS idx_eis_fire_{collection}_region ON eis_fire_{collection}(region);" + ) + conn.commit() + + +def handler(): + print("Vector ingest started") parser = ArgumentParser( prog="vector_ingest", description="Ingest Vector", @@ -150,48 +348,37 @@ def handler(event, context): ) args = parser.parse_args() - # Extracting the payload passed from upstream task/dag or conf payload_event = ast.literal_eval(args.payload) s3_event = payload_event.pop("payload") - - # Extracting configs for ingestion - x_possible = payload_event["x_possible"] - y_possible = payload_event["y_possible"] - source_projection = payload_event["source_projection"] - target_projection = payload_event["target_projection"] - extra_flags = payload_event["extra_flags"] - - layer_name = payload_event["collection"] - - # Read the json to extract the discovered file paths with smart_open.open(s3_event, "r") as _file: s3_event_read = _file.read() - event_received = json.loads(s3_event_read) s3_objects = event_received["objects"] status = list() - for s3_object in s3_objects: href = s3_object["assets"]["default"]["href"] collection = s3_object["collection"] downloaded_filepath = download_file(href) - print(f"[ COLLECTION ]: {layer_name}, [ DOWNLOAD FILEPATH ]: {downloaded_filepath}") - - coll_status = load_to_featuresdb(downloaded_filepath, layer_name, - x_possible, y_possible, - source_projection, target_projection, - extra_flags) - status.append(coll_status) + print(f"[ DOWNLOAD FILEPATH ]: {downloaded_filepath}") + print(f"[ COLLECTION ]: {collection}") - # Delete file after ingest + s3_object_prefix = event_received["prefix"] + if s3_object_prefix.startswith("EIS/"): + coll_status = load_to_featuresdb_eis(downloaded_filepath, collection) + else: + coll_status = load_to_featuresdb(downloaded_filepath, collection) + + status.append(coll_status) + # delete file after ingest os.remove(downloaded_filepath) - if coll_status["status"] != "success": - # Bubble exception so Airflow shows it as a failure + if coll_status["status"] == "success" and s3_object_prefix.startswith("EIS/"): + alter_datetime_add_indexes_eis(collection) + elif coll_status["status"] != "success": + # bubble exception so Airflow shows it as a failure raise Exception(coll_status["reason"]) - - print("------Overall Status------\n", f"Done for {len(status)} discovered files\n",status) + print(status) if __name__ == "__main__": - handler({}, {}) + handler() \ No newline at end of file From a9665f9c13d95a3501854ba57e0dbd648be86b41 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Mon, 12 Aug 2024 15:59:55 -0500 Subject: [PATCH 44/97] Add generic vector ingest ecs --- infrastructure/main.tf | 6 ++++++ infrastructure/task_definition.tf | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/infrastructure/main.tf b/infrastructure/main.tf index 116e7e8a..a8e49d3f 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -32,6 +32,12 @@ module "mwaa" { docker_file_path = "${path.module}/../docker_tasks/vector_ingest/Dockerfile" ecs_container_folder_path = "${path.module}/../docker_tasks/vector_ingest" ecr_repo_name = "${var.prefix}-veda-vector_ingest" + }, + { + handler_file_path = "${path.module}/../docker_tasks/generic_vector_ingest/handler.py" + docker_file_path = "${path.module}/../docker_tasks/generic_vector_ingest/Dockerfile" + ecs_container_folder_path = "${path.module}/../docker_tasks/generic_vector_ingest" + ecr_repo_name = "${var.prefix}-veda-generic_vector_ingest" } ] } diff --git a/infrastructure/task_definition.tf b/infrastructure/task_definition.tf index 12d47615..46895217 100644 --- a/infrastructure/task_definition.tf +++ b/infrastructure/task_definition.tf @@ -57,6 +57,35 @@ resource "aws_ecs_task_definition" "veda_vector_task_definition" { memory = var.ecs_task_memory } +resource "aws_ecs_task_definition" "veda_generic_vector_task_definition" { + + + container_definitions = jsonencode([ + + { + name = "${var.prefix}-veda-generic_vector_ingest" + image = "${local.account_id}.dkr.ecr.${local.aws_region}.amazonaws.com/${var.prefix}-veda-generic_vector_ingest" + essential = true, + logConfiguration = { + "logDriver" : "awslogs", + "options" : { + "awslogs-group" : module.mwaa.log_group_name, + "awslogs-region" : local.aws_region, + "awslogs-stream-prefix" : "ecs" + } + } + } + + ]) + family = "${var.prefix}-vector-tasks" + requires_compatibilities = ["FARGATE"] + network_mode = "awsvpc" + execution_role_arn = module.mwaa.mwaa_role_arn + task_role_arn = module.mwaa.mwaa_role_arn + cpu = var.ecs_task_cpu + memory = var.ecs_task_memory +} + resource "aws_ecs_task_definition" "veda_transfer_task_definition" { From c685426c25d683e337d68248733bdd029cb9fed1 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Mon, 12 Aug 2024 16:00:43 -0500 Subject: [PATCH 45/97] Add dag for generic vector ingest --- .../veda_process_generic_vector_pipeline.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py diff --git a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py new file mode 100644 index 00000000..aac9a151 --- /dev/null +++ b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py @@ -0,0 +1,108 @@ +import pendulum +from airflow import DAG +from airflow.models.variable import Variable +from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator +from airflow.operators.dummy_operator import DummyOperator +from airflow.utils.trigger_rule import TriggerRule + +from datetime import timedelta + +dag_doc_md = """ +### Build and submit stac +#### Purpose +This DAG is supposed to be triggered by `veda_discover`. But you still can trigger this DAG manually or through an API + +#### Notes +- This DAG can run with the following configuration
+```json +{ + "collection": "geoglam", + "prefix": "geoglam/", + "bucket": "veda-data-store-staging", + "filename_regex": "^(.*).tif$", + "discovery": "s3", + "datetime_range": "month", + "upload": false, + "cogify": false, + "discovered": 33, + "payload": "s3://veda-uah-sit-mwaa-853558080719/events/geoglam/s3_discover_output_6c46b57a-7474-41fe-977a-19d164531cdc.json" +} +``` +- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) +""" + +templat_dag_run_conf = { + "collection": "", + "prefix": "/", + "bucket": "", + "filename_regex": "", + "discovery": "|cmr", + "datetime_range": "|", + "upload": " | true", + "cogify": "false | true", + "payload": "> generic_ingest_vector >> end From 487fe15e62eae74aa52d3c48adf99ca704657d0a Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Mon, 12 Aug 2024 16:01:14 -0500 Subject: [PATCH 46/97] Modify to take generic vector ingest --- dags/veda_data_pipeline/groups/discover_group.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/veda_data_pipeline/groups/discover_group.py b/dags/veda_data_pipeline/groups/discover_group.py index 38b754fb..8134a0f3 100644 --- a/dags/veda_data_pipeline/groups/discover_group.py +++ b/dags/veda_data_pipeline/groups/discover_group.py @@ -97,7 +97,7 @@ def subdag_discover(event={}): run_process_vector = TriggerMultiDagRunOperator( task_id="parallel_run_process_vectors", - trigger_dag_id="veda_ingest_vector", + trigger_dag_id="veda_generic_ingest_vector", python_callable=get_files_to_process, ) From 2b6ef7ee1690d9be2242e18ec9b958872c0e580d Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 13 Aug 2024 09:15:32 -0500 Subject: [PATCH 47/97] modify family in task definition --- infrastructure/task_definition.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/task_definition.tf b/infrastructure/task_definition.tf index 46895217..14a7a6ad 100644 --- a/infrastructure/task_definition.tf +++ b/infrastructure/task_definition.tf @@ -77,7 +77,7 @@ resource "aws_ecs_task_definition" "veda_generic_vector_task_definition" { } ]) - family = "${var.prefix}-vector-tasks" + family = "${var.prefix}-generic-vector-tasks" requires_compatibilities = ["FARGATE"] network_mode = "awsvpc" execution_role_arn = module.mwaa.mwaa_role_arn From 9ff8137e66bd964bab27c55a10cf6ab244e2134a Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Tue, 13 Aug 2024 12:05:53 -0500 Subject: [PATCH 48/97] Create a dir to deploy --- .../actions/terraform-deploy-sm2a/action.yml | 25 +++++++------------ .github/workflows/cicd.yml | 1 + 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index d3ad6426..1cf4aa02 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -1,28 +1,21 @@ name: Deploy SM2A inputs: - environment: - type: string + env_aws_secret_name: required: true - aws-region: type: string - required: false - default: us-west-2 env-file: type: string - required: true - role-session-name: + default: ".env" + dir: required: false type: string - default: github-actions-deployment - env_aws_secret_name: + default: "." + script_path: type: string - required: true - sm2a_dir: - required: false + backend_stack_name: type: string - default: "./sm2a" - script_path: + auth_stack_name: type: string @@ -43,7 +36,7 @@ runs: - name: Get relevant environment configuration from aws secrets shell: bash - working-directory: ${{ inputs.sm2a_dir }} + working-directory: ${{ inputs.dir }} env: AWS_DEFAULT_REGION: ${{ inputs.aws-region }} AWS_REGION: ${{ inputs.aws-region }} @@ -57,7 +50,7 @@ runs: - name: Deploy shell: bash - working-directory: ${{ inputs.sm2a_dir }} + working-directory: ${{ inputs.dir }} env: AWS_DEFAULT_REGION: ${{ inputs.aws-region }} AWS_REGION: ${{ inputs.aws-region }} diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 2e47f7ae..088a11ec 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -81,6 +81,7 @@ jobs: if: ${{ vars.DEPLOY_SM2A }} = "true" uses: "./.github/actions/terraform-deploy-sm2a" with: + dir: ./sm2a env_aws_secret_name: ${{ vars.SM2A_ENVS_DEPLOYMENT_SECRET_NAME }} env-file: .env aws-region: us-west-2 From 7d5c84669bf6da5a1a41e2b3d816eb8ead3ab509 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Tue, 13 Aug 2024 12:08:12 -0500 Subject: [PATCH 49/97] Add deploy requirements --- .github/actions/terraform-deploy-sm2a/action.yml | 2 +- deploy_requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 1cf4aa02..f633b2c0 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -32,7 +32,7 @@ runs: - name: Install python dependencies shell: bash working-directory: ${{ inputs.dir }} - run: pip install -r deploy_requirements.txt + run: pip install -r ../deploy_requirements.txt - name: Get relevant environment configuration from aws secrets shell: bash diff --git a/deploy_requirements.txt b/deploy_requirements.txt index b1f6c3bc..e0c90230 100644 --- a/deploy_requirements.txt +++ b/deploy_requirements.txt @@ -1,2 +1,2 @@ boto3==1.26.62 -requests==2.28.2 \ No newline at end of file +requests==2.28.2 From 8f81d40cdd5e4fd022986db841ba48267e224a04 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Tue, 13 Aug 2024 12:19:38 -0500 Subject: [PATCH 50/97] Add region to deployment --- .github/actions/terraform-deploy-sm2a/action.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index f633b2c0..d99034aa 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -11,6 +11,10 @@ inputs: required: false type: string default: "." + aws-region: + required: false + type: string + default: "us-west-2" script_path: type: string backend_stack_name: From 5484d646c69386b276479b1da99ad4283afdae50 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Tue, 13 Aug 2024 12:24:50 -0500 Subject: [PATCH 51/97] Add workflow api to github output --- .github/actions/terraform-deploy-sm2a/action.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index d99034aa..59f53ab9 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -62,3 +62,12 @@ runs: cp -r ../dags . ./scripts/deploy.sh ${{ inputs.env-file }} <<< init ./scripts/deploy.sh ${{ inputs.env-file }} <<< deploy + + - name: Output workflows API endpoint + id: output_sm2a_workflows_endpoint + shell: bash + working-directory: ${{ inputs.dir }} + run: | + cd ./infrastructure + terraform output -json Airflow_url > ${HOME}/output_sm2a_workflows_endpoint.json + From d86bcd00083aeb850aedc000807e8bcce4830107 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 13 Aug 2024 14:03:26 -0500 Subject: [PATCH 52/97] Fix log group --- dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py index aac9a151..d88c28fe 100644 --- a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py @@ -102,7 +102,7 @@ }, }, awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), - awslogs_stream_prefix=f"ecs/{mwaa_stack_conf.get('PREFIX')}-veda-vector_ingest", # prefix with container name + awslogs_stream_prefix=f"ecs/{mwaa_stack_conf.get('PREFIX')}-veda-generic-vector_ingest", # prefix with container name ) start >> generic_ingest_vector >> end From 2acf30af0b9a6a2c4071586cddedbfd81acd5905 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 13 Aug 2024 14:30:13 -0500 Subject: [PATCH 53/97] debug --- docker_tasks/generic_vector_ingest/handler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker_tasks/generic_vector_ingest/handler.py b/docker_tasks/generic_vector_ingest/handler.py index e0ee3daa..99d7f604 100644 --- a/docker_tasks/generic_vector_ingest/handler.py +++ b/docker_tasks/generic_vector_ingest/handler.py @@ -103,8 +103,10 @@ def load_to_featuresdb( extra_flags: list = ["-overwrite", "-progress"] ): secret_name = os.environ.get("VECTOR_SECRET_NAME") + print(f"Secret name {secret_name}") con_secrets = get_secret(secret_name) connection = get_connection_string(con_secrets) + print(f"{connection=}") print(f"running ogr2ogr import for collection/file: {layer_name}") options = [ From 53c351162df103d4b9d81db4bafbc5186f4e73dd Mon Sep 17 00:00:00 2001 From: smohiudd Date: Tue, 13 Aug 2024 13:45:36 -0600 Subject: [PATCH 54/97] check length of assumr role arns --- infrastructure/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/main.tf b/infrastructure/main.tf index c7a39ff3..c938e099 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -95,8 +95,8 @@ resource "local_file" "mwaa_variables" { ecs_cluster_name = module.mwaa.cluster_name log_group_name = module.mwaa.log_group_name mwaa_execution_role_arn = module.mwaa.mwaa_role_arn - assume_role_read_arn = var.assume_role_arns[0] - assume_role_write_arn = var.assume_role_arns[0] + assume_role_read_arn = length(var.assume_role_arns) > 0 ? var.assume_role_arns[0] : "" + assume_role_write_arn = length(var.assume_role_arns) > 0 ? var.assume_role_arns[0] : "" account_id = local.account_id aws_region = local.aws_region cognito_app_secret = var.workflows_client_secret From 0f2dc1345bf87f0c64c08756cb17b39f3b4c2c38 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 13 Aug 2024 15:27:41 -0500 Subject: [PATCH 55/97] Add vector_ecs_conf --- .../veda_process_generic_vector_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py index d88c28fe..06ae4a9e 100644 --- a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py @@ -97,8 +97,8 @@ }, network_configuration={ "awsvpcConfiguration": { - "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), - "subnets": mwaa_stack_conf.get("SUBNETS"), + "securityGroups": vector_ecs_conf.get("SECURITYGROUPS") + mwaa_stack_conf.get("SECURITYGROUPS"), + "subnets": vector_ecs_conf.get("SUBNETS"), }, }, awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), From 30ee61c248a660e8aad28211c2660164dd82dbe2 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 13 Aug 2024 15:36:47 -0500 Subject: [PATCH 56/97] change to vector subnet --- .../veda_process_generic_vector_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py index 06ae4a9e..92226abf 100644 --- a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py @@ -97,8 +97,8 @@ }, network_configuration={ "awsvpcConfiguration": { - "securityGroups": vector_ecs_conf.get("SECURITYGROUPS") + mwaa_stack_conf.get("SECURITYGROUPS"), - "subnets": vector_ecs_conf.get("SUBNETS"), + "securityGroups": vector_ecs_conf.get("VECTOR_SECURITY_GROUP") + mwaa_stack_conf.get("SECURITYGROUPS"), + "subnets": vector_ecs_conf.get("VECTOR_SUBNETS"), }, }, awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), From 62e3b8640eb10cfd3ed051d8435a128447da0b08 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Tue, 13 Aug 2024 16:22:35 -0500 Subject: [PATCH 57/97] handle empty collection field --- docker_tasks/generic_vector_ingest/handler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_tasks/generic_vector_ingest/handler.py b/docker_tasks/generic_vector_ingest/handler.py index 99d7f604..192538cf 100644 --- a/docker_tasks/generic_vector_ingest/handler.py +++ b/docker_tasks/generic_vector_ingest/handler.py @@ -103,10 +103,8 @@ def load_to_featuresdb( extra_flags: list = ["-overwrite", "-progress"] ): secret_name = os.environ.get("VECTOR_SECRET_NAME") - print(f"Secret name {secret_name}") con_secrets = get_secret(secret_name) connection = get_connection_string(con_secrets) - print(f"{connection=}") print(f"running ogr2ogr import for collection/file: {layer_name}") options = [ @@ -140,7 +138,7 @@ def load_to_featuresdb( return {"status": "success"} -def handler(event, context): +def handler(): print("------Vector ingestion for Features API started------") parser = ArgumentParser( prog="vector_ingest", @@ -164,6 +162,8 @@ def handler(event, context): extra_flags = payload_event["extra_flags"] layer_name = payload_event["collection"] + collection_not_provided = layer_name == "" + # Read the json to extract the discovered file paths with smart_open.open(s3_event, "r") as _file: @@ -178,7 +178,7 @@ def handler(event, context): filename = href.split("/")[-1].split(".")[0] # Use id template when collection is not provided in the conf - if layer_name == "": + if collection_not_provided: layer_name = payload_event["id_template"].format(filename) downloaded_filepath = download_file(href) @@ -201,4 +201,4 @@ def handler(event, context): if __name__ == "__main__": - handler({}, {}) + handler() From 205db09a9a053724c6ef34bd4016285d4e52dd34 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 10:39:40 -0500 Subject: [PATCH 58/97] revert to vecto ecs conf --- dags/veda_data_pipeline/veda_process_vector_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/veda_data_pipeline/veda_process_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_vector_pipeline.py index de061630..f4ac4b4f 100644 --- a/dags/veda_data_pipeline/veda_process_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_vector_pipeline.py @@ -97,8 +97,8 @@ }, network_configuration={ "awsvpcConfiguration": { - "securityGroups": mwaa_stack_conf.get("SECURITYGROUPS"), - "subnets": mwaa_stack_conf.get("SUBNETS"), + "securityGroups": vector_ecs_conf.get("SECURITYGROUPS"), + "subnets": vector_ecs_conf.get("SUBNETS"), }, }, awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), From ede6200126f4b42722720311f0123763af12eca3 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 10:43:44 -0500 Subject: [PATCH 59/97] add vector vpc conf --- dags/veda_data_pipeline/veda_process_vector_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/veda_data_pipeline/veda_process_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_vector_pipeline.py index f4ac4b4f..909d6469 100644 --- a/dags/veda_data_pipeline/veda_process_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_vector_pipeline.py @@ -97,8 +97,8 @@ }, network_configuration={ "awsvpcConfiguration": { - "securityGroups": vector_ecs_conf.get("SECURITYGROUPS"), - "subnets": vector_ecs_conf.get("SUBNETS"), + "securityGroups": vector_ecs_conf.get("VECTOR_SECURITY_GROUP"), + "subnets": vector_ecs_conf.get("VECTOR_SUBNETS"), }, }, awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), From 0e7cdd88657ec9aca47fafaa592f29db4ed43758 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 10:51:07 -0500 Subject: [PATCH 60/97] Add generic ingest to branching choices --- dags/veda_data_pipeline/groups/discover_group.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dags/veda_data_pipeline/groups/discover_group.py b/dags/veda_data_pipeline/groups/discover_group.py index 8134a0f3..eef5bbdc 100644 --- a/dags/veda_data_pipeline/groups/discover_group.py +++ b/dags/veda_data_pipeline/groups/discover_group.py @@ -71,6 +71,8 @@ def vector_raster_choice(ti): dynamic_group_id = ti.task_id.split(".")[0] if payload.get("vector"): + return f"{dynamic_group_id}.parallel_run_process_generic_vectors" + if payload.get("vector_eis"): return f"{dynamic_group_id}.parallel_run_process_vectors" return f"{dynamic_group_id}.parallel_run_process_rasters" @@ -97,6 +99,12 @@ def subdag_discover(event={}): run_process_vector = TriggerMultiDagRunOperator( task_id="parallel_run_process_vectors", + trigger_dag_id="veda_ingest_vector", + python_callable=get_files_to_process, + ) + + run_process_vector = TriggerMultiDagRunOperator( + task_id="parallel_run_process_generic_vectors", trigger_dag_id="veda_generic_ingest_vector", python_callable=get_files_to_process, ) From 893c7b4fa41cb49bd82c8c47b26960a00ec89537 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 10:53:35 -0500 Subject: [PATCH 61/97] Add to dag flow --- dags/veda_data_pipeline/groups/discover_group.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dags/veda_data_pipeline/groups/discover_group.py b/dags/veda_data_pipeline/groups/discover_group.py index eef5bbdc..6fd14e90 100644 --- a/dags/veda_data_pipeline/groups/discover_group.py +++ b/dags/veda_data_pipeline/groups/discover_group.py @@ -103,7 +103,7 @@ def subdag_discover(event={}): python_callable=get_files_to_process, ) - run_process_vector = TriggerMultiDagRunOperator( + run_process_generic_vector = TriggerMultiDagRunOperator( task_id="parallel_run_process_generic_vectors", trigger_dag_id="veda_generic_ingest_vector", python_callable=get_files_to_process, @@ -112,7 +112,8 @@ def subdag_discover(event={}): # extra no-op, needed to run in dynamic mapping context end_discover = EmptyOperator(task_id="end_discover", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,) - discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector] + discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector,run_process_generic_vector] run_process_raster >> end_discover run_process_vector >> end_discover + run_process_generic_vector >> end_discover From 9f703218500bc1aaae58fa34ded7e02d789ef12e Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 11:45:45 -0500 Subject: [PATCH 62/97] Adjust spaces --- dags/veda_data_pipeline/veda_process_vector_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/veda_data_pipeline/veda_process_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_vector_pipeline.py index 909d6469..57099cc5 100644 --- a/dags/veda_data_pipeline/veda_process_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_vector_pipeline.py @@ -97,8 +97,8 @@ }, network_configuration={ "awsvpcConfiguration": { - "securityGroups": vector_ecs_conf.get("VECTOR_SECURITY_GROUP"), - "subnets": vector_ecs_conf.get("VECTOR_SUBNETS"), + "securityGroups": vector_ecs_conf.get("VECTOR_SECURITY_GROUP"), + "subnets": vector_ecs_conf.get("VECTOR_SUBNETS"), }, }, awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"), From 9258aeca93cfad1f3c862a2d122c2821434830df Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 11:50:01 -0500 Subject: [PATCH 63/97] Adjust space --- docker_tasks/vector_ingest/handler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 0938d8ce..628f6912 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -381,4 +381,5 @@ def handler(): if __name__ == "__main__": - handler() \ No newline at end of file + handler() + \ No newline at end of file From 1944a7a4ab645161b65d40fb592b0a0cda2bda37 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 11:51:47 -0500 Subject: [PATCH 64/97] remove space --- docker_tasks/vector_ingest/handler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docker_tasks/vector_ingest/handler.py b/docker_tasks/vector_ingest/handler.py index 628f6912..35ff27cb 100644 --- a/docker_tasks/vector_ingest/handler.py +++ b/docker_tasks/vector_ingest/handler.py @@ -382,4 +382,3 @@ def handler(): if __name__ == "__main__": handler() - \ No newline at end of file From a6039becf579fd15377f024800e7d2729a1d6014 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 12:15:30 -0500 Subject: [PATCH 65/97] Update with dag info --- README.md | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/README.md b/README.md index ec95dddd..a235c3a8 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,67 @@ Currently, the client id and domain of an existing Cognito user pool programmati # Gitflow Model [VEDA pipeline gitflow](./GITFLOW.md) + +# Ingestion Pipeline Overview + +This pipeline is designed to handle the ingestion of both vector and raster data. The ingestion can be performed using the `veda-discover` DAG. Below are examples of configurations for both vector and raster data. + +## Ingestion Configuration + +### Vector Data Ingestion +```json +{ + "collection": "", + "bucket": "", + "prefix": "", + "filename_regex": ".*.csv$", + "id_regex": "", + "id_template": "-{}", + "datetime_range": "", + "vector": true, + "x_possible": "longitude", + "y_possible": "latitude", + "source_projection": "EPSG:4326", + "target_projection": "EPSG:4326", + "extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"] +} +``` + +### Raster Data Ingestion +```json +{ + "collection": "", + "bucket": "", + "prefix": "", + "filename_regex": ".*.tif$", + "datetime_range": "", + "assets": { + "co2": { + "title": "", + "description": ".", + "regex": ".*.tif$" + } + }, + "id_regex": ".*_(.*).tif$", + "id_template": "-{}" +} + +``` +## Configuration Fields Description +- collection: The collection_id of the raster or vector data. +- bucket: The name of the S3 bucket where the data is stored. +- prefix: The location within the bucket where the files are to be discovered. +- filename_regex: A regex expression used to filter files based on naming patterns. +- id_template: The format used to create item identifiers in the system. +- vector: Set to true to trigger the generic vector ingestion pipeline. +- vector_eis: Set to true to trigger the vector ingestion pipeline. + + +## Pipeline Behaviour +Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag). + +Setting `"vector_eis": true `will trigger the `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above. + # License This project is licensed under **Apache 2**, see the [LICENSE](LICENSE) file for more details. From 2a01479e4c12746a326dc753eb75ab0dd5347d83 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 13:07:30 -0500 Subject: [PATCH 66/97] Update readme --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a235c3a8..2a597902 100644 --- a/README.md +++ b/README.md @@ -125,13 +125,13 @@ This pipeline is designed to handle the ingestion of both vector and raster data ``` ## Configuration Fields Description -- collection: The collection_id of the raster or vector data. -- bucket: The name of the S3 bucket where the data is stored. -- prefix: The location within the bucket where the files are to be discovered. -- filename_regex: A regex expression used to filter files based on naming patterns. -- id_template: The format used to create item identifiers in the system. -- vector: Set to true to trigger the generic vector ingestion pipeline. -- vector_eis: Set to true to trigger the vector ingestion pipeline. +- `collection`: The collection_id of the raster or vector data. +- `bucket`: The name of the S3 bucket where the data is stored. +- `prefix`: The location within the bucket where the files are to be discovered. +- `filename_regex`: A regex expression used to filter files based on naming patterns. +- `id_template`: The format used to create item identifiers in the system. +- `vector`: Set to true to trigger the generic vector ingestion pipeline. +- `vector_eis`: Set to true to trigger the vector ingestion pipeline. ## Pipeline Behaviour From f28150bda4085b1e17312d0568a992e2ad64370f Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 13:22:55 -0500 Subject: [PATCH 67/97] address sugesstions --- .../groups/discover_group.py | 2 +- .../veda_process_generic_vector_pipeline.py | 36 ++++++++++++------- docker_tasks/generic_vector_ingest/handler.py | 2 +- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/dags/veda_data_pipeline/groups/discover_group.py b/dags/veda_data_pipeline/groups/discover_group.py index 6fd14e90..798f5f10 100644 --- a/dags/veda_data_pipeline/groups/discover_group.py +++ b/dags/veda_data_pipeline/groups/discover_group.py @@ -112,7 +112,7 @@ def subdag_discover(event={}): # extra no-op, needed to run in dynamic mapping context end_discover = EmptyOperator(task_id="end_discover", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,) - discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector,run_process_generic_vector] + discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector, run_process_generic_vector] run_process_raster >> end_discover run_process_vector >> end_discover run_process_generic_vector >> end_discover diff --git a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py index 92226abf..226c3314 100644 --- a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py +++ b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py @@ -8,7 +8,7 @@ from datetime import timedelta dag_doc_md = """ -### Build and submit stac +### Generic Ingest Vector #### Purpose This DAG is supposed to be triggered by `veda_discover`. But you still can trigger this DAG manually or through an API @@ -16,16 +16,24 @@ - This DAG can run with the following configuration
```json { - "collection": "geoglam", - "prefix": "geoglam/", - "bucket": "veda-data-store-staging", - "filename_regex": "^(.*).tif$", + "collection": "", + "prefix": "transformed_csv/", + "bucket": "ghgc-data-store-develop", + "filename_regex": ".*.csv$", "discovery": "s3", "datetime_range": "month", - "upload": false, - "cogify": false, + "vector": true, + "id_regex": "", + "id_template": "NIST_Urban_Testbed_test-{}", + "datetime_range": "", + "vector": true, + "x_possible": "longitude", + "y_possible": "latitude", + "source_projection": "EPSG:4326", + "target_projection": "EPSG:4326", + "extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"] "discovered": 33, - "payload": "s3://veda-uah-sit-mwaa-853558080719/events/geoglam/s3_discover_output_6c46b57a-7474-41fe-977a-19d164531cdc.json" + "payload": "s3://data-pipeline-ghgc-dev-mwaa-597746869805/events/test_layer_name2/s3_discover_output_f88257e8-ee50-4a14-ace4-5612ae6ebf38.jsonn" } ``` - [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines) @@ -36,11 +44,15 @@ "prefix": "/", "bucket": "", "filename_regex": "", - "discovery": "|cmr", + "id_template": "-{}", "datetime_range": "|", - "upload": " | true", - "cogify": "false | true", - "payload": "", + "y_possible": "", + "source_projection": "", + "target_projection": "", + "extra_flags": "", + "payload": "", } dag_args = { "start_date": pendulum.today("UTC").add(days=-1), diff --git a/docker_tasks/generic_vector_ingest/handler.py b/docker_tasks/generic_vector_ingest/handler.py index 192538cf..fe056623 100644 --- a/docker_tasks/generic_vector_ingest/handler.py +++ b/docker_tasks/generic_vector_ingest/handler.py @@ -179,7 +179,7 @@ def handler(): # Use id template when collection is not provided in the conf if collection_not_provided: - layer_name = payload_event["id_template"].format(filename) + layer_name = payload_event.get("id_template", "{}").format(filename) downloaded_filepath = download_file(href) print(f"[ COLLECTION ]: {layer_name}, [ DOWNLOAD FILEPATH ]: {downloaded_filepath}") From 3d5b32dfb2efe56f235ce245604b0f3b728fde51 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 13:24:46 -0500 Subject: [PATCH 68/97] Add space --- docker_tasks/generic_vector_ingest/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_tasks/generic_vector_ingest/requirements.txt b/docker_tasks/generic_vector_ingest/requirements.txt index 38263eed..cc585e05 100644 --- a/docker_tasks/generic_vector_ingest/requirements.txt +++ b/docker_tasks/generic_vector_ingest/requirements.txt @@ -4,4 +4,4 @@ requests==2.30.0 boto3==1.26.129 GeoAlchemy2==0.14.2 geopandas==0.14.0 -SQLAlchemy==2.0.23 \ No newline at end of file +SQLAlchemy==2.0.23 From 6230a336cc88d4cfb17a5e99d218497f4bd258c4 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 13:30:53 -0500 Subject: [PATCH 69/97] update readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 2a597902..14cc19ff 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,6 @@ This pipeline is designed to handle the ingestion of both vector and raster data "bucket": "", "prefix": "", "filename_regex": ".*.csv$", - "id_regex": "", "id_template": "-{}", "datetime_range": "", "vector": true, From 8ee612987f135d5d3513c938eec95bd43e482212 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 13:34:36 -0500 Subject: [PATCH 70/97] Update content --- README.md | 59 -------------------- docker_tasks/generic_vector_ingest/README.md | 59 ++++++++++++++++++++ 2 files changed, 59 insertions(+), 59 deletions(-) create mode 100644 docker_tasks/generic_vector_ingest/README.md diff --git a/README.md b/README.md index 14cc19ff..614caccc 100644 --- a/README.md +++ b/README.md @@ -79,65 +79,6 @@ Currently, the client id and domain of an existing Cognito user pool programmati # Gitflow Model [VEDA pipeline gitflow](./GITFLOW.md) -# Ingestion Pipeline Overview - -This pipeline is designed to handle the ingestion of both vector and raster data. The ingestion can be performed using the `veda-discover` DAG. Below are examples of configurations for both vector and raster data. - -## Ingestion Configuration - -### Vector Data Ingestion -```json -{ - "collection": "", - "bucket": "", - "prefix": "", - "filename_regex": ".*.csv$", - "id_template": "-{}", - "datetime_range": "", - "vector": true, - "x_possible": "longitude", - "y_possible": "latitude", - "source_projection": "EPSG:4326", - "target_projection": "EPSG:4326", - "extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"] -} -``` - -### Raster Data Ingestion -```json -{ - "collection": "", - "bucket": "", - "prefix": "", - "filename_regex": ".*.tif$", - "datetime_range": "", - "assets": { - "co2": { - "title": "", - "description": ".", - "regex": ".*.tif$" - } - }, - "id_regex": ".*_(.*).tif$", - "id_template": "-{}" -} - -``` -## Configuration Fields Description -- `collection`: The collection_id of the raster or vector data. -- `bucket`: The name of the S3 bucket where the data is stored. -- `prefix`: The location within the bucket where the files are to be discovered. -- `filename_regex`: A regex expression used to filter files based on naming patterns. -- `id_template`: The format used to create item identifiers in the system. -- `vector`: Set to true to trigger the generic vector ingestion pipeline. -- `vector_eis`: Set to true to trigger the vector ingestion pipeline. - - -## Pipeline Behaviour -Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag). - -Setting `"vector_eis": true `will trigger the `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above. - # License This project is licensed under **Apache 2**, see the [LICENSE](LICENSE) file for more details. diff --git a/docker_tasks/generic_vector_ingest/README.md b/docker_tasks/generic_vector_ingest/README.md new file mode 100644 index 00000000..db1e2cf4 --- /dev/null +++ b/docker_tasks/generic_vector_ingest/README.md @@ -0,0 +1,59 @@ + +# Ingestion Pipeline Overview + +This pipeline is designed to handle the ingestion of both vector and raster data. The ingestion can be performed using the `veda-discover` DAG. Below are examples of configurations for both vector and raster data. + +## Ingestion Configuration + +### Vector Data Ingestion +```json +{ + "collection": "", + "bucket": "", + "prefix": "", + "filename_regex": ".*.csv$", + "id_template": "-{}", + "datetime_range": "", + "vector": true, + "x_possible": "longitude", + "y_possible": "latitude", + "source_projection": "EPSG:4326", + "target_projection": "EPSG:4326", + "extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"] +} +``` + +### Raster Data Ingestion +```json +{ + "collection": "", + "bucket": "", + "prefix": "", + "filename_regex": ".*.tif$", + "datetime_range": "", + "assets": { + "co2": { + "title": "", + "description": ".", + "regex": ".*.tif$" + } + }, + "id_regex": ".*_(.*).tif$", + "id_template": "-{}" +} + +``` +## Configuration Fields Description +- `collection`: The collection_id of the raster or vector data. +- `bucket`: The name of the S3 bucket where the data is stored. +- `prefix`: The location within the bucket where the files are to be discovered. +- `filename_regex`: A regex expression used to filter files based on naming patterns. +- `id_template`: The format used to create item identifiers in the system. +- `vector`: Set to true to trigger the generic vector ingestion pipeline. +- `vector_eis`: Set to true to trigger the vector ingestion pipeline. + + +## Pipeline Behaviour +Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag). + +Setting `"vector_eis": true `will trigger the `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above. From d22fef5e9b96302d5eb4624c4387ed4531a5d46f Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 13:37:57 -0500 Subject: [PATCH 71/97] Add pipeline info --- README.md | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/README.md b/README.md index 614caccc..14cc19ff 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,65 @@ Currently, the client id and domain of an existing Cognito user pool programmati # Gitflow Model [VEDA pipeline gitflow](./GITFLOW.md) +# Ingestion Pipeline Overview + +This pipeline is designed to handle the ingestion of both vector and raster data. The ingestion can be performed using the `veda-discover` DAG. Below are examples of configurations for both vector and raster data. + +## Ingestion Configuration + +### Vector Data Ingestion +```json +{ + "collection": "", + "bucket": "", + "prefix": "", + "filename_regex": ".*.csv$", + "id_template": "-{}", + "datetime_range": "", + "vector": true, + "x_possible": "longitude", + "y_possible": "latitude", + "source_projection": "EPSG:4326", + "target_projection": "EPSG:4326", + "extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"] +} +``` + +### Raster Data Ingestion +```json +{ + "collection": "", + "bucket": "", + "prefix": "", + "filename_regex": ".*.tif$", + "datetime_range": "", + "assets": { + "co2": { + "title": "", + "description": ".", + "regex": ".*.tif$" + } + }, + "id_regex": ".*_(.*).tif$", + "id_template": "-{}" +} + +``` +## Configuration Fields Description +- `collection`: The collection_id of the raster or vector data. +- `bucket`: The name of the S3 bucket where the data is stored. +- `prefix`: The location within the bucket where the files are to be discovered. +- `filename_regex`: A regex expression used to filter files based on naming patterns. +- `id_template`: The format used to create item identifiers in the system. +- `vector`: Set to true to trigger the generic vector ingestion pipeline. +- `vector_eis`: Set to true to trigger the vector ingestion pipeline. + + +## Pipeline Behaviour +Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag). + +Setting `"vector_eis": true `will trigger the `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above. + # License This project is licensed under **Apache 2**, see the [LICENSE](LICENSE) file for more details. From 3d6dcd6473236f0ce68dce76db166dc640776dd5 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 13:38:48 -0500 Subject: [PATCH 72/97] deleted the other readme --- docker_tasks/generic_vector_ingest/README.md | 59 -------------------- 1 file changed, 59 deletions(-) delete mode 100644 docker_tasks/generic_vector_ingest/README.md diff --git a/docker_tasks/generic_vector_ingest/README.md b/docker_tasks/generic_vector_ingest/README.md deleted file mode 100644 index db1e2cf4..00000000 --- a/docker_tasks/generic_vector_ingest/README.md +++ /dev/null @@ -1,59 +0,0 @@ - -# Ingestion Pipeline Overview - -This pipeline is designed to handle the ingestion of both vector and raster data. The ingestion can be performed using the `veda-discover` DAG. Below are examples of configurations for both vector and raster data. - -## Ingestion Configuration - -### Vector Data Ingestion -```json -{ - "collection": "", - "bucket": "", - "prefix": "", - "filename_regex": ".*.csv$", - "id_template": "-{}", - "datetime_range": "", - "vector": true, - "x_possible": "longitude", - "y_possible": "latitude", - "source_projection": "EPSG:4326", - "target_projection": "EPSG:4326", - "extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"] -} -``` - -### Raster Data Ingestion -```json -{ - "collection": "", - "bucket": "", - "prefix": "", - "filename_regex": ".*.tif$", - "datetime_range": "", - "assets": { - "co2": { - "title": "", - "description": ".", - "regex": ".*.tif$" - } - }, - "id_regex": ".*_(.*).tif$", - "id_template": "-{}" -} - -``` -## Configuration Fields Description -- `collection`: The collection_id of the raster or vector data. -- `bucket`: The name of the S3 bucket where the data is stored. -- `prefix`: The location within the bucket where the files are to be discovered. -- `filename_regex`: A regex expression used to filter files based on naming patterns. -- `id_template`: The format used to create item identifiers in the system. -- `vector`: Set to true to trigger the generic vector ingestion pipeline. -- `vector_eis`: Set to true to trigger the vector ingestion pipeline. - - -## Pipeline Behaviour -Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag). - -Setting `"vector_eis": true `will trigger the `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above. From b91871d3de0ebad1b01e9da9fd18636f9864563b Mon Sep 17 00:00:00 2001 From: smohiudd Date: Wed, 14 Aug 2024 12:51:56 -0600 Subject: [PATCH 73/97] change write arn index --- infrastructure/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/main.tf b/infrastructure/main.tf index c938e099..c6abc320 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -96,7 +96,7 @@ resource "local_file" "mwaa_variables" { log_group_name = module.mwaa.log_group_name mwaa_execution_role_arn = module.mwaa.mwaa_role_arn assume_role_read_arn = length(var.assume_role_arns) > 0 ? var.assume_role_arns[0] : "" - assume_role_write_arn = length(var.assume_role_arns) > 0 ? var.assume_role_arns[0] : "" + assume_role_write_arn = length(var.assume_role_arns) > 0 ? var.assume_role_arns[1] : "" account_id = local.account_id aws_region = local.aws_region cognito_app_secret = var.workflows_client_secret From 0005a32944c8b8cc8525220f80a9b46712d335f6 Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 14:53:04 -0500 Subject: [PATCH 74/97] Modify readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 14cc19ff..1d7fe3a4 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ This pipeline is designed to handle the ingestion of both vector and raster data ## Pipeline Behaviour Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag). -Setting `"vector_eis": true `will trigger the `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above. +Setting `"vector_eis": true` will trigger the EIS Fire specific `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above. # License This project is licensed under **Apache 2**, see the [LICENSE](LICENSE) file for more details. From 8bcfbade3b1daf98a75fc207e73a21121300c3ba Mon Sep 17 00:00:00 2001 From: Paridhi Parajuli Date: Wed, 14 Aug 2024 14:57:33 -0500 Subject: [PATCH 75/97] Modify readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1d7fe3a4..f2654a8e 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ This pipeline is designed to handle the ingestion of both vector and raster data - `filename_regex`: A regex expression used to filter files based on naming patterns. - `id_template`: The format used to create item identifiers in the system. - `vector`: Set to true to trigger the generic vector ingestion pipeline. -- `vector_eis`: Set to true to trigger the vector ingestion pipeline. +- `vector_eis`: Set to true to trigger the EIS Fire specific vector ingestion pipeline. ## Pipeline Behaviour From 98446d3a52e5c02ce023288ad12ef4664d830c5d Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 19 Aug 2024 11:18:09 -0500 Subject: [PATCH 76/97] Adding makefile to data-airflow --- .../actions/terraform-deploy-sm2a/action.yml | 19 +------------- README.md | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 59f53ab9..4f146a95 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -33,20 +33,6 @@ runs: python-version: "3.11" cache: "pip" - - name: Install python dependencies - shell: bash - working-directory: ${{ inputs.dir }} - run: pip install -r ../deploy_requirements.txt - - - name: Get relevant environment configuration from aws secrets - shell: bash - working-directory: ${{ inputs.dir }} - env: - AWS_DEFAULT_REGION: ${{ inputs.aws-region }} - AWS_REGION: ${{ inputs.aws-region }} - run: | - python scripts/generate_env_file.py --secret-id ${{ inputs.env_aws_secret_name }} --env-file ${{ inputs.env-file }} - - name: Setup Terraform uses: hashicorp/setup-terraform@v1 with: @@ -54,14 +40,11 @@ runs: - name: Deploy shell: bash - working-directory: ${{ inputs.dir }} env: AWS_DEFAULT_REGION: ${{ inputs.aws-region }} AWS_REGION: ${{ inputs.aws-region }} run: | - cp -r ../dags . - ./scripts/deploy.sh ${{ inputs.env-file }} <<< init - ./scripts/deploy.sh ${{ inputs.env-file }} <<< deploy + make sm2a-deploy ENV_FILE=${{ inputs.env-file }} SECRET_NAME=${{ inputs.env_aws_secret_name }} - name: Output workflows API endpoint id: output_sm2a_workflows_endpoint diff --git a/README.md b/README.md index f2654a8e..51a4338e 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,31 @@ See [terraform-getting-started](https://developer.hashicorp.com/terraform/tutori See [getting-started-install](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) + + +### Setup a local SM2A development environment + +1. Initialize the metadata db (only needed once) + +```shell +make sm2a-local-init +``` +This will create an airflow username: `airflow` with password `airflow` + +2. Start all services + +```shell +make sm2a-local-run +``` +This will start SM2A services and will be running on http://localhost:8080 + +3. Stop all services + +```shell +make sm2a-local-stop +``` + + ## Deployment This project uses Terraform modules to deploy Apache Airflow and related AWS resources using Amazon's managed Airflow provider. From 903614f0b43c31857d71e7e5d5f6200f91f36311 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 19 Aug 2024 12:24:09 -0500 Subject: [PATCH 77/97] Add Makefile --- .gitignore | 1 - Makefile | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 Makefile diff --git a/.gitignore b/.gitignore index 547284f8..cc646d8d 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,5 @@ cdk.context.json env.sh .hypothesis -Makefile .env_sit terraform.tfstate diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..a4212110 --- /dev/null +++ b/Makefile @@ -0,0 +1,75 @@ +SECRET_NAME="" +ENV_FILE=".env" +SM2A_FOLDER="sm2a" + +CHDIR_SHELL := $(SHELL) +define chdir + $(eval _D=$(firstword $(1) $(@D))) + $(info $(MAKE): cd $(_D)) $(eval SHELL = cd $(_D); $(CHDIR_SHELL)) +endef + + +important_message = \ + @echo "\033[0;31m$(1) \033[0m" + +info_message = \ + @echo "\033[0;32m$(1) \033[0m" + + +count_down = \ + @echo "Spinning up the system please wait..."; \ + secs=40 ;\ + while [ $$secs -gt 0 ]; do \ + printf "%d\033[0K\r" $$secs; \ + sleep 1; \ + : $$((secs--)); \ + done; + + +.PHONY: + clean + all + test + +all: switch-to-sm2a sm2a-local-init sm2a-local-run + +test: + $(call count_down, 10) + +switch-to-sm2a: + $(call chdir,${SM2A_FOLDER}) + +sm2a-local-run: switch-to-sm2a sm2a-local-stop sm2a-cp-dags + @echo "Running SM2A" + docker compose up -d + $(call important_message, "Give the resources a minute to be healthy 💪") + $(count_down) + $(call info_message, "Please visit http://localhost:8080") + echo "username:airflow | password:airflow" + echo "To use local SM2A with AWS update ${SM2A_FOLDER}/sm2a-local-config/.env AWS credentials" + +sm2a-local-init: switch-to-sm2a sm2a-cp-dags + cp sm2a-local-config/env_example sm2a-local-config/.env + docker compose run --rm airflow-cli db init + docker compose run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin + +sm2a-local-stop: switch-to-sm2a + docker compose down + +sm2a-cp-dags: + cp -r ../dags dags + +sm2a-deploy: switch-to-sm2a sm2a-cp-dags + @echo "Installing the deployment dependency" + pip install -r deploy_requirements.txt + echo "Deploying SM2A" + python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} + ./scripts/deploy.sh ${ENV_FILE} <<< init + ./scripts/deploy.sh ${ENV_FILE} <<< deploy + +clean: switch-to-sm2a sm2a-local-stop + @echo "Cleaning local env" + docker container prune <<< y + docker image prune <<< y + docker volume prune <<< y + From 365fa06f69e7319f4a1d0648469baa14fb191f72 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Mon, 19 Aug 2024 17:03:31 -0500 Subject: [PATCH 78/97] fix: vector subnet id reference (#223) --- infrastructure/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/main.tf b/infrastructure/main.tf index ca016191..0f22eb64 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -109,8 +109,8 @@ resource "local_file" "mwaa_variables" { stac_ingestor_api_url = var.stac_ingestor_api_url stac_url = var.stac_url vector_secret_name = var.vector_secret_name - vector_subnet_1 = length(data.aws_subnets.private.ids) > 0 ? data.aws_subnets.private.ids[0] : "" - vector_subnet_2 = length(data.aws_subnets.private.ids) > 0 ? data.aws_subnets.private.ids[1] : "" + vector_subnet_1 = length(data.aws_subnets.subnet_ids.ids) > 0 ? data.aws_subnets.subnet_ids.ids[0] : "" + vector_subnet_2 = length(data.aws_subnets.subnet_ids.ids) > 0 ? data.aws_subnets.subnet_ids.ids[1] : "" vector_security_group = length(aws_security_group.vector_sg) > 0 ? aws_security_group.vector_sg[0].id : "" vector_vpc = var.vector_vpc }) From 64bd0e607c5d5719e46b0c4c51d14a2cfda9a59d Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Tue, 20 Aug 2024 10:14:19 -0500 Subject: [PATCH 79/97] Updating REDME and makefile to include test --- Makefile | 9 +++++---- README.md | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index a4212110..55d2c02c 100644 --- a/Makefile +++ b/Makefile @@ -31,10 +31,11 @@ count_down = \ all test + all: switch-to-sm2a sm2a-local-init sm2a-local-run test: - $(call count_down, 10) + pytest tests switch-to-sm2a: $(call chdir,${SM2A_FOLDER}) @@ -69,7 +70,7 @@ sm2a-deploy: switch-to-sm2a sm2a-cp-dags clean: switch-to-sm2a sm2a-local-stop @echo "Cleaning local env" - docker container prune <<< y - docker image prune <<< y - docker volume prune <<< y + docker container prune -f + docker image prune -f + docker volume prune -f diff --git a/README.md b/README.md index 51a4338e..3f9857b4 100644 --- a/README.md +++ b/README.md @@ -39,11 +39,15 @@ See [getting-started-install](https://docs.aws.amazon.com/cli/latest/userguide/g ### Setup a local SM2A development environment -1. Initialize the metadata db (only needed once) +1. Initialize the metadata db ```shell make sm2a-local-init ``` +🚨 NOTE: This command is typically required only once at the beginning. +After running it, you generally do not need to run it again unless you run `make clean`, +which will require you to reinitialize SM2A with `make sm2a-local-init` + This will create an airflow username: `airflow` with password `airflow` 2. Start all services From 5bef3915bc6e88d8412349d8886852fbc1af249d Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Tue, 20 Aug 2024 13:52:18 -0500 Subject: [PATCH 80/97] Fix persistant cd command line --- Makefile | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 55d2c02c..a42e487e 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,6 @@ SECRET_NAME="" ENV_FILE=".env" SM2A_FOLDER="sm2a" -CHDIR_SHELL := $(SHELL) -define chdir - $(eval _D=$(firstword $(1) $(@D))) - $(info $(MAKE): cd $(_D)) $(eval SHELL = cd $(_D); $(CHDIR_SHELL)) -endef important_message = \ @@ -32,16 +27,15 @@ count_down = \ test -all: switch-to-sm2a sm2a-local-init sm2a-local-run +all: sm2a-local-init sm2a-local-run test: pytest tests -switch-to-sm2a: - $(call chdir,${SM2A_FOLDER}) -sm2a-local-run: switch-to-sm2a sm2a-local-stop sm2a-cp-dags +sm2a-local-run: sm2a-local-stop sm2a-cp-dags @echo "Running SM2A" + cd ${SM2A_FOLDER} && \ docker compose up -d $(call important_message, "Give the resources a minute to be healthy 💪") $(count_down) @@ -49,28 +43,28 @@ sm2a-local-run: switch-to-sm2a sm2a-local-stop sm2a-cp-dags echo "username:airflow | password:airflow" echo "To use local SM2A with AWS update ${SM2A_FOLDER}/sm2a-local-config/.env AWS credentials" -sm2a-local-init: switch-to-sm2a sm2a-cp-dags - cp sm2a-local-config/env_example sm2a-local-config/.env - docker compose run --rm airflow-cli db init - docker compose run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin +sm2a-local-init: sm2a-cp-dags + cp ${SM2A_FOLDER}/sm2a-local-config/env_example ${SM2A_FOLDER}/sm2a-local-config/.env + docker compose -f ${SM2A_FOLDER}/docker-compose.yml run --rm airflow-cli db init + docker compose -f ${SM2A_FOLDER}/docker-compose.yml run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin -sm2a-local-stop: switch-to-sm2a - docker compose down +sm2a-local-stop: + docker compose -f ${SM2A_FOLDER}/docker-compose.yml down sm2a-cp-dags: - cp -r ../dags dags + cp -r dags ${SM2A_FOLDER}/. -sm2a-deploy: switch-to-sm2a sm2a-cp-dags +sm2a-deploy: sm2a-cp-dags @echo "Installing the deployment dependency" - pip install -r deploy_requirements.txt - echo "Deploying SM2A" - python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} - ./scripts/deploy.sh ${ENV_FILE} <<< init + cd ${SM2A_FOLDER} && \ + pip install -r deploy_requirements.txt && \ + echo "Deploying SM2A" && \ + python ./scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} && \ + ./scripts/deploy.sh ${ENV_FILE} <<< init && \ ./scripts/deploy.sh ${ENV_FILE} <<< deploy -clean: switch-to-sm2a sm2a-local-stop +clean: sm2a-local-stop @echo "Cleaning local env" docker container prune -f docker image prune -f docker volume prune -f - From 73ba204723c557fd992c21a662ff5b471049e679 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Fri, 23 Aug 2024 10:30:00 -0500 Subject: [PATCH 81/97] Move Makefile to SM2A folder --- Makefile | 68 +++++++----------------------- infrastructure/.terraform.lock.hcl | 19 +++++++++ sm2a/Makefile | 65 ++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 53 deletions(-) create mode 100644 sm2a/Makefile diff --git a/Makefile b/Makefile index a42e487e..4e271d1e 100644 --- a/Makefile +++ b/Makefile @@ -1,24 +1,3 @@ -SECRET_NAME="" -ENV_FILE=".env" -SM2A_FOLDER="sm2a" - - - -important_message = \ - @echo "\033[0;31m$(1) \033[0m" - -info_message = \ - @echo "\033[0;32m$(1) \033[0m" - - -count_down = \ - @echo "Spinning up the system please wait..."; \ - secs=40 ;\ - while [ $$secs -gt 0 ]; do \ - printf "%d\033[0K\r" $$secs; \ - sleep 1; \ - : $$((secs--)); \ - done; .PHONY: @@ -27,44 +6,27 @@ count_down = \ test -all: sm2a-local-init sm2a-local-run +all: + $(MAKE) -C sm2a all -test: - pytest tests +sm2a-local-run: + $(MAKE) -C sm2a sm2a-local-run -sm2a-local-run: sm2a-local-stop sm2a-cp-dags - @echo "Running SM2A" - cd ${SM2A_FOLDER} && \ - docker compose up -d - $(call important_message, "Give the resources a minute to be healthy 💪") - $(count_down) - $(call info_message, "Please visit http://localhost:8080") - echo "username:airflow | password:airflow" - echo "To use local SM2A with AWS update ${SM2A_FOLDER}/sm2a-local-config/.env AWS credentials" +sm2a-local-init: + $(MAKE) -C sm2a sm2a-local-init -sm2a-local-init: sm2a-cp-dags - cp ${SM2A_FOLDER}/sm2a-local-config/env_example ${SM2A_FOLDER}/sm2a-local-config/.env - docker compose -f ${SM2A_FOLDER}/docker-compose.yml run --rm airflow-cli db init - docker compose -f ${SM2A_FOLDER}/docker-compose.yml run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin sm2a-local-stop: - docker compose -f ${SM2A_FOLDER}/docker-compose.yml down + $(MAKE) -C sm2a sm2a-local-stop + -sm2a-cp-dags: - cp -r dags ${SM2A_FOLDER}/. -sm2a-deploy: sm2a-cp-dags - @echo "Installing the deployment dependency" - cd ${SM2A_FOLDER} && \ - pip install -r deploy_requirements.txt && \ - echo "Deploying SM2A" && \ - python ./scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} && \ - ./scripts/deploy.sh ${ENV_FILE} <<< init && \ - ./scripts/deploy.sh ${ENV_FILE} <<< deploy +sm2a-deploy: + $(MAKE) -C sm2a sm2a-deploy -clean: sm2a-local-stop - @echo "Cleaning local env" - docker container prune -f - docker image prune -f - docker volume prune -f +clean: + $(MAKE) -C sm2a clean + +test: + pytest tests diff --git a/infrastructure/.terraform.lock.hcl b/infrastructure/.terraform.lock.hcl index 4fa29add..1dde3073 100644 --- a/infrastructure/.terraform.lock.hcl +++ b/infrastructure/.terraform.lock.hcl @@ -107,3 +107,22 @@ provider "registry.terraform.io/hashicorp/null" { "zh:fca01a623d90d0cad0843102f9b8b9fe0d3ff8244593bd817f126582b52dd694", ] } + +provider "registry.terraform.io/hashicorp/random" { + version = "3.6.2" + hashes = [ + "h1:VavG5unYCa3SYISMKF9pzc3718M0bhPlcbUZZGl7wuo=", + "zh:0ef01a4f81147b32c1bea3429974d4d104bbc4be2ba3cfa667031a8183ef88ec", + "zh:1bcd2d8161e89e39886119965ef0f37fcce2da9c1aca34263dd3002ba05fcb53", + "zh:37c75d15e9514556a5f4ed02e1548aaa95c0ecd6ff9af1119ac905144c70c114", + "zh:4210550a767226976bc7e57d988b9ce48f4411fa8a60cd74a6b246baf7589dad", + "zh:562007382520cd4baa7320f35e1370ffe84e46ed4e2071fdc7e4b1a9b1f8ae9b", + "zh:5efb9da90f665e43f22c2e13e0ce48e86cae2d960aaf1abf721b497f32025916", + "zh:6f71257a6b1218d02a573fc9bff0657410404fb2ef23bc66ae8cd968f98d5ff6", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:9647e18f221380a85f2f0ab387c68fdafd58af6193a932417299cdcae4710150", + "zh:bb6297ce412c3c2fa9fec726114e5e0508dd2638cad6a0cb433194930c97a544", + "zh:f83e925ed73ff8a5ef6e3608ad9225baa5376446349572c2449c0c0b3cf184b7", + "zh:fbef0781cb64de76b1df1ca11078aecba7800d82fd4a956302734999cfd9a4af", + ] +} diff --git a/sm2a/Makefile b/sm2a/Makefile new file mode 100644 index 00000000..35560e33 --- /dev/null +++ b/sm2a/Makefile @@ -0,0 +1,65 @@ +SECRET_NAME="" +ENV_FILE=".env" + + +important_message = \ + @echo "\033[0;31m$(1) \033[0m" + +info_message = \ + @echo "\033[0;32m$(1) \033[0m" + + +count_down = \ + @echo "Spinning up the system please wait..."; \ + secs=40 ;\ + while [ $$secs -gt 0 ]; do \ + printf "%d\033[0K\r" $$secs; \ + sleep 1; \ + : $$((secs--)); \ + done; + + +.PHONY: + clean + all + + + +all: sm2a-local-init sm2a-local-run + + +sm2a-local-run: sm2a-local-stop sm2a-cp-dags + @echo "Running SM2A" + docker compose up -d + $(call important_message, "Give the resources a minute to be healthy 💪") + $(count_down) + $(call info_message, "Please visit http://localhost:8080") + echo "username:airflow | password:airflow" + echo "To use local SM2A with AWS update ${SM2A_FOLDER}/sm2a-local-config/.env AWS credentials" + +sm2a-local-init: sm2a-cp-dags + cp sm2a-local-config/env_example sm2a-local-config/.env + docker compose run --rm airflow-cli db init + docker compose run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin + +sm2a-local-stop: + docker compose down + +sm2a-cp-dags: + cp -r ../dags . + +sm2a-deploy: sm2a-cp-dags + @echo "Installing the deployment dependency" + pip install -r deploy_requirements.txt + echo "Deploying SM2A" + pwd + ls -al + python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} + ./scripts/deploy.sh ${ENV_FILE} <<< init + ./scripts/deploy.sh ${ENV_FILE} <<< deploy + +clean: sm2a-local-stop + @echo "Cleaning local env" + docker container prune -f + docker image prune -f + docker volume prune -f From 4d690d690d7a5a021a312c15530ad685c73dd1f7 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Fri, 23 Aug 2024 10:32:10 -0500 Subject: [PATCH 82/97] Move Makefile to SM2A folder --- sm2a/infrastructure/.terraform.lock.hcl | 113 +++++++++++++++--------- 1 file changed, 70 insertions(+), 43 deletions(-) diff --git a/sm2a/infrastructure/.terraform.lock.hcl b/sm2a/infrastructure/.terraform.lock.hcl index b9eab3ab..4fa29add 100644 --- a/sm2a/infrastructure/.terraform.lock.hcl +++ b/sm2a/infrastructure/.terraform.lock.hcl @@ -1,11 +1,32 @@ # This file is maintained automatically by "terraform init". # Manual edits may be lost in future updates. +provider "registry.terraform.io/hashicorp/archive" { + version = "2.4.0" + hashes = [ + "h1:EtN1lnoHoov3rASpgGmh6zZ/W6aRCTgKC7iMwvFY1yc=", + "h1:cJokkjeH1jfpG4QEHdRx0t2j8rr52H33A7C/oX73Ok4=", + "zh:18e408596dd53048f7fc8229098d0e3ad940b92036a24287eff63e2caec72594", + "zh:392d4216ecd1a1fd933d23f4486b642a8480f934c13e2cae3c13b6b6a7e34a7b", + "zh:655dd1fa5ca753a4ace21d0de3792d96fff429445717f2ce31c125d19c38f3ff", + "zh:70dae36c176aa2b258331ad366a471176417a94dd3b4985a911b8be9ff842b00", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7d8c8e3925f1e21daf73f85983894fbe8868e326910e6df3720265bc657b9c9c", + "zh:a032ec0f0aee27a789726e348e8ad20778c3a1c9190ef25e7cff602c8d175f44", + "zh:b8e50de62ba185745b0fe9713755079ad0e9f7ac8638d204de6762cc36870410", + "zh:c8ad0c7697a3d444df21ff97f3473a8604c8639be64afe3f31b8ec7ad7571e18", + "zh:df736c5a2a7c3a82c5493665f659437a22f0baf8c2d157e45f4dd7ca40e739fc", + "zh:e8ffbf578a0977074f6d08aa8734e36c726e53dc79894cfc4f25fadc4f45f1df", + "zh:efea57ff23b141551f92b2699024d356c7ffd1a4ad62931da7ed7a386aef7f1f", + ] +} + provider "registry.terraform.io/hashicorp/aws" { version = "4.67.0" - constraints = "~> 4.0" + constraints = "~> 4.0, >= 4.54.0" hashes = [ "h1:5Zfo3GfRSWBaXs4TGQNOflr1XaYj6pRnVJLX5VAjFX4=", + "h1:dCRc4GqsyfqHEMjgtlM1EympBcgTmcTkWaJmtd91+KA=", "zh:0843017ecc24385f2b45f2c5fce79dc25b258e50d516877b3affee3bef34f060", "zh:19876066cfa60de91834ec569a6448dab8c2518b8a71b5ca870b2444febddac6", "zh:24995686b2ad88c1ffaa242e36eee791fc6070e6144f418048c4ce24d0ba5183", @@ -24,59 +45,65 @@ provider "registry.terraform.io/hashicorp/aws" { ] } -provider "registry.terraform.io/hashicorp/local" { - version = "2.4.1" +provider "registry.terraform.io/hashicorp/external" { + version = "2.3.1" + constraints = ">= 1.0.0" hashes = [ - "h1:gpp25uNkYJYzJVnkyRr7RIBVfwLs9GSq2HNnFpTRBg0=", - "zh:244b445bf34ddbd167731cc6c6b95bbed231dc4493f8cc34bd6850cfe1f78528", - "zh:3c330bdb626123228a0d1b1daa6c741b4d5d484ab1c7ae5d2f48d4c9885cc5e9", - "zh:5ff5f9b791ddd7557e815449173f2db38d338e674d2d91800ac6e6d808de1d1d", - "zh:70206147104f4bf26ae67d730c995772f85bf23e28c2c2e7612c74f4dae3c46f", - "zh:75029676993accd6bef933c196b2fad51a9ec8a69a847dbbe96ec8ebf7926cdc", + "h1:bROCw6g5D/3fFnWeJ01L4IrdnJl1ILU8DGDgXCtYzaY=", + "h1:gznGscVJ0USxy4CdihpjRKPsKvyGr/zqPvBoFLJTQDc=", + "zh:001e2886dc81fc98cf17cf34c0d53cb2dae1e869464792576e11b0f34ee92f54", + "zh:2eeac58dd75b1abdf91945ac4284c9ccb2bfb17fa9bdb5f5d408148ff553b3ee", + "zh:2fc39079ba61411a737df2908942e6970cb67ed2f4fb19090cd44ce2082903dd", + "zh:472a71c624952cff7aa98a7b967f6c7bb53153dbd2b8f356ceb286e6743bb4e2", + "zh:4cff06d31272aac8bc35e9b7faec42cf4554cbcbae1092eaab6ab7f643c215d9", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7d48d5999fe1fcdae9295a7c3448ac1541f5a24c474bd82df6d4fa3732483f2b", - "zh:b766b38b027f0f84028244d1c2f990431a37d4fc3ac645962924554016507e77", - "zh:bfc7ad301dada204cf51c59d8bd6a9a87de5fddb42190b4d6ba157d6e08a1f10", - "zh:c902b527702a8c5e2c25a6637d07bbb1690cb6c1e63917a5f6dc460efd18d43f", - "zh:d68ae0e1070cf429c46586bc87580c3ed113f76241da2b6e4f1a8348126b3c46", - "zh:f4903fd89f7c92a346ae9e666c2d0b6884c4474ae109e9b4bd15e7efaa4bfc29", + "zh:7ed16ccd2049fa089616b98c0bd57219f407958f318f3c697843e2397ddf70df", + "zh:842696362c92bf2645eb85c739410fd51376be6c488733efae44f4ce688da50e", + "zh:8985129f2eccfd7f1841ce06f3bf2bbede6352ec9e9f926fbaa6b1a05313b326", + "zh:a5f0602d8ec991a5411ef42f872aa90f6347e93886ce67905c53cfea37278e05", + "zh:bf4ab82cbe5256dcef16949973bf6aa1a98c2c73a98d6a44ee7bc40809d002b8", + "zh:e70770be62aa70198fa899526d671643ff99eecf265bf1a50e798fc3480bd417", ] } -provider "registry.terraform.io/hashicorp/null" { - version = "3.2.2" +provider "registry.terraform.io/hashicorp/local" { + version = "2.4.0" + constraints = ">= 1.0.0" hashes = [ - "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", - "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", - "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", - "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", - "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", - "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", - "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", - "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "h1:R97FTYETo88sT2VHfMgkPU3lzCsZLunPftjSI5vfKe8=", + "h1:ZUEYUmm2t4vxwzxy1BvN1wL6SDWrDxfH7pxtzX8c6d0=", + "zh:53604cd29cb92538668fe09565c739358dc53ca56f9f11312b9d7de81e48fab9", + "zh:66a46e9c508716a1c98efbf793092f03d50049fa4a83cd6b2251e9a06aca2acf", + "zh:70a6f6a852dd83768d0778ce9817d81d4b3f073fab8fa570bff92dcb0824f732", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", - "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", - "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", - "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + "zh:82a803f2f484c8b766e2e9c32343e9c89b91997b9f8d2697f9f3837f62926b35", + "zh:9708a4e40d6cc4b8afd1352e5186e6e1502f6ae599867c120967aebe9d90ed04", + "zh:973f65ce0d67c585f4ec250c1e634c9b22d9c4288b484ee2a871d7fa1e317406", + "zh:c8fa0f98f9316e4cfef082aa9b785ba16e36ff754d6aba8b456dab9500e671c6", + "zh:cfa5342a5f5188b20db246c73ac823918c189468e1382cb3c48a9c0c08fc5bf7", + "zh:e0e2b477c7e899c63b06b38cd8684a893d834d6d0b5e9b033cedc06dd7ffe9e2", + "zh:f62d7d05ea1ee566f732505200ab38d94315a4add27947a60afa29860822d3fc", + "zh:fa7ce69dde358e172bd719014ad637634bbdabc49363104f4fca759b4b73f2ce", ] } -provider "registry.terraform.io/hashicorp/random" { - version = "3.6.0" +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.1" + constraints = ">= 2.0.0" hashes = [ - "h1:I8MBeauYA8J8yheLJ8oSMWqB0kovn16dF/wKZ1QTdkk=", - "zh:03360ed3ecd31e8c5dac9c95fe0858be50f3e9a0d0c654b5e504109c2159287d", - "zh:1c67ac51254ba2a2bb53a25e8ae7e4d076103483f55f39b426ec55e47d1fe211", - "zh:24a17bba7f6d679538ff51b3a2f378cedadede97af8a1db7dad4fd8d6d50f829", - "zh:30ffb297ffd1633175d6545d37c2217e2cef9545a6e03946e514c59c0859b77d", - "zh:454ce4b3dbc73e6775f2f6605d45cee6e16c3872a2e66a2c97993d6e5cbd7055", + "h1:FbGfc+muBsC17Ohy5g806iuI1hQc4SIexpYCrQHQd8w=", + "h1:ydA0/SNRVB1o95btfshvYsmxA+jZFRZcvKzZSB+4S1M=", + "zh:58ed64389620cc7b82f01332e27723856422820cfd302e304b5f6c3436fb9840", + "zh:62a5cc82c3b2ddef7ef3a6f2fedb7b9b3deff4ab7b414938b08e51d6e8be87cb", + "zh:63cff4de03af983175a7e37e52d4bd89d990be256b16b5c7f919aff5ad485aa5", + "zh:74cb22c6700e48486b7cabefa10b33b801dfcab56f1a6ac9b6624531f3d36ea3", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:91df0a9fab329aff2ff4cf26797592eb7a3a90b4a0c04d64ce186654e0cc6e17", - "zh:aa57384b85622a9f7bfb5d4512ca88e61f22a9cea9f30febaa4c98c68ff0dc21", - "zh:c4a3e329ba786ffb6f2b694e1fd41d413a7010f3a53c20b432325a94fa71e839", - "zh:e2699bc9116447f96c53d55f2a00570f982e6f9935038c3810603572693712d0", - "zh:e747c0fd5d7684e5bfad8aa0ca441903f15ae7a98a737ff6aca24ba223207e2c", - "zh:f1ca75f417ce490368f047b63ec09fd003711ae48487fba90b4aba2ccf71920e", + "zh:79e553aff77f1cfa9012a2218b8238dd672ea5e1b2924775ac9ac24d2a75c238", + "zh:a1e06ddda0b5ac48f7e7c7d59e1ab5a4073bbcf876c73c0299e4610ed53859dc", + "zh:c37a97090f1a82222925d45d84483b2aa702ef7ab66532af6cbcfb567818b970", + "zh:e4453fbebf90c53ca3323a92e7ca0f9961427d2f0ce0d2b65523cc04d5d999c2", + "zh:e80a746921946d8b6761e77305b752ad188da60688cfd2059322875d363be5f5", + "zh:fbdb892d9822ed0e4cb60f2fedbdbb556e4da0d88d3b942ae963ed6ff091e48f", + "zh:fca01a623d90d0cad0843102f9b8b9fe0d3ff8244593bd817f126582b52dd694", ] } From 50cc5e51fce94f7926b9e51101157abd09091302 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Fri, 23 Aug 2024 10:34:03 -0500 Subject: [PATCH 83/97] Move Makefile to SM2A folder --- infrastructure/.terraform.lock.hcl | 19 ---- sm2a/infrastructure/.terraform.lock.hcl | 113 +++++++++--------------- 2 files changed, 43 insertions(+), 89 deletions(-) diff --git a/infrastructure/.terraform.lock.hcl b/infrastructure/.terraform.lock.hcl index 1dde3073..4fa29add 100644 --- a/infrastructure/.terraform.lock.hcl +++ b/infrastructure/.terraform.lock.hcl @@ -107,22 +107,3 @@ provider "registry.terraform.io/hashicorp/null" { "zh:fca01a623d90d0cad0843102f9b8b9fe0d3ff8244593bd817f126582b52dd694", ] } - -provider "registry.terraform.io/hashicorp/random" { - version = "3.6.2" - hashes = [ - "h1:VavG5unYCa3SYISMKF9pzc3718M0bhPlcbUZZGl7wuo=", - "zh:0ef01a4f81147b32c1bea3429974d4d104bbc4be2ba3cfa667031a8183ef88ec", - "zh:1bcd2d8161e89e39886119965ef0f37fcce2da9c1aca34263dd3002ba05fcb53", - "zh:37c75d15e9514556a5f4ed02e1548aaa95c0ecd6ff9af1119ac905144c70c114", - "zh:4210550a767226976bc7e57d988b9ce48f4411fa8a60cd74a6b246baf7589dad", - "zh:562007382520cd4baa7320f35e1370ffe84e46ed4e2071fdc7e4b1a9b1f8ae9b", - "zh:5efb9da90f665e43f22c2e13e0ce48e86cae2d960aaf1abf721b497f32025916", - "zh:6f71257a6b1218d02a573fc9bff0657410404fb2ef23bc66ae8cd968f98d5ff6", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:9647e18f221380a85f2f0ab387c68fdafd58af6193a932417299cdcae4710150", - "zh:bb6297ce412c3c2fa9fec726114e5e0508dd2638cad6a0cb433194930c97a544", - "zh:f83e925ed73ff8a5ef6e3608ad9225baa5376446349572c2449c0c0b3cf184b7", - "zh:fbef0781cb64de76b1df1ca11078aecba7800d82fd4a956302734999cfd9a4af", - ] -} diff --git a/sm2a/infrastructure/.terraform.lock.hcl b/sm2a/infrastructure/.terraform.lock.hcl index 4fa29add..b9eab3ab 100644 --- a/sm2a/infrastructure/.terraform.lock.hcl +++ b/sm2a/infrastructure/.terraform.lock.hcl @@ -1,32 +1,11 @@ # This file is maintained automatically by "terraform init". # Manual edits may be lost in future updates. -provider "registry.terraform.io/hashicorp/archive" { - version = "2.4.0" - hashes = [ - "h1:EtN1lnoHoov3rASpgGmh6zZ/W6aRCTgKC7iMwvFY1yc=", - "h1:cJokkjeH1jfpG4QEHdRx0t2j8rr52H33A7C/oX73Ok4=", - "zh:18e408596dd53048f7fc8229098d0e3ad940b92036a24287eff63e2caec72594", - "zh:392d4216ecd1a1fd933d23f4486b642a8480f934c13e2cae3c13b6b6a7e34a7b", - "zh:655dd1fa5ca753a4ace21d0de3792d96fff429445717f2ce31c125d19c38f3ff", - "zh:70dae36c176aa2b258331ad366a471176417a94dd3b4985a911b8be9ff842b00", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7d8c8e3925f1e21daf73f85983894fbe8868e326910e6df3720265bc657b9c9c", - "zh:a032ec0f0aee27a789726e348e8ad20778c3a1c9190ef25e7cff602c8d175f44", - "zh:b8e50de62ba185745b0fe9713755079ad0e9f7ac8638d204de6762cc36870410", - "zh:c8ad0c7697a3d444df21ff97f3473a8604c8639be64afe3f31b8ec7ad7571e18", - "zh:df736c5a2a7c3a82c5493665f659437a22f0baf8c2d157e45f4dd7ca40e739fc", - "zh:e8ffbf578a0977074f6d08aa8734e36c726e53dc79894cfc4f25fadc4f45f1df", - "zh:efea57ff23b141551f92b2699024d356c7ffd1a4ad62931da7ed7a386aef7f1f", - ] -} - provider "registry.terraform.io/hashicorp/aws" { version = "4.67.0" - constraints = "~> 4.0, >= 4.54.0" + constraints = "~> 4.0" hashes = [ "h1:5Zfo3GfRSWBaXs4TGQNOflr1XaYj6pRnVJLX5VAjFX4=", - "h1:dCRc4GqsyfqHEMjgtlM1EympBcgTmcTkWaJmtd91+KA=", "zh:0843017ecc24385f2b45f2c5fce79dc25b258e50d516877b3affee3bef34f060", "zh:19876066cfa60de91834ec569a6448dab8c2518b8a71b5ca870b2444febddac6", "zh:24995686b2ad88c1ffaa242e36eee791fc6070e6144f418048c4ce24d0ba5183", @@ -45,65 +24,59 @@ provider "registry.terraform.io/hashicorp/aws" { ] } -provider "registry.terraform.io/hashicorp/external" { - version = "2.3.1" - constraints = ">= 1.0.0" +provider "registry.terraform.io/hashicorp/local" { + version = "2.4.1" hashes = [ - "h1:bROCw6g5D/3fFnWeJ01L4IrdnJl1ILU8DGDgXCtYzaY=", - "h1:gznGscVJ0USxy4CdihpjRKPsKvyGr/zqPvBoFLJTQDc=", - "zh:001e2886dc81fc98cf17cf34c0d53cb2dae1e869464792576e11b0f34ee92f54", - "zh:2eeac58dd75b1abdf91945ac4284c9ccb2bfb17fa9bdb5f5d408148ff553b3ee", - "zh:2fc39079ba61411a737df2908942e6970cb67ed2f4fb19090cd44ce2082903dd", - "zh:472a71c624952cff7aa98a7b967f6c7bb53153dbd2b8f356ceb286e6743bb4e2", - "zh:4cff06d31272aac8bc35e9b7faec42cf4554cbcbae1092eaab6ab7f643c215d9", + "h1:gpp25uNkYJYzJVnkyRr7RIBVfwLs9GSq2HNnFpTRBg0=", + "zh:244b445bf34ddbd167731cc6c6b95bbed231dc4493f8cc34bd6850cfe1f78528", + "zh:3c330bdb626123228a0d1b1daa6c741b4d5d484ab1c7ae5d2f48d4c9885cc5e9", + "zh:5ff5f9b791ddd7557e815449173f2db38d338e674d2d91800ac6e6d808de1d1d", + "zh:70206147104f4bf26ae67d730c995772f85bf23e28c2c2e7612c74f4dae3c46f", + "zh:75029676993accd6bef933c196b2fad51a9ec8a69a847dbbe96ec8ebf7926cdc", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7ed16ccd2049fa089616b98c0bd57219f407958f318f3c697843e2397ddf70df", - "zh:842696362c92bf2645eb85c739410fd51376be6c488733efae44f4ce688da50e", - "zh:8985129f2eccfd7f1841ce06f3bf2bbede6352ec9e9f926fbaa6b1a05313b326", - "zh:a5f0602d8ec991a5411ef42f872aa90f6347e93886ce67905c53cfea37278e05", - "zh:bf4ab82cbe5256dcef16949973bf6aa1a98c2c73a98d6a44ee7bc40809d002b8", - "zh:e70770be62aa70198fa899526d671643ff99eecf265bf1a50e798fc3480bd417", + "zh:7d48d5999fe1fcdae9295a7c3448ac1541f5a24c474bd82df6d4fa3732483f2b", + "zh:b766b38b027f0f84028244d1c2f990431a37d4fc3ac645962924554016507e77", + "zh:bfc7ad301dada204cf51c59d8bd6a9a87de5fddb42190b4d6ba157d6e08a1f10", + "zh:c902b527702a8c5e2c25a6637d07bbb1690cb6c1e63917a5f6dc460efd18d43f", + "zh:d68ae0e1070cf429c46586bc87580c3ed113f76241da2b6e4f1a8348126b3c46", + "zh:f4903fd89f7c92a346ae9e666c2d0b6884c4474ae109e9b4bd15e7efaa4bfc29", ] } -provider "registry.terraform.io/hashicorp/local" { - version = "2.4.0" - constraints = ">= 1.0.0" +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.2" hashes = [ - "h1:R97FTYETo88sT2VHfMgkPU3lzCsZLunPftjSI5vfKe8=", - "h1:ZUEYUmm2t4vxwzxy1BvN1wL6SDWrDxfH7pxtzX8c6d0=", - "zh:53604cd29cb92538668fe09565c739358dc53ca56f9f11312b9d7de81e48fab9", - "zh:66a46e9c508716a1c98efbf793092f03d50049fa4a83cd6b2251e9a06aca2acf", - "zh:70a6f6a852dd83768d0778ce9817d81d4b3f073fab8fa570bff92dcb0824f732", + "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:82a803f2f484c8b766e2e9c32343e9c89b91997b9f8d2697f9f3837f62926b35", - "zh:9708a4e40d6cc4b8afd1352e5186e6e1502f6ae599867c120967aebe9d90ed04", - "zh:973f65ce0d67c585f4ec250c1e634c9b22d9c4288b484ee2a871d7fa1e317406", - "zh:c8fa0f98f9316e4cfef082aa9b785ba16e36ff754d6aba8b456dab9500e671c6", - "zh:cfa5342a5f5188b20db246c73ac823918c189468e1382cb3c48a9c0c08fc5bf7", - "zh:e0e2b477c7e899c63b06b38cd8684a893d834d6d0b5e9b033cedc06dd7ffe9e2", - "zh:f62d7d05ea1ee566f732505200ab38d94315a4add27947a60afa29860822d3fc", - "zh:fa7ce69dde358e172bd719014ad637634bbdabc49363104f4fca759b4b73f2ce", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", ] } -provider "registry.terraform.io/hashicorp/null" { - version = "3.2.1" - constraints = ">= 2.0.0" +provider "registry.terraform.io/hashicorp/random" { + version = "3.6.0" hashes = [ - "h1:FbGfc+muBsC17Ohy5g806iuI1hQc4SIexpYCrQHQd8w=", - "h1:ydA0/SNRVB1o95btfshvYsmxA+jZFRZcvKzZSB+4S1M=", - "zh:58ed64389620cc7b82f01332e27723856422820cfd302e304b5f6c3436fb9840", - "zh:62a5cc82c3b2ddef7ef3a6f2fedb7b9b3deff4ab7b414938b08e51d6e8be87cb", - "zh:63cff4de03af983175a7e37e52d4bd89d990be256b16b5c7f919aff5ad485aa5", - "zh:74cb22c6700e48486b7cabefa10b33b801dfcab56f1a6ac9b6624531f3d36ea3", + "h1:I8MBeauYA8J8yheLJ8oSMWqB0kovn16dF/wKZ1QTdkk=", + "zh:03360ed3ecd31e8c5dac9c95fe0858be50f3e9a0d0c654b5e504109c2159287d", + "zh:1c67ac51254ba2a2bb53a25e8ae7e4d076103483f55f39b426ec55e47d1fe211", + "zh:24a17bba7f6d679538ff51b3a2f378cedadede97af8a1db7dad4fd8d6d50f829", + "zh:30ffb297ffd1633175d6545d37c2217e2cef9545a6e03946e514c59c0859b77d", + "zh:454ce4b3dbc73e6775f2f6605d45cee6e16c3872a2e66a2c97993d6e5cbd7055", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:79e553aff77f1cfa9012a2218b8238dd672ea5e1b2924775ac9ac24d2a75c238", - "zh:a1e06ddda0b5ac48f7e7c7d59e1ab5a4073bbcf876c73c0299e4610ed53859dc", - "zh:c37a97090f1a82222925d45d84483b2aa702ef7ab66532af6cbcfb567818b970", - "zh:e4453fbebf90c53ca3323a92e7ca0f9961427d2f0ce0d2b65523cc04d5d999c2", - "zh:e80a746921946d8b6761e77305b752ad188da60688cfd2059322875d363be5f5", - "zh:fbdb892d9822ed0e4cb60f2fedbdbb556e4da0d88d3b942ae963ed6ff091e48f", - "zh:fca01a623d90d0cad0843102f9b8b9fe0d3ff8244593bd817f126582b52dd694", + "zh:91df0a9fab329aff2ff4cf26797592eb7a3a90b4a0c04d64ce186654e0cc6e17", + "zh:aa57384b85622a9f7bfb5d4512ca88e61f22a9cea9f30febaa4c98c68ff0dc21", + "zh:c4a3e329ba786ffb6f2b694e1fd41d413a7010f3a53c20b432325a94fa71e839", + "zh:e2699bc9116447f96c53d55f2a00570f982e6f9935038c3810603572693712d0", + "zh:e747c0fd5d7684e5bfad8aa0ca441903f15ae7a98a737ff6aca24ba223207e2c", + "zh:f1ca75f417ce490368f047b63ec09fd003711ae48487fba90b4aba2ccf71920e", ] } From 0301e6d18474725c9d935c51fd212007280874ef Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Fri, 23 Aug 2024 13:41:21 -0500 Subject: [PATCH 84/97] Remove debug lines --- sm2a/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/sm2a/Makefile b/sm2a/Makefile index 35560e33..62a04bd3 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -52,8 +52,6 @@ sm2a-deploy: sm2a-cp-dags @echo "Installing the deployment dependency" pip install -r deploy_requirements.txt echo "Deploying SM2A" - pwd - ls -al python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} ./scripts/deploy.sh ${ENV_FILE} <<< init ./scripts/deploy.sh ${ENV_FILE} <<< deploy From 23435b51b1bc9a8325b4e221a4a71cc8ade03028 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Tue, 27 Aug 2024 15:33:05 -0500 Subject: [PATCH 85/97] Making @echo consistent --- sm2a/Makefile | 15 +++++++-------- sm2a/docker-compose.yml | 4 ++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sm2a/Makefile b/sm2a/Makefile index 62a04bd3..287f8812 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -28,16 +28,16 @@ count_down = \ all: sm2a-local-init sm2a-local-run -sm2a-local-run: sm2a-local-stop sm2a-cp-dags +sm2a-local-run: sm2a-local-stop @echo "Running SM2A" docker compose up -d $(call important_message, "Give the resources a minute to be healthy 💪") $(count_down) $(call info_message, "Please visit http://localhost:8080") - echo "username:airflow | password:airflow" - echo "To use local SM2A with AWS update ${SM2A_FOLDER}/sm2a-local-config/.env AWS credentials" + @echo "username:airflow | password:airflow" + @echo "To use local SM2A with AWS update ${SM2A_FOLDER}/sm2a-local-config/.env AWS credentials" -sm2a-local-init: sm2a-cp-dags +sm2a-local-init: cp sm2a-local-config/env_example sm2a-local-config/.env docker compose run --rm airflow-cli db init docker compose run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin @@ -45,13 +45,12 @@ sm2a-local-init: sm2a-cp-dags sm2a-local-stop: docker compose down -sm2a-cp-dags: - cp -r ../dags . -sm2a-deploy: sm2a-cp-dags +sm2a-deploy: @echo "Installing the deployment dependency" pip install -r deploy_requirements.txt - echo "Deploying SM2A" + @echo "Deploying SM2A" + cp -r ../dags . python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} ./scripts/deploy.sh ${ENV_FILE} <<< init ./scripts/deploy.sh ${ENV_FILE} <<< deploy diff --git a/sm2a/docker-compose.yml b/sm2a/docker-compose.yml index c441b6c7..b84a99c0 100644 --- a/sm2a/docker-compose.yml +++ b/sm2a/docker-compose.yml @@ -22,7 +22,7 @@ x-airflow-common: # Gotcha: Even though we set this to "True" in airflow.cfg, an environment variable overrides it AIRFLOW__CORE__LOAD_EXAMPLES: false volumes: - - ./dags:/opt/airflow/dags + - ../dags:/opt/airflow/dags - ./plugins:/opt/airflow/plugins - ./sm2a-local-config/local_airflow.cfg:/opt/airflow/airflow.cfg - ./sm2a-local-config/local_webserver_config.py:/opt/airflow/webserver_config.py @@ -47,7 +47,7 @@ x-airflow-worker: environment: <<: *airflow-common-env volumes: - - ./dags:/opt/airflow/dags + - ../dags:/opt/airflow/dags - ./plugins:/opt/airflow/plugins - ./sm2a-local-config/local_airflow.cfg:/opt/airflow/airflow.cfg - ./infrastructure/configuration:/opt/airflow/configuration From f268f847af080a90657efe2dd2a27cc6e46f954b Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 28 Aug 2024 10:28:47 -0500 Subject: [PATCH 86/97] Fix installing the deployment dependencies --- sm2a/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sm2a/Makefile b/sm2a/Makefile index 287f8812..64d1d82c 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -48,7 +48,7 @@ sm2a-local-stop: sm2a-deploy: @echo "Installing the deployment dependency" - pip install -r deploy_requirements.txt + pip install -r ../deploy_requirements.txt @echo "Deploying SM2A" cp -r ../dags . python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} From 8000675239c3677df289c17a4f5c0584086e905b Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Wed, 28 Aug 2024 14:28:17 -0500 Subject: [PATCH 87/97] Adding extra targets --- Makefile | 6 ++++++ sm2a/Makefile | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/Makefile b/Makefile index 4e271d1e..9ff9a4ee 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,11 @@ clean all test + list +list: + $(MAKE) -C sm2a list + all: $(MAKE) -C sm2a all @@ -25,6 +29,8 @@ sm2a-local-stop: sm2a-deploy: $(MAKE) -C sm2a sm2a-deploy +sm2a-local-build: + $(MAKE) -C sm2a sm2a-local-build clean: $(MAKE) -C sm2a clean diff --git a/sm2a/Makefile b/sm2a/Makefile index 64d1d82c..9bccf0b4 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -22,6 +22,7 @@ count_down = \ .PHONY: clean all + list @@ -46,6 +47,11 @@ sm2a-local-stop: docker compose down +sm2a-local-build: + cp -r ../dags . + docker compose build + rm -rf dags + sm2a-deploy: @echo "Installing the deployment dependency" pip install -r ../deploy_requirements.txt @@ -60,3 +66,6 @@ clean: sm2a-local-stop docker container prune -f docker image prune -f docker volume prune -f + +list: + @grep '^[^#[:space:]].*:' Makefile \ No newline at end of file From 50b8600c1ad7deefe4920b0b6a6c6ff9f405f0f2 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Thu, 29 Aug 2024 10:11:06 -0500 Subject: [PATCH 88/97] Add missing documentation to README --- Makefile | 8 +------- README.md | 10 +++++++--- sm2a/Makefile | 11 +++-------- sm2a/README.md | 33 +++++++++++++++++++++++++-------- 4 files changed, 36 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 9ff9a4ee..5732fdb9 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,3 @@ - - .PHONY: clean all @@ -8,29 +6,25 @@ list: $(MAKE) -C sm2a list - all: $(MAKE) -C sm2a all - sm2a-local-run: $(MAKE) -C sm2a sm2a-local-run sm2a-local-init: $(MAKE) -C sm2a sm2a-local-init - sm2a-local-stop: $(MAKE) -C sm2a sm2a-local-stop - - sm2a-deploy: $(MAKE) -C sm2a sm2a-deploy sm2a-local-build: $(MAKE) -C sm2a sm2a-local-build + clean: $(MAKE) -C sm2a clean diff --git a/README.md b/README.md index 3f9857b4..d0d1bf4c 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,12 @@ See [getting-started-install](https://docs.aws.amazon.com/cli/latest/userguide/g ### Setup a local SM2A development environment +1. Build services +```shell +make sm2a-local-build +``` -1. Initialize the metadata db +2. Initialize the metadata db ```shell make sm2a-local-init @@ -50,14 +54,14 @@ which will require you to reinitialize SM2A with `make sm2a-local-init` This will create an airflow username: `airflow` with password `airflow` -2. Start all services +3. Start all services ```shell make sm2a-local-run ``` This will start SM2A services and will be running on http://localhost:8080 -3. Stop all services +4. Stop all services ```shell make sm2a-local-stop diff --git a/sm2a/Makefile b/sm2a/Makefile index 9bccf0b4..e092e577 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -1,14 +1,12 @@ SECRET_NAME="" ENV_FILE=".env" - important_message = \ @echo "\033[0;31m$(1) \033[0m" info_message = \ @echo "\033[0;32m$(1) \033[0m" - count_down = \ @echo "Spinning up the system please wait..."; \ secs=40 ;\ @@ -18,17 +16,13 @@ count_down = \ : $$((secs--)); \ done; - .PHONY: clean all list - - all: sm2a-local-init sm2a-local-run - sm2a-local-run: sm2a-local-stop @echo "Running SM2A" docker compose up -d @@ -40,13 +34,14 @@ sm2a-local-run: sm2a-local-stop sm2a-local-init: cp sm2a-local-config/env_example sm2a-local-config/.env + cp -r ../dags . docker compose run --rm airflow-cli db init docker compose run --rm airflow-cli users create --email airflow@example.com --firstname airflow --lastname airflow --password airflow --username airflow --role Admin + rm -rf dags sm2a-local-stop: docker compose down - sm2a-local-build: cp -r ../dags . docker compose build @@ -68,4 +63,4 @@ clean: sm2a-local-stop docker volume prune -f list: - @grep '^[^#[:space:]].*:' Makefile \ No newline at end of file + @grep '^[^#[:space:]].*:' Makefile diff --git a/sm2a/README.md b/sm2a/README.md index b64bf5ae..56536384 100644 --- a/sm2a/README.md +++ b/sm2a/README.md @@ -80,17 +80,34 @@ This project uses Terraform modules to deploy Apache Airflow and related AWS res - [gitflow.yml](./.github/workflows/gitflow.yml) provides a structured way to manage the development, testing, and deployment of terraform modules. For more info refer to [gitflow](https://github.com/NASA-IMPACT/csda-data-pipelines/blob/dev/GITFLOW.md) +### Setup a local SM2A development environment +1. Build services +```shell +make sm2a-local-build +``` -### Deployment via local machine -You can deploy SM2A from your local machine by running: -```bash -$python scripts/generate_env_file.py --secret-id $AWS_SECRET_NAME --env-file .env +2. Initialize the metadata db + +```shell +make sm2a-local-init ``` -Assuming you have access to [AWS Secrets Manager](https://aws.amazon.com/secrets-manager/) where the deployment variables are stored. +🚨 NOTE: This command is typically required only once at the beginning. +After running it, you generally do not need to run it again unless you run `make clean`, +which will require you to reinitialize SM2A with `make sm2a-local-init` -```bash -./scripts/deploy.sh .env <<< init -./scripts/deploy.sh .env <<< deploy +This will create an airflow username: `airflow` with password `airflow` + +3. Start all services + +```shell +make sm2a-local-run +``` +This will start SM2A services and will be running on http://localhost:8080 + +4. Stop all services + +```shell +make sm2a-local-stop ``` ### Login to UI From adada8325c6c3f57927e64b2a74ffb8b624e8d0f Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Thu, 29 Aug 2024 10:26:55 -0500 Subject: [PATCH 89/97] Fix sh syntax error by switching to bash --- sm2a/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/sm2a/Makefile b/sm2a/Makefile index e092e577..2e37423e 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -1,5 +1,6 @@ SECRET_NAME="" ENV_FILE=".env" +SHELL=/bin/bash important_message = \ @echo "\033[0;31m$(1) \033[0m" From 205a2ad95f64488a7c1980f4da7beb89d2422eb3 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 10:21:41 -0500 Subject: [PATCH 90/97] Prep the repo for MWAA 2 SM2A migration --- dags/generate_dags.py | 18 +++++++++++++++--- sm2a/Makefile | 13 ++++++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/dags/generate_dags.py b/dags/generate_dags.py index f20db9f3..9eeab714 100644 --- a/dags/generate_dags.py +++ b/dags/generate_dags.py @@ -11,6 +11,7 @@ def generate_dags(): import boto3 import json + from botocore.exceptions import ClientError, NoCredentialsError from pathlib import Path @@ -18,9 +19,20 @@ def generate_dags(): mwaa_stac_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True) bucket = mwaa_stac_conf["EVENT_BUCKET"] - client = boto3.client("s3") - response = client.list_objects_v2(Bucket=bucket, Prefix="collections/") - + try: + client = boto3.client("s3") + response = client.list_objects_v2(Bucket=bucket, Prefix="collections/") + except ClientError as e: + # Handle general AWS service errors (e.g., wrong bucket name) + print(f"ClientError: {e}") + return + except NoCredentialsError: + # Handle missing credentials + print("Credentials not found.") + return + except Exception as ex: + print(f"An unexpected error occurred: {ex}") + return for file_ in response.get("Contents", []): key = file_["Key"] if key.endswith("/"): diff --git a/sm2a/Makefile b/sm2a/Makefile index 2e37423e..e1fb2f96 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -1,6 +1,6 @@ SECRET_NAME="" ENV_FILE=".env" -SHELL=/bin/bash + important_message = \ @echo "\033[0;31m$(1) \033[0m" @@ -49,13 +49,16 @@ sm2a-local-build: rm -rf dags sm2a-deploy: +ifeq ($(GITHUB_ACTIONS_ENV),true) @echo "Installing the deployment dependency" - pip install -r ../deploy_requirements.txt + pip install -r deploy_requirements.txt @echo "Deploying SM2A" - cp -r ../dags . python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} - ./scripts/deploy.sh ${ENV_FILE} <<< init - ./scripts/deploy.sh ${ENV_FILE} <<< deploy + @bash -c './scripts/deploy.sh ${ENV_FILE} <<< init' + @bash -c './scripts/deploy.sh ${ENV_FILE} <<< deploy' +else + $(call important_message, "Wait a minute you are not github 😡") +endif clean: sm2a-local-stop @echo "Cleaning local env" From e2a9d139c0688eba639902597203aa106f042512 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 10:29:22 -0500 Subject: [PATCH 91/97] Upgrade base SM2A module --- sm2a/infrastructure/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index 5c8f3940..f5c4e749 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,7 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.1.3/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.1.4/self-managed-apache-airflow.zip" airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix From e0e9f4183be8bf7642decb806e110a7b2d2b0c34 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 10:43:22 -0500 Subject: [PATCH 92/97] Fix deploy --- sm2a/Makefile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sm2a/Makefile b/sm2a/Makefile index e1fb2f96..4b207636 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -1,6 +1,6 @@ SECRET_NAME="" ENV_FILE=".env" - +GITHUB_ACTIONS_ENV := $(shell test -n "$$GITHUB_ACTIONS" && echo "true" || echo "false") important_message = \ @echo "\033[0;31m$(1) \033[0m" @@ -49,16 +49,16 @@ sm2a-local-build: rm -rf dags sm2a-deploy: -ifeq ($(GITHUB_ACTIONS_ENV),true) - @echo "Installing the deployment dependency" - pip install -r deploy_requirements.txt - @echo "Deploying SM2A" - python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} - @bash -c './scripts/deploy.sh ${ENV_FILE} <<< init' - @bash -c './scripts/deploy.sh ${ENV_FILE} <<< deploy' -else - $(call important_message, "Wait a minute you are not github 😡") -endif + ifeq ($(GITHUB_ACTIONS_ENV),true) + @echo "Installing the deployment dependency" + pip install -r deploy_requirements.txt + @echo "Deploying SM2A" + python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} + @bash -c './scripts/deploy.sh ${ENV_FILE} <<< init' + @bash -c './scripts/deploy.sh ${ENV_FILE} <<< deploy' + else + $(call important_message, "Wait a minute you are not github 😡") + endif clean: sm2a-local-stop @echo "Cleaning local env" From a36a15b2c5ac9f80b7aa8fb3add534a268141802 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 10:49:20 -0500 Subject: [PATCH 93/97] Fix deploy --- .github/actions/terraform-deploy-sm2a/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 4f146a95..aad783da 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -39,6 +39,7 @@ runs: terraform_version: 1.3.3 - name: Deploy + working-directory: ${{ inputs.dir }} shell: bash env: AWS_DEFAULT_REGION: ${{ inputs.aws-region }} From 3860b9b9622c19625c93671f0cbb62e30435a030 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 10:54:05 -0500 Subject: [PATCH 94/97] Fix deploy --- sm2a/Makefile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sm2a/Makefile b/sm2a/Makefile index 4b207636..bd86c5f0 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -49,16 +49,16 @@ sm2a-local-build: rm -rf dags sm2a-deploy: - ifeq ($(GITHUB_ACTIONS_ENV),true) - @echo "Installing the deployment dependency" - pip install -r deploy_requirements.txt - @echo "Deploying SM2A" - python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} - @bash -c './scripts/deploy.sh ${ENV_FILE} <<< init' - @bash -c './scripts/deploy.sh ${ENV_FILE} <<< deploy' - else - $(call important_message, "Wait a minute you are not github 😡") - endif +ifeq ($(GITHUB_ACTIONS_ENV),true) + @echo "Installing the deployment dependency" + pip install -r deploy_requirements.txt + @echo "Deploying SM2A" + python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} + @bash -c './scripts/deploy.sh ${ENV_FILE} <<< init' + @bash -c './scripts/deploy.sh ${ENV_FILE} <<< deploy' +else + $(call important_message, "Wait a minute you are not github 😡") +endif clean: sm2a-local-stop @echo "Cleaning local env" From 31a34801646a281e3cffb815bb36696f35eb5f9c Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 11:00:58 -0500 Subject: [PATCH 95/97] Fixing install dependencies --- .github/actions/terraform-deploy-sm2a/action.yml | 4 ++++ sm2a/Makefile | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index aad783da..282b7fa8 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -38,6 +38,10 @@ runs: with: terraform_version: 1.3.3 + - name: Install python dependencies + shell: bash + run: pip install -r deploy_requirements.txt + - name: Deploy working-directory: ${{ inputs.dir }} shell: bash diff --git a/sm2a/Makefile b/sm2a/Makefile index bd86c5f0..3dd33b8a 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -51,7 +51,6 @@ sm2a-local-build: sm2a-deploy: ifeq ($(GITHUB_ACTIONS_ENV),true) @echo "Installing the deployment dependency" - pip install -r deploy_requirements.txt @echo "Deploying SM2A" python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} @bash -c './scripts/deploy.sh ${ENV_FILE} <<< init' From 58d12254b1136ae53f29ce02ed64382e9469e4b6 Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 11:10:25 -0500 Subject: [PATCH 96/97] Fixing dependency deployment --- .github/actions/terraform-deploy-sm2a/action.yml | 4 ---- sm2a/Makefile | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml index 282b7fa8..aad783da 100644 --- a/.github/actions/terraform-deploy-sm2a/action.yml +++ b/.github/actions/terraform-deploy-sm2a/action.yml @@ -38,10 +38,6 @@ runs: with: terraform_version: 1.3.3 - - name: Install python dependencies - shell: bash - run: pip install -r deploy_requirements.txt - - name: Deploy working-directory: ${{ inputs.dir }} shell: bash diff --git a/sm2a/Makefile b/sm2a/Makefile index 3dd33b8a..57a4e020 100644 --- a/sm2a/Makefile +++ b/sm2a/Makefile @@ -51,7 +51,9 @@ sm2a-local-build: sm2a-deploy: ifeq ($(GITHUB_ACTIONS_ENV),true) @echo "Installing the deployment dependency" + pip install -r ../deploy_requirements.txt @echo "Deploying SM2A" + cp -r ../dags . python scripts/generate_env_file.py --secret-id ${SECRET_NAME} --env-file ${ENV_FILE} @bash -c './scripts/deploy.sh ${ENV_FILE} <<< init' @bash -c './scripts/deploy.sh ${ENV_FILE} <<< deploy' From 12eab30ed2a87a7db8adbcf947a0abd842a4e3bf Mon Sep 17 00:00:00 2001 From: Abdelhak Marouane Date: Mon, 30 Sep 2024 16:36:31 -0500 Subject: [PATCH 97/97] Add project name --- sm2a/infrastructure/main.tf | 5 +++-- sm2a/infrastructure/variables.tf | 12 ++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sm2a/infrastructure/main.tf b/sm2a/infrastructure/main.tf index f5c4e749..f7b6aa22 100644 --- a/sm2a/infrastructure/main.tf +++ b/sm2a/infrastructure/main.tf @@ -19,7 +19,8 @@ resource "random_password" "password" { module "sma-base" { - source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.1.4/self-managed-apache-airflow.zip" + source = "https://github.com/NASA-IMPACT/self-managed-apache-airflow/releases/download/v1.1.4/self-managed-apache-airflow.zip" + project = var.project_name airflow_db = var.airflow_db fernet_key = var.fernet_key prefix = var.prefix @@ -43,7 +44,7 @@ module "sma-base" { rds_allocated_storage = var.rds_configuration[var.stage].rds_allocated_storage rds_max_allocated_storage = var.rds_configuration[var.stage].rds_max_allocated_storage workers_logs_retention_days = var.workers_configuration[var.stage].workers_logs_retention_days - airflow_custom_variables = var.airflow_custom_variables + airflow_custom_variables = var.airflow_custom_variables extra_airflow_task_common_environment = [ { diff --git a/sm2a/infrastructure/variables.tf b/sm2a/infrastructure/variables.tf index 378958e8..e2b7f54a 100644 --- a/sm2a/infrastructure/variables.tf +++ b/sm2a/infrastructure/variables.tf @@ -181,9 +181,9 @@ variable "custom_worker_policy_statement" { { Effect = "Allow" Action = [ - "sts:AssumeRole", - "iam:PassRole", - "logs:GetLogEvents" + "sts:AssumeRole", + "iam:PassRole", + "logs:GetLogEvents" ] "Resource" : [ "*" @@ -197,6 +197,10 @@ variable "custom_worker_policy_statement" { variable "airflow_custom_variables" { description = "Airflow custom variables" type = map(string) - default = {} + default = {} +} +variable "project_name" { + type = string + default = "SM2A" }