Merge pull request #235 from NASA-IMPACT/dev

Promote dev to main
NASA-IMPACT · Oct 17, 2024 · 7e3e134 · 7e3e134
2 parents 574a173 + 2af4a7c
commit 7e3e134
Show file tree

Hide file tree

Showing 48 changed files with 2,872 additions and 13 deletions.
diff --git a/.github/actions/terraform-deploy-sm2a/action.yml b/.github/actions/terraform-deploy-sm2a/action.yml
@@ -0,0 +1,57 @@
+name: Deploy SM2A
+
+inputs:
+  env_aws_secret_name:
+    required: true
+    type: string
+  env-file:
+    type: string
+    default: ".env"
+  dir:
+    required: false
+    type: string
+    default: "."
+  aws-region:
+    required: false
+    type: string
+    default: "us-west-2"
+  script_path:
+    type: string
+  backend_stack_name:
+    type: string
+  auth_stack_name:
+    type: string
+
+
+runs:
+  using: "composite"
+
+  steps:
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+        cache: "pip"
+
+    - name: Setup Terraform
+      uses: hashicorp/setup-terraform@v1
+      with:
+        terraform_version: 1.3.3
+
+    - name: Deploy
+      working-directory: ${{ inputs.dir }}
+      shell: bash
+      env:
+        AWS_DEFAULT_REGION: ${{ inputs.aws-region }}
+        AWS_REGION: ${{ inputs.aws-region }}
+      run: |
+        make sm2a-deploy ENV_FILE=${{ inputs.env-file }} SECRET_NAME=${{ inputs.env_aws_secret_name }}
+
+    - name: Output workflows API endpoint
+      id: output_sm2a_workflows_endpoint
+      shell: bash
+      working-directory: ${{ inputs.dir }}
+      run: |
+        cd ./infrastructure
+        terraform output -json Airflow_url > ${HOME}/output_sm2a_workflows_endpoint.json
+
diff --git a/.github/actions/terraform-deploy/action.yml b/.github/actions/terraform-deploy/action.yml
@@ -70,3 +70,4 @@ runs:
       run: |
         cd ./infrastructure
         terraform output -json workflows_api > ${HOME}/terraform_outputs.json
+
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -69,9 +69,19 @@ jobs:
         with:
           role-to-assume: ${{ secrets.DEPLOYMENT_ROLE_ARN }}
           role-session-name: "veda-airflow-github-${{ needs.define-environment.outputs.env_name }}-deployment"
-          aws-region: "us-west-2"
+          aws-region: us-west-2
 
-      - name: Run deployment
+      - name: Run MWAA deployment
         uses: "./.github/actions/terraform-deploy"
         with:
           env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }}
+
+      - name: Run SM2A deployment
+        # Flag to deploy SM2A
+        if: ${{ vars.DEPLOY_SM2A }} = "true"
+        uses: "./.github/actions/terraform-deploy-sm2a"
+        with:
+          dir: ./sm2a
+          env_aws_secret_name: ${{ vars.SM2A_ENVS_DEPLOYMENT_SECRET_NAME }}
+          env-file: .env
+          aws-region: us-west-2
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ __pycache__
 # Ignore Terraform
 .terraform
 .env
+.env_ghg_dev
 terraform.tf
 terraform.tfvars
 # Ignore data files which are downloaded for local testing
@@ -43,6 +44,5 @@ cdk.context.json
 env.sh
 
 .hypothesis
-Makefile
 .env_sit
 terraform.tfstate
diff --git a/Makefile b/Makefile
@@ -0,0 +1,32 @@
+.PHONY:
+	clean
+	all
+	test
+	list
+
+list:
+	$(MAKE) -C sm2a list
+
+all:
+	$(MAKE) -C sm2a all
+
+sm2a-local-run:
+	$(MAKE) -C sm2a sm2a-local-run
+
+sm2a-local-init:
+	$(MAKE) -C sm2a sm2a-local-init
+
+sm2a-local-stop:
+	$(MAKE) -C sm2a sm2a-local-stop
+
+sm2a-deploy:
+	$(MAKE) -C sm2a sm2a-deploy
+
+sm2a-local-build:
+	$(MAKE) -C sm2a sm2a-local-build
+
+clean:
+	$(MAKE) -C sm2a clean
+
+test:
+	pytest tests
diff --git a/README.md b/README.md
@@ -35,6 +35,39 @@ See [terraform-getting-started](https://developer.hashicorp.com/terraform/tutori
 
 See [getting-started-install](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
 
+
+
+### Setup a local SM2A development environment
+1. Build services
+```shell
+make sm2a-local-build
+```
+
+2. Initialize the metadata db
+
+```shell
+make sm2a-local-init
+```
+🚨 NOTE: This command is typically required only once at the beginning. 
+After running it, you generally do not need to run it again unless you run `make clean`,
+which will require you to reinitialize SM2A with `make sm2a-local-init`
+
+This will create an airflow username: `airflow` with password `airflow`
+
+3. Start all services
+
+```shell
+make sm2a-local-run
+```
+This will start SM2A services and will be running on http://localhost:8080
+
+4. Stop all services
+
+```shell
+make sm2a-local-stop
+```
+
+
 ## Deployment
 
 This project uses Terraform modules to deploy Apache Airflow and related AWS resources using Amazon's managed Airflow provider.
@@ -78,6 +111,66 @@ Currently, the client id and domain of an existing Cognito user pool programmati
 
 # Gitflow Model
 [VEDA pipeline gitflow](./GITFLOW.md)
+
+# Ingestion Pipeline Overview
+
+This pipeline is designed to handle the ingestion of both vector and raster data. The ingestion can be performed using the `veda-discover` DAG. Below are examples of configurations for both vector and raster data.
+
+## Ingestion Configuration
+
+### Vector Data Ingestion
+```json
+{
+  "collection": "",
+  "bucket": "",
+  "prefix": "",
+  "filename_regex": ".*.csv$",
+  "id_template": "-{}",
+  "datetime_range": "",
+  "vector": true,
+  "x_possible": "longitude",
+  "y_possible": "latitude",
+  "source_projection": "EPSG:4326",
+  "target_projection": "EPSG:4326",
+  "extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"]
+}
+```
+
+### Raster Data Ingestion 
+```json
+{
+    "collection": "",
+    "bucket": "",
+    "prefix": "",
+    "filename_regex": ".*.tif$",
+    "datetime_range": "",
+    "assets": {
+        "co2": {
+            "title": "",
+            "description": ".",
+            "regex": ".*.tif$"
+        }
+    },
+    "id_regex": ".*_(.*).tif$",
+    "id_template": "-{}"
+}
+
+```
+## Configuration Fields Description
+- `collection`: The collection_id of the raster or vector data.
+- `bucket`: The name of the S3 bucket where the data is stored.
+- `prefix`: The location within the bucket where the files are to be discovered.
+- `filename_regex`: A regex expression used to filter files based on naming patterns.
+- `id_template`: The format used to create item identifiers in the system.
+- `vector`: Set to true to trigger the generic vector ingestion pipeline.
+- `vector_eis`: Set to true to trigger the EIS Fire specific vector ingestion pipeline.
+
+
+## Pipeline Behaviour
+Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag).
+
+Setting `"vector_eis": true` will trigger the EIS Fire specific `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above.
+
 # License
 This project is licensed under **Apache 2**, see the [LICENSE](LICENSE) file for more details.
 
diff --git a/dags/example_dag.py b/dags/example_dag.py
@@ -43,13 +43,16 @@ def push_to_cmr_task(text):
     schedule_interval=None,
     tags=["example"],
 ) as dag:
+
     start = EmptyOperator(task_id="start", dag=dag)
+
     discover_from_cmr = PythonOperator(
         task_id="discover_from_cmr",
         python_callable=discover_from_cmr_task,
         op_kwargs={"text": "Discover from CMR"},
         dag=dag,
     )
+
     discover_from_s3 = PythonOperator(
         task_id="discover_from_s3",
         python_callable=discover_from_s3_task,
@@ -79,7 +82,9 @@ def push_to_cmr_task(text):
     )
 
     end = EmptyOperator(task_id="end", dag=dag)
+
     start >> discover_from_cmr
+
     start >> discover_from_s3 >> move_files_to_maap_store
     (
         [discover_from_cmr, move_files_to_maap_store]

diff --git a/dags/generate_dags.py b/dags/generate_dags.py
@@ -11,16 +11,28 @@
 def generate_dags():
     import boto3
     import json
+    from botocore.exceptions import ClientError, NoCredentialsError
 
     from pathlib import Path
 
 
     mwaa_stac_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True)
     bucket = mwaa_stac_conf["EVENT_BUCKET"]
 
-    client = boto3.client("s3")
-    response = client.list_objects_v2(Bucket=bucket, Prefix="collections/")
-
+    try:
+        client = boto3.client("s3")
+        response = client.list_objects_v2(Bucket=bucket, Prefix="collections/")
+    except ClientError as e:
+        # Handle general AWS service errors (e.g., wrong bucket name)
+        print(f"ClientError: {e}")
+        return
+    except NoCredentialsError:
+        # Handle missing credentials
+        print("Credentials not found.")
+        return
+    except Exception as ex:
+        print(f"An unexpected error occurred: {ex}")
+        return
     for file_ in response.get("Contents", []):
         key = file_["Key"]
         if key.endswith("/"):

diff --git a/dags/veda_data_pipeline/groups/discover_group.py b/dags/veda_data_pipeline/groups/discover_group.py
@@ -71,6 +71,8 @@ def vector_raster_choice(ti):
     dynamic_group_id = ti.task_id.split(".")[0]
 
     if payload.get("vector"):
+        return f"{dynamic_group_id}.parallel_run_process_generic_vectors"
+    if payload.get("vector_eis"):
         return f"{dynamic_group_id}.parallel_run_process_vectors"
     return f"{dynamic_group_id}.parallel_run_process_rasters"
 
@@ -101,10 +103,17 @@ def subdag_discover(event={}):
         python_callable=get_files_to_process,
     )
 
+    run_process_generic_vector = TriggerMultiDagRunOperator(
+        task_id="parallel_run_process_generic_vectors",
+        trigger_dag_id="veda_generic_ingest_vector",
+        python_callable=get_files_to_process,
+    )
+
     # extra no-op, needed to run in dynamic mapping context
     end_discover = EmptyOperator(task_id="end_discover", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,)
 
-    discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector]
+    discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector, run_process_generic_vector]
     run_process_raster >> end_discover
     run_process_vector >> end_discover
+    run_process_generic_vector >> end_discover