Skip to content

Commit

Permalink
Merge pull request #235 from NASA-IMPACT/dev
Browse files Browse the repository at this point in the history
Promote dev to main
  • Loading branch information
smohiudd authored Oct 17, 2024
2 parents 574a173 + 2af4a7c commit 7e3e134
Show file tree
Hide file tree
Showing 48 changed files with 2,872 additions and 13 deletions.
57 changes: 57 additions & 0 deletions .github/actions/terraform-deploy-sm2a/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
name: Deploy SM2A

inputs:
env_aws_secret_name:
required: true
type: string
env-file:
type: string
default: ".env"
dir:
required: false
type: string
default: "."
aws-region:
required: false
type: string
default: "us-west-2"
script_path:
type: string
backend_stack_name:
type: string
auth_stack_name:
type: string


runs:
using: "composite"

steps:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"

- name: Setup Terraform
uses: hashicorp/setup-terraform@v1
with:
terraform_version: 1.3.3

- name: Deploy
working-directory: ${{ inputs.dir }}
shell: bash
env:
AWS_DEFAULT_REGION: ${{ inputs.aws-region }}
AWS_REGION: ${{ inputs.aws-region }}
run: |
make sm2a-deploy ENV_FILE=${{ inputs.env-file }} SECRET_NAME=${{ inputs.env_aws_secret_name }}
- name: Output workflows API endpoint
id: output_sm2a_workflows_endpoint
shell: bash
working-directory: ${{ inputs.dir }}
run: |
cd ./infrastructure
terraform output -json Airflow_url > ${HOME}/output_sm2a_workflows_endpoint.json
1 change: 1 addition & 0 deletions .github/actions/terraform-deploy/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,4 @@ runs:
run: |
cd ./infrastructure
terraform output -json workflows_api > ${HOME}/terraform_outputs.json
14 changes: 12 additions & 2 deletions .github/workflows/cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,19 @@ jobs:
with:
role-to-assume: ${{ secrets.DEPLOYMENT_ROLE_ARN }}
role-session-name: "veda-airflow-github-${{ needs.define-environment.outputs.env_name }}-deployment"
aws-region: "us-west-2"
aws-region: us-west-2

- name: Run deployment
- name: Run MWAA deployment
uses: "./.github/actions/terraform-deploy"
with:
env_aws_secret_name: ${{ secrets.ENV_AWS_SECRET_NAME }}

- name: Run SM2A deployment
# Flag to deploy SM2A
if: ${{ vars.DEPLOY_SM2A }} = "true"
uses: "./.github/actions/terraform-deploy-sm2a"
with:
dir: ./sm2a
env_aws_secret_name: ${{ vars.SM2A_ENVS_DEPLOYMENT_SECRET_NAME }}
env-file: .env
aws-region: us-west-2
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ __pycache__
# Ignore Terraform
.terraform
.env
.env_ghg_dev
terraform.tf
terraform.tfvars
# Ignore data files which are downloaded for local testing
Expand Down Expand Up @@ -43,6 +44,5 @@ cdk.context.json
env.sh

.hypothesis
Makefile
.env_sit
terraform.tfstate
32 changes: 32 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
.PHONY:
clean
all
test
list

list:
$(MAKE) -C sm2a list

all:
$(MAKE) -C sm2a all

sm2a-local-run:
$(MAKE) -C sm2a sm2a-local-run

sm2a-local-init:
$(MAKE) -C sm2a sm2a-local-init

sm2a-local-stop:
$(MAKE) -C sm2a sm2a-local-stop

sm2a-deploy:
$(MAKE) -C sm2a sm2a-deploy

sm2a-local-build:
$(MAKE) -C sm2a sm2a-local-build

clean:
$(MAKE) -C sm2a clean

test:
pytest tests
93 changes: 93 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,39 @@ See [terraform-getting-started](https://developer.hashicorp.com/terraform/tutori

See [getting-started-install](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)



### Setup a local SM2A development environment
1. Build services
```shell
make sm2a-local-build
```

2. Initialize the metadata db

```shell
make sm2a-local-init
```
🚨 NOTE: This command is typically required only once at the beginning.
After running it, you generally do not need to run it again unless you run `make clean`,
which will require you to reinitialize SM2A with `make sm2a-local-init`

This will create an airflow username: `airflow` with password `airflow`

3. Start all services

```shell
make sm2a-local-run
```
This will start SM2A services and will be running on http://localhost:8080

4. Stop all services

```shell
make sm2a-local-stop
```


## Deployment

This project uses Terraform modules to deploy Apache Airflow and related AWS resources using Amazon's managed Airflow provider.
Expand Down Expand Up @@ -78,6 +111,66 @@ Currently, the client id and domain of an existing Cognito user pool programmati
# Gitflow Model
[VEDA pipeline gitflow](./GITFLOW.md)
# Ingestion Pipeline Overview
This pipeline is designed to handle the ingestion of both vector and raster data. The ingestion can be performed using the `veda-discover` DAG. Below are examples of configurations for both vector and raster data.
## Ingestion Configuration
### Vector Data Ingestion
```json
{
"collection": "",
"bucket": "",
"prefix": "",
"filename_regex": ".*.csv$",
"id_template": "-{}",
"datetime_range": "",
"vector": true,
"x_possible": "longitude",
"y_possible": "latitude",
"source_projection": "EPSG:4326",
"target_projection": "EPSG:4326",
"extra_flags": ["-overwrite", "-lco", "OVERWRITE=YES"]
}
```

### Raster Data Ingestion
```json
{
"collection": "",
"bucket": "",
"prefix": "",
"filename_regex": ".*.tif$",
"datetime_range": "",
"assets": {
"co2": {
"title": "",
"description": ".",
"regex": ".*.tif$"
}
},
"id_regex": ".*_(.*).tif$",
"id_template": "-{}"
}

```
## Configuration Fields Description
- `collection`: The collection_id of the raster or vector data.
- `bucket`: The name of the S3 bucket where the data is stored.
- `prefix`: The location within the bucket where the files are to be discovered.
- `filename_regex`: A regex expression used to filter files based on naming patterns.
- `id_template`: The format used to create item identifiers in the system.
- `vector`: Set to true to trigger the generic vector ingestion pipeline.
- `vector_eis`: Set to true to trigger the EIS Fire specific vector ingestion pipeline.


## Pipeline Behaviour
Since this pipeline can ingest both raster and vector data, the configuration can be modified accordingly. The `"vector": true` triggers the `generic_ingest_vector` dag. If the `collection` is provided, it uses the collection name as the table name for ingestion (recommended to use `append` extra_flag when the collection is provided). When no `collection` is provided, it uses the `id_template` and generates a table name by appending the actual ingested filename to the id_template (recommended to use `overwrite` extra flag).

Setting `"vector_eis": true` will trigger the EIS Fire specific `ingest_vector` dag. If neither of these flags is set, the raster ingestion will be triggered, with the configuration typically looking like the raster ingestion example above.

# License
This project is licensed under **Apache 2**, see the [LICENSE](LICENSE) file for more details.

5 changes: 5 additions & 0 deletions dags/example_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,16 @@ def push_to_cmr_task(text):
schedule_interval=None,
tags=["example"],
) as dag:

start = EmptyOperator(task_id="start", dag=dag)

discover_from_cmr = PythonOperator(
task_id="discover_from_cmr",
python_callable=discover_from_cmr_task,
op_kwargs={"text": "Discover from CMR"},
dag=dag,
)

discover_from_s3 = PythonOperator(
task_id="discover_from_s3",
python_callable=discover_from_s3_task,
Expand Down Expand Up @@ -79,7 +82,9 @@ def push_to_cmr_task(text):
)

end = EmptyOperator(task_id="end", dag=dag)

start >> discover_from_cmr

start >> discover_from_s3 >> move_files_to_maap_store
(
[discover_from_cmr, move_files_to_maap_store]
Expand Down
18 changes: 15 additions & 3 deletions dags/generate_dags.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,28 @@
def generate_dags():
import boto3
import json
from botocore.exceptions import ClientError, NoCredentialsError

from pathlib import Path


mwaa_stac_conf = Variable.get("MWAA_STACK_CONF", deserialize_json=True)
bucket = mwaa_stac_conf["EVENT_BUCKET"]

client = boto3.client("s3")
response = client.list_objects_v2(Bucket=bucket, Prefix="collections/")

try:
client = boto3.client("s3")
response = client.list_objects_v2(Bucket=bucket, Prefix="collections/")
except ClientError as e:
# Handle general AWS service errors (e.g., wrong bucket name)
print(f"ClientError: {e}")
return
except NoCredentialsError:
# Handle missing credentials
print("Credentials not found.")
return
except Exception as ex:
print(f"An unexpected error occurred: {ex}")
return
for file_ in response.get("Contents", []):
key = file_["Key"]
if key.endswith("/"):
Expand Down
11 changes: 10 additions & 1 deletion dags/veda_data_pipeline/groups/discover_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def vector_raster_choice(ti):
dynamic_group_id = ti.task_id.split(".")[0]

if payload.get("vector"):
return f"{dynamic_group_id}.parallel_run_process_generic_vectors"
if payload.get("vector_eis"):
return f"{dynamic_group_id}.parallel_run_process_vectors"
return f"{dynamic_group_id}.parallel_run_process_rasters"

Expand Down Expand Up @@ -101,10 +103,17 @@ def subdag_discover(event={}):
python_callable=get_files_to_process,
)

run_process_generic_vector = TriggerMultiDagRunOperator(
task_id="parallel_run_process_generic_vectors",
trigger_dag_id="veda_generic_ingest_vector",
python_callable=get_files_to_process,
)

# extra no-op, needed to run in dynamic mapping context
end_discover = EmptyOperator(task_id="end_discover", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,)

discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector]
discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector, run_process_generic_vector]
run_process_raster >> end_discover
run_process_vector >> end_discover
run_process_generic_vector >> end_discover

Loading

0 comments on commit 7e3e134

Please sign in to comment.