NASA-IMPACT · paridhi-parajuli · Aug 14, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/dags/veda_data_pipeline/groups/discover_group.py b/dags/veda_data_pipeline/groups/discover_group.py
@@ -71,6 +71,8 @@ def vector_raster_choice(ti):
     dynamic_group_id = ti.task_id.split(".")[0]
 
     if payload.get("vector"):
+        return f"{dynamic_group_id}.parallel_run_process_generic_vectors"
+    if payload.get("vector_eis"):
         return f"{dynamic_group_id}.parallel_run_process_vectors"
     return f"{dynamic_group_id}.parallel_run_process_rasters"
 
@@ -101,10 +103,17 @@ def subdag_discover(event={}):
         python_callable=get_files_to_process,
     )
 
+    run_process_generic_vector = TriggerMultiDagRunOperator(
+        task_id="parallel_run_process_generic_vectors",
+        trigger_dag_id="veda_generic_ingest_vector",
+        python_callable=get_files_to_process,
+    )
+
     # extra no-op, needed to run in dynamic mapping context
     end_discover = EmptyOperator(task_id="end_discover", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,)
 
-    discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector]
+    discover_from_s3 >> raster_vector_branching >> [run_process_raster, run_process_vector,run_process_generic_vector]
     run_process_raster >> end_discover
     run_process_vector >> end_discover
+    run_process_generic_vector >> end_discover
 
diff --git a/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_generic_vector_pipeline.py
@@ -0,0 +1,108 @@
+import pendulum
+from airflow import DAG
+from airflow.models.variable import Variable
+from airflow.providers.amazon.aws.operators.ecs import EcsRunTaskOperator
+from airflow.operators.dummy_operator import DummyOperator
+from airflow.utils.trigger_rule import TriggerRule
+
+from datetime import timedelta
+
+dag_doc_md = """
+### Build and submit stac
+#### Purpose
+This DAG is supposed to be triggered by `veda_discover`. But you still can trigger this DAG manually or through an API 
+
+#### Notes
+- This DAG can run with the following configuration <br>
+```json
+{
+    "collection": "geoglam",
+    "prefix": "geoglam/",
+    "bucket": "veda-data-store-staging",
+    "filename_regex": "^(.*).tif$",
+    "discovery": "s3",
+    "datetime_range": "month",
+    "upload": false,
+    "cogify": false,
+    "discovered": 33,
+    "payload": "s3://veda-uah-sit-mwaa-853558080719/events/geoglam/s3_discover_output_6c46b57a-7474-41fe-977a-19d164531cdc.json"
+}	
+```
+- [Supports linking to external content](https://github.com/NASA-IMPACT/veda-data-pipelines)
+"""
+
+templat_dag_run_conf = {
+    "collection": "<collection_name>",
+    "prefix": "<prefix>/",
+    "bucket": "<bucket>",
+    "filename_regex": "<filename_regex>",
+    "discovery": "<s3>|cmr",
+    "datetime_range": "<month>|<day>",
+    "upload": "<false> | true",
+    "cogify": "false | true",
+    "payload": "<s3_uri_event_payload",
+}
+dag_args = {
+    "start_date": pendulum.today("UTC").add(days=-1),
+    "schedule_interval": None,
+    "catchup": False,
+    "doc_md": dag_doc_md,
+}
+
+with DAG(dag_id="veda_generic_ingest_vector", params=templat_dag_run_conf, **dag_args) as dag:
+    start = DummyOperator(task_id="Start", dag=dag)
+    end = DummyOperator(task_id="End", trigger_rule=TriggerRule.ONE_SUCCESS, dag=dag)
+
+    mwaa_stack_conf = Variable.get(
+        "MWAA_STACK_CONF", default_var={}, deserialize_json=True
+    )
+    vector_ecs_conf = Variable.get("VECTOR_ECS_CONF", deserialize_json=True)
+
+    generic_ingest_vector = EcsRunTaskOperator(
+        task_id="generic_ingest_vector",
+        trigger_rule=TriggerRule.NONE_FAILED,
+        cluster=f"{mwaa_stack_conf.get('PREFIX')}-cluster",
+        task_definition=f"{mwaa_stack_conf.get('PREFIX')}-generic-vector-tasks",
+        launch_type="FARGATE",
+        do_xcom_push=True,
+        execution_timeout=timedelta(minutes=120),
+        overrides={
+            "containerOverrides": [
+                {
+                    "name": f"{mwaa_stack_conf.get('PREFIX')}-veda-generic_vector_ingest",
+                    "command": [
+                        "/var/lang/bin/python",
+                        "handler.py",
+                        "--payload",
+                        "{}".format("{{ task_instance.dag_run.conf }}"),
+                    ],
+                    "environment": [
+                        {
+                            "name": "EXTERNAL_ROLE_ARN",
+                            "value": Variable.get(
+                                "ASSUME_ROLE_READ_ARN", default_var=""
+                            ),
+                        },
+                        {
+                            "name": "AWS_REGION",
+                            "value": mwaa_stack_conf.get("AWS_REGION"),
+                        },
+                        {
+                            "name": "VECTOR_SECRET_NAME",
+                            "value": Variable.get("VECTOR_SECRET_NAME"),
+                        },
+                    ],
+                },
+            ],
+        },
+        network_configuration={
+            "awsvpcConfiguration": {
+                    "securityGroups": vector_ecs_conf.get("VECTOR_SECURITY_GROUP") + mwaa_stack_conf.get("SECURITYGROUPS"),
+                    "subnets": vector_ecs_conf.get("VECTOR_SUBNETS"),
+            },
+        },
+        awslogs_group=mwaa_stack_conf.get("LOG_GROUP_NAME"),
+        awslogs_stream_prefix=f"ecs/{mwaa_stack_conf.get('PREFIX')}-veda-generic-vector_ingest",  # prefix with container name
+    )
+
+    start >> generic_ingest_vector >> end
diff --git a/dags/veda_data_pipeline/veda_process_vector_pipeline.py b/dags/veda_data_pipeline/veda_process_vector_pipeline.py
@@ -80,7 +80,7 @@
                         {
                             "name": "EXTERNAL_ROLE_ARN",
                             "value": Variable.get(
-                                "ASSUME_ROLE_READ_ARN", default_var=None
+                                "ASSUME_ROLE_READ_ARN", default_var=""
                             ),
                         },
                         {

diff --git a/docker_tasks/generic_vector_ingest/Dockerfile b/docker_tasks/generic_vector_ingest/Dockerfile
@@ -0,0 +1,10 @@
+FROM --platform=linux/amd64 ghcr.io/lambgeo/lambda-gdal:3.6-python3.9
+RUN yum update -y
+
+WORKDIR /app
+ENTRYPOINT []
+RUN pip install --upgrade pip
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+COPY handler.py handler.py
diff --git a/docker_tasks/generic_vector_ingest/handler.py b/docker_tasks/generic_vector_ingest/handler.py
@@ -0,0 +1,204 @@
+import base64
+from argparse import ArgumentParser
+import boto3
+import os
+import ast
+import subprocess
+import json
+import smart_open
+from urllib.parse import urlparse
+
+def download_file(file_uri: str):
+    """Downloads file from s3
+
+    Args:
+        file_uri (str): s3 URL of the file to be downloaded
+
+    Returns:
+        target_filepath (str): filepath of the downloaded file
+    """
+    role_arn = os.environ.get("EXTERNAL_ROLE_ARN")
+    kwargs = assume_role(role_arn=role_arn) if role_arn else {}
+
+    s3 = boto3.client("s3", **kwargs)
+    url_parse = urlparse(file_uri)
+
+    bucket = url_parse.netloc
+    path = url_parse.path[1:]
+    filename = url_parse.path.split("/")[-1]
+    target_filepath = os.path.join("/tmp", filename)
+
+    s3.download_file(bucket, path, target_filepath)
+
+    s3.close()
+    return target_filepath
+
+def assume_role(role_arn, session_name="veda-data-pipelines_vector-ingest"):
+    """Assumes an AWS IAM role and returns temporary credentials.
+
+    Args:
+        role_arn (str): The ARN of the role to assume.
+        session_name (str): A name for the assumed session.
+
+    Returns:
+        dict: Temporary AWS credentials.
+    """
+    sts = boto3.client("sts")
+    credentials = sts.assume_role(
+        RoleArn=role_arn,
+        RoleSessionName=session_name,
+    )
+    creds = credentials["Credentials"]
+    return {
+        "aws_access_key_id": creds["AccessKeyId"],
+        "aws_secret_access_key": creds.get("SecretAccessKey"),
+        "aws_session_token": creds.get("SessionToken"),
+    }
+
+
+def get_connection_string(secret: dict, as_uri: bool = False) -> str:
+    if as_uri:
+        return f"postgresql://{secret['username']}:{secret['password']}@{secret['host']}:5432/{secret['dbname']}"
+    else:
+        #return f"PG:host=localhost port=5432 dbname=postgis user=username password=password"
+        return f"PG:host={secret['host']} dbname={secret['dbname']} user={secret['username']} password={secret['password']}"
+
+
+def get_secret(secret_name: str) -> None:
+    """Retrieve secrets from AWS Secrets Manager
+
+    Args:
+        secret_name (str): name of aws secrets manager secret containing database connection secrets
+
+    Returns:
+        secrets (dict): decrypted secrets in dict
+    """
+
+    # Create a Secrets Manager client
+    session = boto3.session.Session(region_name=os.environ.get("AWS_REGION"))
+    client = session.client(service_name="secretsmanager")
+
+    # In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
+    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
+    # We rethrow the exception by default.
+
+    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
+
+    # Decrypts secret using the associated KMS key.
+    # Depending on whether the secret is a string or binary, one of these fields will be populated.
+    if "SecretString" in get_secret_value_response:
+        return json.loads(get_secret_value_response["SecretString"])
+    else:
+        return json.loads(base64.b64decode(get_secret_value_response["SecretBinary"]))
+
+
+
+def load_to_featuresdb(
+    filename: str,
+    layer_name: str,
+    x_possible: str = "longitude",
+    y_possible: str = "latitude",
+    source_projection : str ="EPSG:4326",
+    target_projection : str ="EPSG:4326",
+    extra_flags: list = ["-overwrite", "-progress"]
+):
+    secret_name = os.environ.get("VECTOR_SECRET_NAME")
+    con_secrets = get_secret(secret_name)
+    connection = get_connection_string(con_secrets)
+
+    print(f"running ogr2ogr import for collection/file: {layer_name}")
+    options = [
+        "ogr2ogr",
+        "-f",
+        "PostgreSQL",
+        connection,
+        filename,
+        "-oo",
+        f"X_POSSIBLE_NAMES={x_possible}",
+        "-oo",
+        f"Y_POSSIBLE_NAMES={y_possible}",
+        "-nln",
+        layer_name, 
+        "-s_srs",
+        source_projection,
+        "-t_srs",
+        target_projection,
+        *extra_flags
+    ]
+    out = subprocess.run(
+        options,
+        check=False,
+        capture_output=True,
+    )
+
+    if out.stderr:
+        error_description = f"Error: {out.stderr}"
+        print(error_description)
+        return {"status": "failure", "reason": error_description}
+
+    return {"status": "success"}
+
+def handler():
+    print("------Vector ingestion for Features API started------")
+    parser = ArgumentParser(
+        prog="vector_ingest",
+        description="Ingest Vector",
+        epilog="Running the code as ECS task",
+    )
+    parser.add_argument(
+        "--payload", dest="payload", help="event passed to stac_handler function"
+    )
+    args = parser.parse_args()
+
+    # Extracting the payload passed from upstream task/dag or conf
+    payload_event = ast.literal_eval(args.payload)
+    s3_event = payload_event.pop("payload")
+
+    # Extracting configs for ingestion
+    x_possible = payload_event["x_possible"]
+    y_possible = payload_event["y_possible"]
+    source_projection = payload_event["source_projection"]
+    target_projection = payload_event["target_projection"]
+    extra_flags = payload_event["extra_flags"]
+
+    layer_name = payload_event["collection"]
+    collection_not_provided = layer_name == ""
+
+
+    # Read the json to extract the discovered file paths
+    with smart_open.open(s3_event, "r") as _file:
+        s3_event_read = _file.read()
+
+    event_received = json.loads(s3_event_read)
+    s3_objects = event_received["objects"]
+    status = list()
+
+    for s3_object in s3_objects:
+        href = s3_object["assets"]["default"]["href"] 
+        filename = href.split("/")[-1].split(".")[0]
+
+        # Use id template when collection is not provided in the conf
+        if collection_not_provided:
+            layer_name = payload_event["id_template"].format(filename)
+
+        downloaded_filepath = download_file(href)
+        print(f"[ COLLECTION ]: {layer_name}, [ DOWNLOAD FILEPATH ]: {downloaded_filepath}")
+
+        coll_status = load_to_featuresdb(downloaded_filepath, layer_name, 
+                                         x_possible, y_possible, 
+                                         source_projection, target_projection, 
+                                         extra_flags)
+        status.append(coll_status)
+
+        # Delete file after ingest
+        os.remove(downloaded_filepath)
+
+        if coll_status["status"] != "success":
+            # Bubble exception so Airflow shows it as a failure
+            raise Exception(coll_status["reason"])
+
+    print("------Overall Status------\n", f"Done for {len(status)} discovered files\n",status)
+
+
+if __name__ == "__main__":
+    handler()
diff --git a/docker_tasks/generic_vector_ingest/requirements.txt b/docker_tasks/generic_vector_ingest/requirements.txt
@@ -0,0 +1,7 @@
+smart-open==6.3.0
+psycopg2-binary==2.9.9
+requests==2.30.0
+boto3==1.26.129
+GeoAlchemy2==0.14.2
+geopandas==0.14.0
+SQLAlchemy==2.0.23
diff --git a/infrastructure/main.tf b/infrastructure/main.tf
@@ -32,6 +32,12 @@ module "mwaa" {
       docker_file_path          = "${path.module}/../docker_tasks/vector_ingest/Dockerfile"
       ecs_container_folder_path = "${path.module}/../docker_tasks/vector_ingest"
       ecr_repo_name             = "${var.prefix}-veda-vector_ingest"
+    },
+    {
+      handler_file_path         = "${path.module}/../docker_tasks/generic_vector_ingest/handler.py"
+      docker_file_path          = "${path.module}/../docker_tasks/generic_vector_ingest/Dockerfile"
+      ecs_container_folder_path = "${path.module}/../docker_tasks/generic_vector_ingest"
+      ecr_repo_name             = "${var.prefix}-veda-generic_vector_ingest"
     }
   ]
 }