stellar · chowbao · Oct 8, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
@@ -1,5 +1,6 @@
 {
   "api_key_path": "/home/airflow/gcs/data/apiKey.json",
+  "avro_gcs_bucket": "test_dune_bucket_sdf",
   "bq_dataset": "test_crypto_stellar_internal",
   "bq_dataset_audit_log": "audit_log",
   "bq_project": "test-hubble-319619",
@@ -339,6 +340,7 @@
     "build_export_task": 840,
     "build_gcs_to_bq_task": 960,
     "build_time_task": 480,
+    "build_bq_generate_avro_job": 600,
     "cleanup_metadata": 60,
     "create_sandbox": 2400,
     "current_state": 720,
@@ -373,7 +375,8 @@
     "build_delete_data_task": 180,
     "build_export_task": 420,
     "build_gcs_to_bq_task": 300,
-    "build_time_task": 480
+    "build_time_task": 480,
+    "build_bq_generate_avro_job": 600
   },
   "txmeta_datastore_path": "sdf-ledger-close-meta/ledgers",
   "use_captive_core": "False",

@@ -0,0 +1,67 @@
+from datetime import datetime
+
+from airflow import DAG
+from kubernetes.client import models as k8s
+from stellar_etl_airflow.build_bq_generate_avro_job_task import (
+    build_bq_generate_avro_job,
+)
+from stellar_etl_airflow.build_cross_dependency_task import build_cross_deps
+from stellar_etl_airflow.default import (
+    alert_sla_miss,
+    get_default_dag_args,
+    init_sentry,
+)
+
+init_sentry()
+
+dag = DAG(
+    "generate_avro_files",
+    default_args=get_default_dag_args(),
+    start_date=datetime(2024, 10, 1, 0, 0),
+    description="This DAG generates AVRO files from BQ tables",
+    schedule_interval="0 * * * *",  # Runs every hour
+    user_defined_filters={
+        "container_resources": lambda s: k8s.V1ResourceRequirements(requests=s),
+    },
+    max_active_runs=5,
+    catchup=True,
+    sla_miss_callback=alert_sla_miss,
+)
+
+public_project = "{{ var.value.public_project }}"
+public_dataset = "{{ var.value.public_dataset }}"
+gcs_bucket = "{{ var.value.avro_gcs_bucket }}"
+
+# Wait on ingestion DAGs
+wait_on_history_table = build_cross_deps(
+    dag, "wait_on_ledgers_txs", "history_table_export"
+)
+wait_on_state_table = build_cross_deps(dag, "wait_on_state_table", "state_table_export")
+
+# Generate AVRO files
+avro_tables = [
+    "accounts",
+    "contract_data",
+    "history_contract_events",
+    "history_ledgers",
+    "history_trades",
+    "history_transactions",
+    "liquidity_pools",
+    "offers",
+    "trust_lines",
+    "ttl",
+    # "history_effects",
+    # "history_operations",
+]
+
+for table in avro_tables:
+    task = build_bq_generate_avro_job(
+        dag=dag,
+        project=public_project,
+        dataset=public_dataset,
+        table=table,
+        gcs_bucket=gcs_bucket,
+    )
+
+    wait_on_history_table >> task
+    wait_on_state_table >> task
@@ -0,0 +1,17 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(sequence_ledger, batch_id, batch_insert_ts, batch_run_date),
+    sequence_ledger as account_sequence_last_modified_ledger
+  from {project_id}.{dataset_id}.accounts
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.contract_data
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.history_contract_events
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,18 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(details, batch_id, batch_insert_ts, batch_run_date),
+    details.*
+    except(predicate)
+  from {project_id}.{dataset_id}.history_effects
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.history_ledgers
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,19 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(details, details_json, batch_id, batch_insert_ts, batch_run_date),
+    details.*
+    except(claimants, type),
+    details.type as details_type
+  from {project_id}.{dataset_id}.history_operations
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,17 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(ledger_closed_at, batch_id, batch_insert_ts, batch_run_date),
+    ledger_closed_at as closed_at
+  from {project_id}.{dataset_id}.history_trades
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.history_transactions
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.liquidity_pools
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.offers
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.trust_lines
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,16 @@
+export data
+  options (
+    uri = '{uri}',
+    format = 'avro',
+    overwrite=true
+    )
+as (
+  select
+    *
+    except(batch_id, batch_insert_ts, batch_run_date)
+  from {project_id}.{dataset_id}.ttl
+  where true
+    and closed_at >= '{batch_run_date}'
+    and closed_at < '{next_batch_run_date}'
+  order by closed_at asc
+)
@@ -0,0 +1,66 @@
+import os
+from datetime import timedelta
+
+from airflow.models import Variable
+from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
+from stellar_etl_airflow import macros
+from stellar_etl_airflow.build_bq_insert_job_task import file_to_string
+from stellar_etl_airflow.default import alert_after_max_retries
+
+
+def get_query_filepath(query_name):
+    root = os.path.dirname(os.path.dirname(__file__))
+    return os.path.join(root, f"queries/generate_avro/{query_name}.sql")
+
+
+def build_bq_generate_avro_job(
+    dag,
+    project,
+    dataset,
+    table,
+    gcs_bucket,
+):
+    query_path = get_query_filepath(table)
+    query = file_to_string(query_path)
+    batch_run_date = "{{ batch_run_date_as_datetime_string(dag, data_interval_start) }}"
+    prev_batch_run_date = (
+        "{{ batch_run_date_as_datetime_string(dag, prev_data_interval_start_success) }}"
+    )
+    next_batch_run_date = (
+        "{{ batch_run_date_as_datetime_string(dag, data_interval_end) }}"
+    )
+    uri_datetime = (
+        "{{ batch_run_date_as_directory_string(dag, data_interval_start) }}"
+    )
+    uri = f"gs://{gcs_bucket}/avro/{table}/{uri_datetime}/*.avro"
+    sql_params = {
+        "project_id": project,
+        "dataset_id": dataset,
+        "batch_run_date": batch_run_date,
+        "prev_batch_run_date": prev_batch_run_date,
+        "next_batch_run_date": next_batch_run_date,
+        "uri": uri,
+    }
+    query = query.format(**sql_params)
+    configuration = {
+        "query": {
+            "query": query,
+            "useLegacySql": False,
+        }
+    }
+
+    return BigQueryInsertJobOperator(
+        task_id=f"generate_avro_{table}",
+        execution_timeout=timedelta(
+            seconds=Variable.get("task_timeout", deserialize_json=True)[
+                build_bq_generate_avro_job.__name__
+            ]
+        ),
+        on_failure_callback=alert_after_max_retries,
+        configuration=configuration,
+        sla=timedelta(
+            seconds=Variable.get("task_sla", deserialize_json=True)[
+                build_bq_generate_avro_job.__name__
+            ]
+        ),
+    )
@@ -9,3 +9,7 @@ def batch_run_date_as_datetime_string(dag, start_time):
 
 def get_batch_id():
     return "{}-{}".format("{{ run_id }}", "{{ params.alias }}")
+
+def batch_run_date_as_directory_string(dag, start_time):
+    time = subtract_data_interval(dag, start_time)
+    return f"{time.year}/{time.month}/{time.day}/{time.hour}:{time.minute}:{time.second}"