From 106802547d4bb69d807e7cdba806fa33f3127b36 Mon Sep 17 00:00:00 2001 From: Pankaj Koti Date: Mon, 30 Sep 2024 12:02:13 +0530 Subject: [PATCH] Add docs --- dev/dags/simple_dag_async.py | 2 ++ docs/configuration/cosmos-conf.rst | 21 ++++++++++++ docs/getting_started/execution-modes.rst | 41 +++++++++++++++++++++++- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/dev/dags/simple_dag_async.py b/dev/dags/simple_dag_async.py index cf5f57e02..787461236 100644 --- a/dev/dags/simple_dag_async.py +++ b/dev/dags/simple_dag_async.py @@ -18,6 +18,7 @@ ), ) +# [START airflow_async_execution_mode_example] simple_dag_async = DbtDag( # dbt/cosmos-specific parameters project_config=ProjectConfig( @@ -35,3 +36,4 @@ tags=["simple"], operator_args={"install_deps": True}, ) +# [END airflow_async_execution_mode_example] diff --git a/docs/configuration/cosmos-conf.rst b/docs/configuration/cosmos-conf.rst index 95a4adcad..8b4e9fed9 100644 --- a/docs/configuration/cosmos-conf.rst +++ b/docs/configuration/cosmos-conf.rst @@ -126,6 +126,27 @@ This page lists all available Airflow configurations that affect ``astronomer-co - Default: ``None`` - Environment Variable: ``AIRFLOW__COSMOS__REMOTE_CACHE_DIR_CONN_ID`` +.. _remote_target_path: + +`remote_target_path`_: + (Introduced since Cosmos 1.7.0) The path to the remote target directory. This is the directory designated to + remotely copy & store in the files generated and stored by dbt in the dbt project's target directory. The value + for the remote target path can be any of the schemes that are supported by the + `Airflow Object Store `_ + feature introduced in Airflow 2.8.0 (e.g. ``s3://your_s3_bucket/cache_dir/``, ``gs://your_gs_bucket/cache_dir/``, + ``abfs://your_azure_container/cache_dir``, etc.) + + - Default: ``None`` + - Environment Variable: ``AIRFLOW__COSMOS__REMOTE_TARGET_PATH`` + +.. _remote_target_path_conn_id: + +`remote_target_path_conn_id`_: + (Introduced since Cosmos 1.7.0) The connection ID for the remote target path. If this is not set, the default + Airflow connection ID identified for the scheme will be used. + + - Default: ``None`` + - Environment Variable: ``AIRFLOW__COSMOS__REMOTE_TARGET_PATH_CONN_ID`` [openlineage] ~~~~~~~~~~~~~ diff --git a/docs/getting_started/execution-modes.rst b/docs/getting_started/execution-modes.rst index f80c3da9d..ecd60c0b0 100644 --- a/docs/getting_started/execution-modes.rst +++ b/docs/getting_started/execution-modes.rst @@ -12,12 +12,13 @@ Cosmos can run ``dbt`` commands using five different approaches, called ``execut 5. **aws_eks**: Run ``dbt`` commands from AWS EKS Pods managed by Cosmos (requires a pre-existing Docker image) 6. **azure_container_instance**: Run ``dbt`` commands from Azure Container Instances managed by Cosmos (requires a pre-existing Docker image) 7. **gcp_cloud_run_job**: Run ``dbt`` commands from GCP Cloud Run Job instances managed by Cosmos (requires a pre-existing Docker image) +8. **airflow_async**: (Introduced since Cosmos 1.7.0) Run the dbt resources from your dbt project asynchronously, by submitting the corresponding compiled SQLs to Apache Airflow's `Deferrable operators `__ The choice of the ``execution mode`` can vary based on each user's needs and concerns. For more details, check each execution mode described below. .. list-table:: Execution Modes Comparison - :widths: 20 20 20 20 20 + :widths: 25 25 25 25 :header-rows: 1 * - Execution Mode @@ -52,6 +53,10 @@ The choice of the ``execution mode`` can vary based on each user's needs and con - Slow - High - No + * - Airflow Async + - Medium + - None + - Yes Local ----- @@ -238,6 +243,40 @@ Each task will create a new Cloud Run Job execution, giving full isolation. The }, ) +Airflow Async +------------- + +.. versionadded:: 1.7.0 + +(**Experimental**) + +The ``airflow_async`` execution mode is a way to run the dbt resources from your dbt project using Apache Airflow's +`Deferrable operators `__. +This execution mode could be preferred when you've long running resources and you want to run them asynchronously by +leveraging Airflow's deferrable operators. With that, you would be able to potentially observe higher throughput of tasks +as more dbt nodes will be run in parallel since they won't be blocking Airflow's worker slots. + +In this mode, Cosmos adds a new operator, ``DbtCompileAirflowAsyncOperator``, as a root task in the DAG. The task runs +the ``dbt compile`` command on your dbt project which then outputs compiled SQLs in the project's target directory. +As part of the same task run, these compiled SQLs are then stored remotely to a remote path set using the +:ref:`remote_target_path` configuration. The remote path is then used by the subsequent tasks in the DAG to +fetch (from the remote path) and run the compiled SQLs asynchronously using e.g. the ``DbtRunAirflowAsyncOperator``. +You may observe that the compile task takes a bit longer to run due to the latency of storing the compiled SQLs remotely, +however, it is still a win as it is one-time overhead and the subsequent tasks run asynchronously utilising the Airflow's +deferrable operators and supplying to them those compiled SQLs. + +Note that currently, the ``airflow_async`` execution mode has the following limitations and is released as Experimental: + +1. Only supports the ``dbt resource type`` models to be run asynchronously using Airflow deferrable operators. All other resources are executed synchronously using dbt commands as they are in the ``local`` execution mode. +2. Only supports BigQuery as the target database. If a profile target other than BigQuery is specified, Cosmos will error out saying that the target database is not supported with this execution mode. + +Example DAG: + +.. literalinclude:: ../../dev/dags/simple_dag_async.py + :language: python + :start-after: [START airflow_async_execution_mode_example] + :end-before: [END airflow_async_execution_mode_example] + .. _invocation_modes: Invocation Modes ================