From 53dbb38ad567fb26fc87d37afc8a77a64e5c2492 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Tue, 29 Aug 2023 15:22:07 +0100 Subject: [PATCH] Refactor dbt ls to run from tempdir with symbolic links (#488) Since Cosmos 1.0, `load_method.DBT_LS` is the default dbt project parsing method, unless the user gives a manifest. Using the original dbt project path has been a source of issues when that path is Read-Only. This issue was faced when running commands that generate `{project-dir}/target/` and `{project-dir}/logs/`, which was solved as part of #414. This issue is particularly problematic if we want to run `dbt deps` from the original project directory since dbt 1.6 saves adaptors to `{project_dir}/dbt_packages` unless specified in the user's `dbt_project.yml`. To our knowledge, dbt currently does not allow users to define this directory via flags or environment variables, as discussed in #481. This change aims to solve these issues, by creating a temporary directory and creating symbolic links to the original directory. Finally, during the development of this task, it was observed that when running dbt ls in a project with `packages.yml`, dbt raises a 'Compilation Error'. Since dbt may raise other errors in stdout, this PR captures "Errors" more generically - making it more evident potential issues to the end-users. --- .github/workflows/test.yml | 14 +++++++++++-- cosmos/constants.py | 2 ++ cosmos/dbt/graph.py | 43 ++++++++++++++++++++++++-------------- tests/dbt/test_graph.py | 1 + 4 files changed, 42 insertions(+), 18 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 806cbd06b..55bff7384 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -115,6 +115,12 @@ jobs: DATABRICKS_WAREHOUSE_ID: ${{ secrets.DATABRICKS_WAREHOUSE_ID }} DATABRICKS_CLUSTER_ID: ${{ secrets.DATABRICKS_CLUSTER_ID }} COSMOS_CONN_POSTGRES_PASSWORD: ${{ secrets.COSMOS_CONN_POSTGRES_PASSWORD }} + POSTGRES_HOST: localhost + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + POSTGRES_SCHEMA: public + POSTGRES_PORT: 5432 - name: Upload coverage to Github uses: actions/upload-artifact@v2 @@ -122,8 +128,6 @@ jobs: name: coverage-integration-test-${{ matrix.python-version }}-${{ matrix.airflow-version }} path: .coverage - - Run-Integration-Tests-Expensive: needs: Authorize runs-on: ubuntu-latest @@ -180,6 +184,12 @@ jobs: DATABRICKS_WAREHOUSE_ID: ${{ secrets.DATABRICKS_WAREHOUSE_ID }} DATABRICKS_CLUSTER_ID: ${{ secrets.DATABRICKS_CLUSTER_ID }} COSMOS_CONN_POSTGRES_PASSWORD: ${{ secrets.COSMOS_CONN_POSTGRES_PASSWORD }} + POSTGRES_HOST: localhost + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + POSTGRES_SCHEMA: public + POSTGRES_PORT: 5432 - name: Upload coverage to Github uses: actions/upload-artifact@v2 diff --git a/cosmos/constants.py b/cosmos/constants.py index 0f1cd6780..03553f8c8 100644 --- a/cosmos/constants.py +++ b/cosmos/constants.py @@ -7,7 +7,9 @@ DEFAULT_DBT_PROFILE_NAME = "cosmos_profile" DEFAULT_DBT_TARGET_NAME = "cosmos_target" DBT_LOG_PATH_ENVVAR = "DBT_LOG_PATH" +DBT_LOG_DIR_NAME = "logs" DBT_TARGET_PATH_ENVVAR = "DBT_TARGET_PATH" +DBT_TARGET_DIR_NAME = "target" DBT_LOG_FILENAME = "dbt.log" diff --git a/cosmos/dbt/graph.py b/cosmos/dbt/graph.py index 28219aef0..644930496 100644 --- a/cosmos/dbt/graph.py +++ b/cosmos/dbt/graph.py @@ -17,6 +17,8 @@ DBT_LOG_FILENAME, DBT_LOG_PATH_ENVVAR, DBT_TARGET_PATH_ENVVAR, + DBT_LOG_DIR_NAME, + DBT_TARGET_DIR_NAME, ) from cosmos.dbt.executable import get_system_dbt from cosmos.dbt.parser.project import DbtProject as LegacyDbtProject @@ -151,27 +153,36 @@ def load_via_dbt_ls(self) -> None: with self.profile_config.ensure_profile(use_mock_values=True) as profile_values: (profile_path, env_vars) = profile_values - command.extend( - [ - "--project-dir", - str(self.project.dir), - "--profiles-dir", - str(profile_path.parent), - "--profile", - self.profile_config.profile_name, - "--target", - self.profile_config.target_name, - ] - ) - env = os.environ.copy() env.update(env_vars) with tempfile.TemporaryDirectory() as tmpdir: + logger.info("Creating symlinks from %s to `%s`", self.project.dir, tmpdir) + # We create symbolic links to the original directory files and directories. + # This allows us to run the dbt command from within the temporary directory, outputting any necessary + # artifact and also allow us to run `dbt deps` + tmpdir_path = Path(tmpdir) + ignore_paths = (DBT_LOG_DIR_NAME, DBT_TARGET_DIR_NAME, "profiles.yml") + for child_name in os.listdir(self.project.dir): + if child_name not in ignore_paths: + os.symlink(self.project.dir / child_name, tmpdir_path / child_name) + + command.extend( + [ + "--project-dir", + str(tmpdir), + "--profiles-dir", + str(profile_path.parent), + "--profile", + self.profile_config.profile_name, + "--target", + self.profile_config.target_name, + ] + ) logger.info("Running command: `%s`", " ".join(command)) logger.info("Environment variable keys: %s", env.keys()) - log_dir = Path(env.get(DBT_LOG_PATH_ENVVAR) or tmpdir) - target_dir = Path(env.get(DBT_TARGET_PATH_ENVVAR) or tmpdir) + log_dir = Path(env.get(DBT_LOG_PATH_ENVVAR) or tmpdir_path / DBT_LOG_DIR_NAME) + target_dir = Path(env.get(DBT_TARGET_PATH_ENVVAR) or tmpdir_path / DBT_TARGET_DIR_NAME) env[DBT_LOG_PATH_ENVVAR] = str(log_dir) env[DBT_TARGET_PATH_ENVVAR] = str(target_dir) @@ -194,7 +205,7 @@ def load_via_dbt_ls(self) -> None: for line in logfile: logger.debug(line.strip()) - if stderr or "Runtime Error" in stdout: + if stderr or "Error" in stdout: details = stderr or stdout raise CosmosLoadDbtException(f"Unable to run the command due to the error:\n{details}") diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index 497e189cd..e0f279204 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -172,6 +172,7 @@ def test_load_via_dbt_ls_with_exclude(): ), ), ) + dbt_graph.load_via_dbt_ls() assert dbt_graph.nodes == dbt_graph.filtered_nodes assert len(dbt_graph.nodes) == 7