Skip to content

Commit

Permalink
Refactor dbt ls to run from tempdir with symbolic links (#488)
Browse files Browse the repository at this point in the history
Since Cosmos 1.0, `load_method.DBT_LS` is the default dbt project
parsing method, unless the user gives a manifest.

Using the original dbt project path has been a source of issues when
that path is Read-Only. This issue was faced when running commands that
generate `{project-dir}/target/` and `{project-dir}/logs/`, which was
solved as part of #414.

This issue is particularly problematic if we want to run `dbt deps` from
the original project directory since dbt 1.6 saves adaptors to
`{project_dir}/dbt_packages` unless specified in the user's
`dbt_project.yml`. To our knowledge, dbt currently does not allow users
to define this directory via flags or environment variables, as
discussed in #481.

This change aims to solve these issues, by creating a temporary
directory and creating symbolic links to the original directory.

Finally, during the development of this task, it was observed that when
running dbt ls in a project with `packages.yml`, dbt raises a
'Compilation Error'. Since dbt may raise other errors in stdout, this PR
captures "Errors" more generically - making it more evident potential
issues to the end-users.
  • Loading branch information
tatiana authored Aug 29, 2023
1 parent 56d38b9 commit 53dbb38
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 18 deletions.
14 changes: 12 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,19 @@ jobs:
DATABRICKS_WAREHOUSE_ID: ${{ secrets.DATABRICKS_WAREHOUSE_ID }}
DATABRICKS_CLUSTER_ID: ${{ secrets.DATABRICKS_CLUSTER_ID }}
COSMOS_CONN_POSTGRES_PASSWORD: ${{ secrets.COSMOS_CONN_POSTGRES_PASSWORD }}
POSTGRES_HOST: localhost
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
POSTGRES_SCHEMA: public
POSTGRES_PORT: 5432

- name: Upload coverage to Github
uses: actions/upload-artifact@v2
with:
name: coverage-integration-test-${{ matrix.python-version }}-${{ matrix.airflow-version }}
path: .coverage



Run-Integration-Tests-Expensive:
needs: Authorize
runs-on: ubuntu-latest
Expand Down Expand Up @@ -180,6 +184,12 @@ jobs:
DATABRICKS_WAREHOUSE_ID: ${{ secrets.DATABRICKS_WAREHOUSE_ID }}
DATABRICKS_CLUSTER_ID: ${{ secrets.DATABRICKS_CLUSTER_ID }}
COSMOS_CONN_POSTGRES_PASSWORD: ${{ secrets.COSMOS_CONN_POSTGRES_PASSWORD }}
POSTGRES_HOST: localhost
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
POSTGRES_SCHEMA: public
POSTGRES_PORT: 5432

- name: Upload coverage to Github
uses: actions/upload-artifact@v2
Expand Down
2 changes: 2 additions & 0 deletions cosmos/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
DEFAULT_DBT_PROFILE_NAME = "cosmos_profile"
DEFAULT_DBT_TARGET_NAME = "cosmos_target"
DBT_LOG_PATH_ENVVAR = "DBT_LOG_PATH"
DBT_LOG_DIR_NAME = "logs"
DBT_TARGET_PATH_ENVVAR = "DBT_TARGET_PATH"
DBT_TARGET_DIR_NAME = "target"
DBT_LOG_FILENAME = "dbt.log"


Expand Down
43 changes: 27 additions & 16 deletions cosmos/dbt/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
DBT_LOG_FILENAME,
DBT_LOG_PATH_ENVVAR,
DBT_TARGET_PATH_ENVVAR,
DBT_LOG_DIR_NAME,
DBT_TARGET_DIR_NAME,
)
from cosmos.dbt.executable import get_system_dbt
from cosmos.dbt.parser.project import DbtProject as LegacyDbtProject
Expand Down Expand Up @@ -151,27 +153,36 @@ def load_via_dbt_ls(self) -> None:

with self.profile_config.ensure_profile(use_mock_values=True) as profile_values:
(profile_path, env_vars) = profile_values
command.extend(
[
"--project-dir",
str(self.project.dir),
"--profiles-dir",
str(profile_path.parent),
"--profile",
self.profile_config.profile_name,
"--target",
self.profile_config.target_name,
]
)

env = os.environ.copy()
env.update(env_vars)

with tempfile.TemporaryDirectory() as tmpdir:
logger.info("Creating symlinks from %s to `%s`", self.project.dir, tmpdir)
# We create symbolic links to the original directory files and directories.
# This allows us to run the dbt command from within the temporary directory, outputting any necessary
# artifact and also allow us to run `dbt deps`
tmpdir_path = Path(tmpdir)
ignore_paths = (DBT_LOG_DIR_NAME, DBT_TARGET_DIR_NAME, "profiles.yml")
for child_name in os.listdir(self.project.dir):
if child_name not in ignore_paths:
os.symlink(self.project.dir / child_name, tmpdir_path / child_name)

command.extend(
[
"--project-dir",
str(tmpdir),
"--profiles-dir",
str(profile_path.parent),
"--profile",
self.profile_config.profile_name,
"--target",
self.profile_config.target_name,
]
)
logger.info("Running command: `%s`", " ".join(command))
logger.info("Environment variable keys: %s", env.keys())
log_dir = Path(env.get(DBT_LOG_PATH_ENVVAR) or tmpdir)
target_dir = Path(env.get(DBT_TARGET_PATH_ENVVAR) or tmpdir)
log_dir = Path(env.get(DBT_LOG_PATH_ENVVAR) or tmpdir_path / DBT_LOG_DIR_NAME)
target_dir = Path(env.get(DBT_TARGET_PATH_ENVVAR) or tmpdir_path / DBT_TARGET_DIR_NAME)
env[DBT_LOG_PATH_ENVVAR] = str(log_dir)
env[DBT_TARGET_PATH_ENVVAR] = str(target_dir)

Expand All @@ -194,7 +205,7 @@ def load_via_dbt_ls(self) -> None:
for line in logfile:
logger.debug(line.strip())

if stderr or "Runtime Error" in stdout:
if stderr or "Error" in stdout:
details = stderr or stdout
raise CosmosLoadDbtException(f"Unable to run the command due to the error:\n{details}")

Expand Down
1 change: 1 addition & 0 deletions tests/dbt/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def test_load_via_dbt_ls_with_exclude():
),
),
)

dbt_graph.load_via_dbt_ls()
assert dbt_graph.nodes == dbt_graph.filtered_nodes
assert len(dbt_graph.nodes) == 7
Expand Down

0 comments on commit 53dbb38

Please sign in to comment.