From ebae8efee2a20a152f11c4a66c8fe66714a91c07 Mon Sep 17 00:00:00 2001 From: Pankaj Koti Date: Fri, 10 May 2024 22:28:52 +0530 Subject: [PATCH] Correct stale `root_path` in partial parse file (#950) With the introduction of enabling partial parse in PR #904, upon testing the implementation, it is observed that the seeds files were not been able to be located as the partial parse file contained a stale `root_path` from previous command runs. This issue is observed on specific earlier versions of dbt-core like `1.5.4` and `1.6.5`, but not on recent versions of dbt-core `1.5.8`, `1.6.6` and `1.7.0`. I am suspecting that PR https://github.com/dbt-labs/dbt-core/pull/8762 is likely the fix and the fix appears to be backported to later version releases of `1.5.x` and `1.6.x`. However, irrespective of the dbt-core version, this PR attempts to correct the `root_path` in the partial parse file by replacing it with the needed project directory where the project files are located. And thus ensures that the feature runs correctly for older and newer versions of dbt-core. closes: #937 --------- Co-authored-by: Tatiana Al-Chueyr --- .github/workflows/test.yml | 73 +++++++++++++++++++++++++++ cosmos/cache.py | 18 +++++++ docs/requirements.txt | 11 ++-- pyproject.toml | 21 ++++---- scripts/test/integration-dbt-1-5-4.sh | 12 +++++ 5 files changed, 121 insertions(+), 14 deletions(-) create mode 100644 scripts/test/integration-dbt-1-5-4.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f6e3701a8..dc0cfd055 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -292,6 +292,79 @@ jobs: AIRFLOW_CONN_AIRFLOW_DB: postgres://postgres:postgres@0.0.0.0:5432/postgres PYTHONPATH: /home/runner/work/astronomer-cosmos/astronomer-cosmos/:$PYTHONPATH + Run-Integration-Tests-DBT-1-5-4: + needs: Authorize + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ "3.11" ] + airflow-version: [ "2.7" ] + services: + postgres: + image: postgres + env: + POSTGRES_PASSWORD: postgres + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha || github.ref }} + - uses: actions/cache@v3 + with: + path: | + ~/.cache/pip + .nox + key: integration-dbt-1-5-4-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.airflow-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('cosmos/__init__.py') }} + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install packages and dependencies + run: | + python -m pip install hatch + hatch -e tests.py${{ matrix.python-version }}-${{ matrix.airflow-version }} run pip freeze + + - name: Test Cosmos against Airflow ${{ matrix.airflow-version }}, Python ${{ matrix.python-version }} and dbt 1.5.4 + run: | + hatch run tests.py${{ matrix.python-version }}-${{ matrix.airflow-version }}:test-integration-dbt-1-5-4 + env: + AIRFLOW_HOME: /home/runner/work/astronomer-cosmos/astronomer-cosmos/ + AIRFLOW_CONN_AIRFLOW_DB: postgres://postgres:postgres@0.0.0.0:5432/postgres + AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT: 90.0 + PYTHONPATH: /home/runner/work/astronomer-cosmos/astronomer-cosmos/:$PYTHONPATH + AIRFLOW_CONN_DATABRICKS_DEFAULT: ${{ secrets.AIRFLOW_CONN_DATABRICKS_DEFAULT }} + DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} + DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} + DATABRICKS_WAREHOUSE_ID: ${{ secrets.DATABRICKS_WAREHOUSE_ID }} + DATABRICKS_CLUSTER_ID: ${{ secrets.DATABRICKS_CLUSTER_ID }} + COSMOS_CONN_POSTGRES_PASSWORD: ${{ secrets.COSMOS_CONN_POSTGRES_PASSWORD }} + POSTGRES_HOST: localhost + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + POSTGRES_SCHEMA: public + POSTGRES_PORT: 5432 + + - name: Upload coverage to Github + uses: actions/upload-artifact@v2 + with: + name: coverage-integration-dbt-1-5-4-test-${{ matrix.python-version }}-${{ matrix.airflow-version }} + path: .coverage + + env: + AIRFLOW_HOME: /home/runner/work/astronomer-cosmos/astronomer-cosmos/ + AIRFLOW_CONN_AIRFLOW_DB: postgres://postgres:postgres@0.0.0.0:5432/postgres + PYTHONPATH: /home/runner/work/astronomer-cosmos/astronomer-cosmos/:$PYTHONPATH + Run-Performance-Tests: needs: Authorize runs-on: ubuntu-latest diff --git a/cosmos/cache.py b/cosmos/cache.py index 3c2086c7a..7d136a127 100644 --- a/cosmos/cache.py +++ b/cosmos/cache.py @@ -3,6 +3,7 @@ import shutil from pathlib import Path +import msgpack from airflow.models.dag import DAG from airflow.utils.task_group import TaskGroup @@ -121,4 +122,21 @@ def _copy_partial_parse_to_project(partial_parse_filepath: Path, project_path: P source_manifest_filepath = partial_parse_filepath.parent / DBT_MANIFEST_FILE_NAME target_manifest_filepath = target_partial_parse_file.parent / DBT_MANIFEST_FILE_NAME shutil.copy(str(partial_parse_filepath), str(target_partial_parse_file)) + + # Update root_path in partial parse file to point to the needed project directory. This is necessary because + # an issue is observed where on specific earlier versions of dbt-core like 1.5.4 and 1.6.5, the commands fail to + # locate project files as they are pointed to a stale directory by the root_path in the partial parse file. + # This issue was not observed on recent versions of dbt-core 1.5.8, 1.6.6, 1.7.0 and 1.8.0 as tested on. + # It is suspected that PR dbt-labs/dbt-core#8762 is likely the fix and the fix appears to be backported to later + # version releases of 1.5.x and 1.6.x. However, the below modification is applied to ensure that the root_path is + # correctly set to the needed project directory and the feature is compatible across all dbt-core versions. + with target_partial_parse_file.open("rb") as f: + data = msgpack.unpack(f) + for node in data["nodes"].values(): + if node.get("root_path"): + node["root_path"] = str(project_path) + with target_partial_parse_file.open("wb") as f: + packed = msgpack.packb(data) + f.write(packed) + shutil.copy(str(source_manifest_filepath), str(target_manifest_filepath)) diff --git a/docs/requirements.txt b/docs/requirements.txt index 430993ff8..81a7084e4 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,10 +1,11 @@ -google-re2==1.1 aenum -sphinx -pydata-sphinx-theme -sphinx-autobuild -sphinx-autoapi apache-airflow apache-airflow-providers-cncf-kubernetes>=5.1.1 +google-re2==1.1 +msgpack openlineage-airflow pydantic +pydata-sphinx-theme +sphinx +sphinx-autoapi +sphinx-autobuild diff --git a/pyproject.toml b/pyproject.toml index 5f0e5ee0e..f740f2071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "apache-airflow>=2.3.0", "importlib-metadata; python_version < '3.8'", "Jinja2>=3.0.0", + "msgpack", "pydantic>=1.10.0", "typing-extensions; python_version < '3.8'", "virtualenv", @@ -141,16 +142,17 @@ matrix.airflow.dependencies = [ [tool.hatch.envs.tests.scripts] freeze = "pip freeze" -type-check = "mypy cosmos" test = 'sh scripts/test/unit.sh' test-cov = 'sh scripts/test/unit-cov.sh' -test-integration-setup = 'sh scripts/test/integration-setup.sh' test-integration = 'sh scripts/test/integration.sh' +test-integration-dbt-1-5-4 = 'sh scripts/test/integration-dbt-1-5-4.sh' test-integration-expensive = 'sh scripts/test/integration-expensive.sh' -test-integration-sqlite-setup = 'sh scripts/test/integration-sqlite-setup.sh' +test-integration-setup = 'sh scripts/test/integration-setup.sh' test-integration-sqlite = 'sh scripts/test/integration-sqlite.sh' -test-performance-setup = 'sh scripts/test/performance-setup.sh' +test-integration-sqlite-setup = 'sh scripts/test/integration-sqlite-setup.sh' test-performance = 'sh scripts/test/performance.sh' +test-performance-setup = 'sh scripts/test/performance-setup.sh' +type-check = "mypy cosmos" [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] @@ -164,13 +166,14 @@ markers = ["integration", "sqlite", "perf"] [tool.hatch.envs.docs] dependencies = [ "aenum", - "sphinx", - "pydata-sphinx-theme", - "sphinx-autobuild", - "sphinx-autoapi", - "openlineage-airflow", "apache-airflow-providers-cncf-kubernetes>=5.1.1", + "msgpack", + "openlineage-airflow", "pydantic>=1.10.0", + "pydata-sphinx-theme", + "sphinx", + "sphinx-autoapi", + "sphinx-autobuild", ] [tool.hatch.envs.docs.scripts] diff --git a/scripts/test/integration-dbt-1-5-4.sh b/scripts/test/integration-dbt-1-5-4.sh new file mode 100644 index 000000000..087533082 --- /dev/null +++ b/scripts/test/integration-dbt-1-5-4.sh @@ -0,0 +1,12 @@ +pip uninstall dbt-adapters dbt-common dbt-core dbt-extractor dbt-postgres dbt-semantic-interfaces -y +pip install dbt-postgres==1.5.4 dbt-databricks==1.5.4 +rm -rf airflow.*; \ +airflow db init; \ +pytest -vv \ + --cov=cosmos \ + --cov-report=term-missing \ + --cov-report=xml \ + --durations=0 \ + -m integration \ + --ignore=tests/perf \ + -k 'basic_cosmos_task_group'