diff --git a/.github/actions/build-ci-images/action.yml b/.github/actions/build-ci-images/action.yml index e23a67222f2e..ed613c29ce0c 100644 --- a/.github/actions/build-ci-images/action.yml +++ b/.github/actions/build-ci-images/action.yml @@ -48,13 +48,6 @@ runs: cat "files/constraints-${PYTHON_VERSION}/*.md" >> $GITHUB_STEP_SUMMARY || true done if: env.UPGRADE_TO_NEWER_DEPENDENCIES != 'false' - - name: Push empty CI image ${{ env.PYTHON_MAJOR_MINOR_VERSION }}:${{ env.IMAGE }} - if: failure() || cancelled() - shell: bash - run: breeze ci-image build --push --empty-image --run-in-parallel - env: - IMAGE_TAG: ${{ env.IMAGE_TAG }} - COMMIT_SHA: ${{ github.sha }} - name: "Fix ownership" shell: bash run: breeze ci fix-ownership diff --git a/.github/actions/build-prod-images/action.yml b/.github/actions/build-prod-images/action.yml index afbf58445f97..0086345b6977 100644 --- a/.github/actions/build-prod-images/action.yml +++ b/.github/actions/build-prod-images/action.yml @@ -63,10 +63,6 @@ runs: --install-packages-from-context --upgrade-on-failure env: COMMIT_SHA: ${{ github.sha }} - - name: Push empty PROD images ${{ env.IMAGE_TAG }} - shell: bash - run: breeze prod-image build --cleanup-context --push --empty-image --run-in-parallel - if: failure() || cancelled() - name: "Fix ownership" shell: bash run: breeze ci fix-ownership diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml index daa94ab29082..96b4108ab770 100644 --- a/.github/workflows/build-images.yml +++ b/.github/workflows/build-images.yml @@ -194,6 +194,7 @@ jobs: DOCKER_CACHE: ${{ needs.build-info.outputs.cache-directive }} PYTHON_VERSIONS: ${{needs.build-info.outputs.all-python-versions-list-as-string}} DEBUG_RESOURCES: ${{ needs.build-info.outputs.debug-resources }} + BUILD_TIMEOUT_MINUTES: 70 build-prod-images: permissions: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0e3b91e33d03..e36724d5815c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -324,6 +324,7 @@ jobs: DOCKER_CACHE: ${{ needs.build-info.outputs.cache-directive }} PYTHON_VERSIONS: ${{needs.build-info.outputs.all-python-versions-list-as-string}} DEBUG_RESOURCES: ${{needs.build-info.outputs.debug-resources}} + BUILD_TIMEOUT_MINUTES: 70 build-prod-images: timeout-minutes: 80 diff --git a/BREEZE.rst b/BREEZE.rst index b2694a9fdee4..5213e08acaaa 100644 --- a/BREEZE.rst +++ b/BREEZE.rst @@ -1741,6 +1741,23 @@ These are all available flags of ``get-workflow-info`` command: :width: 100% :alt: Breeze ci get-workflow-info +Finding backtracking candidates +............................... + +Sometimes the CI build fails because ``pip`` timeouts when trying to resolve the latest set of dependencies +for that we have the ``find-backtracking-candidates`` command. This command will try to find the +backtracking candidates that might cause the backtracking. + +The details on how to use that command are explained in +`Figuring out backtracking dependencies `_. + +These are all available flags of ``find-backtracking-candidates`` command: + +.. image:: ./images/breeze/output_ci_find-backtracking-candidates.svg + :target: https://raw.githubusercontent.com/apache/airflow/main/images/breeze/output_ci_find-backtracking-candidates.svg + :width: 100% + :alt: Breeze ci find-backtracking-candidates + Release management tasks ------------------------ diff --git a/Dockerfile b/Dockerfile index b92a5d2849c2..050b4a22921f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -594,7 +594,7 @@ function install_airflow_and_providers_from_docker_context_files(){ pip install "${pip_flags[@]}" --root-user-action ignore --upgrade --upgrade-strategy eager \ ${ADDITIONAL_PIP_INSTALL_FLAGS} \ ${reinstalling_apache_airflow_package} ${reinstalling_apache_airflow_providers_packages} \ - ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} + ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=} set +x common::install_pip_version @@ -665,7 +665,7 @@ function install_airflow() { pip install --root-user-action ignore --upgrade --upgrade-strategy eager \ ${ADDITIONAL_PIP_INSTALL_FLAGS} \ "${AIRFLOW_INSTALLATION_METHOD}[${AIRFLOW_EXTRAS}]${AIRFLOW_VERSION_SPECIFICATION}" \ - ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} + ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=} if [[ -n "${AIRFLOW_INSTALL_EDITABLE_FLAG}" ]]; then # Remove airflow and reinstall it using editable flag # We can only do it when we install airflow from sources @@ -734,7 +734,7 @@ function install_additional_dependencies() { set -x pip install --root-user-action ignore --upgrade --upgrade-strategy eager \ ${ADDITIONAL_PIP_INSTALL_FLAGS} \ - ${ADDITIONAL_PYTHON_DEPS} ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} + ${ADDITIONAL_PYTHON_DEPS} ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=} common::install_pip_version set +x echo @@ -1290,17 +1290,11 @@ COPY --chown=airflow:0 ${AIRFLOW_SOURCES_FROM} ${AIRFLOW_SOURCES_TO} # Add extra python dependencies ARG ADDITIONAL_PYTHON_DEPS="" -# Those are additional constraints that are needed for some extras but we do not want to -# force them on the main Airflow package. Currently we need no extra limits as PIP 23.1+ has much better -# dependency resolution and we do not need to limit the versions of the dependencies -# !!! MAKE SURE YOU SYNCHRONIZE THE LIST BETWEEN: Dockerfile, Dockerfile.ci -ARG EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS="" ARG VERSION_SUFFIX_FOR_PYPI="" ENV ADDITIONAL_PYTHON_DEPS=${ADDITIONAL_PYTHON_DEPS} \ INSTALL_PACKAGES_FROM_CONTEXT=${INSTALL_PACKAGES_FROM_CONTEXT} \ - EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} \ VERSION_SUFFIX_FOR_PYPI=${VERSION_SUFFIX_FOR_PYPI} WORKDIR ${AIRFLOW_HOME} diff --git a/Dockerfile.ci b/Dockerfile.ci index def7c6345b09..a7df2f8c11f6 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -534,7 +534,7 @@ function install_airflow() { pip install --root-user-action ignore --upgrade --upgrade-strategy eager \ ${ADDITIONAL_PIP_INSTALL_FLAGS} \ "${AIRFLOW_INSTALLATION_METHOD}[${AIRFLOW_EXTRAS}]${AIRFLOW_VERSION_SPECIFICATION}" \ - ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} + ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=} if [[ -n "${AIRFLOW_INSTALL_EDITABLE_FLAG}" ]]; then # Remove airflow and reinstall it using editable flag # We can only do it when we install airflow from sources @@ -603,7 +603,7 @@ function install_additional_dependencies() { set -x pip install --root-user-action ignore --upgrade --upgrade-strategy eager \ ${ADDITIONAL_PIP_INSTALL_FLAGS} \ - ${ADDITIONAL_PYTHON_DEPS} ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} + ${ADDITIONAL_PYTHON_DEPS} ${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=} common::install_pip_version set +x echo @@ -1334,7 +1334,8 @@ ARG CASS_DRIVER_NO_CYTHON="1" # Build cassandra driver on multiple CPUs ARG CASS_DRIVER_BUILD_CONCURRENCY="8" -ARG AIRFLOW_VERSION="2.5.0.dev0" +# This value should be set by the CI image build system to the current timestamp +ARG AIRFLOW_VERSION="" # Additional PIP flags passed to all pip install commands except reinstalling pip itself ARG ADDITIONAL_PIP_INSTALL_FLAGS="" diff --git a/dev/MANUALLY_GENERATING_IMAGE_CACHE_AND_CONSTRAINTS.md b/dev/MANUALLY_GENERATING_IMAGE_CACHE_AND_CONSTRAINTS.md index 503830d2940d..d8f8d4c2c654 100644 --- a/dev/MANUALLY_GENERATING_IMAGE_CACHE_AND_CONSTRAINTS.md +++ b/dev/MANUALLY_GENERATING_IMAGE_CACHE_AND_CONSTRAINTS.md @@ -23,6 +23,10 @@ - [Purpose of the document](#purpose-of-the-document) - [Automated image cache and constraints refreshing in CI](#automated-image-cache-and-constraints-refreshing-in-ci) +- [Figuring out backtracking dependencies](#figuring-out-backtracking-dependencies) + - [Why we need to figure out backtracking dependencies](#why-we-need-to-figure-out-backtracking-dependencies) + - [How to figure out backtracking dependencies](#how-to-figure-out-backtracking-dependencies) + - [Example backtracking session](#example-backtracking-session) - [Manually refreshing the image cache](#manually-refreshing-the-image-cache) - [Why we need to update image cache manually](#why-we-need-to-update-image-cache-manually) - [Prerequisites](#prerequisites) @@ -80,6 +84,199 @@ rebuilding of [Breeze](../BREEZE.rst) images for development purpose. This is al step makes sure that constraints are committed and pushed just before the cache is refreshed, so there is no problem with conflicting dependencies. + +# Figuring out backtracking dependencies + +## Why we need to figure out backtracking dependencies + +Sometimes, very rarely the CI image in `canary` builds take a very long time to build. This is usually +caused by `pip` trying to figure out the latest set of dependencies (`eager upgrade`) . +The resolution of dependencies is a very complex problem and sometimes it takes a long time to figure out +the best set of dependencies. This is especially true when we have a lot of dependencies and they all have +to be found compatible with each other. In case new dependencies are released, sometimes `pip` enters +a long loop trying to figure out if the newly released dependency can be used, but due to some other +dependencies of ours it is impossible, but it will take `pip` a very long time to figure it out. + +This is visible in the "build output" as `pip` attempting to continuously backtrack and download many new +versions of various dependencies, trying to find a good match. + +This is why we sometimes we need to help pip to skip newer versions of those dependencies, until the +condition that caused the backtracking is solved. + +We do it by adding `dependency<=version` to the EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS variable in +`Dockerfile.ci`. The trick is to find the dependency that is causing the backtracking. + +Here is how. We use `bisecting` methodology to try out candidates for backtrack triggering among the +candidates that have been released in PyPI since the last time we successfully run +``--upgrade-to-newer-dependencies`` and committed the constraints in the `canary` build. + +## How to figure out backtracking dependencies + +First - we have a breeze command that can help us with that: + +```bash +breeze ci find-backtracking-candidates +``` + +This command should be run rather quickly after we notice that the CI build is taking a long time and fail, +because it is based on the fact that eager upgrade produced valid constraints at some point of time and +it tries to find out what dependencies have been added since then and limit them to the version that +was used in the constraints. + +You can also - instead of running the command manually rely on the failing CI builds. We run the +`find-backtracking-candidates` command in the `canary` build when it times out, so the +easiest way to find backtracking candidates is to find the first build that failed with timeout - it +will likely have the smallest number of backtracking candidates. The command outputs the limitation +for those backtracking candidates that are guaranteed to work (because they are taken from the latest +constraints and they already succeeded in the past when the constraints were updated). + +Then we run ``breeze ci-image build --upgrade-to-newer-dependencies --eager-upgrade-additional-requirements "REQUIREMENTS"`` +to check which of the candidates causes the long builds. Initially you put there the whole list of +candidates that you got from the `find-backtracking-candidates` command. This **should** succeed. Now, +the next step is to narrow down the list of candidates to the one that is causing the backtracking. + +We narrow-down the list by "bisecting" the list. We remove half of the dependency limits and see if it +still works or not. It it works - we continue. If it does not work, we restore the removed half and remove +the other half. Rinse and repeat until there is only one dependency left - hopefully +(sometimes you will need to leave few of them). + +This way we can relatively quickly narrow down the dependency that is causing the backtracking. Once we +figure out which dependency is causing it, we can attempt to figure it out why it is causing the backtracking +by specifying the latest released version of the dependency as `== ` in the +`--eager-upgrade-additional-requirements`. This should rather quickly fail and `pip` should show us what +the dependency is conflicting with. There might be multiple reasons for that. Most often it is simply +a dependency that has a requirement that is limited and we need to wait until new version of that +dependency is released. + +Note that - such build **might** even succeed - surprisingly. Then this is simply a sign that `pip` +algorithm for `--eager-upgrade` was not perfect and the solution could be found given sufficient time. +In such case it might also be that removing the limit in the next few days will not cause the backtracking. + +Finally, in order to make the change permanent in our CI builds, we should add the limitation to the +`EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS` arg in `Dockerfile.ci` and commit the change. We usually commit +the limits with `