From a8f5ca884e00c711993e26d5ee92e847fbf091c0 Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Tue, 9 Jan 2024 20:11:15 +0000 Subject: [PATCH] #3007: Add initial 2x2 config to a new slow dispatch unit tests workflow that runs the multi-chip tests with schedule runs, no push trigger --- ...ce-slow-dispatch-build-and-unit-tests.yaml | 41 +++++++++++++++++++ CODEOWNERS | 1 + ...re_post_commit_regressions_multi_device.sh | 26 ++++++++++++ tests/scripts/run_tests.sh | 17 ++++++++ 4 files changed, 85 insertions(+) create mode 100644 .github/workflows/multi-device-slow-dispatch-build-and-unit-tests.yaml create mode 100755 tests/scripts/run_pre_post_commit_regressions_multi_device.sh diff --git a/.github/workflows/multi-device-slow-dispatch-build-and-unit-tests.yaml b/.github/workflows/multi-device-slow-dispatch-build-and-unit-tests.yaml new file mode 100644 index 00000000000..75db32dd38e --- /dev/null +++ b/.github/workflows/multi-device-slow-dispatch-build-and-unit-tests.yaml @@ -0,0 +1,41 @@ +name: "[post-commit] all - Slow Dispatch multi-Nebula post-commit main build and unit tests" + +on: + workflow_dispatch: + workflow_call: + schedule: + - cron: "0 1,5,7,9,14,17,21 * * *" + +jobs: + build-and-unit-tests: + strategy: + # Do not fail-fast because we need to ensure all tests go to completion + # so we try not to get hanging machines + fail-fast: false + matrix: + runner-info: [ + # N300 2x2 + {arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-2", "multi-chip-num-chips-4"]}, + ] + env: + TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} + ARCH_NAME: ${{ matrix.runner-info.arch }} + TT_METAL_SLOW_DISPATCH_MODE: 1 + environment: dev + runs-on: ${{ matrix.runner-info.runs-on }} + steps: + - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@main + with: + token: ${{ secrets.CHECKOUT_TOKEN }} + - name: Set up dyanmic env vars for build + run: | + echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - name: Build tt-metal and libs + run: make build + - name: Build tt-metal CPP tests + run: make tests + - name: Run pre/post regression tests + timeout-minutes: 30 + run: | + source build/python_env/bin/activate + ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type post_commit_multi_device --dispatch-mode slow diff --git a/CODEOWNERS b/CODEOWNERS index 4f44a3cfc1e..00c801c1246 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -32,6 +32,7 @@ conftest.py @tt-rkim tests/scripts/run_pre_post_commit_regressions.sh @tt-rkim tests/scripts/run_tests.sh @tt-rkim +tests/scripts/run_pre_post_commit_regressions_multi_device.sh @tt-rkim @aliuTT @tt-aho @TT-BrianLiu # metal - base # tt_metal/tt_metal.cpp @abhullar-tt @TT-billteng diff --git a/tests/scripts/run_pre_post_commit_regressions_multi_device.sh b/tests/scripts/run_pre_post_commit_regressions_multi_device.sh new file mode 100755 index 00000000000..ac00cdca942 --- /dev/null +++ b/tests/scripts/run_pre_post_commit_regressions_multi_device.sh @@ -0,0 +1,26 @@ +#/bin/bash + +set -eo pipefail + +if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 +fi + +if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 +fi + +if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]]; then + echo "Only Slow Dispatch mode allowed - Must have TT_METAL_SLOW_DISPATCH_MODE set" 1>&2 + exit 1 +fi + +cd $TT_METAL_HOME +export PYTHONPATH=$TT_METAL_HOME + +TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" +TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" +TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" +TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 75271befba4..91b60b74735 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -150,6 +150,21 @@ run_stress_post_commit_pipeline_tests() { done } +run_post_commit_multi_device_pipeline_tests() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + # Switch to modules only soon + # run_module_tests "$tt_arch" "llrt" "$pipeline_type" + if [[ $dispatch_mode == "slow" ]]; then + ./tests/scripts/run_pre_post_commit_regressions_multi_device.sh + else + echo "Only slow dispatch mode is currently supported with multi-device" + exit 1 + fi +} + run_microbenchmarks_pipeline_tests() { local tt_arch=$1 local pipeline_type=$2 @@ -184,6 +199,8 @@ run_pipeline_tests() { run_models_performance_virtual_machine_pipeline_tests "$tt_arch" "$pipeline_type" elif [[ $pipeline_type == "stress_post_commit" ]]; then run_stress_post_commit_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" + elif [[ $pipeline_type == "post_commit_multi_device" ]]; then + run_post_commit_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == "microbenchmarks" ]]; then run_microbenchmarks_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" else