From b5e7914b6f9c98cec1bfbcbc97049dede497ba9a Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Wed, 6 Nov 2024 15:48:32 +0000 Subject: [PATCH 1/6] #14406: Initial commit to test ccl perf in pipeline --- .../t3000-model-perf-tests-impl.yaml | 10 ++++++- .github/workflows/t3000-model-perf-tests.yaml | 8 ++++- .../t3000/run_t3000_model_perf_tests.sh | 29 ++++++++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index 91e208c214b..489994bf9f0 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -22,6 +22,7 @@ jobs: { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho + { name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run? ] name: ${{ matrix.test-group.name }} @@ -45,7 +46,14 @@ jobs: run: | echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - - uses: actions/download-artifact@v4 + - name: Download profiler build artifact + if: ${{ matrix.test-group.tracy }} + uses: actions/download-artifact@v4 + with: + name: TTMetal_build_${{ matrix.test-group.arch }}_profiler + - name: Download regular build artifact + if: ${{ !matrix.test-group.tracy }} + uses: actions/download-artifact@v4 with: name: TTMetal_build_${{ matrix.test-group.arch }} - name: Extract files diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml index 0a8759af27c..15d96746889 100644 --- a/.github/workflows/t3000-model-perf-tests.yaml +++ b/.github/workflows/t3000-model-perf-tests.yaml @@ -11,7 +11,13 @@ jobs: with: arch: '["wormhole_b0"]' secrets: inherit + build-artifact-profiler: + uses: ./.github/workflows/build-artifact.yaml + with: + arch: '["wormhole_b0"]' + tracy: true + secrets: inherit t3000-model-perf-tests: - needs: build-artifact + needs: [build-artifact, build-artifact-profiler] secrets: inherit uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 70baaa85ae3..7c0d0757a09 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -142,6 +142,25 @@ run_t3000_resnet50_tests() { fi } +run_t3000_ccl_perf_tests() { + # Record the start time + fail=0 + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_ccl_perf_tests" + + tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -t t3000 + fail+=$? + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_ccl_perf_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_t3000_llm_tests() { # Run falcon7b tests run_t3000_falcon7b_tests @@ -173,6 +192,12 @@ run_t3000_cnn_tests() { env python models/perf/merge_perf_results.py } +run_t3000_ccl_tests() { + # Run ccl performance tests + run_t3000_ccl_perf_tests + +} + fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly @@ -219,8 +244,10 @@ main() { run_t3000_llm_tests elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then run_t3000_cnn_tests + elif [[ "$pipeline_type" == "ccl_perf_t3000_device" ]]; then + run_t3000_ccl_tests else - echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1 + echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device, ccl_perf_t3000_device])" 2>&1 exit 1 fi From 33baf8ab8a39b9131b962ae99305a5236c0ff7d7 Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Thu, 7 Nov 2024 06:20:13 +0000 Subject: [PATCH 2/6] #14406: Upload perf report to GH --- .../t3000-model-perf-tests-impl.yaml | 20 +++++++++++++++---- .../operations/ccl/perf/perf_csv.py | 9 ++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index 489994bf9f0..f1d0090bda3 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -51,7 +51,7 @@ jobs: uses: actions/download-artifact@v4 with: name: TTMetal_build_${{ matrix.test-group.arch }}_profiler - - name: Download regular build artifact + - name: Download build artifact if: ${{ !matrix.test-group.tracy }} uses: actions/download-artifact@v4 with: @@ -74,9 +74,21 @@ jobs: if: ${{ !cancelled() }} run: | ls -hal - export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv" - ls -hal $PERF_REPORT_FILENAME - echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" + TODAY=$(date +%Y_%m_%d) + PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv" + PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv" + if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then + echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS" + echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT" + ls -hal "$PERF_REPORT_FILENAME_MODELS" + elif [ -f "$PERF_REPORT_FILENAME_CCL" ]; then + echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL" + echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT" + ls -hal "$PERF_REPORT_FILENAME_CCL" + else + echo "No perf report found." + exit 1 + fi - name: Upload perf report if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} uses: actions/upload-artifact@v4 diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py index 31f4636aa66..3d5cc2aaeb5 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py @@ -5,6 +5,7 @@ import pandas as pd import os import re +import time def perf_report(file_path): @@ -214,10 +215,12 @@ def calculate_bandwidth(row): averages_df = pd.DataFrame(averages_data) - averages_file_path = file_path.replace(".csv", "_averages.csv") + today = time.strftime("%Y_%m_%d") + ccl_perf_file_path = f"CCL_Perf_{today}.csv" + os.rename(file_path, ccl_perf_file_path) - averages_df.to_csv(averages_file_path, index=False) + averages_df.to_csv(ccl_perf_file_path, index=False) - print(f"Averages CSV saved to: {averages_file_path}") + print(f"CCL Perf report CSV saved to: {ccl_perf_file_path}") return averages_df From 6819ad76a2f54608a74d11036eced7edf5a3593b Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Thu, 7 Nov 2024 06:39:11 +0000 Subject: [PATCH 3/6] #14406: Upload perf report to GH --- .../t3000-model-perf-tests-impl.yaml | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index f1d0090bda3..3c0bd44cdcf 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -73,21 +73,25 @@ jobs: id: check-perf-report if: ${{ !cancelled() }} run: | - ls -hal TODAY=$(date +%Y_%m_%d) PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv" PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv" - if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then - echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS" - echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT" - ls -hal "$PERF_REPORT_FILENAME_MODELS" - elif [ -f "$PERF_REPORT_FILENAME_CCL" ]; then - echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL" - echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT" - ls -hal "$PERF_REPORT_FILENAME_CCL" + if [ "${{ matrix.test-group.tracy }}" == "true" ]; then + if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then + echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL" + echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT" + else + echo "No CCL perf report found for today." + exit 1 + fi else - echo "No perf report found." - exit 1 + if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then + echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS" + echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT" + else + echo "No Models perf report found for today." + exit 1 + fi fi - name: Upload perf report if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} From c8881cf3998ac3db929eb9f36c672624f7f1d9df Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Thu, 7 Nov 2024 07:37:17 +0000 Subject: [PATCH 4/6] #14406: Mark fail if csv not generated --- .../unit_tests/operations/ccl/perf/run_all_gather_profile.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh index 8422bde56d0..69e34a86b22 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh @@ -90,6 +90,7 @@ print(tabulate(average_df, headers='keys', tablefmt='pretty')) echo "$average_values" else echo "CSV path not found in the command output." + exit 1 fi } From b835deb3e8ff2dedb2d584d860b66da86acf1b9e Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Fri, 8 Nov 2024 08:08:56 +0000 Subject: [PATCH 5/6] #14406: Improved print messages --- .../t3000-model-perf-tests-impl.yaml | 2 +- .../t3000/run_t3000_model_perf_tests.sh | 10 +++---- .../ccl/perf/run_all_gather_profile.sh | 29 +++++++++++------- .../ccl/perf/run_reduce_scatter_profile.sh | 30 ++++++++++++------- 4 files changed, 45 insertions(+), 26 deletions(-) diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index 3c0bd44cdcf..f092abf18d1 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -22,7 +22,7 @@ jobs: { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho - { name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar + { name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run? ] name: ${{ matrix.test-group.name }} diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 7c0d0757a09..19a54d710b1 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -142,20 +142,20 @@ run_t3000_resnet50_tests() { fi } -run_t3000_ccl_perf_tests() { +run_t3000_ccl_all_gather_perf_tests() { # Record the start time fail=0 start_time=$(date +%s) - echo "LOG_METAL: Running run_t3000_ccl_perf_tests" + echo "LOG_METAL: Running run_t3000_ccl_all_gather_perf_tests" - tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -t t3000 + tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t t3000 fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) - echo "LOG_METAL: run_t3000_ccl_perf_tests $duration seconds to complete" + echo "LOG_METAL: run_t3000_ccl_all_gather_perf_tests $duration seconds to complete" if [[ $fail -ne 0 ]]; then exit 1 fi @@ -194,7 +194,7 @@ run_t3000_cnn_tests() { run_t3000_ccl_tests() { # Run ccl performance tests - run_t3000_ccl_perf_tests + run_t3000_ccl_all_gather_perf_tests } diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh index 69e34a86b22..0e714429b88 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh @@ -72,22 +72,31 @@ run_profile_and_extract_csv() { if [ -n "$csv_path" ]; then echo "CSV path found: $csv_path" + echo "Generating performance report..." - # Run the Python script to generate performance report - average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " + tmp_file="/tmp/perf_report_output.log" + PYTHONPATH="$MODULE_DIR" python3 -c " +import sys import pandas as pd from perf_csv import perf_report from tabulate import tabulate -# Generate the report and convert it to a DataFrame -average_df = perf_report('$csv_path') -# Print the DataFrame in a pretty table format -print(tabulate(average_df, headers='keys', tablefmt='pretty')) -") +try: + # Generate the report and convert it to a DataFrame + average_df = perf_report('$csv_path') + # Print the DataFrame in a pretty table format + print('Min - Avg - Max by Common Runs:') + print(tabulate(average_df, headers='keys', tablefmt='pretty')) +except Exception as e: + print(f'Error in performance report generation: {e}', file=sys.stderr) + sys.exit(1) +" 2>&1 | tee "$tmp_file" + + if grep -q "Error in performance report generation" "$tmp_file"; then + echo "Error: Performance report generation failed." + exit 1 + fi - # Print the output - echo "Min - Avg - Max by Common Runs:" - echo "$average_values" else echo "CSV path not found in the command output." exit 1 diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh index 23071225ac1..2f054ca348c 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh @@ -72,24 +72,34 @@ run_profile_and_extract_csv() { if [ -n "$csv_path" ]; then echo "CSV path found: $csv_path" + echo "Generating performance report..." - # Run the Python script to generate performance report - average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " + tmp_file="/tmp/perf_report_output.log" + PYTHONPATH="$MODULE_DIR" python3 -c " +import sys import pandas as pd from perf_csv import perf_report from tabulate import tabulate -# Generate the report and convert it to a DataFrame -average_df = perf_report('$csv_path') -# Print the DataFrame in a pretty table format -print(tabulate(average_df, headers='keys', tablefmt='pretty')) -") +try: + # Generate the report and convert it to a DataFrame + average_df = perf_report('$csv_path') + # Print the DataFrame in a pretty table format + print('Min - Avg - Max by Common Runs:') + print(tabulate(average_df, headers='keys', tablefmt='pretty')) +except Exception as e: + print(f'Error in performance report generation: {e}', file=sys.stderr) + sys.exit(1) +" 2>&1 | tee "$tmp_file" + + if grep -q "Error in performance report generation" "$tmp_file"; then + echo "Error: Performance report generation failed." + exit 1 + fi - # Print the output - echo "Min - Avg - Max by Common Runs:" - echo "$average_values" else echo "CSV path not found in the command output." + exit 1 fi } From 97cae032ef1e570ca75b57e99d0720eb33bc0e0c Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Fri, 8 Nov 2024 10:13:57 +0000 Subject: [PATCH 6/6] #0: ci check --- .github/workflows/t3000-model-perf-tests-impl.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index f092abf18d1..c104d01fbaa 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -47,19 +47,24 @@ jobs: echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - name: Download profiler build artifact + id: download-profiler-artifact if: ${{ matrix.test-group.tracy }} uses: actions/download-artifact@v4 with: name: TTMetal_build_${{ matrix.test-group.arch }}_profiler + continue-on-error: true - name: Download build artifact + id: download-artifact if: ${{ !matrix.test-group.tracy }} uses: actions/download-artifact@v4 with: name: TTMetal_build_${{ matrix.test-group.arch }} - name: Extract files + if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }} run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run model perf regression tests + if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }} shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | @@ -71,7 +76,7 @@ jobs: env python models/perf/merge_perf_results.py - name: Check perf report exists id: check-perf-report - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }} run: | TODAY=$(date +%Y_%m_%d) PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"