diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index 91e208c214b..c104d01fbaa 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -22,6 +22,7 @@ jobs: { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho + { name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run? ] name: ${{ matrix.test-group.name }} @@ -45,13 +46,25 @@ jobs: run: | echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - - uses: actions/download-artifact@v4 + - name: Download profiler build artifact + id: download-profiler-artifact + if: ${{ matrix.test-group.tracy }} + uses: actions/download-artifact@v4 + with: + name: TTMetal_build_${{ matrix.test-group.arch }}_profiler + continue-on-error: true + - name: Download build artifact + id: download-artifact + if: ${{ !matrix.test-group.tracy }} + uses: actions/download-artifact@v4 with: name: TTMetal_build_${{ matrix.test-group.arch }} - name: Extract files + if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }} run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run model perf regression tests + if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }} shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | @@ -63,12 +76,28 @@ jobs: env python models/perf/merge_perf_results.py - name: Check perf report exists id: check-perf-report - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }} run: | - ls -hal - export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv" - ls -hal $PERF_REPORT_FILENAME - echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" + TODAY=$(date +%Y_%m_%d) + PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv" + PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv" + if [ "${{ matrix.test-group.tracy }}" == "true" ]; then + if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then + echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL" + echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT" + else + echo "No CCL perf report found for today." + exit 1 + fi + else + if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then + echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS" + echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT" + else + echo "No Models perf report found for today." + exit 1 + fi + fi - name: Upload perf report if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} uses: actions/upload-artifact@v4 diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml index 0a8759af27c..15d96746889 100644 --- a/.github/workflows/t3000-model-perf-tests.yaml +++ b/.github/workflows/t3000-model-perf-tests.yaml @@ -11,7 +11,13 @@ jobs: with: arch: '["wormhole_b0"]' secrets: inherit + build-artifact-profiler: + uses: ./.github/workflows/build-artifact.yaml + with: + arch: '["wormhole_b0"]' + tracy: true + secrets: inherit t3000-model-perf-tests: - needs: build-artifact + needs: [build-artifact, build-artifact-profiler] secrets: inherit uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 70baaa85ae3..19a54d710b1 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -142,6 +142,25 @@ run_t3000_resnet50_tests() { fi } +run_t3000_ccl_all_gather_perf_tests() { + # Record the start time + fail=0 + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_ccl_all_gather_perf_tests" + + tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t t3000 + fail+=$? + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_ccl_all_gather_perf_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_t3000_llm_tests() { # Run falcon7b tests run_t3000_falcon7b_tests @@ -173,6 +192,12 @@ run_t3000_cnn_tests() { env python models/perf/merge_perf_results.py } +run_t3000_ccl_tests() { + # Run ccl performance tests + run_t3000_ccl_all_gather_perf_tests + +} + fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly @@ -219,8 +244,10 @@ main() { run_t3000_llm_tests elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then run_t3000_cnn_tests + elif [[ "$pipeline_type" == "ccl_perf_t3000_device" ]]; then + run_t3000_ccl_tests else - echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1 + echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device, ccl_perf_t3000_device])" 2>&1 exit 1 fi diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py index 31f4636aa66..3d5cc2aaeb5 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py @@ -5,6 +5,7 @@ import pandas as pd import os import re +import time def perf_report(file_path): @@ -214,10 +215,12 @@ def calculate_bandwidth(row): averages_df = pd.DataFrame(averages_data) - averages_file_path = file_path.replace(".csv", "_averages.csv") + today = time.strftime("%Y_%m_%d") + ccl_perf_file_path = f"CCL_Perf_{today}.csv" + os.rename(file_path, ccl_perf_file_path) - averages_df.to_csv(averages_file_path, index=False) + averages_df.to_csv(ccl_perf_file_path, index=False) - print(f"Averages CSV saved to: {averages_file_path}") + print(f"CCL Perf report CSV saved to: {ccl_perf_file_path}") return averages_df diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh index 8422bde56d0..0e714429b88 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh @@ -72,24 +72,34 @@ run_profile_and_extract_csv() { if [ -n "$csv_path" ]; then echo "CSV path found: $csv_path" + echo "Generating performance report..." - # Run the Python script to generate performance report - average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " + tmp_file="/tmp/perf_report_output.log" + PYTHONPATH="$MODULE_DIR" python3 -c " +import sys import pandas as pd from perf_csv import perf_report from tabulate import tabulate -# Generate the report and convert it to a DataFrame -average_df = perf_report('$csv_path') -# Print the DataFrame in a pretty table format -print(tabulate(average_df, headers='keys', tablefmt='pretty')) -") +try: + # Generate the report and convert it to a DataFrame + average_df = perf_report('$csv_path') + # Print the DataFrame in a pretty table format + print('Min - Avg - Max by Common Runs:') + print(tabulate(average_df, headers='keys', tablefmt='pretty')) +except Exception as e: + print(f'Error in performance report generation: {e}', file=sys.stderr) + sys.exit(1) +" 2>&1 | tee "$tmp_file" + + if grep -q "Error in performance report generation" "$tmp_file"; then + echo "Error: Performance report generation failed." + exit 1 + fi - # Print the output - echo "Min - Avg - Max by Common Runs:" - echo "$average_values" else echo "CSV path not found in the command output." + exit 1 fi } diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh index 23071225ac1..2f054ca348c 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh @@ -72,24 +72,34 @@ run_profile_and_extract_csv() { if [ -n "$csv_path" ]; then echo "CSV path found: $csv_path" + echo "Generating performance report..." - # Run the Python script to generate performance report - average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " + tmp_file="/tmp/perf_report_output.log" + PYTHONPATH="$MODULE_DIR" python3 -c " +import sys import pandas as pd from perf_csv import perf_report from tabulate import tabulate -# Generate the report and convert it to a DataFrame -average_df = perf_report('$csv_path') -# Print the DataFrame in a pretty table format -print(tabulate(average_df, headers='keys', tablefmt='pretty')) -") +try: + # Generate the report and convert it to a DataFrame + average_df = perf_report('$csv_path') + # Print the DataFrame in a pretty table format + print('Min - Avg - Max by Common Runs:') + print(tabulate(average_df, headers='keys', tablefmt='pretty')) +except Exception as e: + print(f'Error in performance report generation: {e}', file=sys.stderr) + sys.exit(1) +" 2>&1 | tee "$tmp_file" + + if grep -q "Error in performance report generation" "$tmp_file"; then + echo "Error: Performance report generation failed." + exit 1 + fi - # Print the output - echo "Min - Avg - Max by Common Runs:" - echo "$average_values" else echo "CSV path not found in the command output." + exit 1 fi }