diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index bdee29373ce..0a2bd744f14 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -22,7 +22,7 @@ jobs: { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho - { name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar + { name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run? ] name: ${{ matrix.test-group.name }} diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 7c0d0757a09..19a54d710b1 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -142,20 +142,20 @@ run_t3000_resnet50_tests() { fi } -run_t3000_ccl_perf_tests() { +run_t3000_ccl_all_gather_perf_tests() { # Record the start time fail=0 start_time=$(date +%s) - echo "LOG_METAL: Running run_t3000_ccl_perf_tests" + echo "LOG_METAL: Running run_t3000_ccl_all_gather_perf_tests" - tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -t t3000 + tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t t3000 fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) - echo "LOG_METAL: run_t3000_ccl_perf_tests $duration seconds to complete" + echo "LOG_METAL: run_t3000_ccl_all_gather_perf_tests $duration seconds to complete" if [[ $fail -ne 0 ]]; then exit 1 fi @@ -194,7 +194,7 @@ run_t3000_cnn_tests() { run_t3000_ccl_tests() { # Run ccl performance tests - run_t3000_ccl_perf_tests + run_t3000_ccl_all_gather_perf_tests } diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh index 69e34a86b22..0e714429b88 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh @@ -72,22 +72,31 @@ run_profile_and_extract_csv() { if [ -n "$csv_path" ]; then echo "CSV path found: $csv_path" + echo "Generating performance report..." - # Run the Python script to generate performance report - average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " + tmp_file="/tmp/perf_report_output.log" + PYTHONPATH="$MODULE_DIR" python3 -c " +import sys import pandas as pd from perf_csv import perf_report from tabulate import tabulate -# Generate the report and convert it to a DataFrame -average_df = perf_report('$csv_path') -# Print the DataFrame in a pretty table format -print(tabulate(average_df, headers='keys', tablefmt='pretty')) -") +try: + # Generate the report and convert it to a DataFrame + average_df = perf_report('$csv_path') + # Print the DataFrame in a pretty table format + print('Min - Avg - Max by Common Runs:') + print(tabulate(average_df, headers='keys', tablefmt='pretty')) +except Exception as e: + print(f'Error in performance report generation: {e}', file=sys.stderr) + sys.exit(1) +" 2>&1 | tee "$tmp_file" + + if grep -q "Error in performance report generation" "$tmp_file"; then + echo "Error: Performance report generation failed." + exit 1 + fi - # Print the output - echo "Min - Avg - Max by Common Runs:" - echo "$average_values" else echo "CSV path not found in the command output." exit 1 diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh index 23071225ac1..2f054ca348c 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh @@ -72,24 +72,34 @@ run_profile_and_extract_csv() { if [ -n "$csv_path" ]; then echo "CSV path found: $csv_path" + echo "Generating performance report..." - # Run the Python script to generate performance report - average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " + tmp_file="/tmp/perf_report_output.log" + PYTHONPATH="$MODULE_DIR" python3 -c " +import sys import pandas as pd from perf_csv import perf_report from tabulate import tabulate -# Generate the report and convert it to a DataFrame -average_df = perf_report('$csv_path') -# Print the DataFrame in a pretty table format -print(tabulate(average_df, headers='keys', tablefmt='pretty')) -") +try: + # Generate the report and convert it to a DataFrame + average_df = perf_report('$csv_path') + # Print the DataFrame in a pretty table format + print('Min - Avg - Max by Common Runs:') + print(tabulate(average_df, headers='keys', tablefmt='pretty')) +except Exception as e: + print(f'Error in performance report generation: {e}', file=sys.stderr) + sys.exit(1) +" 2>&1 | tee "$tmp_file" + + if grep -q "Error in performance report generation" "$tmp_file"; then + echo "Error: Performance report generation failed." + exit 1 + fi - # Print the output - echo "Min - Avg - Max by Common Runs:" - echo "$average_values" else echo "CSV path not found in the command output." + exit 1 fi }