Skip to content

Commit

Permalink
#14406: Add CCL Perf tests to pipeline (#14836)
Browse files Browse the repository at this point in the history
### Ticket
#14406 

### Problem description
Need CCL Perf to be triggered in pipeline

### What's changed
Adds CCL Perf tests to T3K perf pipeline

T3K Model Perf -
https://github.com/tenstorrent/tt-metal/actions/runs/11718145347/job/32639080900
Perf Artifact -
https://github.com/tenstorrent/tt-metal/actions/runs/11718145347/job/32639080900#step:13:33

### Checklist
- [ ] Post commit CI passes
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
Aswinmcw authored Nov 12, 2024
1 parent ef71901 commit 9387e7b
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 31 deletions.
41 changes: 35 additions & 6 deletions .github/workflows/t3000-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:
{ name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
{ name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
{ name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
{ name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
#{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
]
name: ${{ matrix.test-group.name }}
Expand All @@ -45,13 +46,25 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
- name: Download profiler build artifact
id: download-profiler-artifact
if: ${{ matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
continue-on-error: true
- name: Download build artifact
id: download-artifact
if: ${{ !matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
Expand All @@ -63,12 +76,28 @@ jobs:
env python models/perf/merge_perf_results.py
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }}
run: |
ls -hal
export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv"
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
TODAY=$(date +%Y_%m_%d)
PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
else
echo "No CCL perf report found for today."
exit 1
fi
else
if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
else
echo "No Models perf report found for today."
exit 1
fi
fi
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/t3000-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@ jobs:
with:
arch: '["wormhole_b0"]'
secrets: inherit
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
t3000-model-perf-tests:
needs: build-artifact
needs: [build-artifact, build-artifact-profiler]
secrets: inherit
uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
29 changes: 28 additions & 1 deletion tests/scripts/t3000/run_t3000_model_perf_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,25 @@ run_t3000_resnet50_tests() {
fi
}

run_t3000_ccl_all_gather_perf_tests() {
# Record the start time
fail=0
start_time=$(date +%s)

echo "LOG_METAL: Running run_t3000_ccl_all_gather_perf_tests"

tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t t3000
fail+=$?

# Record the end time
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "LOG_METAL: run_t3000_ccl_all_gather_perf_tests $duration seconds to complete"
if [[ $fail -ne 0 ]]; then
exit 1
fi
}

run_t3000_llm_tests() {
# Run falcon7b tests
run_t3000_falcon7b_tests
Expand Down Expand Up @@ -173,6 +192,12 @@ run_t3000_cnn_tests() {
env python models/perf/merge_perf_results.py
}

run_t3000_ccl_tests() {
# Run ccl performance tests
run_t3000_ccl_all_gather_perf_tests

}

fail=0
main() {
# For CI pipeline - source func commands but don't execute tests if not invoked directly
Expand Down Expand Up @@ -219,8 +244,10 @@ main() {
run_t3000_llm_tests
elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then
run_t3000_cnn_tests
elif [[ "$pipeline_type" == "ccl_perf_t3000_device" ]]; then
run_t3000_ccl_tests
else
echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1
echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device, ccl_perf_t3000_device])" 2>&1
exit 1
fi

Expand Down
9 changes: 6 additions & 3 deletions tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
import os
import re
import time


def perf_report(file_path):
Expand Down Expand Up @@ -214,10 +215,12 @@ def calculate_bandwidth(row):

averages_df = pd.DataFrame(averages_data)

averages_file_path = file_path.replace(".csv", "_averages.csv")
today = time.strftime("%Y_%m_%d")
ccl_perf_file_path = f"CCL_Perf_{today}.csv"
os.rename(file_path, ccl_perf_file_path)

averages_df.to_csv(averages_file_path, index=False)
averages_df.to_csv(ccl_perf_file_path, index=False)

print(f"Averages CSV saved to: {averages_file_path}")
print(f"CCL Perf report CSV saved to: {ccl_perf_file_path}")

return averages_df
30 changes: 20 additions & 10 deletions tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,34 @@ run_profile_and_extract_csv() {

if [ -n "$csv_path" ]; then
echo "CSV path found: $csv_path"
echo "Generating performance report..."

# Run the Python script to generate performance report
average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
tmp_file="/tmp/perf_report_output.log"
PYTHONPATH="$MODULE_DIR" python3 -c "
import sys
import pandas as pd
from perf_csv import perf_report
from tabulate import tabulate
# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
")
try:
# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print('Min - Avg - Max by Common Runs:')
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
except Exception as e:
print(f'Error in performance report generation: {e}', file=sys.stderr)
sys.exit(1)
" 2>&1 | tee "$tmp_file"

if grep -q "Error in performance report generation" "$tmp_file"; then
echo "Error: Performance report generation failed."
exit 1
fi

# Print the output
echo "Min - Avg - Max by Common Runs:"
echo "$average_values"
else
echo "CSV path not found in the command output."
exit 1
fi
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,34 @@ run_profile_and_extract_csv() {

if [ -n "$csv_path" ]; then
echo "CSV path found: $csv_path"
echo "Generating performance report..."

# Run the Python script to generate performance report
average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
tmp_file="/tmp/perf_report_output.log"
PYTHONPATH="$MODULE_DIR" python3 -c "
import sys
import pandas as pd
from perf_csv import perf_report
from tabulate import tabulate
# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
")
try:
# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print('Min - Avg - Max by Common Runs:')
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
except Exception as e:
print(f'Error in performance report generation: {e}', file=sys.stderr)
sys.exit(1)
" 2>&1 | tee "$tmp_file"

if grep -q "Error in performance report generation" "$tmp_file"; then
echo "Error: Performance report generation failed."
exit 1
fi

# Print the output
echo "Min - Avg - Max by Common Runs:"
echo "$average_values"
else
echo "CSV path not found in the command output."
exit 1
fi
}

Expand Down

0 comments on commit 9387e7b

Please sign in to comment.