Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#14406: Add CCL Perf tests to pipeline #14836

Merged
merged 6 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 35 additions & 6 deletions .github/workflows/t3000-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:
{ name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
{ name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
{ name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
{ name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
#{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
]
name: ${{ matrix.test-group.name }}
Expand All @@ -45,13 +46,25 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
- name: Download profiler build artifact
id: download-profiler-artifact
if: ${{ matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
continue-on-error: true
- name: Download build artifact
id: download-artifact
if: ${{ !matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
Expand All @@ -63,12 +76,28 @@ jobs:
env python models/perf/merge_perf_results.py
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }}
run: |
ls -hal
export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv"
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
TODAY=$(date +%Y_%m_%d)
PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
else
echo "No CCL perf report found for today."
exit 1
fi
else
if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
else
echo "No Models perf report found for today."
exit 1
fi
fi
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/t3000-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@ jobs:
with:
arch: '["wormhole_b0"]'
secrets: inherit
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
t3000-model-perf-tests:
needs: build-artifact
needs: [build-artifact, build-artifact-profiler]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just use only the build-artifact-profiler here?
I don't think we need build-artifact anymore if we want to use profiler tools here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scratch that, i realized i misread the impl.

Is it possible to introduce an if statement to determine the correct preciding step?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have introduced an if statement in the impl file to choose which build to use for that job

secrets: inherit
uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
29 changes: 28 additions & 1 deletion tests/scripts/t3000/run_t3000_model_perf_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,25 @@ run_t3000_resnet50_tests() {
fi
}

run_t3000_ccl_all_gather_perf_tests() {
# Record the start time
fail=0
start_time=$(date +%s)

echo "LOG_METAL: Running run_t3000_ccl_all_gather_perf_tests"

tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t t3000
fail+=$?

# Record the end time
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "LOG_METAL: run_t3000_ccl_all_gather_perf_tests $duration seconds to complete"
if [[ $fail -ne 0 ]]; then
exit 1
fi
}

run_t3000_llm_tests() {
# Run falcon7b tests
run_t3000_falcon7b_tests
Expand Down Expand Up @@ -173,6 +192,12 @@ run_t3000_cnn_tests() {
env python models/perf/merge_perf_results.py
}

run_t3000_ccl_tests() {
# Run ccl performance tests
run_t3000_ccl_all_gather_perf_tests

}

fail=0
main() {
# For CI pipeline - source func commands but don't execute tests if not invoked directly
Expand Down Expand Up @@ -219,8 +244,10 @@ main() {
run_t3000_llm_tests
elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then
run_t3000_cnn_tests
elif [[ "$pipeline_type" == "ccl_perf_t3000_device" ]]; then
run_t3000_ccl_tests
else
echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1
echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device, ccl_perf_t3000_device])" 2>&1
exit 1
fi

Expand Down
9 changes: 6 additions & 3 deletions tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
import os
import re
import time


def perf_report(file_path):
Expand Down Expand Up @@ -214,10 +215,12 @@ def calculate_bandwidth(row):

averages_df = pd.DataFrame(averages_data)

averages_file_path = file_path.replace(".csv", "_averages.csv")
today = time.strftime("%Y_%m_%d")
ccl_perf_file_path = f"CCL_Perf_{today}.csv"
os.rename(file_path, ccl_perf_file_path)

averages_df.to_csv(averages_file_path, index=False)
averages_df.to_csv(ccl_perf_file_path, index=False)

print(f"Averages CSV saved to: {averages_file_path}")
print(f"CCL Perf report CSV saved to: {ccl_perf_file_path}")

return averages_df
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,34 @@ run_profile_and_extract_csv() {

if [ -n "$csv_path" ]; then
echo "CSV path found: $csv_path"
echo "Generating performance report..."

# Run the Python script to generate performance report
average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
tmp_file="/tmp/perf_report_output.log"
PYTHONPATH="$MODULE_DIR" python3 -c "
import sys
import pandas as pd
from perf_csv import perf_report
from tabulate import tabulate

# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
")
try:
# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print('Min - Avg - Max by Common Runs:')
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
except Exception as e:
print(f'Error in performance report generation: {e}', file=sys.stderr)
sys.exit(1)
" 2>&1 | tee "$tmp_file"

if grep -q "Error in performance report generation" "$tmp_file"; then
echo "Error: Performance report generation failed."
exit 1
fi

# Print the output
echo "Min - Avg - Max by Common Runs:"
echo "$average_values"
else
echo "CSV path not found in the command output."
exit 1
fi
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,34 @@ run_profile_and_extract_csv() {

if [ -n "$csv_path" ]; then
echo "CSV path found: $csv_path"
echo "Generating performance report..."

# Run the Python script to generate performance report
average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
tmp_file="/tmp/perf_report_output.log"
PYTHONPATH="$MODULE_DIR" python3 -c "
import sys
import pandas as pd
from perf_csv import perf_report
from tabulate import tabulate

# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
")
try:
# Generate the report and convert it to a DataFrame
average_df = perf_report('$csv_path')
# Print the DataFrame in a pretty table format
print('Min - Avg - Max by Common Runs:')
print(tabulate(average_df, headers='keys', tablefmt='pretty'))
except Exception as e:
print(f'Error in performance report generation: {e}', file=sys.stderr)
sys.exit(1)
" 2>&1 | tee "$tmp_file"

if grep -q "Error in performance report generation" "$tmp_file"; then
echo "Error: Performance report generation failed."
exit 1
fi

# Print the output
echo "Min - Avg - Max by Common Runs:"
echo "$average_values"
else
echo "CSV path not found in the command output."
exit 1
fi
}

Expand Down
Loading