TorchBench Optim Regression Detector on A100 #638
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: TorchBench Optim Regression Detector on A100 | |
on: | |
schedule: | |
- cron: '0 4 * * *' # run at 4 AM UTC = midnight ET | |
workflow_dispatch: | |
inputs: | |
userbenchmark_name: | |
description: "Name of the user benchmark to run" | |
userbenchmark_options: | |
description: "Option of the user benchmark to run" | |
jobs: | |
run-userbenchmark: | |
runs-on: [a100-runner] | |
timeout-minutes: 1440 # 24 hours | |
environment: docker-s3-upload | |
env: | |
BASE_CONDA_ENV: "torchbench" | |
CONDA_ENV: "optim" | |
PLATFORM_NAME: "gcp_a100" | |
TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
SETUP_SCRIPT: "/workspace/setup_instance.sh" | |
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
steps: | |
- name: Checkout TorchBench | |
uses: actions/checkout@v3 | |
with: | |
path: benchmark | |
- name: Tune Nvidia GPU | |
run: | | |
sudo nvidia-smi -pm 1 | |
sudo nvidia-smi -ac 1215,1410 | |
nvidia-smi | |
- name: Clone and setup Conda env | |
run: | | |
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" | |
conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}" | |
- name: Install TorchBench | |
run: | | |
set -x | |
. "${SETUP_SCRIPT}" | |
pushd benchmark | |
# only install the subset of models currently running. | |
python install.py BERT_pytorch DALLE2_pytorch hf_GPT2_large hf_T5_large resnet50 timm_vision_transformer_large yolov3 | |
- name: Print torch.version.git_version | |
run: | | |
set -x | |
. "${SETUP_SCRIPT}" | |
python -c "import torch; print(torch.version.git_version)" | |
- name: Run optim user benchmark | |
run: | | |
set -x | |
. "${SETUP_SCRIPT}" | |
# remove old results | |
if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi | |
pushd benchmark | |
if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi | |
# TODO: scale this to run other benchmarks, but let's start with optim | |
# Only run a subset, see if this fixes the workflow | |
python -m userbenchmark.optim.run_optim_benchmarks -s -c ${{ github.event.inputs.userbenchmark_options }} | |
cp -r ./.userbenchmark/optim ../benchmark-output | |
- name: Detect potential regressions | |
continue-on-error: true | |
run: | | |
. "${SETUP_SCRIPT}" | |
pushd benchmark | |
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r)) | |
# TODO: the following assumes only one metrics-*.json is found. It will keep | |
# overwriting gh-issue.md if multiple are found. Scaling this up is a potential next step. | |
for r in ${RESULTS[@]}; do | |
python regression_detector.py --platform "${PLATFORM_NAME}" --treatment "${r}" --owner @janeyx99 \ | |
--gh-issue-path gh-issue.md --errors-path errors.txt | |
done | |
- name: Create the github issue | |
continue-on-error: true | |
if: false | |
uses: peter-evans/create-issue-from-file@v4 | |
with: | |
title: Optim Perf Signal Detected by TorchBench CI on ${{ env.TORCHBENCH_REGRESSION_DETECTED }} | |
token: ${{ secrets.TORCHBENCH_ACCESS_TOKEN }} | |
content-filepath: ./benchmark/gh-issue.md | |
labels: | | |
torchbench-perf-report | |
- name: Upload result jsons to Scribe and S3 | |
run: | | |
. "${SETUP_SCRIPT}" | |
pushd benchmark | |
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r)) | |
echo "Uploading result jsons: ${RESULTS}" | |
for r in ${RESULTS[@]}; do | |
python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
python ./scripts/userbenchmark/upload_s3.py --upload-file "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
done | |
- name: Upload artifact | |
uses: actions/upload-artifact@v3 | |
with: | |
name: TorchBench result | |
path: benchmark-output/ | |
- name: Finally, error if errors.txt exists | |
if: always() | |
run: | | |
# Do not error earlier as we want all artifacts and regressions to be reported first | |
# TODO: potentially move errors.txt to benchmark-output so it gets uploaded to S3 | |
pushd benchmark | |
if [ -e errors.txt ]; then cat errors.txt && exit 1; fi | |
- name: Remove conda environment | |
if: always() | |
run: | | |
. "${SETUP_SCRIPT}" | |
conda deactivate && conda deactivate | |
conda remove -n "${CONDA_ENV}" --all |