TorchBench Userbenchmark on T4 Metal #524
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: TorchBench Userbenchmark on T4 Metal | |
on: | |
schedule: | |
- cron: '0 17 * * *' # run at 5 PM UTC | |
workflow_dispatch: | |
inputs: | |
userbenchmark_name: | |
description: "Name of the user benchmark to run" | |
userbenchmark_options: | |
description: "Option of the user benchmark to run" | |
jobs: | |
run-userbenchmark: | |
runs-on: [self-hosted, bm-runner] | |
timeout-minutes: 1440 # 24 hours | |
environment: docker-s3-upload | |
env: | |
CONDA_ENV_NAME: "userbenchmarks-ci" | |
PLATFORM_NAME: "aws_t4_metal" | |
TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh" | |
steps: | |
- name: Checkout TorchBench | |
uses: actions/checkout@v3 | |
with: | |
path: benchmark | |
- name: Create conda environment | |
run: | | |
python benchmark/utils/python_utils.py --create-conda-env "${CONDA_ENV_NAME}" | |
sudo python benchmark/utils/cuda_utils.py --setup-cuda-softlink | |
- name: Install PyTorch nightly | |
run: | | |
. "${SETUP_SCRIPT}" && . activate "${CONDA_ENV_NAME}" | |
pushd benchmark | |
# Install dependencies | |
python utils/cuda_utils.py --install-torch-deps | |
# check the machine is tuned | |
pip install -U py-cpuinfo psutil distro boto3 | |
sudo ${HOME}/miniconda3/envs/${CONDA_ENV_NAME}/bin/python3 torchbenchmark/util/machine_config.py | |
# Check if nightly builds are available | |
NIGHTLIES=$(python torchbenchmark/util/torch_nightly.py --packages torch) | |
# If failed, the script will generate empty result | |
if [ -z $NIGHTLIES ]; then | |
echo "Torch nightly build failed. Cancel the workflow." | |
exit 1 | |
fi | |
# Install PyTorch and torchvision nightly from pip | |
python utils/cuda_utils.py --install-torch-nightly | |
python utils/cuda_utils.py --check-torch-nightly-version | |
# make sure pytorch+cuda works | |
python -c "import torch; torch.cuda.init()" | |
- name: Install TorchBench | |
run: | | |
set -x | |
. "${SETUP_SCRIPT}" && conda activate "${CONDA_ENV_NAME}" | |
pushd benchmark | |
python install.py | |
- name: Run user benchmark | |
run: | | |
set -x | |
. "${SETUP_SCRIPT}" && conda activate "${CONDA_ENV_NAME}" | |
# remove old results | |
if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi | |
pushd benchmark | |
if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi | |
MANUAL_WORKFLOW="${{ github.event.inputs.userbenchmark_name }}" | |
if [ -z "${MANUAL_WORKFLOW}" ]; then | |
# Figure out what userbenchmarks we should run, and run it | |
python ./.github/scripts/userbenchmark/schedule-benchmarks.py --platform ${PLATFORM_NAME} | |
if [ -d ./.userbenchmark ]; then | |
cp -r ./.userbenchmark ../benchmark-output | |
else | |
mkdir ../benchmark-output | |
fi | |
else | |
python run_benchmark.py "${{ github.event.inputs.userbenchmark_name }}" ${{ github.event.inputs.userbenchmark_options }} | |
cp -r ./.userbenchmark/"${{ github.event.inputs.userbenchmark_name }}" ../benchmark-output | |
fi | |
- name: Upload result jsons to Scribe | |
run: | | |
. "${SETUP_SCRIPT}" && conda activate "${CONDA_ENV_NAME}" | |
pushd benchmark | |
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r)) | |
echo "Uploading result jsons: ${RESULTS}" | |
for r in ${RESULTS[@]}; do | |
python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
python ./scripts/userbenchmark/upload_s3.py --upload-file "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
done | |
- name: Upload artifact | |
uses: actions/upload-artifact@v3 | |
with: | |
name: TorchBench result | |
path: benchmark-output/ | |
- name: Remove conda environment | |
run: | | |
conda env remove --name "${CONDA_ENV_NAME}" |