Self-hosted runner (scheduled) #114
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted runner (scheduled) | |
# Note that each job's dependencies go into a corresponding docker file. | |
# | |
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is | |
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at | |
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` | |
on: | |
repository_dispatch: | |
schedule: | |
- cron: "17 2 * * *" | |
push: | |
branches: | |
- run_scheduled_ci* | |
env: | |
HF_HOME: /mnt/cache | |
TRANSFORMERS_IS_CI: yes | |
OMP_NUM_THREADS: 8 | |
MKL_NUM_THREADS: 8 | |
RUN_SLOW: yes | |
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} | |
TF_FORCE_GPU_ALLOW_GROWTH: true | |
RUN_PT_TF_CROSS_TESTS: 1 | |
jobs: | |
check_runner_status: | |
name: Check Runner Status | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout transformers | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 2 | |
- name: Check Runner Status | |
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | |
check_runners: | |
name: Check Runners | |
needs: check_runner_status | |
strategy: | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
setup: | |
name: Setup | |
needs: check_runners | |
strategy: | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
outputs: | |
matrix: ${{ steps.set-matrix.outputs.matrix }} | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: | | |
git fetch && git checkout ${{ github.sha }} | |
- name: Cleanup | |
working-directory: /transformers | |
run: | | |
rm -rf tests/__pycache__ | |
rm -rf tests/models/__pycache__ | |
rm -rf reports | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- id: set-matrix | |
name: Identify models to test | |
working-directory: /transformers/tests | |
run: | | |
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
run_tests_single_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [single-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_tests_multi_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [multi-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_examples_gpu: | |
name: Examples directory | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
container: | |
image: huggingface/transformers-all-latest-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- name: Run examples tests on GPU | |
working-directory: /transformers | |
run: | | |
pip install -r examples/pytorch/_tests_requirements.txt | |
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_examples_gpu | |
path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu | |
run_pipelines_torch_gpu: | |
name: PyTorch pipelines | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
container: | |
image: huggingface/transformers-pytorch-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- name: Run all pipeline tests on GPU | |
working-directory: /transformers | |
run: | | |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu | |
run_pipelines_tf_gpu: | |
name: TensorFlow pipelines | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
container: | |
image: huggingface/transformers-tensorflow-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: | | |
git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /transformers | |
run: pip freeze | |
- name: Run all pipeline tests on GPU | |
working-directory: /transformers | |
run: | | |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines | |
- name: Failure short reports | |
if: ${{ always() }} | |
run: | | |
cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu | |
run_all_tests_torch_cuda_extensions_gpu: | |
name: Torch CUDA extension tests | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] | |
needs: setup | |
container: | |
image: huggingface/transformers-pytorch-deepspeed-latest-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Update clone | |
working-directory: /workspace/transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | |
working-directory: /workspace/transformers | |
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | |
- name: Remove cached torch extensions | |
run: rm -rf /github/home/.cache/torch_extensions/ | |
# To avoid unknown test failures | |
- name: Pre build DeepSpeed *again* | |
working-directory: /workspace | |
run: | | |
python3 -m pip uninstall -y deepspeed | |
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /workspace/transformers | |
run: | | |
python utils/print_env.py | |
- name: Show installed libraries and their versions | |
working-directory: /workspace/transformers | |
run: pip freeze | |
- name: Run all tests on GPU | |
working-directory: /workspace/transformers | |
run: | | |
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports | |
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu | |
run_extract_warnings: | |
name: Extract warnings in CI artifacts | |
runs-on: ubuntu-latest | |
if: always() | |
needs: [ | |
check_runner_status, | |
check_runners, | |
setup, | |
run_tests_single_gpu, | |
run_tests_multi_gpu, | |
run_examples_gpu, | |
run_pipelines_tf_gpu, | |
run_pipelines_torch_gpu, | |
run_all_tests_torch_cuda_extensions_gpu | |
] | |
steps: | |
- name: Checkout transformers | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 2 | |
- name: Install transformers | |
run: pip install transformers | |
- name: Show installed libraries and their versions | |
run: pip freeze | |
- name: Create output directory | |
run: mkdir warnings_in_ci | |
- uses: actions/download-artifact@v3 | |
with: | |
path: warnings_in_ci | |
- name: Show artifacts | |
run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')" | |
working-directory: warnings_in_ci | |
- name: Extract warnings in CI artifacts | |
run: | | |
python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh | |
echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')" | |
- name: Upload artifact | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: warnings_in_ci | |
path: warnings_in_ci/selected_warnings.json | |
send_results: | |
name: Send results to webhook | |
runs-on: ubuntu-latest | |
if: always() | |
needs: [ | |
check_runner_status, | |
check_runners, | |
setup, | |
run_tests_single_gpu, | |
run_tests_multi_gpu, | |
run_examples_gpu, | |
run_pipelines_tf_gpu, | |
run_pipelines_torch_gpu, | |
run_all_tests_torch_cuda_extensions_gpu, | |
run_extract_warnings | |
] | |
steps: | |
- name: Preliminary job status | |
shell: bash | |
# For the meaning of these environment variables, see the job `Setup` | |
run: | | |
echo "Runner availability: ${{ needs.check_runner_status.result }}" | |
echo "Runner status: ${{ needs.check_runners.result }}" | |
echo "Setup status: ${{ needs.setup.result }}" | |
- uses: actions/checkout@v3 | |
- uses: actions/download-artifact@v3 | |
- name: Send message to Slack | |
env: | |
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} | |
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} | |
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} | |
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} | |
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} | |
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | |
CI_EVENT: scheduled | |
CI_SHA: ${{ github.sha }} | |
CI_WORKFLOW_REF: ${{ github.workflow_ref }} | |
RUNNER_STATUS: ${{ needs.check_runner_status.result }} | |
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} | |
SETUP_STATUS: ${{ needs.setup.result }} | |
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change | |
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. | |
run: | | |
sudo apt-get install -y curl | |
pip install slack_sdk | |
pip show slack_sdk | |
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" | |
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. | |
- name: Failure table artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v3 | |
with: | |
name: test_failure_tables | |
path: test_failure_tables |