-
Notifications
You must be signed in to change notification settings - Fork 231
95 lines (89 loc) · 3.08 KB
/
periodic.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
name: periodic
on:
schedule:
- cron: '0,6,12,18 0 * * *' # Runs at midnight UTC and every 6 hours
push:
tags:
- ciflow/periodic/*
workflow_dispatch:
jobs:
gather-models:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Extract the list of models to test
id: gather-models
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic"
test-cpu:
name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }})
needs: gather-models
strategy:
matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
ENABKE_ET_PYBIND: ${{ matrix.runner == 'macos-14' && 'false' || 'true' }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
bash ${TORCHCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
pushd ${TORCHCHAT_ROOT}
export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh ${CHECKPOINT_PATH}
test-cuda:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-cuda (linux, ${{ matrix.repo_name }})
needs: gather-models
strategy:
matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install required packages"
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} cuda
echo "::endgroup::"