From 4be298277b62e03779726e3322da36c6dc271296 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Thu, 11 Apr 2024 21:46:24 -0700 Subject: [PATCH] Fix CI and validation scripts --- .ci/scripts/gather_test_models.py | 37 +++++++++++++ .github/workflows/periodic.yml | 86 +++++++++++++++++++++++++++++++ .github/workflows/pull.yml | 57 ++++++++++++++++---- scripts/install_et.sh | 3 +- 4 files changed, 172 insertions(+), 11 deletions(-) diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py index 51401d0c5..fc52ee3ab 100644 --- a/.ci/scripts/gather_test_models.py +++ b/.ci/scripts/gather_test_models.py @@ -27,6 +27,37 @@ } +def parse_args() -> Any: + from argparse import ArgumentParser + + parser = ArgumentParser("Gather all models to test on CI for the target OS") + parser.add_argument( + "-e", + "--event", + type=str, + choices=["pull_request", "push", "periodic"], + required=True, + help="GitHub CI Event. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on", + ) + + return parser.parse_args() + + +def model_should_run_on_event(model: str, event: str) -> bool: + """ + A helper function to decide whether a model should be tested on an event (pull_request/push) + We put higher priority and fast models to pull request and rest to push. + """ + if event == "pull_request": + return model in ["tinyllamas/stories15M"] + elif event == "push": + return model in [] + elif event == "periodic": + return model in ["mistralai/Mistral-7B-v0.1"] + else: + return False + + def set_output(name: str, val: Any) -> None: """ Set the GitHb output so that it can be accessed by other jobs @@ -45,6 +76,9 @@ def export_models_for_ci() -> dict[str, dict]: This gathers all the models that we want to test on GitHub OSS CI """ + args = parse_args() + event = args.event + # This is the JSON syntax for configuration matrix used by GitHub # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs models = {"include": []} @@ -53,6 +87,9 @@ def export_models_for_ci() -> dict[str, dict]: MODEL_REPOS.keys(), JOB_RUNNERS.keys(), ): + if not model_should_run_on_event(repo_name, event): + continue + record = { "repo_name": repo_name, "resources": MODEL_REPOS[repo_name], diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index b0bb8d8ab..b1ab33ec8 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -7,3 +7,89 @@ on: tags: - ciflow/periodic/* workflow_dispatch: + +jobs: + gather-models: + runs-on: ubuntu-22.04 + outputs: + models: ${{ steps.gather-models.outputs.models }} + steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Extract the list of models to test + id: gather-models + run: | + set -eux + PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" + test-cpu: + name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }}) + needs: gather-models + strategy: + matrix: ${{ fromJSON(needs.gather-models.outputs.models) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + TORCHCHAT_ROOT: ${{ github.workspace }} + REPO_NAME: ${{ matrix.repo_name }} + ENABKE_ET_PYBIND: ${{ matrix.runner == 'macos-14' && 'false' || 'true' }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Print machine info + run: | + echo "$(uname -a)" + - name: Install dependencies + run: | + bash ${TORCHCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND + - name: Download checkpoints + run: | + bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" + - name: Run validation + run: | + pushd ${TORCHCHAT_ROOT} + export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} + test-cuda: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + name: test-cuda (linux, ${{ matrix.repo_name }}) + needs: gather-models + strategy: + matrix: ${{ fromJSON(needs.gather-models.outputs.models) }} + fail-fast: false + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + script: | + echo "::group::Print machine info" + nvidia-smi + echo "::endgroup::" + + echo "::group::Install required packages" + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install -r ./requirements.txt + pip list + echo "::endgroup::" + + echo "::group::Download checkpoint" + export REPO_NAME=${{ matrix.repo_name }} + bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} + echo "::endgroup::" + + echo "::group::Convert checkpoint" + export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + echo "::endgroup::" + + echo "::group::Run inference" + bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} cuda + echo "::endgroup::" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index cf1e38550..59d19ecdf 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -1,8 +1,10 @@ name: pull on: - schedule: - - cron: '0,6,12,18 0 * * *' # Runs at midnight UTC and every 6 hours + pull_request: + push: + branches: + - main workflow_dispatch: jobs: @@ -21,7 +23,7 @@ jobs: id: gather-models run: | set -eux - PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py + PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" test-cpu: name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }}) needs: gather-models @@ -30,7 +32,7 @@ jobs: fail-fast: false runs-on: ${{ matrix.runner }} env: - TORCHAT_ROOT: ${{ github.workspace }} + TORCHCHAT_ROOT: ${{ github.workspace }} REPO_NAME: ${{ matrix.repo_name }} ENABKE_ET_PYBIND: ${{ matrix.runner == 'macos-14' && 'false' || 'true' }} steps: @@ -45,13 +47,48 @@ jobs: echo "$(uname -a)" - name: Install dependencies run: | - bash ${TORCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND + bash ${TORCHCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND - name: Download checkpoints run: | - bash ${TORCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" + bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" - name: Run validation run: | - pushd ${TORCHAT_ROOT} - export CHECKPOINT_PATH=${TORCHAT_ROOT}/checkpoints/${REPO_NAME}/model.pth - bash ${TORCHAT_ROOT}/.ci/scripts/convert_checkpoint.sh ${REPO_NAME} - bash ${TORCHAT_ROOT}/.ci/scripts/validate.sh ${CHECKPOINT_PATH} + pushd ${TORCHCHAT_ROOT} + export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} + test-cuda: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + name: test-cuda (linux, ${{ matrix.repo_name }}) + needs: gather-models + strategy: + matrix: ${{ fromJSON(needs.gather-models.outputs.models) }} + fail-fast: false + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + script: | + echo "::group::Print machine info" + nvidia-smi + echo "::endgroup::" + + echo "::group::Install required packages" + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install -r ./requirements.txt + pip list + echo "::endgroup::" + + echo "::group::Download checkpoint" + export REPO_NAME=${{ matrix.repo_name }} + bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} + echo "::endgroup::" + + echo "::group::Convert checkpoint" + export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + echo "::endgroup::" + + echo "::group::Run inference" + bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} cuda + echo "::endgroup::" diff --git a/scripts/install_et.sh b/scripts/install_et.sh index d230736dd..b7247ca64 100755 --- a/scripts/install_et.sh +++ b/scripts/install_et.sh @@ -11,7 +11,7 @@ install_pip_dependencies() { echo "Intalling common pip packages" pip install wheel - pip install cmake + pip install "cmake>=3.19" pip install ninja pip install zstd pushd ${TORCHCHAT_ROOT} @@ -26,6 +26,7 @@ install_executorch() { pushd ${TORCHCHAT_ROOT}/build/src git clone https://github.com/pytorch/executorch.git cd executorch + git checkout viable/strict echo "Install executorch: submodule update" git submodule sync git submodule update --init