From 4be298277b62e03779726e3322da36c6dc271296 Mon Sep 17 00:00:00 2001
From: Guang Yang <guangyang@meta.com>
Date: Thu, 11 Apr 2024 21:46:24 -0700
Subject: [PATCH] Fix CI and validation scripts

---
 .ci/scripts/gather_test_models.py | 37 +++++++++++++
 .github/workflows/periodic.yml    | 86 +++++++++++++++++++++++++++++++
 .github/workflows/pull.yml        | 57 ++++++++++++++++----
 scripts/install_et.sh             |  3 +-
 4 files changed, 172 insertions(+), 11 deletions(-)

diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index 51401d0c5..fc52ee3ab 100644
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -27,6 +27,37 @@
 }
 
 
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("Gather all models to test on CI for the target OS")
+    parser.add_argument(
+        "-e",
+        "--event",
+        type=str,
+        choices=["pull_request", "push", "periodic"],
+        required=True,
+        help="GitHub CI Event. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on",
+    )
+
+    return parser.parse_args()
+
+
+def model_should_run_on_event(model: str, event: str) -> bool:
+    """
+    A helper function to decide whether a model should be tested on an event (pull_request/push)
+    We put higher priority and fast models to pull request and rest to push.
+    """
+    if event == "pull_request":
+        return model in ["tinyllamas/stories15M"]
+    elif event == "push":
+        return model in []
+    elif event == "periodic":
+        return model in ["mistralai/Mistral-7B-v0.1"]
+    else:
+        return False
+
+
 def set_output(name: str, val: Any) -> None:
     """
     Set the GitHb output so that it can be accessed by other jobs
@@ -45,6 +76,9 @@ def export_models_for_ci() -> dict[str, dict]:
     This gathers all the models that we want to test on GitHub OSS CI
     """
 
+    args = parse_args()
+    event = args.event
+
     # This is the JSON syntax for configuration matrix used by GitHub
     # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
     models = {"include": []}
@@ -53,6 +87,9 @@ def export_models_for_ci() -> dict[str, dict]:
         MODEL_REPOS.keys(),
         JOB_RUNNERS.keys(),
     ):
+        if not model_should_run_on_event(repo_name, event):
+            continue
+
         record = {
             "repo_name": repo_name,
             "resources": MODEL_REPOS[repo_name],
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index b0bb8d8ab..b1ab33ec8 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -7,3 +7,89 @@ on:
     tags:
       - ciflow/periodic/*
   workflow_dispatch:
+
+jobs:
+  gather-models:
+    runs-on: ubuntu-22.04
+    outputs:
+      models: ${{ steps.gather-models.outputs.models }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Extract the list of models to test
+        id: gather-models
+        run: |
+          set -eux
+          PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic"
+  test-cpu:
+    name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }})
+    needs: gather-models
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      TORCHCHAT_ROOT: ${{ github.workspace }}
+      REPO_NAME: ${{ matrix.repo_name }}
+      ENABKE_ET_PYBIND: ${{ matrix.runner == 'macos-14' && 'false' || 'true' }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Print machine info
+        run: |
+          echo "$(uname -a)"
+      - name: Install dependencies
+        run: |
+          bash ${TORCHCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND
+      - name: Download checkpoints
+        run: |
+          bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
+      - name: Run validation
+        run: |
+          pushd ${TORCHCHAT_ROOT}
+          export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
+          bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+          bash .ci/scripts/validate.sh ${CHECKPOINT_PATH}
+  test-cuda:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    name: test-cuda (linux, ${{ matrix.repo_name }})
+    needs: gather-models
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        nvidia-smi
+        echo "::endgroup::"
+
+        echo "::group::Install required packages"
+        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r ./requirements.txt
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoint"
+        export REPO_NAME=${{ matrix.repo_name }}
+        bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
+        echo "::endgroup::"
+
+        echo "::group::Convert checkpoint"
+        export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
+        bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} cuda
+        echo "::endgroup::"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index cf1e38550..59d19ecdf 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1,8 +1,10 @@
 name: pull
 
 on:
-  schedule:
-    - cron: '0,6,12,18 0 * * *'  # Runs at midnight UTC and every 6 hours
+  pull_request:
+  push:
+    branches:
+      - main
   workflow_dispatch:
 
 jobs:
@@ -21,7 +23,7 @@ jobs:
         id: gather-models
         run: |
           set -eux
-          PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py
+          PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request"
   test-cpu:
     name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }})
     needs: gather-models
@@ -30,7 +32,7 @@ jobs:
       fail-fast: false
     runs-on: ${{ matrix.runner }}
     env:
-      TORCHAT_ROOT: ${{ github.workspace }}
+      TORCHCHAT_ROOT: ${{ github.workspace }}
       REPO_NAME: ${{ matrix.repo_name }}
       ENABKE_ET_PYBIND: ${{ matrix.runner == 'macos-14' && 'false' || 'true' }}
     steps:
@@ -45,13 +47,48 @@ jobs:
           echo "$(uname -a)"
       - name: Install dependencies
         run: |
-          bash ${TORCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND
+          bash ${TORCHCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND
       - name: Download checkpoints
         run: |
-          bash ${TORCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
+          bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
       - name: Run validation
         run: |
-          pushd ${TORCHAT_ROOT}
-          export CHECKPOINT_PATH=${TORCHAT_ROOT}/checkpoints/${REPO_NAME}/model.pth
-          bash ${TORCHAT_ROOT}/.ci/scripts/convert_checkpoint.sh ${REPO_NAME}
-          bash ${TORCHAT_ROOT}/.ci/scripts/validate.sh ${CHECKPOINT_PATH}
+          pushd ${TORCHCHAT_ROOT}
+          export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
+          bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+          bash .ci/scripts/validate.sh ${CHECKPOINT_PATH}
+  test-cuda:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    name: test-cuda (linux, ${{ matrix.repo_name }})
+    needs: gather-models
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        nvidia-smi
+        echo "::endgroup::"
+
+        echo "::group::Install required packages"
+        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r ./requirements.txt
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoint"
+        export REPO_NAME=${{ matrix.repo_name }}
+        bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
+        echo "::endgroup::"
+
+        echo "::group::Convert checkpoint"
+        export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
+        bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} cuda
+        echo "::endgroup::"
diff --git a/scripts/install_et.sh b/scripts/install_et.sh
index d230736dd..b7247ca64 100755
--- a/scripts/install_et.sh
+++ b/scripts/install_et.sh
@@ -11,7 +11,7 @@ install_pip_dependencies() {
   echo "Intalling common pip packages"
 
   pip install wheel
-  pip install cmake
+  pip install "cmake>=3.19"
   pip install ninja
   pip install zstd
   pushd ${TORCHCHAT_ROOT}
@@ -26,6 +26,7 @@ install_executorch() {
   pushd ${TORCHCHAT_ROOT}/build/src
   git clone https://github.com/pytorch/executorch.git
   cd executorch
+  git checkout viable/strict
   echo "Install executorch: submodule update"
   git submodule sync
   git submodule update --init