From f4278a96cd032990de24fc64a52493ab77a510af Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@meta.com>
Date: Fri, 22 Nov 2024 14:23:21 -0800
Subject: [PATCH] Add qnn 16a16w quantization test (#7039)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/7039

Differential Revision: D66390212
---
 .ci/scripts/test_llama.sh   |   8 +
 .github/workflows/pull.yml  | 834 ++++++++++++++++++------------------
 .github/workflows/trunk.yml | 782 ++++++++++++++++-----------------
 3 files changed, 817 insertions(+), 807 deletions(-)

diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index dad3e1101f..bfb21a9880 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
       MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
       shift 2
       ;;
+    -pt2e_quantize)
+      PT2E_QUANTIZE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
+      shift 2
+      ;;
     -upload)
       UPLOAD_DIR="$2"
       shift 2
@@ -234,6 +238,10 @@ if [[ "${COREML}" == "ON" ]]; then
 fi
 if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
+  echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
+  if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
+    EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
+  fi
 fi
 # Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 6fc8ca9185..a66400d600 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -53,313 +53,313 @@ jobs:
         # Build and test ExecuTorch with the add model on portable backend.
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable"
 
-  test-models-linux:
-    name: test-models-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    needs: gather-models
-    strategy:
-      matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
-      fail-fast: false
-    with:
-      runner: ${{ matrix.runner }}
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: ${{ matrix.timeout }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        MODEL_NAME=${{ matrix.model }}
-        BUILD_TOOL=${{ matrix.build-tool }}
-        BACKEND=${{ matrix.backend }}
-        DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-        # Build and test ExecuTorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
-
-  test-llama-runner-linux:
-    name: test-llama-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      matrix:
-        dtype: [fp32]
-        mode: [portable, xnnpack+custom, xnnpack+custom+qe]
-        include:
-          - dtype: bf16
-            mode: portable
-          - dtype: bf16
-            mode: custom
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
-      upload-artifact: android-models
-      upload-artifact-to-s3: true
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        DTYPE=${{ matrix.dtype }}
-        BUILD_TOOL="cmake"
-        MODE=${{ matrix.mode }}
-        ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
-        ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
-
-        # Setup executorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-        # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-        # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
-
-  test-llama-runner-linux-android:
-    name: test-llama-runner-linux-android
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12-android
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python \
-        bash .ci/scripts/build_llama_android.sh  "${BUILD_TOOL}"
-
-  test-custom-ops-linux:
-    name: test-custom-ops-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-        # Test custom ops
-        PYTHON_EXECUTABLE=python bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
-
-  test-selective-build-linux:
-    name: test-selective-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-        # Test selective build
-        PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
-
-  test-llava-runner-linux:
-    name: test-llava-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.24xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
-
-        # install pybind
-        bash install_requirements.sh --pybind xnnpack
-
-        # install Llava requirements
-        bash examples/models/llama/install_requirements.sh
-        bash examples/models/llava/install_requirements.sh
-
-        # run python unittest
-        python -m unittest examples.models.llava.test.test_llava
-
-        # run e2e (export, tokenizer and runner)
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
-
-  test-quantized-aot-lib-linux:
-    name: test-quantized-aot-lib-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-        PYTHON_EXECUTABLE=python bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" mv2
-
-  test-pybind-build-linux:
-    name: test-pybind-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # build module for executorch.extension.pybindings.portable_lib
-        BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_XNNPACK=ON \
-        EXECUTORCH_BUILD_PYBIND=ON \
-        bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-
-        # see if we can import the module successfully
-        python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-
-  test-binary-size-linux-gcc:
-    name: test-binary-size-linux-gcc
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-gcc9
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # build module for executorch.extension.pybindings.portable_lib
-        bash test/build_size_test.sh
-        strip cmake-out/test/size_test
-        output=$(ls -la cmake-out/test/size_test)
-        arr=($output)
-        size=${arr[4]}
-        # threshold=48120 on devserver with gcc11.4
-        # todo(lfq): update once binary size is below 50kb.
-        threshold="51504"
-        if [[ "$size" -le "$threshold" ]]; then
-          echo "Success $size <= $threshold"
-        else
-          echo "Fail $size > $threshold"
-          exit 1
-        fi
-
-  test-binary-size-linux:
-    name: test-binary-size-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # build module for executorch.extension.pybindings.portable_lib
-        bash test/build_size_test.sh
-        strip cmake-out/test/size_test
-        output=$(ls -la cmake-out/test/size_test)
-        arr=($output)
-        size=${arr[4]}
-        # threshold=48120 on devserver with gcc11.4
-        # todo(lfq): update once binary size is below 50kb.
-        threshold="51784"
-        if [[ "$size" -le "$threshold" ]]; then
-          echo "Success $size <= $threshold"
-        else
-          echo "Fail $size > $threshold"
-          exit 1
-        fi
-
-  android:
-    uses: ./.github/workflows/_android.yml
-    needs: test-llama-runner-linux
-
-  unittest:
-    uses: ./.github/workflows/_unittest.yml
-    with:
-      docker-image: executorch-ubuntu-22.04-clang12
-
-  unittest-arm:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-arm-sdk
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        set -eux
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        BUILD_TOOL="cmake"
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_PYBIND=ON \
-        EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
-        .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-
-        source .ci/scripts/utils.sh
-        # Install Arm dependencies
-        install_arm
-
-        # Run pytest with coverage
-        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
+  # test-models-linux:
+  #   name: test-models-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   needs: gather-models
+  #   strategy:
+  #     matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
+  #     fail-fast: false
+  #   with:
+  #     runner: ${{ matrix.runner }}
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: ${{ matrix.timeout }}
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       MODEL_NAME=${{ matrix.model }}
+  #       BUILD_TOOL=${{ matrix.build-tool }}
+  #       BACKEND=${{ matrix.backend }}
+  #       DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
+
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+  #       # Build and test ExecuTorch
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+
+  # test-llama-runner-linux:
+  #   name: test-llama-runner-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     matrix:
+  #       dtype: [fp32]
+  #       mode: [portable, xnnpack+custom, xnnpack+custom+qe]
+  #       include:
+  #         - dtype: bf16
+  #           mode: portable
+  #         - dtype: bf16
+  #           mode: custom
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 900
+  #     upload-artifact: android-models
+  #     upload-artifact-to-s3: true
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       DTYPE=${{ matrix.dtype }}
+  #       BUILD_TOOL="cmake"
+  #       MODE=${{ matrix.mode }}
+  #       ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
+  #       ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
+
+  #       # Setup executorch
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+  #       # Install requirements for export_llama
+  #       PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+  #       # Test llama2
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
+
+  # test-llama-runner-linux-android:
+  #   name: test-llama-runner-linux-android
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12-android
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       BUILD_TOOL="cmake"
+  #       PYTHON_EXECUTABLE=python \
+  #       bash .ci/scripts/build_llama_android.sh  "${BUILD_TOOL}"
+
+  # test-custom-ops-linux:
+  #   name: test-custom-ops-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       BUILD_TOOL="cmake"
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+  #       # Test custom ops
+  #       PYTHON_EXECUTABLE=python bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
+
+  # test-selective-build-linux:
+  #   name: test-selective-build-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       BUILD_TOOL="cmake"
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+  #       # Test selective build
+  #       PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
+
+  # test-llava-runner-linux:
+  #   name: test-llava-runner-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.24xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+  #       # install pybind
+  #       bash install_requirements.sh --pybind xnnpack
+
+  #       # install Llava requirements
+  #       bash examples/models/llama/install_requirements.sh
+  #       bash examples/models/llava/install_requirements.sh
+
+  #       # run python unittest
+  #       python -m unittest examples.models.llava.test.test_llava
+
+  #       # run e2e (export, tokenizer and runner)
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
+
+  # test-quantized-aot-lib-linux:
+  #   name: test-quantized-aot-lib-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       BUILD_TOOL="cmake"
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+  #       PYTHON_EXECUTABLE=python bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" mv2
+
+  # test-pybind-build-linux:
+  #   name: test-pybind-build-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       # build module for executorch.extension.pybindings.portable_lib
+  #       BUILD_TOOL="cmake"
+  #       PYTHON_EXECUTABLE=python \
+  #       EXECUTORCH_BUILD_XNNPACK=ON \
+  #       EXECUTORCH_BUILD_PYBIND=ON \
+  #       bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+
+  #       # see if we can import the module successfully
+  #       python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
+
+  # test-binary-size-linux-gcc:
+  #   name: test-binary-size-linux-gcc
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-gcc9
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       # build module for executorch.extension.pybindings.portable_lib
+  #       bash test/build_size_test.sh
+  #       strip cmake-out/test/size_test
+  #       output=$(ls -la cmake-out/test/size_test)
+  #       arr=($output)
+  #       size=${arr[4]}
+  #       # threshold=48120 on devserver with gcc11.4
+  #       # todo(lfq): update once binary size is below 50kb.
+  #       threshold="51504"
+  #       if [[ "$size" -le "$threshold" ]]; then
+  #         echo "Success $size <= $threshold"
+  #       else
+  #         echo "Fail $size > $threshold"
+  #         exit 1
+  #       fi
+
+  # test-binary-size-linux:
+  #   name: test-binary-size-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       # build module for executorch.extension.pybindings.portable_lib
+  #       bash test/build_size_test.sh
+  #       strip cmake-out/test/size_test
+  #       output=$(ls -la cmake-out/test/size_test)
+  #       arr=($output)
+  #       size=${arr[4]}
+  #       # threshold=48120 on devserver with gcc11.4
+  #       # todo(lfq): update once binary size is below 50kb.
+  #       threshold="51784"
+  #       if [[ "$size" -le "$threshold" ]]; then
+  #         echo "Success $size <= $threshold"
+  #       else
+  #         echo "Fail $size > $threshold"
+  #         exit 1
+  #       fi
+
+  # android:
+  #   uses: ./.github/workflows/_android.yml
+  #   needs: test-llama-runner-linux
+
+  # unittest:
+  #   uses: ./.github/workflows/_unittest.yml
+  #   with:
+  #     docker-image: executorch-ubuntu-22.04-clang12
+
+  # unittest-arm:
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-arm-sdk
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       set -eux
+
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       BUILD_TOOL="cmake"
+
+  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  #       PYTHON_EXECUTABLE=python \
+  #       EXECUTORCH_BUILD_PYBIND=ON \
+  #       EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+  #       .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+
+  #       source .ci/scripts/utils.sh
+  #       # Install Arm dependencies
+  #       install_arm
+
+  #       # Run pytest with coverage
+  #       pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
 
 
   test-llama-runner-qnn-linux:
@@ -368,6 +368,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
+        pt2e_quantize: [qnn_16a16w]
         mode: [qnn]
       fail-fast: false
     with:
@@ -384,6 +385,7 @@ jobs:
         DTYPE=${{ matrix.dtype }}
         BUILD_TOOL="cmake"
         MODE=${{ matrix.mode }}
+        PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
@@ -393,112 +395,112 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
-
-  test-phi-3-mini-runner-linux:
-    name: test-phi-3-mini-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.24xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
-
-        # install pybind
-        bash install_requirements.sh --pybind xnnpack
-
-        # install phi-3-mini requirements
-        bash examples/models/phi-3-mini/install_requirements.sh
-
-        # run e2e (export, tokenizer and runner)
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
-
-  test-eval_llama-wikitext-linux:
-    name: test-eval_llama-wikitext-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.24xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
-
-        # install pybind
-        bash install_requirements.sh --pybind xnnpack
-
-        # install llama requirements
-        bash examples/models/llama/install_requirements.sh
-
-        # run eval_llama wikitext task
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh
-
-  test-eval_llama-mmlu-linux:
-    name: test-eval_llama-mmlu-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.24xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
-
-        # install pybind
-        bash install_requirements.sh --pybind xnnpack
-
-        # install llama requirements
-        bash examples/models/llama/install_requirements.sh
-
-        # run eval_llama mmlu task
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh
-
-  test-llama_runner_eager-linux:
-    name: test-llama_runner_eager-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: linux.24xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
-
-        # install pybind
-        bash install_requirements.sh --pybind xnnpack
-
-        # install llama requirements
-        bash examples/models/llama/install_requirements.sh
-
-        # run llama runner in eager mode
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -pt2e_quantize "${PT2E_QUANTIZE}"
+
+  # test-phi-3-mini-runner-linux:
+  #   name: test-phi-3-mini-runner-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.24xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+  #       # install pybind
+  #       bash install_requirements.sh --pybind xnnpack
+
+  #       # install phi-3-mini requirements
+  #       bash examples/models/phi-3-mini/install_requirements.sh
+
+  #       # run e2e (export, tokenizer and runner)
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
+
+  # test-eval_llama-wikitext-linux:
+  #   name: test-eval_llama-wikitext-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.24xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+  #       # install pybind
+  #       bash install_requirements.sh --pybind xnnpack
+
+  #       # install llama requirements
+  #       bash examples/models/llama/install_requirements.sh
+
+  #       # run eval_llama wikitext task
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh
+
+  # test-eval_llama-mmlu-linux:
+  #   name: test-eval_llama-mmlu-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.24xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+  #       # install pybind
+  #       bash install_requirements.sh --pybind xnnpack
+
+  #       # install llama requirements
+  #       bash examples/models/llama/install_requirements.sh
+
+  #       # run eval_llama mmlu task
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh
+
+  # test-llama_runner_eager-linux:
+  #   name: test-llama_runner_eager-linux
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.24xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+  #       # install pybind
+  #       bash install_requirements.sh --pybind xnnpack
+
+  #       # install llama requirements
+  #       bash examples/models/llama/install_requirements.sh
+
+  #       # run llama runner in eager mode
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 7afc385a19..5f5d638ba4 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -36,408 +36,408 @@ jobs:
 
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --target-os macos --event "${GITHUB_EVENT_NAME}"
 
-  test-models-macos:
-    name: test-models-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    needs: gather-models
-    strategy:
-      matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
-      fail-fast: false
-    with:
-      runner: ${{ matrix.runner }}
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: ${{ matrix.timeout }}
-      script: |
-        MODEL_NAME=${{ matrix.model }}
-        BUILD_TOOL=${{ matrix.build-tool }}
-        BACKEND=${{ matrix.backend }}
-        DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
-
-        bash .ci/scripts/setup-conda.sh
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
-        # Build and test executorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
-
-  test-custom-ops-macos:
-    name: test-custom-ops-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: cmake
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      script: |
-        BUILD_TOOL=${{ matrix.build-tool }}
-
-        bash .ci/scripts/setup-conda.sh
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
-        # Build and test custom ops
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
-
-  test-selective-build-macos:
-    name: test-selective-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: cmake
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      script: |
-        BUILD_TOOL=${{ matrix.build-tool }}
-
-        bash .ci/scripts/setup-conda.sh
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
-        # Build and test selective build
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
-
-  test-demo-backend-delegation:
-    name: test-demo-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: buck2
-          - build-tool: cmake
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        BUILD_TOOL=${{ matrix.build-tool }}
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
-        # Test selective build
-        PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
-
-  test-arm-backend-delegation:
-    name: test-arm-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-arm-sdk
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        source .ci/scripts/utils.sh
-        install_executorch
-
-        install_arm
-
-        # Increase number of files user can monitor to bypass buck failures.
-        # Hopefully this is high enough for this setup.
-        sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
-
-        # Test ethos-u delegate examples with run.sh
-        PYTHON_EXECUTABLE=python bash examples/arm/run.sh examples/arm/ethos-u-scratch/
-
-  test-arm-reference-delegation:
-    name: test-arm-reference-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-arm-sdk
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        source .ci/scripts/utils.sh
-        install_executorch
-
-        install_arm
-
-        # Run arm unit tests
-        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
-
-  test-coreml-delegate:
-    name: test-coreml-delegate
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    with:
-      runner: macos-13-xlarge
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        BUILD_TOOL=cmake
-
-        bash .ci/scripts/setup-conda.sh
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
-        # Build and test coreml delegate
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/build_all.sh
-
-  test-pybind-build-macos:
-    name: test-pybind-build-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: cmake
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 180
-      script: |
-        bash .ci/scripts/setup-conda.sh
-
-        # build module for executorch.extension.pybindings.portable_lib
-        BUILD_TOOL=${{ matrix.build-tool }}
-        EXECUTORCH_BUILD_PYBIND=ON PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
-
-        # see if we can import the module successfully
-        ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-
-  test-llama-runner-macos:
-    name: test-llama-runner-mac
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      matrix:
-        dtype: [fp32]
-        mode: [portable, xnnpack+kv+custom, mps, coreml]
-        include:
-          - dtype: bf16
-            mode: portable
-          - dtype: bf16
-            mode: custom
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
-      script: |
-
-        DTYPE=${{ matrix.dtype }}
-        MODE=${{ matrix.mode }}
-
-        bash .ci/scripts/setup-conda.sh
-
-        # Setup executorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh cmake
-
-        if [[ "${MODE}" == "mps" ]]; then
-          # Install mps delegate
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/mps/install_requirements.sh
-          echo "Finishing installing mps."
-        elif [[ "${MODE}" == "coreml" ]]; then
-          # Install coreml delegate
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh
-          echo "Finishing installing coreml."
-        fi
-
-        # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
-        # Test llama2
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
-
-  # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
-  # test-llava-runner-macos:
-  #   name: test-llava-runner-macos
+  # test-models-macos:
+  #   name: test-models-macos
   #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   needs: gather-models
   #   strategy:
+  #     matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
   #     fail-fast: false
   #   with:
-  #     runner: macos-14-xlarge
+  #     runner: ${{ matrix.runner }}
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: ${{ matrix.timeout }}
+  #     script: |
+  #       MODEL_NAME=${{ matrix.model }}
+  #       BUILD_TOOL=${{ matrix.build-tool }}
+  #       BACKEND=${{ matrix.backend }}
+  #       DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
+
+  #       bash .ci/scripts/setup-conda.sh
+  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+  #       # Build and test executorch
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+
+  # test-custom-ops-macos:
+  #   name: test-custom-ops-macos
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   strategy:
+  #     matrix:
+  #       include:
+  #         - build-tool: cmake
+  #     fail-fast: false
+  #   with:
+  #     runner: macos-m1-stable
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     script: |
+  #       BUILD_TOOL=${{ matrix.build-tool }}
+
+  #       bash .ci/scripts/setup-conda.sh
+  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+  #       # Build and test custom ops
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
+
+  # test-selective-build-macos:
+  #   name: test-selective-build-macos
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   strategy:
+  #     matrix:
+  #       include:
+  #         - build-tool: cmake
+  #     fail-fast: false
+  #   with:
+  #     runner: macos-m1-stable
   #     python-version: '3.11'
   #     submodules: 'true'
   #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-  #     timeout: 900
+  #     script: |
+  #       BUILD_TOOL=${{ matrix.build-tool }}
+
+  #       bash .ci/scripts/setup-conda.sh
+  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+  #       # Build and test selective build
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
+
+  # test-demo-backend-delegation:
+  #   name: test-demo-backend-delegation
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     matrix:
+  #       include:
+  #         - build-tool: buck2
+  #         - build-tool: cmake
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       BUILD_TOOL=${{ matrix.build-tool }}
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+  #       # Test selective build
+  #       PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
+
+  # test-arm-backend-delegation:
+  #   name: test-arm-backend-delegation
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-arm-sdk
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       source .ci/scripts/utils.sh
+  #       install_executorch
+
+  #       install_arm
+
+  #       # Increase number of files user can monitor to bypass buck failures.
+  #       # Hopefully this is high enough for this setup.
+  #       sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
+
+  #       # Test ethos-u delegate examples with run.sh
+  #       PYTHON_EXECUTABLE=python bash examples/arm/run.sh examples/arm/ethos-u-scratch/
+
+  # test-arm-reference-delegation:
+  #   name: test-arm-reference-delegation
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-arm-sdk
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+
+  #       source .ci/scripts/utils.sh
+  #       install_executorch
+
+  #       install_arm
+
+  #       # Run arm unit tests
+  #       pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
+
+  # test-coreml-delegate:
+  #   name: test-coreml-delegate
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   with:
+  #     runner: macos-13-xlarge
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
   #     script: |
   #       BUILD_TOOL=cmake
 
   #       bash .ci/scripts/setup-conda.sh
   #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
   #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+  #       # Build and test coreml delegate
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/build_all.sh
+
+  # test-pybind-build-macos:
+  #   name: test-pybind-build-macos
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   strategy:
+  #     matrix:
+  #       include:
+  #         - build-tool: cmake
+  #     fail-fast: false
+  #   with:
+  #     runner: macos-m1-stable
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 180
+  #     script: |
+  #       bash .ci/scripts/setup-conda.sh
+
+  #       # build module for executorch.extension.pybindings.portable_lib
+  #       BUILD_TOOL=${{ matrix.build-tool }}
+  #       EXECUTORCH_BUILD_PYBIND=ON PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+
+  #       # see if we can import the module successfully
+  #       ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
+
+  # test-llama-runner-macos:
+  #   name: test-llama-runner-mac
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   strategy:
+  #     matrix:
+  #       dtype: [fp32]
+  #       mode: [portable, xnnpack+kv+custom, mps, coreml]
+  #       include:
+  #         - dtype: bf16
+  #           mode: portable
+  #         - dtype: bf16
+  #           mode: custom
+  #     fail-fast: false
+  #   with:
+  #     runner: macos-m1-stable
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 900
+  #     script: |
+
+  #       DTYPE=${{ matrix.dtype }}
+  #       MODE=${{ matrix.mode }}
+
+  #       bash .ci/scripts/setup-conda.sh
 
-  #       # install Llava requirements
-  #       ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
-  #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
-
-  #       # run python unittest
-  #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
-
-  #       # run e2e (export, tokenizer and runner)
-  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
-
-  test-qnn-model:
-    name: test-qnn-model
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      matrix:
-        dtype: [fp32]
-        model: [dl3, mv3, mv2, ic4, ic3, vit]
-      fail-fast: false
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-qnn-sdk
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
-        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
-
-  test-apple-model:
-    name: test-apple-model
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        BUILD_TOOL=cmake
-
-        bash .ci/scripts/setup-conda.sh
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh
-        echo "Finishing installing coreml."
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/mps/install_requirements.sh
-        echo "Finishing installing mps."
-
-        # Build and test coreml model
-        MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l)
-        for MODEL_NAME in "${MODELS[@]}"; do
-          echo "::group::Exporting coreml model: $MODEL_NAME"
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "coreml"
-          echo "::endgroup::"
-
-          echo "::group::Exporting mps model: $MODEL_NAME"
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "mps"
-          echo "::endgroup::"
-        done
-
-  test-huggingface-transformers:
-    name: test-huggingface-transformers
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    secrets: inherit
-    strategy:
-      matrix:
-        hf_model_repo: [google/gemma-2b]
-      fail-fast: false
-    with:
-      secrets-env: EXECUTORCH_HF_TOKEN
-      runner: linux.12xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        echo "::group::Set up ExecuTorch"
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
-
-        echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
-        rm -rf cmake-out
-        cmake \
-            -DCMAKE_INSTALL_PREFIX=cmake-out \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-            -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-            -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-            -DEXECUTORCH_BUILD_XNNPACK=ON \
-            -DPYTHON_EXECUTABLE=python \
-            -Bcmake-out .
-        cmake --build cmake-out -j9 --target install --config Release
-
-        echo "Build llama runner"
-        dir="examples/models/llama"
-        cmake \
-            -DCMAKE_INSTALL_PREFIX=cmake-out \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-            -DEXECUTORCH_BUILD_XNNPACK=ON \
-            -DPYTHON_EXECUTABLE=python \
-            -Bcmake-out/${dir} \
-            ${dir}
-        cmake --build cmake-out/${dir} -j9 --config Release
-        echo "::endgroup::"
-
-        echo "::group::Set up HuggingFace Dependencies"
-        if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then
-          echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR."
-          exit 1
-        fi
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        pip install accelerate sentencepiece
-        pip list
-        echo "::endgroup::"
-
-        echo "::group::Export to ExecuTorch"
-        TOKENIZER_FILE=tokenizer.model
-        TOKENIZER_BIN_FILE=tokenizer.bin
-        ET_MODEL_NAME=et_model
-        # Fetch the file using a Python one-liner
-        DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c "
-        from huggingface_hub import hf_hub_download
-        # Download the file from the Hugging Face Hub
-        downloaded_path = hf_hub_download(
-            repo_id='${{ matrix.hf_model_repo }}',
-            filename='${TOKENIZER_FILE}'
-        )
-        print(downloaded_path)
-        ")
-        if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then
-            echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
-            python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE}
-            ls ./tokenizer.bin
-        else
-            echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
-            exit 1
-        fi
-
-        python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
-
-        cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
-        echo "::endgroup::"
+  #       # Setup executorch
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh cmake
+
+  #       if [[ "${MODE}" == "mps" ]]; then
+  #         # Install mps delegate
+  #         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/mps/install_requirements.sh
+  #         echo "Finishing installing mps."
+  #       elif [[ "${MODE}" == "coreml" ]]; then
+  #         # Install coreml delegate
+  #         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh
+  #         echo "Finishing installing coreml."
+  #       fi
+
+  #       # Install requirements for export_llama
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
+  #       # Test llama2
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
+
+  # # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
+  # # test-llava-runner-macos:
+  # #   name: test-llava-runner-macos
+  # #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  # #   strategy:
+  # #     fail-fast: false
+  # #   with:
+  # #     runner: macos-14-xlarge
+  # #     python-version: '3.11'
+  # #     submodules: 'true'
+  # #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  # #     timeout: 900
+  # #     script: |
+  # #       BUILD_TOOL=cmake
+
+  # #       bash .ci/scripts/setup-conda.sh
+  # #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  # #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+
+  # #       # install Llava requirements
+  # #       ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
+  # #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
+
+  # #       # run python unittest
+  # #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
+
+  # #       # run e2e (export, tokenizer and runner)
+  # #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
+
+  # test-qnn-model:
+  #   name: test-qnn-model
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   strategy:
+  #     matrix:
+  #       dtype: [fp32]
+  #       model: [dl3, mv3, mv2, ic4, ic3, vit]
+  #     fail-fast: false
+  #   with:
+  #     runner: linux.2xlarge
+  #     docker-image: executorch-ubuntu-22.04-qnn-sdk
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 900
+  #     script: |
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
+
+  # test-apple-model:
+  #   name: test-apple-model
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: macos-m1-stable
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       BUILD_TOOL=cmake
+
+  #       bash .ci/scripts/setup-conda.sh
+
+  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh
+  #       echo "Finishing installing coreml."
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/mps/install_requirements.sh
+  #       echo "Finishing installing mps."
+
+  #       # Build and test coreml model
+  #       MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l)
+  #       for MODEL_NAME in "${MODELS[@]}"; do
+  #         echo "::group::Exporting coreml model: $MODEL_NAME"
+  #         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "coreml"
+  #         echo "::endgroup::"
+
+  #         echo "::group::Exporting mps model: $MODEL_NAME"
+  #         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "mps"
+  #         echo "::endgroup::"
+  #       done
+
+  # test-huggingface-transformers:
+  #   name: test-huggingface-transformers
+  #   uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  #   secrets: inherit
+  #   strategy:
+  #     matrix:
+  #       hf_model_repo: [google/gemma-2b]
+  #     fail-fast: false
+  #   with:
+  #     secrets-env: EXECUTORCH_HF_TOKEN
+  #     runner: linux.12xlarge
+  #     docker-image: executorch-ubuntu-22.04-clang12
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 90
+  #     script: |
+  #       echo "::group::Set up ExecuTorch"
+  #       # The generic Linux job chooses to use base env, not the one setup by the image
+  #       CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+  #       conda activate "${CONDA_ENV}"
+  #       PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+
+  #       echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
+  #       rm -rf cmake-out
+  #       cmake \
+  #           -DCMAKE_INSTALL_PREFIX=cmake-out \
+  #           -DCMAKE_BUILD_TYPE=Release \
+  #           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+  #           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  #           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  #           -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+  #           -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+  #           -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+  #           -DEXECUTORCH_BUILD_XNNPACK=ON \
+  #           -DPYTHON_EXECUTABLE=python \
+  #           -Bcmake-out .
+  #       cmake --build cmake-out -j9 --target install --config Release
+
+  #       echo "Build llama runner"
+  #       dir="examples/models/llama"
+  #       cmake \
+  #           -DCMAKE_INSTALL_PREFIX=cmake-out \
+  #           -DCMAKE_BUILD_TYPE=Release \
+  #           -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+  #           -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+  #           -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+  #           -DEXECUTORCH_BUILD_XNNPACK=ON \
+  #           -DPYTHON_EXECUTABLE=python \
+  #           -Bcmake-out/${dir} \
+  #           ${dir}
+  #       cmake --build cmake-out/${dir} -j9 --config Release
+  #       echo "::endgroup::"
+
+  #       echo "::group::Set up HuggingFace Dependencies"
+  #       if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then
+  #         echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR."
+  #         exit 1
+  #       fi
+  #       pip install -U "huggingface_hub[cli]"
+  #       huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+  #       pip install accelerate sentencepiece
+  #       pip list
+  #       echo "::endgroup::"
+
+  #       echo "::group::Export to ExecuTorch"
+  #       TOKENIZER_FILE=tokenizer.model
+  #       TOKENIZER_BIN_FILE=tokenizer.bin
+  #       ET_MODEL_NAME=et_model
+  #       # Fetch the file using a Python one-liner
+  #       DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c "
+  #       from huggingface_hub import hf_hub_download
+  #       # Download the file from the Hugging Face Hub
+  #       downloaded_path = hf_hub_download(
+  #           repo_id='${{ matrix.hf_model_repo }}',
+  #           filename='${TOKENIZER_FILE}'
+  #       )
+  #       print(downloaded_path)
+  #       ")
+  #       if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then
+  #           echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
+  #           python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE}
+  #           ls ./tokenizer.bin
+  #       else
+  #           echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
+  #           exit 1
+  #       fi
+
+  #       python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
+
+  #       cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
+  #       echo "::endgroup::"