Merge branch 'eval-ci-timeout' of https://github.com/nod-ai/SHARK-Pla…

…tform into eval-ci-timeout
nod-ai · Oct 23, 2024 · 41afc31 · 41afc31
2 parents 6b5274a + f92260b
commit 41afc31
Show file tree

Hide file tree

Showing 7 changed files with 1,161 additions and 0 deletions.
diff --git a/.github/workflows/ci-llama.yaml b/.github/workflows/ci-llama.yaml
@@ -0,0 +1,78 @@
+name: Llama Benchmarking Tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekdays at 9:00 AM UTC = 2:00 AM PST.
+    - cron: "0 9 * * 1-5"
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  test_llama:
+    name: "Llama Benchmarking Tests"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: llama-mi300
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+      VENV_DIR: ${{ github.workspace }}/.venv
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@v3
+
+      - name: Cache Pip Packages
+        uses: actions/cache@v4
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+            iree-compiler \
+            iree-runtime \
+            "numpy<2.0"
+
+      - name: Run llama test
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_tests.py -v -s --longrun
+
+      - name: Upload llama executable files
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-files
+          path: ${{ github.workspace }}/${{ steps.date.outputs.date }}
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
@@ -128,6 +128,13 @@ def pytest_addoption(parser):
         help="Llama3.1 8B & 405B model baseline perplexity scores json",
     )
 
+    parser.addoption(
+        "--iree-hip-target",
+        action="store",
+        default="gfx942",
+        help="Specify the iree-hip target version (e.g., gfx942)",
+    )
+
 
 def set_fixture_from_cli_option(
     request: FixtureRequest,
@@ -168,6 +175,13 @@ def caching(request: FixtureRequest) -> Optional[bool]:
     return set_fixture_from_cli_option(request, "caching")
 
 
+@pytest.fixture(scope="class")
+def iree_hip_target_type(request: FixtureRequest) -> Optional[str]:
+    return set_fixture_from_cli_option(
+        request, "iree_hip_target", "iree_hip_target_type"
+    )
+
+
 @pytest.fixture(scope="class")
 def get_model_path(request: FixtureRequest):
     model_path = {}

diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -16,6 +16,7 @@
 
 # TODO: Should be using a base class with the protocol supported.
 from ..models.llama.llama import LlamaModelConfig, PagedLlamaModelV1
+from ..models.llama.sharding import shard_theta
 from ..models.mixtral.mixtral import *
 from ..models.grok.grok import *
 
@@ -51,6 +52,18 @@ def main():
         help="Enables strictness during export",
         action="store_true",
     )
+    parser.add_argument(
+        "--attention-kernel",
+        type=str,
+        default="decomposed",
+        choices=["decomposed", "torch_sdpa"],
+    )
+    parser.add_argument(
+        "--tensor-parallelism-size",
+        type=int,
+        default=1,
+        help="How many devices are involved for tensor parallel sharding.",
+    )
 
     args = cli.parse(parser)
     dataset_type = cli.get_input_data_files(args)
@@ -59,6 +72,8 @@ def main():
 
     hp = configs.LlamaHParams.from_gguf_props(dataset.properties)
     llama_config = LlamaModelConfig(hp)
+    if args.tensor_parallelism_size > 1:
+        dataset.root_theta = shard_theta(dataset.root_theta, llama_config)
     llama_config.use_hf = False
     llama_config.static_tables = False  # Rely on the compiler for hoisting tables.
     llama_config.kv_cache_type = "direct" if args.bs == [1] else "paged"