Move phi_1_5 to canary as it does not install on the docker build (#2095

) Summary: The docker build runner will CPU OOM when installing phi_1_5. Moving the model to canary. https://github.com/pytorch/benchmark/actions/runs/7263599720 Pull Request resolved: #2095 Test Plan: Nightly docker build: https://github.com/pytorch/benchmark/actions/runs/7301484235 Reviewed By: aaronenyeshi Differential Revision: D52391171 Pulled By: xuzhao9 fbshipit-source-id: 4866098292cbca7459d632c5c05fe620e638077e
pytorch · Dec 23, 2023 · 79c236a · 79c236a
1 parent ac77055
commit 79c236a
Show file tree

Hide file tree

Showing 7 changed files with 6 additions and 1 deletion.
diff --git a/scripts/torchbench_install.sh b/scripts/torchbench_install.sh
@@ -16,4 +16,5 @@ conda activate "${CONDA_ENV}"
 parent_dir=$(dirname "$(readlink -f "$0")")/..
 cd ${parent_dir}
 
+python -c "import torch; print(torch.__version__); print(torch.version.git_version)"
 python install.py
diff --git a/test.py b/test.py
@@ -61,7 +61,7 @@ def example_fn(self):
             try:
                 _create_example_model_instance(task, device)
                 accuracy = task.get_model_attribute("accuracy")
-                assert accuracy == "pass" or accuracy == "eager_1st_run_OOM", f"Expected accuracy pass, get {accuracy}"
+                assert accuracy == "pass" or accuracy == "eager_1st_run_OOM" or accuracy == "eager_2nd_run_OOM", f"Expected accuracy pass, get {accuracy}"
                 task.del_model_instance()
             except NotImplementedError as e:
                 self.skipTest(f'Method `get_module()` on {device} is not implemented because "{e}", skipping...')

diff --git a/torchbenchmark/models/phi_1_5/__init__.py → ...nchmark/canary_models/phi_1_5/__init__.py b/torchbenchmark/models/phi_1_5/__init__.py → ...nchmark/canary_models/phi_1_5/__init__.py
diff --git a/torchbenchmark/models/phi_1_5/install.py → ...enchmark/canary_models/phi_1_5/install.py b/torchbenchmark/models/phi_1_5/install.py → ...enchmark/canary_models/phi_1_5/install.py
diff --git a/torchbenchmark/models/phi_1_5/metadata.yaml → ...hmark/canary_models/phi_1_5/metadata.yaml b/torchbenchmark/models/phi_1_5/metadata.yaml → ...hmark/canary_models/phi_1_5/metadata.yaml
diff --git a/...benchmark/models/phi_1_5/requirements.txt → ...rk/canary_models/phi_1_5/requirements.txt b/...benchmark/models/phi_1_5/requirements.txt → ...rk/canary_models/phi_1_5/requirements.txt
diff --git a/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml b/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml
@@ -7,5 +7,9 @@ eval_nograd: true
 not_implemented:
 - device: cpu
 - device: NVIDIA A10G
+# TODO: llama_v2_7b_16h accuracy test will cause "CUBLAS_STATUS_NOT_INITIALIZED" Error
+# https://github.com/pytorch/benchmark/issues/2064
+- device: NVIDIA A100-SXM4-40GB
+  test: eval
 train_benchmark: false
 train_deterministic: false