From 79c236aed69907988b941e730965e6bfc9fd8c21 Mon Sep 17 00:00:00 2001
From: Xu Zhao <i@xuzhao.net>
Date: Fri, 22 Dec 2023 19:36:48 -0800
Subject: [PATCH] Move phi_1_5 to canary as it does not install on the docker
 build (#2095)

Summary:
The docker build runner will CPU OOM when installing phi_1_5. Moving the model to canary.

https://github.com/pytorch/benchmark/actions/runs/7263599720

Pull Request resolved: https://github.com/pytorch/benchmark/pull/2095

Test Plan:
Nightly docker build:
https://github.com/pytorch/benchmark/actions/runs/7301484235

Reviewed By: aaronenyeshi

Differential Revision: D52391171

Pulled By: xuzhao9

fbshipit-source-id: 4866098292cbca7459d632c5c05fe620e638077e
---
 scripts/torchbench_install.sh                                 | 1 +
 test.py                                                       | 2 +-
 torchbenchmark/{models => canary_models}/phi_1_5/__init__.py  | 0
 torchbenchmark/{models => canary_models}/phi_1_5/install.py   | 0
 .../{models => canary_models}/phi_1_5/metadata.yaml           | 0
 .../{models => canary_models}/phi_1_5/requirements.txt        | 0
 torchbenchmark/models/llama_v2_7b_16h/metadata.yaml           | 4 ++++
 7 files changed, 6 insertions(+), 1 deletion(-)
 rename torchbenchmark/{models => canary_models}/phi_1_5/__init__.py (100%)
 rename torchbenchmark/{models => canary_models}/phi_1_5/install.py (100%)
 rename torchbenchmark/{models => canary_models}/phi_1_5/metadata.yaml (100%)
 rename torchbenchmark/{models => canary_models}/phi_1_5/requirements.txt (100%)

diff --git a/scripts/torchbench_install.sh b/scripts/torchbench_install.sh
index 0cc3612c22..5c1d5217a9 100644
--- a/scripts/torchbench_install.sh
+++ b/scripts/torchbench_install.sh
@@ -16,4 +16,5 @@ conda activate "${CONDA_ENV}"
 parent_dir=$(dirname "$(readlink -f "$0")")/..
 cd ${parent_dir}
 
+python -c "import torch; print(torch.__version__); print(torch.version.git_version)"
 python install.py
diff --git a/test.py b/test.py
index 439617bb69..2821b68744 100644
--- a/test.py
+++ b/test.py
@@ -61,7 +61,7 @@ def example_fn(self):
             try:
                 _create_example_model_instance(task, device)
                 accuracy = task.get_model_attribute("accuracy")
-                assert accuracy == "pass" or accuracy == "eager_1st_run_OOM", f"Expected accuracy pass, get {accuracy}"
+                assert accuracy == "pass" or accuracy == "eager_1st_run_OOM" or accuracy == "eager_2nd_run_OOM", f"Expected accuracy pass, get {accuracy}"
                 task.del_model_instance()
             except NotImplementedError as e:
                 self.skipTest(f'Method `get_module()` on {device} is not implemented because "{e}", skipping...')
diff --git a/torchbenchmark/models/phi_1_5/__init__.py b/torchbenchmark/canary_models/phi_1_5/__init__.py
similarity index 100%
rename from torchbenchmark/models/phi_1_5/__init__.py
rename to torchbenchmark/canary_models/phi_1_5/__init__.py
diff --git a/torchbenchmark/models/phi_1_5/install.py b/torchbenchmark/canary_models/phi_1_5/install.py
similarity index 100%
rename from torchbenchmark/models/phi_1_5/install.py
rename to torchbenchmark/canary_models/phi_1_5/install.py
diff --git a/torchbenchmark/models/phi_1_5/metadata.yaml b/torchbenchmark/canary_models/phi_1_5/metadata.yaml
similarity index 100%
rename from torchbenchmark/models/phi_1_5/metadata.yaml
rename to torchbenchmark/canary_models/phi_1_5/metadata.yaml
diff --git a/torchbenchmark/models/phi_1_5/requirements.txt b/torchbenchmark/canary_models/phi_1_5/requirements.txt
similarity index 100%
rename from torchbenchmark/models/phi_1_5/requirements.txt
rename to torchbenchmark/canary_models/phi_1_5/requirements.txt
diff --git a/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml b/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml
index c8072d38c0..ec4d5d8f76 100644
--- a/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml
+++ b/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml
@@ -7,5 +7,9 @@ eval_nograd: true
 not_implemented:
 - device: cpu
 - device: NVIDIA A10G
+# TODO: llama_v2_7b_16h accuracy test will cause "CUBLAS_STATUS_NOT_INITIALIZED" Error
+# https://github.com/pytorch/benchmark/issues/2064
+- device: NVIDIA A100-SXM4-40GB
+  test: eval
 train_benchmark: false
 train_deterministic: false