From 79c236aed69907988b941e730965e6bfc9fd8c21 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 22 Dec 2023 19:36:48 -0800 Subject: [PATCH] Move phi_1_5 to canary as it does not install on the docker build (#2095) Summary: The docker build runner will CPU OOM when installing phi_1_5. Moving the model to canary. https://github.com/pytorch/benchmark/actions/runs/7263599720 Pull Request resolved: https://github.com/pytorch/benchmark/pull/2095 Test Plan: Nightly docker build: https://github.com/pytorch/benchmark/actions/runs/7301484235 Reviewed By: aaronenyeshi Differential Revision: D52391171 Pulled By: xuzhao9 fbshipit-source-id: 4866098292cbca7459d632c5c05fe620e638077e --- scripts/torchbench_install.sh | 1 + test.py | 2 +- torchbenchmark/{models => canary_models}/phi_1_5/__init__.py | 0 torchbenchmark/{models => canary_models}/phi_1_5/install.py | 0 .../{models => canary_models}/phi_1_5/metadata.yaml | 0 .../{models => canary_models}/phi_1_5/requirements.txt | 0 torchbenchmark/models/llama_v2_7b_16h/metadata.yaml | 4 ++++ 7 files changed, 6 insertions(+), 1 deletion(-) rename torchbenchmark/{models => canary_models}/phi_1_5/__init__.py (100%) rename torchbenchmark/{models => canary_models}/phi_1_5/install.py (100%) rename torchbenchmark/{models => canary_models}/phi_1_5/metadata.yaml (100%) rename torchbenchmark/{models => canary_models}/phi_1_5/requirements.txt (100%) diff --git a/scripts/torchbench_install.sh b/scripts/torchbench_install.sh index 0cc3612c22..5c1d5217a9 100644 --- a/scripts/torchbench_install.sh +++ b/scripts/torchbench_install.sh @@ -16,4 +16,5 @@ conda activate "${CONDA_ENV}" parent_dir=$(dirname "$(readlink -f "$0")")/.. cd ${parent_dir} +python -c "import torch; print(torch.__version__); print(torch.version.git_version)" python install.py diff --git a/test.py b/test.py index 439617bb69..2821b68744 100644 --- a/test.py +++ b/test.py @@ -61,7 +61,7 @@ def example_fn(self): try: _create_example_model_instance(task, device) accuracy = task.get_model_attribute("accuracy") - assert accuracy == "pass" or accuracy == "eager_1st_run_OOM", f"Expected accuracy pass, get {accuracy}" + assert accuracy == "pass" or accuracy == "eager_1st_run_OOM" or accuracy == "eager_2nd_run_OOM", f"Expected accuracy pass, get {accuracy}" task.del_model_instance() except NotImplementedError as e: self.skipTest(f'Method `get_module()` on {device} is not implemented because "{e}", skipping...') diff --git a/torchbenchmark/models/phi_1_5/__init__.py b/torchbenchmark/canary_models/phi_1_5/__init__.py similarity index 100% rename from torchbenchmark/models/phi_1_5/__init__.py rename to torchbenchmark/canary_models/phi_1_5/__init__.py diff --git a/torchbenchmark/models/phi_1_5/install.py b/torchbenchmark/canary_models/phi_1_5/install.py similarity index 100% rename from torchbenchmark/models/phi_1_5/install.py rename to torchbenchmark/canary_models/phi_1_5/install.py diff --git a/torchbenchmark/models/phi_1_5/metadata.yaml b/torchbenchmark/canary_models/phi_1_5/metadata.yaml similarity index 100% rename from torchbenchmark/models/phi_1_5/metadata.yaml rename to torchbenchmark/canary_models/phi_1_5/metadata.yaml diff --git a/torchbenchmark/models/phi_1_5/requirements.txt b/torchbenchmark/canary_models/phi_1_5/requirements.txt similarity index 100% rename from torchbenchmark/models/phi_1_5/requirements.txt rename to torchbenchmark/canary_models/phi_1_5/requirements.txt diff --git a/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml b/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml index c8072d38c0..ec4d5d8f76 100644 --- a/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml +++ b/torchbenchmark/models/llama_v2_7b_16h/metadata.yaml @@ -7,5 +7,9 @@ eval_nograd: true not_implemented: - device: cpu - device: NVIDIA A10G +# TODO: llama_v2_7b_16h accuracy test will cause "CUBLAS_STATUS_NOT_INITIALIZED" Error +# https://github.com/pytorch/benchmark/issues/2064 +- device: NVIDIA A100-SXM4-40GB + test: eval train_benchmark: false train_deterministic: false