From d61b74a26d82942f25611d1b36f9a08ff16ede7f Mon Sep 17 00:00:00 2001
From: MaanavD <maanavdalal@microsoft.com>
Date: Tue, 12 Dec 2023 08:58:14 -0800
Subject: [PATCH] Added distil_whisper. (#2074)

Summary:
Should work! Added distil whisper to torchbench. Local time is within normal CI requirements.
![image](https://github.com/pytorch/benchmark/assets/24942306/0f7e7cae-ae90-44e6-bbff-4eede4a4f730)

Pull Request resolved: https://github.com/pytorch/benchmark/pull/2074

Reviewed By: aaronenyeshi

Differential Revision: D52053930

Pulled By: xuzhao9

fbshipit-source-id: f0d20a821c5de916ca174b0033d9d79b5d6cafa0
---
 .../models/hf_distil_whisper/__init__.py      | 30 +++++++++++++++++++
 .../models/hf_distil_whisper/install.py       | 14 +++++++++
 .../models/hf_distil_whisper/metadata.yaml    |  8 +++++
 .../models/hf_distil_whisper/requirements.txt |  2 ++
 .../framework/huggingface/model_factory.py    |  2 ++
 5 files changed, 56 insertions(+)
 create mode 100644 torchbenchmark/models/hf_distil_whisper/__init__.py
 create mode 100644 torchbenchmark/models/hf_distil_whisper/install.py
 create mode 100644 torchbenchmark/models/hf_distil_whisper/metadata.yaml
 create mode 100644 torchbenchmark/models/hf_distil_whisper/requirements.txt

diff --git a/torchbenchmark/models/hf_distil_whisper/__init__.py b/torchbenchmark/models/hf_distil_whisper/__init__.py
new file mode 100644
index 0000000000..62be026bfd
--- /dev/null
+++ b/torchbenchmark/models/hf_distil_whisper/__init__.py
@@ -0,0 +1,30 @@
+from torchbenchmark.tasks import SPEECH
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel
+import torch
+
+class Model(HuggingFaceModel):
+    task = SPEECH.RECOGNITION
+    DEFAULT_TRAIN_BSIZE = 8
+    DEFAULT_EVAL_BSIZE = 1
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        if test == "train":
+            raise NotImplementedError("Training is not implemented.")
+        super().__init__(name="hf_distil_whisper", test=test, device=device, batch_size=batch_size, extra_args=extra_args)
+        self.feature_size = 80
+        self.sequence_length = 3000
+        self.input_features = torch.randn(size=(self.batch_size, self.feature_size, self.sequence_length),device=self.device)
+        self.example_inputs = {"input_features": self.input_features.to(self.device), "input_ids" : self.input_features.to(self.device)}
+        self.model.to(self.device)
+
+    def train(self):
+        raise NotImplementedError("Training is not implemented.")
+    
+    def eval(self):
+        self.model.eval()
+        with torch.no_grad():
+            self.model(self.example_inputs["input_ids"])
+    
+    def enable_fp16(self):
+        self.model.half()
+        self.example_inputs = {"input_features": self.input_features.half().to(self.device), "input_ids" : self.input_features.half().to(self.device)}
diff --git a/torchbenchmark/models/hf_distil_whisper/install.py b/torchbenchmark/models/hf_distil_whisper/install.py
new file mode 100644
index 0000000000..c7855f3fef
--- /dev/null
+++ b/torchbenchmark/models/hf_distil_whisper/install.py
@@ -0,0 +1,14 @@
+
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+def pip_install_requirements():
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'])
+
+if __name__ == '__main__':
+    pip_install_requirements()
+    patch_transformers()
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
diff --git a/torchbenchmark/models/hf_distil_whisper/metadata.yaml b/torchbenchmark/models/hf_distil_whisper/metadata.yaml
new file mode 100644
index 0000000000..b72884cca4
--- /dev/null
+++ b/torchbenchmark/models/hf_distil_whisper/metadata.yaml
@@ -0,0 +1,8 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 16
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+train_benchmark: false
+train_deterministic: false
diff --git a/torchbenchmark/models/hf_distil_whisper/requirements.txt b/torchbenchmark/models/hf_distil_whisper/requirements.txt
new file mode 100644
index 0000000000..8e54d540dd
--- /dev/null
+++ b/torchbenchmark/models/hf_distil_whisper/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece
+datasets
diff --git a/torchbenchmark/util/framework/huggingface/model_factory.py b/torchbenchmark/util/framework/huggingface/model_factory.py
index 995d5bec91..4700cfe4d8 100644
--- a/torchbenchmark/util/framework/huggingface/model_factory.py
+++ b/torchbenchmark/util/framework/huggingface/model_factory.py
@@ -28,6 +28,7 @@
     # see https://huggingface.co/bert-large-cased
     'hf_Bert_large': (512, 512, 'BertConfig(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16)', 'AutoModelForMaskedLM'),
     'hf_Whisper': (1024, 1024, 'WhisperConfig()', 'AutoModelForAudioClassification'),
+    'hf_distil_whisper': (1024, 1024, 'AutoConfig.from_pretrained("distil-whisper/distil-medium.en")', 'AutoModelForAudioClassification'),
     # default num_hidden_layers=32 but that OOMs, feel free to change this config to something more real
     'llama_v2_7b_16h' : (128,512, 'LlamaConfig(num_hidden_layers=16)', 'AutoModelForCausalLM'),
     'hf_MPT_7b_instruct': (512, 512, 'AutoConfig.from_pretrained("mosaicml/mpt-7b-instruct", trust_remote_code=True)', 'AutoModelForCausalLM'),
@@ -36,6 +37,7 @@
     'llama_v2_70b' : (512, 512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-70b-hf")', 'AutoModelForMaskedLM'),
     'phi_1_5' : (512, 512, 'AutoConfig.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)', 'AutoModelForCausalLM'),
     'hf_Yi' : (512, 512, 'AutoConfig.from_pretrained("01-ai/Yi-6B", trust_remote_code=True)', 'AutoModelForCausalLM'),
+
 }
 
 cpu_input_slice = {