From cf7e4109ae6c42a04b0ac443c243539cff45ee31 Mon Sep 17 00:00:00 2001
From: atila <atiorh@icloud.com>
Date: Tue, 19 Mar 2024 10:56:59 -0700
Subject: [PATCH 1/5] Generate README improvements

---
 scripts/generate_readme.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/scripts/generate_readme.py b/scripts/generate_readme.py
index 5fc1089..e40003d 100644
--- a/scripts/generate_readme.py
+++ b/scripts/generate_readme.py
@@ -15,15 +15,16 @@
 
 logger = get_logger(__name__)
 
-QOI_KEY = "QoI (%)"
+QOI_KEY = "QoI (↑)"
 FILE_SIZE_KEY = "File Size (MB)"
-WER_KEY = "WER"
+WER_KEY = "WER (↓)"
 COMMIT_KEY = "Commit Hash"
 
 HF_HUB_DATASET_CARD_YAML_PREFIX = """
 ---
 pretty_name: "WhisperKit ASR Evaluation Results"
 viewer: false
+library_name: whisperkit
 tags:
 - whisper
 - whisperkit
@@ -109,6 +110,8 @@
     "WhisperKit/openai_whisper-tiny.en": 66,              # MB
     "whisper.cpp/openai_whisper-large-v2-q5_0": 1080,     # MB
     "whisper.cpp/openai_whisper-large-v3-q5_0": 1080,     # MB
+    "whisper.cpp/openai_whisper-large-v3": 3100,          # MB
+    "whisper.cpp/openai_whisper-large-v2": 3100,          # MB
     "WhisperOpenAIAPI/openai_whisper-large-v2": 3100,     # MB
 }
 
@@ -172,8 +175,13 @@ def cli():
             # Fill optimized model version values
             for optimized in optimized_csv.split(","):
                 optimized_code_repo, optimized_model = parse_name(optimized)
-                optimized_eval, optimized_link = get_latest_eval(
-                    optimized_code_repo, dataset_name, optimized_model)
+                try:
+                    optimized_eval, optimized_link = get_latest_eval(
+                        optimized_code_repo, dataset_name, optimized_model)
+                except Exception as e:
+                    logger.warning(f"Could not fetch eval JSON for {optimized}: {e}")
+                    continue
+
                 optimized_key = f"[{optimized}]({optimized_link})"
 
                 # Verify fetched evals are comparable
@@ -270,10 +278,11 @@ def parse_name(result, default_code_repo="WhisperKit"):
     return code_repo, model
 
 
-def get_latest_eval(code_repo, dataset_name, model_version, local_dir="/tmp"):
+def get_latest_eval(code_repo, dataset_name, model_version, local_dir="external"):
     f""" Fetch the latest eval from hf.co/datasets/{EVALS_REPO_ID}
     for given code repo, model version and dataset
     """
+    os.makedirs(local_dir, exist_ok=True)
     repo_rel_dir = os.path.join(code_repo, model_version, dataset_name)
     _ = snapshot_download(
         repo_id=EVALS_REPO_ID,

From bab883350916c196c943542ea2ea0faae1541127 Mon Sep 17 00:00:00 2001
From: atila <atiorh@icloud.com>
Date: Tue, 19 Mar 2024 16:17:06 -0700
Subject: [PATCH 2/5] Adopt per-word averaging convention for consistency in
 WER

---
 scripts/generate_readme.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/scripts/generate_readme.py b/scripts/generate_readme.py
index e40003d..5b2b095 100644
--- a/scripts/generate_readme.py
+++ b/scripts/generate_readme.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2024 Argmax, Inc. All Rights Reserved.
 #
 import argparse
+import evaluate
 import json
 import os
 from collections import defaultdict
@@ -13,6 +14,8 @@
 
 from whisperkit._constants import EVALS_REPO_ID, MODEL_REPO_ID
 
+wer_metric = evaluate.load("wer")
+
 logger = get_logger(__name__)
 
 QOI_KEY = "QoI (↑)"
@@ -42,12 +45,14 @@
 implementations and benchmark them using a consistent evaluation harness:
 
 Server-side:
-- `WhisperOpenAIAPI`: [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text) ($0.36 per hour of audio as of 02/29/24, 25MB file size limit per request)
+- `WhisperOpenAIAPI`: [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text)
+($0.36 per hour of audio as of 02/29/24, 25MB file size limit per request)
 
 On-device:
 - `WhisperKit`: Argmax's implementation [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L100) [[Repo]](https://github.com/argmaxinc/WhisperKit)
 - `whisper.cpp`: A C++ implementation form ggerganov [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L212) [[Repo]](https://github.com/ggerganov/whisper.cpp)
 - `WhisperMLX`: A Python implementation from Apple MLX [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L338) [[Repo]](https://github.com/ml-explore/mlx-examples/blob/main/whisper/whisper/transcribe.py)
+(All on-device implementations are available for free under MIT license as of 03/19/2024)
 
 `WhisperOpenAIAPI` sets the reference and we assume that it is using the equivalent of [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2)
 in float16 precision along with additional undisclosed optimizations from OpenAI. In all measurements, we care primarily about per-example no-regressions (quantified as `qoi` below)
@@ -115,6 +120,11 @@
     "WhisperOpenAIAPI/openai_whisper-large-v2": 3100,     # MB
 }
 
+DATASET_CAPTIONS = {
+    "librispeech": "Short-form Audio (<30s/clip) - 5 hours of English audiobook clips",
+    "earnings22": "Long-Form Audio (>1hr/clip) - 120 hours of earnings call recordings in English with various accents",
+}
+
 
 def cli():
     f""" Generates the README for hf.co/datasets/{EVALS_REPO_ID} which contains
@@ -144,7 +154,7 @@ def cli():
     readme = ""
 
     for dataset_name in args.dataset_names:
-        readme += f"\n## Dataset: `{dataset_name}`\n"
+        readme += f"\n## Dataset: `{dataset_name}`\n{DATASET_CAPTIONS[dataset_name]}\n"
         "-------------------------------------------------"
 
         # Quality-of-Inference (QoI) certifications for Whisper models
@@ -168,9 +178,7 @@ def cli():
                 REFERENCE_MODEL_FILE_SIZES[reference]
 
             # Sample average WER for reference model
-            results_dict[WER_KEY][reference_key] = round(
-                sum([sample["wer"] for sample in reference_eval["results"]]) /
-                len(reference_eval["results"]) * 100., 2)
+            results_dict[WER_KEY][reference_key] = compute_average_wer(reference_eval["results"])
 
             # Fill optimized model version values
             for optimized in optimized_csv.split(","):
@@ -192,9 +200,7 @@ def cli():
                     optimized_eval["results"]
                 )
                 results_dict[QOI_KEY][optimized_key] = qoi["no_regression"]
-                results_dict[WER_KEY][optimized_key] = round(
-                    sum([sample["wer"] for sample in optimized_eval["results"]]) /
-                    len(optimized_eval["results"]) * 100., 2)
+                results_dict[WER_KEY][optimized_key] = compute_average_wer(optimized_eval["results"])
 
                 # TODO(atiorh): Read remote git file size
                 if optimized in REFERENCE_MODEL_FILE_SIZES:
@@ -328,3 +334,10 @@ def verify_apples_to_apples(reference_eval, optimized_eval):
         logger.warning(
             "Reference and optimized evals weren't generated with the same "
             "whisperkittools commit")
+
+
+def compute_average_wer(results):
+    return round(wer_metric.compute(
+        references=[result["reference"] for result in results],
+        predictions=[result["prediction"] for result in results],
+    ) * 100., 2)

From ca624ce11231a6ebc2438638101e2e353627989b Mon Sep 17 00:00:00 2001
From: atila <atiorh@icloud.com>
Date: Tue, 19 Mar 2024 17:01:22 -0700
Subject: [PATCH 3/5] Add code commit info to README generator

---
 scripts/generate_readme.py | 40 +++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/scripts/generate_readme.py b/scripts/generate_readme.py
index 5b2b095..3a4825a 100644
--- a/scripts/generate_readme.py
+++ b/scripts/generate_readme.py
@@ -21,7 +21,7 @@
 QOI_KEY = "QoI (↑)"
 FILE_SIZE_KEY = "File Size (MB)"
 WER_KEY = "WER (↓)"
-COMMIT_KEY = "Commit Hash"
+COMMIT_KEY = "Code Commit"
 
 HF_HUB_DATASET_CARD_YAML_PREFIX = """
 ---
@@ -46,13 +46,13 @@
 
 Server-side:
 - `WhisperOpenAIAPI`: [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text)
-($0.36 per hour of audio as of 02/29/24, 25MB file size limit per request)
+\n($0.36 per hour of audio as of 02/29/24, 25MB file size limit per request)
 
 On-device:
 - `WhisperKit`: Argmax's implementation [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L100) [[Repo]](https://github.com/argmaxinc/WhisperKit)
 - `whisper.cpp`: A C++ implementation form ggerganov [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L212) [[Repo]](https://github.com/ggerganov/whisper.cpp)
 - `WhisperMLX`: A Python implementation from Apple MLX [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L338) [[Repo]](https://github.com/ml-explore/mlx-examples/blob/main/whisper/whisper/transcribe.py)
-(All on-device implementations are available for free under MIT license as of 03/19/2024)
+\n(All on-device implementations are available for free under MIT license as of 03/19/2024)
 
 `WhisperOpenAIAPI` sets the reference and we assume that it is using the equivalent of [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2)
 in float16 precision along with additional undisclosed optimizations from OpenAI. In all measurements, we care primarily about per-example no-regressions (quantified as `qoi` below)
@@ -164,13 +164,17 @@ def cli():
             results_dict[WER_KEY] = defaultdict(float)
             results_dict[QOI_KEY] = defaultdict(float)
             results_dict[FILE_SIZE_KEY] = defaultdict(int)
+            results_dict[COMMIT_KEY] = defaultdict(str)
 
             # Fetch the reference eval results
             reference_code_repo, reference_model = parse_name(reference)
 
             reference_eval, reference_link = get_latest_eval(
                 reference_code_repo, dataset_name, reference_model)
-            reference_key = f"[{reference}]({reference_link})"
+            if reference_code_repo == "WhisperKit":
+                reference_key = f"[{reference}]({get_model_link(reference_model)})"
+            else:
+                reference_key = reference
 
             # Fill reference model version values
             results_dict[QOI_KEY][reference_key] = 100.  # By definition of QoI
@@ -178,7 +182,15 @@ def cli():
                 REFERENCE_MODEL_FILE_SIZES[reference]
 
             # Sample average WER for reference model
-            results_dict[WER_KEY][reference_key] = compute_average_wer(reference_eval["results"])
+            results_dict[WER_KEY][reference_key] = \
+                f"[{compute_average_wer(reference_eval['results'])}]({reference_link})"
+
+            # Add commit hash for reference results
+            commit_hash = reference_eval["metadata"]["inference_context"]["code_spec"]["code_commit_hash"]
+            if commit_hash is not None:
+                results_dict[COMMIT_KEY][reference_key] = commit_hash[:7]
+            else:
+                results_dict[COMMIT_KEY][reference_key] = "N/A"
 
             # Fill optimized model version values
             for optimized in optimized_csv.split(","):
@@ -190,7 +202,10 @@ def cli():
                     logger.warning(f"Could not fetch eval JSON for {optimized}: {e}")
                     continue
 
-                optimized_key = f"[{optimized}]({optimized_link})"
+                if optimized_code_repo == "WhisperKit":
+                    optimized_key = f"[{optimized}]({get_model_link(optimized_model)})"
+                else:
+                    optimized_key = optimized
 
                 # Verify fetched evals are comparable
                 logger.info(f"Compare {optimized_link} vs {reference_link}")
@@ -200,7 +215,14 @@ def cli():
                     optimized_eval["results"]
                 )
                 results_dict[QOI_KEY][optimized_key] = qoi["no_regression"]
-                results_dict[WER_KEY][optimized_key] = compute_average_wer(optimized_eval["results"])
+                results_dict[WER_KEY][optimized_key] = f"[{compute_average_wer(optimized_eval['results'])}]({optimized_link})"
+
+                # Add commit hash for reference results
+                commit_hash = optimized_eval["metadata"]["inference_context"]["code_spec"]["code_commit_hash"]
+                if commit_hash is not None:
+                    results_dict[COMMIT_KEY][optimized_key] = commit_hash[:7]
+                else:
+                    results_dict[COMMIT_KEY][optimized_key] = "N/A"
 
                 # TODO(atiorh): Read remote git file size
                 if optimized in REFERENCE_MODEL_FILE_SIZES:
@@ -341,3 +363,7 @@ def compute_average_wer(results):
         references=[result["reference"] for result in results],
         predictions=[result["prediction"] for result in results],
     ) * 100., 2)
+
+
+def get_model_link(model_version):
+    return f"https://hf.co/{MODEL_REPO_ID}/tree/main/{model_version}"

From 3567284d0f3f68b2600be4f40d7ed26ab333bc85 Mon Sep 17 00:00:00 2001
From: atila <atiorh@icloud.com>
Date: Thu, 21 Mar 2024 19:37:43 -0700
Subject: [PATCH 4/5] Optionally disable token timestamps for models without
 alignment_heads

---
 scripts/generate_model.py       | 10 ++++++++++
 scripts/generate_readme.py      | 17 ++++++++++++-----
 tests/test_evaluate.py          |  3 ++-
 tests/test_text_decoder.py      | 16 ++++++++++++++--
 tests/test_word_timestamps.py   |  2 +-
 whisperkit/evaluate/evaluate.py |  4 ++--
 6 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/scripts/generate_model.py b/scripts/generate_model.py
index bcd602a..9b171c6 100644
--- a/scripts/generate_model.py
+++ b/scripts/generate_model.py
@@ -13,6 +13,7 @@
 
 from argmaxtools import _sdpa
 from argmaxtools.utils import get_logger
+from argmaxtools import test_utils
 from huggingface_hub import HfApi, hf_hub_download
 
 from tests import test_audio_encoder, test_text_decoder
@@ -20,6 +21,7 @@
 
 logger = get_logger(__name__)
 
+test_utils.TEST_MIN_SPEEDUP_VS_CPU = 0.3
 
 def cli():
     f""" Generates Whisper models and publishes them to hf.co/{MODEL_REPO_ID} """
@@ -93,6 +95,14 @@ def cli():
 
     logger.info(f"Generating {args.model_version} files")
 
+    # FIXME(atiorh): Remove this once distil-whisper-* models are updated
+    args.disable_token_timestamps = False
+    if "distil-whisper" in args.model_version:
+        logger.info(
+            "Disabling token-level timestamps due to missing alignment_heads in distil-whisper-* models"
+        )
+        args.disable_token_timestamps = True
+
     # Generate WhisperTextDecoder
     args.test_seq_len = args.text_decoder_max_sequence_length
     args.sdpa_implementation = args.text_decoder_sdpa_implementation
diff --git a/scripts/generate_readme.py b/scripts/generate_readme.py
index 3a4825a..c4128f1 100644
--- a/scripts/generate_readme.py
+++ b/scripts/generate_readme.py
@@ -81,11 +81,11 @@
 - [earnings22](https://huggingface.co/datasets/argmaxinc/earnings22): ~120 hours of English audio clips from earnings calls with various accents, tests long-form transcription quality
 
 ### Reproducing Results
-Results in this page are generated by our cluster of Apple Silicon Macs. We use them as self-hosted runners on
-Github Actions as our CI infrastructure. Due to [security concerns](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners),
+Benchmark results on this page were automatically generated by [whisperkittools](https://github.com/argmaxinc/whisperkittools). We use our cluster of Apple Silicon Macs as self-hosted runners on
+Github Actions as our CI infrastructure to periodically recompute these benchmarks. Due to [security concerns](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners),
 we are unable to open up the cluster to the public. However, any Apple Silicon Mac (even with 8GB RAM) can be used to
 run identical [evaluation jobs](#evaluation) locally. For reference, our M2 Ultra devices complete a `librispeech` + `openai/whisper-large-v3`
-evaluation in under 1 hour regardless of the Whisper implementation. Older Apple Silicon Macs should take less than 1 day to complete the same evaluation.
+evaluation in under 1 hour regardless of the Whisper implementation. Oldest Apple Silicon Macs should take less than 1 day to complete the same evaluation.
 
 """  # noqa: E501
 
@@ -125,6 +125,11 @@
     "earnings22": "Long-Form Audio (>1hr/clip) - 120 hours of earnings call recordings in English with various accents",
 }
 
+REPO_URLS = {
+    "whisper.cpp": "https://github.com/ggerganov/whisper.cpp",
+    "WhisperKit": "https://github.com/argmaxinc/WhisperKit"
+}
+
 
 def cli():
     f""" Generates the README for hf.co/datasets/{EVALS_REPO_ID} which contains
@@ -188,7 +193,8 @@ def cli():
             # Add commit hash for reference results
             commit_hash = reference_eval["metadata"]["inference_context"]["code_spec"]["code_commit_hash"]
             if commit_hash is not None:
-                results_dict[COMMIT_KEY][reference_key] = commit_hash[:7]
+                results_dict[COMMIT_KEY][reference_key] = \
+                    f"[{commit_hash[:7]}]({REPO_URLS[reference_code_repo]}/commit/{commit_hash[:7]})"
             else:
                 results_dict[COMMIT_KEY][reference_key] = "N/A"
 
@@ -220,7 +226,8 @@ def cli():
                 # Add commit hash for reference results
                 commit_hash = optimized_eval["metadata"]["inference_context"]["code_spec"]["code_commit_hash"]
                 if commit_hash is not None:
-                    results_dict[COMMIT_KEY][optimized_key] = commit_hash[:7]
+                    results_dict[COMMIT_KEY][optimized_key] = \
+                        f"[{commit_hash[:7]}]({REPO_URLS[optimized_code_repo]}/commit/{commit_hash[:7]})"
                 else:
                     results_dict[COMMIT_KEY][optimized_key] = "N/A"
 
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 86deb32..45aa095 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -14,7 +14,7 @@
 from argmaxtools.utils import get_logger
 from huggingface_hub import HfApi
 
-from whisperkit._constants import EVALS_REPO_ID
+from whisperkit._constants import EVALS_REPO_ID, MODEL_REPO_ID
 from whisperkit.evaluate.datasets import EVAL_DATASETS
 from whisperkit.evaluate.evaluate import evaluate
 from whisperkit.pipelines import get_pipeline_cls
@@ -75,6 +75,7 @@ def setUpClass(cls) -> None:
                 "model_version": TEST_MODEL_VERSION,
                 "whisperkittools_commit_hash": wkt_commit_hash,
                 "inference_context": cls.inference_context.spec_dict(),
+                "model_repo_id": MODEL_REPO_ID
             }
         }
 
diff --git a/tests/test_text_decoder.py b/tests/test_text_decoder.py
index f682b92..6ec3ac8 100644
--- a/tests/test_text_decoder.py
+++ b/tests/test_text_decoder.py
@@ -39,6 +39,7 @@
     "logits", "key_cache_updates", "value_cache_updates", "alignment_heads_weights"]
 TEST_CONTEXT_PREFILL_OUTPUT_NAMES = ["key_cache_prefill", "value_cache_prefill"]
 TEST_DEC_KV_SEQ_LEN = None
+TEST_TOKEN_TIMESTAMPS = True
 
 
 class TestWhisperTextDecoder(argmaxtools_test_utils.CoreMLTestsMixin, unittest.TestCase):
@@ -48,6 +49,9 @@ def setUpClass(cls):
         cls.test_cache_dir = TEST_CACHE_DIR
         cls.model_name = "TextDecoder"
 
+        if not TEST_TOKEN_TIMESTAMPS:
+            cls.test_output_names.pop(cls.test_output_names.index("alignment_heads_weights"))
+
         # Original model
         orig_torch_model = (
             modeling_whisper.WhisperForConditionalGeneration.from_pretrained(
@@ -68,7 +72,9 @@ def setUpClass(cls):
             cls.test_torch_model.to(TEST_DEV).to(TEST_TORCH_DTYPE).eval()
         )
         cls.gen_cfg = orig_torch_model.generation_config
-        cls.test_torch_model.configure_for_token_timestamps(cls.gen_cfg)
+
+        if TEST_TOKEN_TIMESTAMPS:
+            cls.test_torch_model.configure_for_token_timestamps(cls.gen_cfg)
 
         # Elaboration: I/O and architecture config
         cfg = cls.orig_torch_model.config
@@ -347,6 +353,9 @@ class TestWhisperTextDecoderPalettizer(
     def setUpClass(cls):
         cls.model_name = "TextDecoder"
         cls.output_names = TEST_OUTPUT_NAMES
+        if not TEST_TOKEN_TIMESTAMPS:
+            cls.output_names.pop("alignment_heads_weights")
+
         cls.palettizer = palettize.WhisperTextDecoderPalettizer(
             model_version=TEST_WHISPER_VERSION,
             cache_dir=os.path.join(
@@ -370,9 +379,11 @@ def place(t):
 
 
 def main(args):
-    global TEST_WHISPER_VERSION, TEST_CACHE_DIR, TEST_DEC_KV_SEQ_LEN
+    global TEST_WHISPER_VERSION, TEST_CACHE_DIR, TEST_DEC_KV_SEQ_LEN, TEST_TOKEN_TIMESTAMPS
 
     TEST_WHISPER_VERSION = args.test_model_version
+    TEST_TOKEN_TIMESTAMPS = not args.disable_token_timestamps
+
     logger.info(f"Testing {TEST_WHISPER_VERSION}")
 
     text_decoder.SDPA_IMPL = getattr(_sdpa, args.sdpa_implementation)
@@ -422,6 +433,7 @@ def main(args):
     parser.add_argument("--palettizer-tests", action="store_true")
     parser.add_argument("--disable-default-tests", action="store_true")
     parser.add_argument("--context-prefill-tests", action="store_true")
+    parser.add_argument("--disable-token-timestamps", action="store_true")
     parser.add_argument(
         "--sdpa-implementation", default="Cat", choices=tuple(_sdpa.__all__)
     )
diff --git a/tests/test_word_timestamps.py b/tests/test_word_timestamps.py
index 8b3218d..4284a89 100644
--- a/tests/test_word_timestamps.py
+++ b/tests/test_word_timestamps.py
@@ -1,6 +1,6 @@
 #
 # For licensing see accompanying LICENSE.md file.
-# Copyright (C) 2023 Argmax, Inc. All Rights Reserved.
+# Copyright (C) 2024 Argmax, Inc. All Rights Reserved.
 #
 
 import json
diff --git a/whisperkit/evaluate/evaluate.py b/whisperkit/evaluate/evaluate.py
index 09d9c81..f3f3ea3 100644
--- a/whisperkit/evaluate/evaluate.py
+++ b/whisperkit/evaluate/evaluate.py
@@ -90,8 +90,8 @@ def evaluate(whisper_pipeline: Union[pipelines.WhisperPipeline, pipelines.Whispe
             int(bool(_num_fallbacks)) for _num_fallbacks in num_fallbacks
         ]) / len(num_fallbacks)
         fallback_str = "-------------------------------------------------------"
-        fallback_str += f"\nTotal fallbacks: {total_fallbacks}"
-        fallback_str += "\nSamples with fallback: "
+        fallback_str += f"\n    Total fallbacks: {total_fallbacks}"
+        fallback_str += "\n    Samples with fallback: "
         fallback_str += f"{samples_with_fallback_percent * 100.:.3g}%"
 
     # Failed example bookkeeping

From b5c1e50eb611240025c158784ec34bbfce26de09 Mon Sep 17 00:00:00 2001
From: atila <atiorh@icloud.com>
Date: Fri, 22 Mar 2024 17:37:24 -0700
Subject: [PATCH 5/5] updated README and generate_README

---
 README.md                  | 93 +++++++++++++++++++++-----------------
 scripts/generate_model.py  |  1 +
 scripts/generate_readme.py | 69 ++++++++++++++++++----------
 3 files changed, 97 insertions(+), 66 deletions(-)

diff --git a/README.md b/README.md
index fd6e88d..d6ec120 100644
--- a/README.md
+++ b/README.md
@@ -117,25 +117,43 @@ Note that the app is in beta and we are actively seeking feedback to improve it
 
 ## <a name="qoi"></a> WhisperKit Evaluation Results
 
-### Dataset: `librispeech`
-
-|                                                                                                                                                                            |   WER |   QoI (%) |   File Size (MB) |
-|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:|----------:|-----------------:|
-| [WhisperOpenAIAPI/openai_whisper-large-v2](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech)               |  2.85 |     100   |             3100 |
-| [WhisperKit/openai_whisper-large-v3](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3/librispeech)                           |  2.48 |      95.2 |             3100 |
-| [WhisperKit/openai_whisper-large-v3_turbo](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3_turbo/librispeech)               |  2.44 |      95.4 |             3100 |
-| [WhisperKit/openai_whisper-large-v3_turbo_1018MB](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3_turbo_1018MB/librispeech) |  2.49 |      94.8 |             1018 |
-| [WhisperKit/openai_whisper-large-v2](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2/librispeech)                           |  3.28 |      96.6 |             3100 |
-| [WhisperKit/openai_whisper-large-v2_1050MB](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2_1050MB/librispeech)             |  3.32 |      95   |             1050 |
-| [WhisperKit/openai_whisper-large-v2_turbo](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2_turbo/librispeech)               |  3.24 |      96.6 |             3100 |
-| [WhisperKit/openai_whisper-large-v2_turbo_1022MB](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2_turbo_1022MB/librispeech) |  3.33 |      94.9 |             1022 |
-| [WhisperKit/openai_whisper-small.en](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-small.en/librispeech)                           |  4.31 |      85.9 |              483 |
-| [WhisperKit/openai_whisper-small](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-small/librispeech)                                 |  3.98 |      82.9 |              483 |
-| [WhisperKit/openai_whisper-base.en](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-base.en/librispeech)                             |  4.76 |      75.5 |              145 |
-| [WhisperKit/openai_whisper-base](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-base/librispeech)                                   |  6.11 |      67.1 |              145 |
-| [WhisperKit/openai_whisper-tiny.en](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-tiny.en/librispeech)                             |  6.72 |      64   |               66 |
-| [WhisperKit/openai_whisper-tiny](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-tiny/librispeech)                                   |  8.94 |      52.4 |               66 |
-
+## Dataset: `librispeech`
+Short-form Audio (<30s/clip) - 5 hours of English audiobook clips
+
+|                                                                                                                               | WER (↓)                                                                                                                               |   QoI (↑) |   File Size (MB) | Code Commit                                                    |
+|:------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------|----------:|-----------------:|:---------------------------------------------------------------|
+| large-v2 (WhisperOpenAIAPI)                                                                                                   | [2.35](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech)              |     100   |             3100 | N/A                                                            |
+| [large-v2](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v2)                                       | [2.77](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2/librispeech)                    |      96.6 |             3100 | [Link](https://github.com/argmaxinc/WhisperKit/commit/2846fd9) |
+| [large-v2_949MB](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v2_949MB)                           | [2.4](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2_949MB/librispeech)               |      94.6 |              949 | [Link](https://github.com/argmaxinc/WhisperKit/commit/eca4a2e) |
+| [large-v2_turbo](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v2_turbo)                           | [2.76](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2_turbo/librispeech)              |      96.6 |             3100 | [Link](https://github.com/argmaxinc/WhisperKit/commit/2846fd9) |
+| [large-v2_turbo_955MB](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v2_turbo_955MB)               | [2.41](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v2_turbo_955MB/librispeech)        |      94.6 |              955 | [Link](https://github.com/argmaxinc/WhisperKit/commit/cf75348) |
+| [large-v3](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v3)                                       | [2.04](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3/librispeech)                    |      95.2 |             3100 | [Link](https://github.com/argmaxinc/WhisperKit/commit/2846fd9) |
+| [large-v3_947MB](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v3_947MB)                           | [2.46](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3_947MB/librispeech)              |      93.9 |              947 | [Link](https://github.com/argmaxinc/WhisperKit/commit/eca4a2e) |
+| [large-v3_turbo](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v3_turbo)                           | [2.03](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3_turbo/librispeech)              |      95.4 |             3100 | [Link](https://github.com/argmaxinc/WhisperKit/commit/2846fd9) |
+| [large-v3_turbo_954MB](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v3_turbo_954MB)               | [2.47](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3_turbo_954MB/librispeech)        |      93.9 |              954 | [Link](https://github.com/argmaxinc/WhisperKit/commit/cf75348) |
+| [distil-large-v3](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/distil-whisper_distil-large-v3)                         | [2.47](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/distil-whisper_distil-large-v3/librispeech)             |      89.7 |             1510 | [Link](https://github.com/argmaxinc/WhisperKit/commit/cf75348) |
+| [distil-large-v3_594MB](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/distil-whisper_distil-large-v3_594MB)             | [2.96](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/distil-whisper_distil-large-v3_594MB/librispeech)       |      85.4 |              594 | [Link](https://github.com/argmaxinc/WhisperKit/commit/508240f) |
+| [distil-large-v3_turbo](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/distil-whisper_distil-large-v3_turbo)             | [2.47](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/distil-whisper_distil-large-v3_turbo/librispeech)       |      89.7 |             1510 | [Link](https://github.com/argmaxinc/WhisperKit/commit/508240f) |
+| [distil-large-v3_turbo_600MB](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/distil-whisper_distil-large-v3_turbo_600MB) | [2.78](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/distil-whisper_distil-large-v3_turbo_600MB/librispeech) |      86.2 |              600 | [Link](https://github.com/argmaxinc/WhisperKit/commit/ae1cf96) |
+| [small.en](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-small.en)                                       | [3.12](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-small.en/librispeech)                    |      85.8 |              483 | [Link](https://github.com/argmaxinc/WhisperKit/commit/228630c) |
+| [small](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-small)                                             | [3.45](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-small/librispeech)                       |      83   |              483 | [Link](https://github.com/argmaxinc/WhisperKit/commit/228630c) |
+| [base.en](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-base.en)                                         | [3.98](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-base.en/librispeech)                     |      75.3 |              145 | [Link](https://github.com/argmaxinc/WhisperKit/commit/228630c) |
+| [base](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-base)                                               | [4.97](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-base/librispeech)                        |      67.2 |              145 | [Link](https://github.com/argmaxinc/WhisperKit/commit/228630c) |
+| [tiny.en](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-tiny.en)                                         | [5.61](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-tiny.en/librispeech)                     |      63.9 |               66 | [Link](https://github.com/argmaxinc/WhisperKit/commit/228630c) |
+| [tiny](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-tiny)                                               | [7.47](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-tiny/librispeech)                        |      52.5 |               66 | [Link](https://github.com/argmaxinc/WhisperKit/commit/228630c) |
+
+## Dataset: `earnings22`
+Long-Form Audio (>1hr/clip) - 120 hours of earnings call recordings in English with various accents
+
+|                                                                                         | WER (↓)                                                                                                                  |   QoI (↑) |   File Size (MB) | Code Commit                                                    |
+|:----------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------|----------:|-----------------:|:---------------------------------------------------------------|
+| large-v2 (WhisperOpenAIAPI)                                                             | [16.27](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22) |     100   |             3100 | N/A                                                            |
+| [large-v3](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-large-v3) | [15.17](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-large-v3/earnings22)       |      58.5 |             3100 | [Link](https://github.com/argmaxinc/WhisperKit/commit/2846fd9) |
+| [base.en](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-base.en)   | [23.49](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-base.en/earnings22)        |       6.5 |              145 | [Link](https://github.com/argmaxinc/WhisperKit/commit/dda6571) |
+| [tiny.en](https://hf.co/argmaxinc/whisperkit-coreml/tree/main/openai_whisper-tiny.en)   | [28.64](https://hf.co/datasets/argmaxinc/whisperkit-evals/tree/main/WhisperKit/openai_whisper-tiny.en/earnings22)        |       5.7 |               66 | [Link](https://github.com/argmaxinc/WhisperKit/commit/dda6571) |
+
+
+### Explanation
 
 We believe that rigorously measuring the quality of inference is necessary for developers and
 enterprises to make informed decisions when opting to use optimized or compressed variants of
@@ -143,13 +161,17 @@ any machine learning model in production. To contextualize `WhisperKit`, we take
 implementations and benchmark them using a consistent evaluation harness:
 
 Server-side:
-- `WhisperOpenAIAPI`: [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text) ($0.36 per hour of audio as of 02/29/24, 25MB file size limit per request)
+- `WhisperOpenAIAPI`: [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text)
+
+($0.36 per hour of audio as of 02/29/24, 25MB file size limit per request)
 
 On-device:
 - `WhisperKit`: Argmax's implementation [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L100) [[Repo]](https://github.com/argmaxinc/WhisperKit)
 - `whisper.cpp`: A C++ implementation form ggerganov [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L212) [[Repo]](https://github.com/ggerganov/whisper.cpp)
 - `WhisperMLX`: A Python implementation from Apple MLX [[Eval Harness]](https://github.com/argmaxinc/whisperkittools/blob/main/whisperkit/pipelines.py#L338) [[Repo]](https://github.com/ml-explore/mlx-examples/blob/main/whisper/whisper/transcribe.py)
 
+(All on-device implementations are available for free under MIT license as of 03/19/2024)
+
 `WhisperOpenAIAPI` sets the reference and we assume that it is using the equivalent of [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2)
 in float16 precision along with additional undisclosed optimizations from OpenAI. In all measurements, we care primarily about per-example no-regressions (quantified as `qoi` below)
 which is a stricter metric compared to dataset average [Word Error RATE (WER)](https://en.wikipedia.org/wiki/Word_error_rate). A 100% `qoi` preserves perfect backwards-compatibility on the test distribution and avoids "perceived regressions", the phenomenon
@@ -172,20 +194,25 @@ where the production behavior is established by the reference results and the go
 We anticipate developers that use Whisper (or similar models) in production to have their own Quality Assurance test sets and [whisperkittools](https://github.com/argmaxinc/whisperkittools) offers
 the tooling necessary to run the same measurements on such custom test sets, please see the [Model Evaluation on Custom Dataset]((https://github.com/argmaxinc/whisperkittools)) for details.
 
-#### Datasets
+### Why are there so many Whisper versions?
+WhisperKit is an SDK for building speech-to-text features in apps across a wide range of Apple devices. We are working towards abstracting away the model versioning from the developer so WhisperKit
+"just works" by deploying the highest-quality model version that a particular device can execute. In the interim, we leave the choice to the developer by providing quality and size trade-offs.
+
+
+### Datasets
 - [librispeech](https://huggingface.co/datasets/argmaxinc/librispeech): ~5 hours of short English audio clips, tests short-form transcription quality
 - [earnings22](https://huggingface.co/datasets/argmaxinc/earnings22): ~120 hours of English audio clips from earnings calls with various accents, tests long-form transcription quality
 
 ### Reproducing Results
-Results in this page are generated by our cluster of Apple Silicon Macs. We use them as self-hosted runners on
-Github Actions as our CI infrastructure. Due to [security concerns](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners),
+Benchmark results on this page were automatically generated by [whisperkittools](https://github.com/argmaxinc/whisperkittools) using our cluster of Apple Silicon Macs as self-hosted runners on
+Github Actions. We periodically recompute these benchmarks as part of our CI pipeline. Due to [security concerns](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners),
 we are unable to open up the cluster to the public. However, any Apple Silicon Mac (even with 8GB RAM) can be used to
 run identical [evaluation jobs](#evaluation) locally. For reference, our M2 Ultra devices complete a `librispeech` + `openai/whisper-large-v3`
-evaluation in under 1 hour regardless of the Whisper implementation. Older Apple Silicon Macs should take less than 1 day to complete the same evaluation.
+evaluation in under 1 hour regardless of the Whisper implementation. Oldest Apple Silicon Macs should take less than 1 day to complete the same evaluation.
 
 
 
-#### Glossary
+### Glossary
 
 - `_turbo`: Indicates the presence of additional optimizations (not compression) to unlock streaming transcription
 as described in our [Blog Post](https://www.takeargmax.com/blog/whisperkit).
@@ -194,22 +221,6 @@ as described in our [Blog Post](https://www.takeargmax.com/blog/whisperkit).
 `_AudioEncoder-5.8bits_TextDecoder-6.1bits_QLoRA-rank=16`, we choose to summarize the compression spec as the
 resulting total file size since this is what matters to developers in production.
 
-
-
-#### Different Projects + `openai_whisper-large-v3`
-
-|                                                                                                  |   WER | Commit Hash   | Model Format   |
-|:-------------------------------------------------------------------------------------------------|------:|:--------------|:---------------|
-| [WhisperKit](https://github.com/argmaxinc/whisperkit)                                            |  2.44 | 14e705e       | Core ML        |
-| [WhisperCpp](https://github.com/ggerganov/whisper.cpp)                                           |  2.57 | 4bbb60e       | Core ML + GGUF |
-| [WhisperMLX](https://github.com/ml-explore/mlx-examples/blob/main/whisper/whisper/transcribe.py) |  2.57 | 854ad87       | MLX (Numpy)    |
-
-
-- `_turbo`: Indicates the presence of additional optimizations (not compression) to unlock streaming transcription as described in our [Blog Post](https://www.takeargmax.com/blog/whisperkit).
-
-- `_*MB`: Indicates the presence of mixed-bit quantization. Instead of cluttering the filename with details like `_AudioEncoder-5.8bits_TextDecoder-6.1bits`, we choose to summarize the compression spec as the resulting total file size since this is what matters to developers in production.
-
-
 ## FAQ
 
 **Q1**: `xcrun: error: unable to find utility "coremlcompiler", not a developer tool or in PATH`
diff --git a/scripts/generate_model.py b/scripts/generate_model.py
index 9b171c6..7987d98 100644
--- a/scripts/generate_model.py
+++ b/scripts/generate_model.py
@@ -23,6 +23,7 @@
 
 test_utils.TEST_MIN_SPEEDUP_VS_CPU = 0.3
 
+
 def cli():
     f""" Generates Whisper models and publishes them to hf.co/{MODEL_REPO_ID} """
     parser = argparse.ArgumentParser()
diff --git a/scripts/generate_readme.py b/scripts/generate_readme.py
index c4128f1..1753734 100644
--- a/scripts/generate_readme.py
+++ b/scripts/generate_readme.py
@@ -35,10 +35,12 @@
 - asr
 - quantized
 ---
-# WhisperKit Evaluation Results\n
+# WhisperKit Transcription Quality\n
 """
 
 HF_HUB_METRIC_EXPLANATION = """
+### Explanation
+
 We believe that rigorously measuring the quality of inference is necessary for developers and
 enterprises to make informed decisions when opting to use optimized or compressed variants of
 any machine learning model in production. To contextualize `WhisperKit`, we take the following Whisper
@@ -76,13 +78,18 @@
 We anticipate developers that use Whisper (or similar models) in production to have their own Quality Assurance test sets and [whisperkittools](https://github.com/argmaxinc/whisperkittools) offers
 the tooling necessary to run the same measurements on such custom test sets, please see the [Model Evaluation on Custom Dataset]((https://github.com/argmaxinc/whisperkittools)) for details.
 
+### Why are there so many Whisper versions?
+WhisperKit is an SDK for building speech-to-text features in apps across a wide range of Apple devices. We are working towards abstracting away the model versioning from the developer so WhisperKit
+"just works" by deploying the highest-quality model version that a particular device can execute. In the interim, we leave the choice to the developer by providing quality and size trade-offs.
+
+
 ### Datasets
 - [librispeech](https://huggingface.co/datasets/argmaxinc/librispeech): ~5 hours of short English audio clips, tests short-form transcription quality
 - [earnings22](https://huggingface.co/datasets/argmaxinc/earnings22): ~120 hours of English audio clips from earnings calls with various accents, tests long-form transcription quality
 
 ### Reproducing Results
-Benchmark results on this page were automatically generated by [whisperkittools](https://github.com/argmaxinc/whisperkittools). We use our cluster of Apple Silicon Macs as self-hosted runners on
-Github Actions as our CI infrastructure to periodically recompute these benchmarks. Due to [security concerns](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners),
+Benchmark results on this page were automatically generated by [whisperkittools](https://github.com/argmaxinc/whisperkittools) using our cluster of Apple Silicon Macs as self-hosted runners on
+Github Actions. We periodically recompute these benchmarks as part of our CI pipeline. Due to [security concerns](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners),
 we are unable to open up the cluster to the public. However, any Apple Silicon Mac (even with 8GB RAM) can be used to
 run identical [evaluation jobs](#evaluation) locally. For reference, our M2 Ultra devices complete a `librispeech` + `openai/whisper-large-v3`
 evaluation in under 1 hour regardless of the Whisper implementation. Oldest Apple Silicon Macs should take less than 1 day to complete the same evaluation.
@@ -103,21 +110,23 @@
 
 # TODO(atiorh): Read remote git file size
 REFERENCE_MODEL_FILE_SIZES = {
-    "WhisperKit/openai_whisper-large-v2": 3100,           # MB
-    "WhisperKit/openai_whisper-large-v2_turbo": 3100,     # MB
-    "WhisperKit/openai_whisper-large-v3": 3100,           # MB
-    "WhisperKit/openai_whisper-large-v3_turbo": 3100,     # MB
-    "WhisperKit/openai_whisper-small": 483,               # MB
-    "WhisperKit/openai_whisper-small.en": 483,            # MB
-    "WhisperKit/openai_whisper-base": 145,                # MB
-    "WhisperKit/openai_whisper-base.en": 145,             # MB
-    "WhisperKit/openai_whisper-tiny": 66,                 # MB
-    "WhisperKit/openai_whisper-tiny.en": 66,              # MB
-    "whisper.cpp/openai_whisper-large-v2-q5_0": 1080,     # MB
-    "whisper.cpp/openai_whisper-large-v3-q5_0": 1080,     # MB
-    "whisper.cpp/openai_whisper-large-v3": 3100,          # MB
-    "whisper.cpp/openai_whisper-large-v2": 3100,          # MB
-    "WhisperOpenAIAPI/openai_whisper-large-v2": 3100,     # MB
+    "WhisperKit/openai_whisper-large-v2": 3100,                 # MB
+    "WhisperKit/openai_whisper-large-v2_turbo": 3100,           # MB
+    "WhisperKit/openai_whisper-large-v3": 3100,                 # MB
+    "WhisperKit/openai_whisper-large-v3_turbo": 3100,           # MB
+    "WhisperKit/openai_whisper-small": 483,                     # MB
+    "WhisperKit/openai_whisper-small.en": 483,                  # MB
+    "WhisperKit/openai_whisper-base": 145,                      # MB
+    "WhisperKit/openai_whisper-base.en": 145,                   # MB
+    "WhisperKit/openai_whisper-tiny": 66,                       # MB
+    "WhisperKit/openai_whisper-tiny.en": 66,                    # MB
+    "whisper.cpp/openai_whisper-large-v2-q5_0": 1080,           # MB
+    "whisper.cpp/openai_whisper-large-v3-q5_0": 1080,           # MB
+    "whisper.cpp/openai_whisper-large-v3": 3100,                # MB
+    "whisper.cpp/openai_whisper-large-v2": 3100,                # MB
+    "WhisperOpenAIAPI/openai_whisper-large-v2": 3100,           # MB
+    "WhisperKit/distil-whisper_distil-large-v3": 1510,          # MB
+    "WhisperKit/distil-whisper_distil-large-v3_turbo": 1510,    # MB
 }
 
 DATASET_CAPTIONS = {
@@ -176,10 +185,15 @@ def cli():
 
             reference_eval, reference_link = get_latest_eval(
                 reference_code_repo, dataset_name, reference_model)
+
+            reference_key = reference.rsplit('/')[
+                -1].replace('openai_whisper-', '').replace('distil-whisper_', '')
             if reference_code_repo == "WhisperKit":
-                reference_key = f"[{reference}]({get_model_link(reference_model)})"
+                reference_key = \
+                    f"[{reference_key}]" \
+                    f"({get_model_link(reference_model)}) "
             else:
-                reference_key = reference
+                reference_key = reference_key + f" ({reference_code_repo})"
 
             # Fill reference model version values
             results_dict[QOI_KEY][reference_key] = 100.  # By definition of QoI
@@ -194,7 +208,7 @@ def cli():
             commit_hash = reference_eval["metadata"]["inference_context"]["code_spec"]["code_commit_hash"]
             if commit_hash is not None:
                 results_dict[COMMIT_KEY][reference_key] = \
-                    f"[{commit_hash[:7]}]({REPO_URLS[reference_code_repo]}/commit/{commit_hash[:7]})"
+                    f"[Link]({REPO_URLS[reference_code_repo]}/commit/{commit_hash[:7]})"
             else:
                 results_dict[COMMIT_KEY][reference_key] = "N/A"
 
@@ -208,10 +222,14 @@ def cli():
                     logger.warning(f"Could not fetch eval JSON for {optimized}: {e}")
                     continue
 
+                optimized_key = optimized.rsplit('/')[
+                    -1].replace('openai_whisper-', '').replace('distil-whisper_', '')
                 if optimized_code_repo == "WhisperKit":
-                    optimized_key = f"[{optimized}]({get_model_link(optimized_model)})"
+                    optimized_key = \
+                        f"[{optimized_key}]" \
+                        f"({get_model_link(optimized_model)}) "
                 else:
-                    optimized_key = optimized
+                    optimized_key = optimized_key + f" ({optimized_code_repo})"
 
                 # Verify fetched evals are comparable
                 logger.info(f"Compare {optimized_link} vs {reference_link}")
@@ -221,13 +239,14 @@ def cli():
                     optimized_eval["results"]
                 )
                 results_dict[QOI_KEY][optimized_key] = qoi["no_regression"]
-                results_dict[WER_KEY][optimized_key] = f"[{compute_average_wer(optimized_eval['results'])}]({optimized_link})"
+                results_dict[WER_KEY][optimized_key] = \
+                    f"[{compute_average_wer(optimized_eval['results'])}]({optimized_link})"
 
                 # Add commit hash for reference results
                 commit_hash = optimized_eval["metadata"]["inference_context"]["code_spec"]["code_commit_hash"]
                 if commit_hash is not None:
                     results_dict[COMMIT_KEY][optimized_key] = \
-                        f"[{commit_hash[:7]}]({REPO_URLS[optimized_code_repo]}/commit/{commit_hash[:7]})"
+                        f"[Link]({REPO_URLS[optimized_code_repo]}/commit/{commit_hash[:7]})"
                 else:
                     results_dict[COMMIT_KEY][optimized_key] = "N/A"