From 11c7769d74ba4de9fe8374d424f1f32373be56a9 Mon Sep 17 00:00:00 2001
From: Gerardo Roa <gerardo.roa@gmail.com>
Date: Tue, 10 Sep 2024 13:22:30 +0100
Subject: [PATCH 1/3] sample rate in config

---
 recipes/cad2/task1/baseline/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/cad2/task1/baseline/config.yaml b/recipes/cad2/task1/baseline/config.yaml
index 8803f9a8..04768847 100644
--- a/recipes/cad2/task1/baseline/config.yaml
+++ b/recipes/cad2/task1/baseline/config.yaml
@@ -10,7 +10,7 @@ path:
   scene_listeners_file: ${path.metadata_dir}/scene_listeners.valid.json
   exp_folder: ./exp_${separator.causality} # folder to store enhanced signals and final results
 
-input_sample_rate: 44100   # sample rate of the input mixture
+input_sample_rate: 44100  # sample rate of the input mixture
 remix_sample_rate: 44100  # sample rate for the output remixed signal
 HAAQI_sample_rate: 24000  # sample rate for computing HAAQI score
 

From 27d14f2fb58782ff1e5f2746dbb2aa977d85a70b Mon Sep 17 00:00:00 2001
From: Gerardo Roa <gerardo.roa@gmail.com>
Date: Tue, 10 Sep 2024 11:05:19 +0100
Subject: [PATCH 2/3] correct use of whisper

Signed-off-by: Gerardo Roa <gerardo.roa@gmail.com>
---
 recipes/cad2/task1/baseline/evaluate.py | 46 +++++++++++++++++--------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/recipes/cad2/task1/baseline/evaluate.py b/recipes/cad2/task1/baseline/evaluate.py
index e45e29e9..4bee312a 100644
--- a/recipes/cad2/task1/baseline/evaluate.py
+++ b/recipes/cad2/task1/baseline/evaluate.py
@@ -58,7 +58,7 @@ def compute_intelligibility(
     save_intermediate: bool = False,
     path_intermediate: str | Path | None = None,
     equiv_0db_spl: float = 100,
-) -> tuple[float, float]:
+) -> tuple[float, float, dict]:
     """
     Compute the Intelligibility score for the enhanced signal
     using the Whisper model.
@@ -79,6 +79,9 @@ def compute_intelligibility(
     Returns:
         The intelligibility score for the left and right channels
     """
+
+    lyrics = {}
+
     if path_intermediate is None:
         path_intermediate = Path.cwd()
     if isinstance(path_intermediate, str):
@@ -90,6 +93,7 @@ def compute_intelligibility(
     )
 
     reference = segment_metadata["text"]
+    lyrics["reference"] = reference
 
     # Compute left ear
     ear.set_audiogram(listener.audiogram_left)
@@ -101,8 +105,10 @@ def compute_intelligibility(
         44100,
         sample_rate,
     )
-    hipothesis = scorer.transcribe(left_path, fp16=False)["text"]
-    left_results = compute_measures(reference, hipothesis)
+    hypothesis = scorer.transcribe(left_path.as_posix(), fp16=False)["text"]
+    lyrics["hypothesis_left"] = hypothesis
+
+    left_results = compute_measures(reference, hypothesis)
 
     # Compute right ear
     ear.set_audiogram(listener.audiogram_right)
@@ -114,8 +120,10 @@ def compute_intelligibility(
         44100,
         sample_rate,
     )
-    hipothesis = scorer.transcribe(right_path, fp16=False)["text"]
-    right_results = compute_measures(reference, hipothesis)
+    hypothesis = scorer.transcribe(right_path.as_posix(), fp16=False)["text"]
+    lyrics["hypothesis_right"] = hypothesis
+
+    right_results = compute_measures(reference, hypothesis)
 
     # Compute the average score for both ears
     total_words = (
@@ -136,7 +144,11 @@ def compute_intelligibility(
     Path(left_path).unlink()
     Path(right_path).unlink()
 
-    return left_results["hits"] / total_words, right_results["hits"] / total_words
+    return (
+        left_results["hits"] / total_words,
+        right_results["hits"] / total_words,
+        lyrics,
+    )
 
 
 def compute_quality(
@@ -203,14 +215,14 @@ def load_reference_signal(
 
 
 def normalise_luft(
-    signal: np.ndarray, sample_rate: float, target_luft=-40
+    signal: np.ndarray, sample_rate: float, target_luft: float = -40.0
 ) -> np.ndarray:
     """
     Normalise the signal to a target loudness level.
     Args:
         signal: input signal to normalise
         sample_rate: sample rate of the signal
-        target_luft: target loudness level in LUFS
+        target_luft: target loudness level in LUFS.
 
     Returns:
         np.ndarray: normalised signal
@@ -254,6 +266,9 @@ def run_compute_scores(config: DictConfig) -> None:
         "scene",
         "song",
         "listener",
+        "lyrics",
+        "hypothesis_left",
+        "hypothesis_right",
         "haaqi_left",
         "haaqi_right",
         "haaqi_avg",
@@ -363,7 +378,7 @@ def run_compute_scores(config: DictConfig) -> None:
 
         # Compute the HAAQI and Whisper scores
         haaqi_scores = compute_quality(reference, enhanced_signal, listener, config)
-        whisper_scores = compute_intelligibility(
+        whisper_left, whisper_right, lyrics_text = compute_intelligibility(
             enhanced_signal=enhanced_signal,
             segment_metadata=songs[scene["segment_id"]],
             scorer=intelligibility_scorer,
@@ -375,20 +390,23 @@ def run_compute_scores(config: DictConfig) -> None:
             equiv_0db_spl=config.evaluate.equiv_0db_spl,
         )
 
+        max_whisper = np.max([whisper_left, whisper_right])
         results_file.add_result(
             {
                 "scene": scene_id,
                 "song": songs[scene["segment_id"]]["track_name"],
                 "listener": listener_id,
+                "lyrics": lyrics_text["reference"],
+                "hypothesis_left": lyrics_text["hypothesis_left"],
+                "hypothesis_right": lyrics_text["hypothesis_right"],
                 "haaqi_left": haaqi_scores[0],
                 "haaqi_right": haaqi_scores[1],
                 "haaqi_avg": np.mean(haaqi_scores),
-                "whisper_left": whisper_scores[0],
-                "whisper_rigth": whisper_scores[1],
-                "whisper_be": np.max(whisper_scores),
+                "whisper_left": whisper_left,
+                "whisper_rigth": whisper_right,
+                "whisper_be": max_whisper,
                 "alpha": alpha,
-                "score": alpha * np.max(whisper_scores)
-                + (1 - alpha) * np.mean(haaqi_scores),
+                "score": alpha * max_whisper + (1 - alpha) * np.mean(haaqi_scores),
             }
         )
 

From 73ea12153020b03d2b0d504ecfb3685ffb89ef73 Mon Sep 17 00:00:00 2001
From: Gerardo Roa <gerardo.roa@gmail.com>
Date: Tue, 10 Sep 2024 15:26:22 +0100
Subject: [PATCH 3/3] config

Signed-off-by: Gerardo Roa <gerardo.roa@gmail.com>
---
 recipes/cad2/task2/baseline/config.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/recipes/cad2/task2/baseline/config.yaml b/recipes/cad2/task2/baseline/config.yaml
index 950bc042..4311c5cc 100644
--- a/recipes/cad2/task2/baseline/config.yaml
+++ b/recipes/cad2/task2/baseline/config.yaml
@@ -1,7 +1,5 @@
-# Zenodo download path: path to the zenodo download folder.
-# root: root path of the dataset. This path will contain the audio and metadata folders
 path:
-  root: /media/gerardoroadabike/Extreme SSD1/Challenges/CAD2/cadenza_data/cad2/task2
+  root: ???  # Set to the root of the dataset
   metadata_dir: ${path.root}/metadata
   music_dir: ${path.root}/audio
   gains_file: ${path.metadata_dir}/gains.json