more explicit build warning messages (#238)

Warning messages when the build has anomalies (e.g. missing audio files in the timestamp.csv file) should list all missing files.
Project-OSmOSE · Jan 14, 2025 · e93ca93 · e93ca93
1 parent e498634
commit e93ca93
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 20 deletions.
diff --git a/src/OSmOSE/Dataset.py b/src/OSmOSE/Dataset.py
@@ -426,9 +426,9 @@ def _build_audio(
                     exc_info=e,
                 )
                 raise
+            message = f"Timestamp.csv and audio files didn't match. Creating new timestamp.csv files from audio. Detail: \n\t{e}"
             self.logger.warning(
-                "Timestamp.csv and audio files didn't match. Creating new timestamp.csv files from audio. Detail: \n",
-                exc_info=e,
+                message,
             )
             timestamps = parse_timestamps_csv(
                 filenames=[file.name for file in audio_files],
@@ -442,9 +442,9 @@ def _build_audio(
                     exc_info=e,
                 )
                 raise
+            message = f"Your audio files failed the following test(s):\n\t{e}"
             self.logger.warning(
-                "Your audio files failed the following test(s):\n",
-                exc_info=e,
+                message,
             )
 
         file_metadata = self._create_file_metadata(audio_metadata, timestamps)

diff --git a/src/OSmOSE/utils/audio_utils.py b/src/OSmOSE/utils/audio_utils.py
@@ -90,24 +90,42 @@ def check_audio(
         - Large duration differences (> 5% of the mean duration) among audio files
 
     """
-    if any(
-        (unlisted_file := file) not in timestamps["filename"].unique()
-        for file in audio_metadata["filename"]
+    if (
+        len(
+            unlisted_files := [
+                file
+                for file in audio_metadata["filename"]
+                if file not in timestamps["filename"].unique()
+            ],
+        )
+        > 0
     ):
-        message = f"{unlisted_file} has not been found in timestamp.csv"
+        message = (
+            "The following files have not been found in timestamp.csv:\n\t"
+            + "\n\t".join(unlisted_files)
+        )
         raise FileNotFoundError(message)
 
-    if any(
-        (missing_file := filename) not in audio_metadata["filename"].unique()
-        for filename in timestamps["filename"]
+    if (
+        len(
+            missing_files := [
+                file
+                for file in timestamps["filename"]
+                if file not in audio_metadata["filename"].unique()
+            ],
+        )
+        > 0
     ):
-        message = f"{missing_file} is listed in timestamp.csv but hasn't be found."
+        message = (
+            "The following files are listed in timestamp.csv but hasn't be found:\n\t"
+            + "\n\t".join(missing_files)
+        )
         raise FileNotFoundError(message)
 
-    if len(audio_metadata["origin_sr"].unique()) > 1:
+    if len(sample_rates := audio_metadata["origin_sr"].unique()) > 1:
         message = (
-            "Your files do not have all the same sampling rate. "
-            f"Found sampling rates: {', '.join(str(sr) + ' Hz' for sr in audio_metadata['origin_sr'].unique())}."
+            "Your files do not have all the same sampling rate.\n"
+            f"Found sampling rates: {', '.join(str(sr) + ' Hz' for sr in sample_rates)}."
         )
         raise ValueError(message)
 

diff --git a/tests/test_audio_utils.py b/tests/test_audio_utils.py
@@ -65,6 +65,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
                     ["file_1.wav", 128_000, 3_600],
                     ["file_2.wav", 128_000, 3_600],
                     ["file_3.wav", 128_000, 3_600],
+                    ["file_4.wav", 128_000, 3_600],
+                    ["file_5.wav", 128_000, 3_600],
                 ],
                 columns=["filename", "origin_sr", "duration"],
             ),
@@ -78,7 +80,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
             ),
             pytest.raises(
                 FileNotFoundError,
-                match="file_3.wav has not been found in timestamp.csv",
+                match="The following files have not been found in timestamp.csv:\n\tfile_3.wav\n\tfile_5.wav",
             ),
             id="missing_file_in_timestamp_csv",
         ),
@@ -89,6 +91,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
             ),
             pd.DataFrame(
                 [
+                    ["file_0.wav", pd.Timestamp("2024-01-01 12:11:00")],
                     ["file_1.wav", pd.Timestamp("2024-01-01 12:12:00")],
                     ["file_2.wav", pd.Timestamp("2024-01-01 12:13:00")],
                     ["file_3.wav", pd.Timestamp("2024-01-01 12:14:00")],
@@ -97,7 +100,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
             ),
             pytest.raises(
                 FileNotFoundError,
-                match="file_3.wav is listed in timestamp.csv but hasn't be found.",
+                match="The following files are listed in timestamp.csv but hasn't be found:\n\tfile_0.wav\n\tfile_3.wav",
             ),
             id="missing_audio_file",
         ),
@@ -119,7 +122,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
                 columns=["filename", "timestamp"],
             ),
             pytest.raises(
-                ValueError, match="Your files do not have all the same sampling rate."
+                ValueError,
+                match="Your files do not have all the same sampling rate.\nFound sampling rates: 128000 Hz, 128001 Hz.",
             ),
             id="mismatching_sr",
         ),
@@ -141,7 +145,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
                 columns=["filename", "timestamp"],
             ),
             pytest.raises(
-                ValueError, match="Your audio files have large duration discrepancies."
+                ValueError,
+                match="Your audio files have large duration discrepancies.",
             ),
             id="mismatching_duration",
         ),
@@ -168,7 +173,9 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
     ],
 )
 def test_check_audio(
-    audio_metadata: pd.DataFrame, timestamps: pd.DataFrame, expectation: None
+    audio_metadata: pd.DataFrame,
+    timestamps: pd.DataFrame,
+    expectation: None,
 ) -> None:
     with expectation as e:
         assert check_audio(audio_metadata, timestamps) == e