diff --git a/src/OSmOSE/Dataset.py b/src/OSmOSE/Dataset.py index 8b097e53..9f8fa5f2 100755 --- a/src/OSmOSE/Dataset.py +++ b/src/OSmOSE/Dataset.py @@ -426,9 +426,9 @@ def _build_audio( exc_info=e, ) raise + message = f"Timestamp.csv and audio files didn't match. Creating new timestamp.csv files from audio. Detail: \n\t{e}" self.logger.warning( - "Timestamp.csv and audio files didn't match. Creating new timestamp.csv files from audio. Detail: \n", - exc_info=e, + message, ) timestamps = parse_timestamps_csv( filenames=[file.name for file in audio_files], @@ -442,9 +442,9 @@ def _build_audio( exc_info=e, ) raise + message = f"Your audio files failed the following test(s):\n\t{e}" self.logger.warning( - "Your audio files failed the following test(s):\n", - exc_info=e, + message, ) file_metadata = self._create_file_metadata(audio_metadata, timestamps) diff --git a/src/OSmOSE/utils/audio_utils.py b/src/OSmOSE/utils/audio_utils.py index fb40d8d7..a163b7bf 100644 --- a/src/OSmOSE/utils/audio_utils.py +++ b/src/OSmOSE/utils/audio_utils.py @@ -90,24 +90,42 @@ def check_audio( - Large duration differences (> 5% of the mean duration) among audio files """ - if any( - (unlisted_file := file) not in timestamps["filename"].unique() - for file in audio_metadata["filename"] + if ( + len( + unlisted_files := [ + file + for file in audio_metadata["filename"] + if file not in timestamps["filename"].unique() + ], + ) + > 0 ): - message = f"{unlisted_file} has not been found in timestamp.csv" + message = ( + "The following files have not been found in timestamp.csv:\n\t" + + "\n\t".join(unlisted_files) + ) raise FileNotFoundError(message) - if any( - (missing_file := filename) not in audio_metadata["filename"].unique() - for filename in timestamps["filename"] + if ( + len( + missing_files := [ + file + for file in timestamps["filename"] + if file not in audio_metadata["filename"].unique() + ], + ) + > 0 ): - message = f"{missing_file} is listed in timestamp.csv but hasn't be found." + message = ( + "The following files are listed in timestamp.csv but hasn't be found:\n\t" + + "\n\t".join(missing_files) + ) raise FileNotFoundError(message) - if len(audio_metadata["origin_sr"].unique()) > 1: + if len(sample_rates := audio_metadata["origin_sr"].unique()) > 1: message = ( - "Your files do not have all the same sampling rate. " - f"Found sampling rates: {', '.join(str(sr) + ' Hz' for sr in audio_metadata['origin_sr'].unique())}." + "Your files do not have all the same sampling rate.\n" + f"Found sampling rates: {', '.join(str(sr) + ' Hz' for sr in sample_rates)}." ) raise ValueError(message) diff --git a/tests/test_audio_utils.py b/tests/test_audio_utils.py index 888ba474..72d5d5c5 100644 --- a/tests/test_audio_utils.py +++ b/tests/test_audio_utils.py @@ -65,6 +65,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None: ["file_1.wav", 128_000, 3_600], ["file_2.wav", 128_000, 3_600], ["file_3.wav", 128_000, 3_600], + ["file_4.wav", 128_000, 3_600], + ["file_5.wav", 128_000, 3_600], ], columns=["filename", "origin_sr", "duration"], ), @@ -78,7 +80,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None: ), pytest.raises( FileNotFoundError, - match="file_3.wav has not been found in timestamp.csv", + match="The following files have not been found in timestamp.csv:\n\tfile_3.wav\n\tfile_5.wav", ), id="missing_file_in_timestamp_csv", ), @@ -89,6 +91,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None: ), pd.DataFrame( [ + ["file_0.wav", pd.Timestamp("2024-01-01 12:11:00")], ["file_1.wav", pd.Timestamp("2024-01-01 12:12:00")], ["file_2.wav", pd.Timestamp("2024-01-01 12:13:00")], ["file_3.wav", pd.Timestamp("2024-01-01 12:14:00")], @@ -97,7 +100,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None: ), pytest.raises( FileNotFoundError, - match="file_3.wav is listed in timestamp.csv but hasn't be found.", + match="The following files are listed in timestamp.csv but hasn't be found:\n\tfile_0.wav\n\tfile_3.wav", ), id="missing_audio_file", ), @@ -119,7 +122,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None: columns=["filename", "timestamp"], ), pytest.raises( - ValueError, match="Your files do not have all the same sampling rate." + ValueError, + match="Your files do not have all the same sampling rate.\nFound sampling rates: 128000 Hz, 128001 Hz.", ), id="mismatching_sr", ), @@ -141,7 +145,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None: columns=["filename", "timestamp"], ), pytest.raises( - ValueError, match="Your audio files have large duration discrepancies." + ValueError, + match="Your audio files have large duration discrepancies.", ), id="mismatching_duration", ), @@ -168,7 +173,9 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None: ], ) def test_check_audio( - audio_metadata: pd.DataFrame, timestamps: pd.DataFrame, expectation: None + audio_metadata: pd.DataFrame, + timestamps: pd.DataFrame, + expectation: None, ) -> None: with expectation as e: assert check_audio(audio_metadata, timestamps) == e