Skip to content

Commit

Permalink
more explicit build warning messages (#238)
Browse files Browse the repository at this point in the history
Warning messages when the build has anomalies (e.g. missing audio files in the timestamp.csv file) should list all missing files.
  • Loading branch information
Gautzilla authored Jan 14, 2025
1 parent e498634 commit e93ca93
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 20 deletions.
8 changes: 4 additions & 4 deletions src/OSmOSE/Dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,9 +426,9 @@ def _build_audio(
exc_info=e,
)
raise
message = f"Timestamp.csv and audio files didn't match. Creating new timestamp.csv files from audio. Detail: \n\t{e}"
self.logger.warning(
"Timestamp.csv and audio files didn't match. Creating new timestamp.csv files from audio. Detail: \n",
exc_info=e,
message,
)
timestamps = parse_timestamps_csv(
filenames=[file.name for file in audio_files],
Expand All @@ -442,9 +442,9 @@ def _build_audio(
exc_info=e,
)
raise
message = f"Your audio files failed the following test(s):\n\t{e}"
self.logger.warning(
"Your audio files failed the following test(s):\n",
exc_info=e,
message,
)

file_metadata = self._create_file_metadata(audio_metadata, timestamps)
Expand Down
40 changes: 29 additions & 11 deletions src/OSmOSE/utils/audio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,24 +90,42 @@ def check_audio(
- Large duration differences (> 5% of the mean duration) among audio files
"""
if any(
(unlisted_file := file) not in timestamps["filename"].unique()
for file in audio_metadata["filename"]
if (
len(
unlisted_files := [
file
for file in audio_metadata["filename"]
if file not in timestamps["filename"].unique()
],
)
> 0
):
message = f"{unlisted_file} has not been found in timestamp.csv"
message = (
"The following files have not been found in timestamp.csv:\n\t"
+ "\n\t".join(unlisted_files)
)
raise FileNotFoundError(message)

if any(
(missing_file := filename) not in audio_metadata["filename"].unique()
for filename in timestamps["filename"]
if (
len(
missing_files := [
file
for file in timestamps["filename"]
if file not in audio_metadata["filename"].unique()
],
)
> 0
):
message = f"{missing_file} is listed in timestamp.csv but hasn't be found."
message = (
"The following files are listed in timestamp.csv but hasn't be found:\n\t"
+ "\n\t".join(missing_files)
)
raise FileNotFoundError(message)

if len(audio_metadata["origin_sr"].unique()) > 1:
if len(sample_rates := audio_metadata["origin_sr"].unique()) > 1:
message = (
"Your files do not have all the same sampling rate. "
f"Found sampling rates: {', '.join(str(sr) + ' Hz' for sr in audio_metadata['origin_sr'].unique())}."
"Your files do not have all the same sampling rate.\n"
f"Found sampling rates: {', '.join(str(sr) + ' Hz' for sr in sample_rates)}."
)
raise ValueError(message)

Expand Down
17 changes: 12 additions & 5 deletions tests/test_audio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
["file_1.wav", 128_000, 3_600],
["file_2.wav", 128_000, 3_600],
["file_3.wav", 128_000, 3_600],
["file_4.wav", 128_000, 3_600],
["file_5.wav", 128_000, 3_600],
],
columns=["filename", "origin_sr", "duration"],
),
Expand All @@ -78,7 +80,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
),
pytest.raises(
FileNotFoundError,
match="file_3.wav has not been found in timestamp.csv",
match="The following files have not been found in timestamp.csv:\n\tfile_3.wav\n\tfile_5.wav",
),
id="missing_file_in_timestamp_csv",
),
Expand All @@ -89,6 +91,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
),
pd.DataFrame(
[
["file_0.wav", pd.Timestamp("2024-01-01 12:11:00")],
["file_1.wav", pd.Timestamp("2024-01-01 12:12:00")],
["file_2.wav", pd.Timestamp("2024-01-01 12:13:00")],
["file_3.wav", pd.Timestamp("2024-01-01 12:14:00")],
Expand All @@ -97,7 +100,7 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
),
pytest.raises(
FileNotFoundError,
match="file_3.wav is listed in timestamp.csv but hasn't be found.",
match="The following files are listed in timestamp.csv but hasn't be found:\n\tfile_0.wav\n\tfile_3.wav",
),
id="missing_audio_file",
),
Expand All @@ -119,7 +122,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
columns=["filename", "timestamp"],
),
pytest.raises(
ValueError, match="Your files do not have all the same sampling rate."
ValueError,
match="Your files do not have all the same sampling rate.\nFound sampling rates: 128000 Hz, 128001 Hz.",
),
id="mismatching_sr",
),
Expand All @@ -141,7 +145,8 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
columns=["filename", "timestamp"],
),
pytest.raises(
ValueError, match="Your audio files have large duration discrepancies."
ValueError,
match="Your audio files have large duration discrepancies.",
),
id="mismatching_duration",
),
Expand All @@ -168,7 +173,9 @@ def test_supported_audio_formats(filepath: Path, expected_output: bool) -> None:
],
)
def test_check_audio(
audio_metadata: pd.DataFrame, timestamps: pd.DataFrame, expectation: None
audio_metadata: pd.DataFrame,
timestamps: pd.DataFrame,
expectation: None,
) -> None:
with expectation as e:
assert check_audio(audio_metadata, timestamps) == e

0 comments on commit e93ca93

Please sign in to comment.