From 4138c80df705c458ea90a1228bc6e18452ec3694 Mon Sep 17 00:00:00 2001 From: Gautzilla <72027971+Gautzilla@users.noreply.github.com> Date: Tue, 22 Oct 2024 11:48:42 +0200 Subject: [PATCH] refactor timestamps managing (#210) * rename build files test * add unit tests for audio file check util * move audio file tests to specific test module * add datetime template validation function * add datetime regex builder * add extract timestamp from filename with integration tests * add tests ids and check exception messages * move regex dictionary to module constants * add docstrings * add associate_timestamps function * add strftime to osmose format * use string interpolation in test result * backwards compatibility of timestamp.csv labels * use pandas.to_datetime for strptime operations * support timezone in strptime * use new timestamps utils in build method * remove obsolete test module * format with black * rename filename to text in strptime parser util --------- Co-authored-by: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> --- src/OSmOSE/Dataset.py | 43 ++-- src/OSmOSE/__init__.py | 2 - src/OSmOSE/timestamps.py | 182 --------------- src/OSmOSE/utils/audio_utils.py | 4 +- src/OSmOSE/utils/timestamp_utils.py | 183 ++++++++++++++- tests/test_audio_utils.py | 22 ++ tests/test_create_timestamps.py | 20 -- tests/test_timestamp_utils.py | 340 ++++++++++++++++++++++++++++ 8 files changed, 574 insertions(+), 222 deletions(-) delete mode 100644 src/OSmOSE/timestamps.py create mode 100644 tests/test_audio_utils.py delete mode 100755 tests/test_create_timestamps.py create mode 100644 tests/test_timestamp_utils.py diff --git a/src/OSmOSE/Dataset.py b/src/OSmOSE/Dataset.py index fc8fc669..99d57115 100755 --- a/src/OSmOSE/Dataset.py +++ b/src/OSmOSE/Dataset.py @@ -19,10 +19,14 @@ import pandas as pd import numpy as np from tqdm import tqdm -from OSmOSE.utils.timestamp_utils import check_epoch +from OSmOSE.utils.audio_utils import get_all_audio_files +from OSmOSE.utils.timestamp_utils import ( + check_epoch, + associate_timestamps, + strftime_osmose_format, +) from OSmOSE.utils.core_utils import check_n_files, set_umask, chmod_if_needed from OSmOSE.utils.path_utils import make_path -from OSmOSE.timestamps import write_timestamp from OSmOSE.config import DPDEFAULT, FPDEFAULT, OSMOSE_PATH, TIMESTAMP_FORMAT_AUDIO_FILE @@ -354,20 +358,18 @@ def build( if original_folder is not None else self._find_or_create_original_folder() ) - path_timestamp_formatted = path_raw_audio.joinpath("timestamp.csv") - resume_test_anomalies = path_raw_audio.joinpath("resume_test_anomalies.txt") + path_timestamp_formatted = path_raw_audio.joinpath("timestamp.csv") + user_timestamp = ( + path_timestamp_formatted.exists() + ) # TODO: Formatting audio files beforehand will make this obsolete - if not path_timestamp_formatted.exists(): - user_timestamp = False - write_timestamp( - audio_path=path_raw_audio, - date_template=date_template, - timezone=self.timezone, - verbose=False, + if not user_timestamp: + self._write_timestamp_csv_from_audio_files( + audio_path=path_raw_audio, date_template=date_template ) - else: - user_timestamp = True + + resume_test_anomalies = path_raw_audio.joinpath("resume_test_anomalies.txt") # read the timestamp.csv file timestamp_csv = pd.read_csv(path_timestamp_formatted)["timestamp"].values @@ -655,6 +657,21 @@ def build( print("\n DONE ! your dataset is on OSmOSE platform !") + def _write_timestamp_csv_from_audio_files( + self, audio_path: Path, date_template: str + ): + supported_audio_files = [file.name for file in get_all_audio_files(audio_path)] + filenames_with_timestamps = associate_timestamps( + audio_files=supported_audio_files, datetime_template=date_template + ) + filenames_with_timestamps["timestamp"] = filenames_with_timestamps[ + "timestamp" + ].apply(lambda t: strftime_osmose_format(t)) + filenames_with_timestamps.to_csv( + audio_path / "timestamp.csv", index=False, na_rep="NaN" + ) + os.chmod(audio_path / "timestamp.csv", mode=FPDEFAULT) + def _format_timestamp( self, cur_timestamp_not_formatted: str, diff --git a/src/OSmOSE/__init__.py b/src/OSmOSE/__init__.py index 2605ec71..98d00e35 100755 --- a/src/OSmOSE/__init__.py +++ b/src/OSmOSE/__init__.py @@ -1,5 +1,4 @@ from OSmOSE.Dataset import Dataset -from OSmOSE.timestamps import write_timestamp from OSmOSE.job import Job_builder from OSmOSE.Spectrogram import Spectrogram import OSmOSE.utils as utils @@ -8,7 +7,6 @@ __all__ = [ "Auxiliary", "Dataset", - "write_timestamp", "Job_builder", "Spectrogram", "utils", diff --git a/src/OSmOSE/timestamps.py b/src/OSmOSE/timestamps.py deleted file mode 100644 index 7dbe9992..00000000 --- a/src/OSmOSE/timestamps.py +++ /dev/null @@ -1,182 +0,0 @@ -import re -import os -import datetime -import argparse -import pandas as pd -from pathlib import Path - -from OSmOSE.config import * -from OSmOSE.utils.core_utils import get_files, chmod_if_needed - -__converter = { - "%Y": r"[12][0-9]{3}", - "%y": r"[0-9]{2}", - "%m": r"(0[1-9]|1[0-2])", - "%d": r"([0-2][0-9]|3[0-1])", - "%H": r"([0-1][0-9]|2[0-4])", - "%I": r"(0[1-9]|1[0-2])", - "%p": r"(AM|PM)", - "%M": r"[0-5][0-9]", - "%S": r"[0-5][0-9]", - "%f": r"[0-9]{6}", -} - - -def convert_template_to_re(date_template: str) -> str: - """Converts a template in strftime format to a matching regular expression - - Parameter: - date_template: the template in strftime format - - Returns: - The regular expression matching the template""" - - res = "" - i = 0 - while i < len(date_template): - if date_template[i : i + 2] in __converter: - res += __converter[date_template[i : i + 2]] - i += 1 - else: - res += date_template[i] - i += 1 - - return res - - -def write_timestamp( - *, - audio_path: str, - date_template: str, - timezone: str, - offset: tuple = None, - verbose: bool = False, -): - """Read the dates in the filenames of audio files in the `audio_path` folder, - according to the date template in strftime format or the offsets from the beginning and end of the date. - - The result is written in a file named `timestamp.csv` with no header and two columns in this format : [filename],[timestamp]. - The output date is in the template `'%Y-%m-%dT%H:%M:%S.%fZ'. - - Parameters - ---------- - audio_path: `str` - the path of the folder containing audio files - date_template: `str` - the date template in strftime format. For example, `2017/02/24` has the template `%Y/%m/%d` - For more information on strftime template, see https://strftime.org/ - offsets: `tuple(int,int)`, optional - a tuple containing the beginning and end offset of the date. - The first element is the first character of the date, and the second is the last. - verbose: `bool`, optional, keyword-only - If set to True, print all messages. default is False - """ - list_audio_file = [] - msg = "" - for ext in SUPPORTED_AUDIO_FORMAT: - list_audio_file_ext = sorted(Path(audio_path).glob(f"*{ext}")) - [list_audio_file.append(file) for file in list_audio_file_ext] - if len(list_audio_file_ext) > 0: - msg = msg + f"{len(list_audio_file_ext)} {ext[1:]}, " - print(f"{len(list_audio_file)} audio files found:", msg[:-2]) - - if len(list_audio_file) == 0: - list_audio_file_WAV = sorted([file for file in Path(audio_path).glob("*.WAV")]) - list_audio_file_FLAC = sorted( - [file for file in Path(audio_path).glob("*.FLAC")] - ) - - if len(list_audio_file_WAV) > 0: - print( - "Your audio files have a .WAV extension, we are changing it to the standard .wav extension." - ) - - for file_name in list_audio_file_WAV: - os.rename(file_name, Path(audio_path).joinpath(file_name.stem + ".wav")) - if len(list_audio_file_FLAC) > 0: - print( - "Your audio files have a .FLAC extension, we are changing it to the standard .flac extension." - ) - - for file_name in list_audio_file_FLAC: - os.rename( - file_name, Path(audio_path).joinpath(file_name.stem + ".flac") - ) - elif len(get_files(Path(audio_path), ("*.mp3",))) > 0: - raise FileNotFoundError( - "Your audio files do not have the right extension, we only accept wav and flac audio files for the moment." - ) - - else: - raise FileNotFoundError( - f"No audio files found in the {audio_path} directory." - ) - - timestamp = [] - filename_raw_audio = [] - - converted = convert_template_to_re(date_template) - for i, filename in enumerate(list_audio_file): - try: - if offset: - date_extracted = re.search( - converted, filename.stem[offset[0] : offset[1] + 1] - )[0] - else: - date_extracted = re.search(converted, str(filename))[0] - except TypeError: - raise ValueError( - f"The date template does not match any set of character in the file name {filename}\nMake sure you are not forgetting separator characters, or use the offset parameter." - ) - - date_obj = datetime.datetime.strptime( - date_extracted + timezone, date_template + "%z" - ) - dates_final = datetime.datetime.strftime(date_obj, TIMESTAMP_FORMAT_AUDIO_FILE) - - if i == 10: - print( - f"Timestamp extraction seems OK, here is an example: {filename.name} -> {dates_final} \n" - ) - elif verbose: - print("filename->", filename) - print("extracted timestamp->", dates_final, "\n") - - timestamp.append(dates_final) - - filename_raw_audio.append(filename.name) - - df = pd.DataFrame({"filename": filename_raw_audio, "timestamp": timestamp}) - df.sort_values(by=["timestamp"], inplace=True) - df.to_csv(Path(audio_path, "timestamp.csv"), index=False, na_rep="NaN") - chmod_if_needed(path=Path(audio_path) / "timestamp.csv", mode=FPDEFAULT) - - -if __name__ == "__main__": - argparser = argparse.ArgumentParser() - argparser.add_argument("--dataset-name", "-n", help="Name of the dataset.") - argparser.add_argument( - "--offset", - "-s", - help="Offset of the first date character in the dataset names. If the date is not immediately followed by the extension, please provide the offset between the end of the date and the extension of the file, separated by a hyphen (-).", - ) - argparser.add_argument( - "--date-template", - "-d", - help="The date template in strftime format. If not sure, input the whole file name.", - ) - args = argparser.parse_args() - - if args.offset and "-" in args.offset: - split = args.offset.split("-") - offset = (int(split[0]), int(split[1])) - elif args.offset: - offset = int(args.offset) - else: - offset = None - - write_timestamp( - audio_path=args.dataset_name, - date_template=args.date_template, - offsets=offset, - ) diff --git a/src/OSmOSE/utils/audio_utils.py b/src/OSmOSE/utils/audio_utils.py index ec2f07f0..40c56688 100644 --- a/src/OSmOSE/utils/audio_utils.py +++ b/src/OSmOSE/utils/audio_utils.py @@ -2,7 +2,7 @@ from pathlib import Path -def is_audio(filename: Path) -> bool: +def is_supported_audio_format(filename: Path) -> bool: """ Check if a given file is a supported audio file based on its extension. @@ -17,7 +17,7 @@ def is_audio(filename: Path) -> bool: True if the file has an extension that matches a supported audio format, False otherwise. """ - return filename.suffix in SUPPORTED_AUDIO_FORMAT + return filename.suffix.lower() in SUPPORTED_AUDIO_FORMAT def get_all_audio_files(directory: Path) -> list[Path]: diff --git a/src/OSmOSE/utils/timestamp_utils.py b/src/OSmOSE/utils/timestamp_utils.py index dd3cce2a..f63215a3 100644 --- a/src/OSmOSE/utils/timestamp_utils.py +++ b/src/OSmOSE/utils/timestamp_utils.py @@ -1,8 +1,28 @@ +from collections.abc import Generator from datetime import datetime, timedelta import pandas as pd -from typing import List +from typing import List, Iterable import os +from OSmOSE.config import TIMESTAMP_FORMAT_AUDIO_FILE +from pandas import Timestamp +import re + +_REGEX_BUILDER = { + "%Y": "([12][0-9]{3})", + "%y": "([0-9]{2})", + "%m": "(0[1-9]|1[0-2])", + "%d": "([0-2][0-9]|3[0-1])", + "%H": "([0-1][0-9]|2[0-4])", + "%I": "(0[1-9]|1[0-2])", + "%p": "(AM|PM)", + "%M": "([0-5][0-9])", + "%S": "([0-5][0-9])", + "%f": "([0-9]{6})", + "%Z": "((?:\\w+)(?:[-/]\\w+)*(?:[\\+-]\\d+)?)", + "%z": "([\\+-]\\d{4})", +} + def check_epoch(df): "Function that adds epoch column to dataframe" @@ -62,8 +82,165 @@ def to_timestamp(string: str) -> pd.Timestamp: ) -def from_timestamp(date: pd.Timestamp) -> str: - return date.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + date.strftime("%z") +def strftime_osmose_format(date: pd.Timestamp) -> str: + """ + Format a pandas Timestamp using strftime() and the OSmOSE time format %Y-%m-%dT%H:%M:%S.%f%z, with %f limited to a millisecond precision. + If the input Timestamp is not localized, its localization will be defaulted as UTC. + + Parameters + ---------- + date: pandas.Timestamp + The Timestamp to format + + Returns + ------- + str: + The formatted Timestamp + + Examples + -------- + >>> strftime_osmose_format(Timestamp('2024-10-17 10:14:11.933634', tz="US/Eastern")) + '2024-10-17T10:14:11.933-0400' + """ + if date.tz is None: + date = date.tz_localize("UTC") + + str_time = date.strftime(TIMESTAMP_FORMAT_AUDIO_FILE) + str_time = ( + str_time[:-8] + str_time[-5:] + ) # Changes microsecond precision to millisecond precision + + return str_time + + +def build_regex_from_datetime_template(datetime_template: str) -> str: + """ + Builds the regular expression that is used to parse a Timestamp from a string following the given datetime strftime template + + Parameters + ---------- + datetime_template: str + A datetime template string using strftime codes + + Returns + ------- + str + A regex that can be used to parse a Timestamp from a string following the given datetime strftime template + + Examples + -------- + >>> build_regex_from_datetime_template('year_%Y_hour_%H') + 'year_([12][0-9]{3})_hour_([0-1][0-9]|2[0-4])' + """ + + escaped_characters = "()" + for escaped in escaped_characters: + datetime_template = datetime_template.replace(escaped, f"\\{escaped}") + for key, value in _REGEX_BUILDER.items(): + datetime_template = datetime_template.replace(key, value) + return datetime_template + + +def is_datetime_template_valid(datetime_template: str) -> bool: + """ + Checks the validity of a datetime template string. A datetame template string is used to extract a timestamp from a given string: 'year_%Y' is a valid datetame template for extracting '2016' from the 'year_2016' string. + A datetime template string should use valid strftime codes (see https://strftime.org/). + + Parameters + ---------- + datetime_template: str + The datetime template + + Returns + ------- + bool: + True if datetime_template is valid (only uses supported strftime codes), False otherwise + + Examples + -------- + >>> is_datetime_template_valid('year_%Y_hour_%H') + True + >>> is_datetime_template_valid('unsupported_code_%Z_hour_%H') + False + """ + strftime_identifiers = [key.lstrip("%") for key in _REGEX_BUILDER.keys()] + percent_sign_indexes = ( + index for index, char in enumerate(datetime_template) if char == "%" + ) + for index in percent_sign_indexes: + if index == len(datetime_template) - 1: + return False + if datetime_template[index + 1] not in strftime_identifiers: + return False + return True + + +def strptime_from_text(text: str, datetime_template: str) -> Timestamp: + """ + Extract a pandas.Timestamp from the text input string following the datetime_template specified. + + Parameters + ---------- + text: str + The text in which the timestamp should be extracted, ex '2016_06_13_14:12.txt' + datetime_template: str + The datetime template used in the text, using strftime codes (https://strftime.org/). Example: '%y%m%d_%H:%M:%S' + + Returns + ------- + pandas.Timestamp: + The timestamp extracted from the text according to datetime_template + + Examples + -------- + >>> strptime_from_text('2016_06_13_14:12.txt', '%Y_%m_%d_%H:%M') + Timestamp('2016-06-13 14:12:00') + >>> strptime_from_text('date_12_03_21_hour_11:45:10_PM.wav', '%y_%m_%d_hour_%I:%M:%S_%p') + Timestamp('2012-03-21 23:45:10') + """ + + if not is_datetime_template_valid(datetime_template): + raise ValueError(f"{datetime_template} is not a supported strftime template") + + regex_pattern = build_regex_from_datetime_template(datetime_template) + regex_result = re.findall(regex_pattern, text) + + if not regex_result: + raise ValueError(f"{text} did not match the given {datetime_template} template") + + date_string = "".join(regex_result[0]) + cleaned_date_template = "".join( + c + datetime_template[i + 1] + for i, c in enumerate(datetime_template) + if c == "%" + ) + return pd.to_datetime(date_string, format=cleaned_date_template) + + +def associate_timestamps( + audio_files: Iterable[str], datetime_template: str +) -> pd.Series: + """ + Returns a chronologically sorted pandas series containing the audio files as indexes and the extracted timestamp as values. + + Parameters + ---------- + audio_files: Iterable[str] + files from which the timestamps should be extracted. They must share a same datetime format. + datetime_template: str + The datetime template used in filename, using strftime codes (https://strftime.org/). Example: '%y%m%d_%H:%M:%S' + + Returns + ------- + pandas.Series + A series with the audio files names as index and the extracted timestamps as values. + """ + files_with_timestamps = { + file: strptime_from_text(file, datetime_template) for file in audio_files + } + series = pd.Series(data=files_with_timestamps, name="timestamp") + series.index.name = "filename" + return series.sort_values().reset_index() def get_timestamps( diff --git a/tests/test_audio_utils.py b/tests/test_audio_utils.py new file mode 100644 index 00000000..e725ffb0 --- /dev/null +++ b/tests/test_audio_utils.py @@ -0,0 +1,22 @@ +import pytest +from pathlib import Path +from OSmOSE.utils.audio_utils import * + + +@pytest.mark.unit +@pytest.mark.parametrize( + "filepath, expected_output", + [ + (Path("audio.wav"), True), + (Path("audio_with_date_2024_02_14.wav"), True), + (Path("parent_folder/audio.wav"), True), + (Path("audio.flac"), True), + (Path("audio.WAV"), True), + (Path("audio.FLAC"), True), + (Path("audio.mp3"), False), + (Path("audio.MP3"), False), + (Path("audio.pdf"), False), + ], +) +def test_supported_audio_formats(filepath: Path, expected_output: bool): + assert is_supported_audio_format(filepath) == expected_output diff --git a/tests/test_create_timestamps.py b/tests/test_create_timestamps.py deleted file mode 100755 index 75c4d988..00000000 --- a/tests/test_create_timestamps.py +++ /dev/null @@ -1,20 +0,0 @@ -import OSmOSE.timestamps as tm -import re -import pytest - - -@pytest.mark.unit -def test_convert_template_to_re(): - raw_all = "".join(tm.__converter.keys()) - simple_template = "%Y/%m/%d" - simple_text = "sample_file_2017/02/24.txt" - invalid_simple_text = "sample_file_2049/25/01" - complex_template = "y_%Y-m_%m, %I%p." - complex_text = " y_2017-m_02, 11AM%" - - assert tm.convert_template_to_re(raw_all) == "".join(tm.__converter.values()) - simple_res = tm.convert_template_to_re(simple_template) - assert re.search(simple_res, simple_text)[0] == "2017/02/24" - assert re.search(simple_res, invalid_simple_text) == None - complex_res = tm.convert_template_to_re(complex_template) - assert re.search(complex_res, complex_text)[0] == "y_2017-m_02, 11AM%" diff --git a/tests/test_timestamp_utils.py b/tests/test_timestamp_utils.py new file mode 100644 index 00000000..9271b934 --- /dev/null +++ b/tests/test_timestamp_utils.py @@ -0,0 +1,340 @@ +import pytest +from OSmOSE.utils.timestamp_utils import * + + +@pytest.mark.unit +@pytest.mark.parametrize( + "datetime_template, expected", + [ + pytest.param("%y%m%d%H%M%S", True, id="simple_pattern"), + pytest.param("%Y%y%m%d%H%M%S%I%p%f", True, id="all_strftime_codes"), + pytest.param("%y %m %d %H %M %S", True, id="spaces_separated_codes"), + pytest.param("%y:%m:%d%H.%M%S", True, id="special_chars_separated_codes"), + pytest.param("%y%z%d%H%X%S", False, id="%X_is_wrong_strftime_code"), + pytest.param("%y%z%d%H%%%S", False, id="%%_is_wrong_strftime_code"), + pytest.param("%y%m%d_at_%H%M%S", True, id="alpha_letters_separated_codes"), + pytest.param("%y%m%d%H%M%S%", False, id="trailing_%_is_wrong_strftime_code"), + pytest.param("%y%m%d%H%M%S%z", True, id="utc_offset"), + pytest.param("%y%m%d%H%M%S_%Z", True, id="timezone_name"), + ], +) +def test_is_datetime_template_valid(datetime_template, expected): + assert is_datetime_template_valid(datetime_template) == expected + + +@pytest.mark.unit +@pytest.mark.parametrize( + "datetime_template, expected", + [ + pytest.param( + "%y%m%d%H%M%S", + "([0-9]{2})(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])([0-1][0-9]|2[0-4])([0-5][0-9])([0-5][0-9])", + id="simple_pattern", + ), + pytest.param( + "%Y%y%m%d%H%M%S%I%p%f", + "([12][0-9]{3})([0-9]{2})(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])([0-1][0-9]|2[0-4])([0-5][0-9])([0-5][0-9])(0[1-9]|1[0-2])(AM|PM)([0-9]{6})", + id="all_strftime_codes", + ), + pytest.param( + "%y %m %d %H %M %S", + "([0-9]{2}) (0[1-9]|1[0-2]) ([0-2][0-9]|3[0-1]) ([0-1][0-9]|2[0-4]) ([0-5][0-9]) ([0-5][0-9])", + id="spaces_separated_codes", + ), + pytest.param( + "%y:%m:%d%H.%M%S", + "([0-9]{2}):(0[1-9]|1[0-2]):([0-2][0-9]|3[0-1])([0-1][0-9]|2[0-4]).([0-5][0-9])([0-5][0-9])", + id="special_chars_separated_codes", + ), + pytest.param( + "%y%m%d_at_%H%M%S", + "([0-9]{2})(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])_at_([0-1][0-9]|2[0-4])([0-5][0-9])([0-5][0-9])", + id="alpha_letters_separated_codes", + ), + pytest.param( + "{%y}%m%d(%H%M%S)", + "{([0-9]{2})}(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])\(([0-1][0-9]|2[0-4])([0-5][0-9])([0-5][0-9])\)", + id="parentheses_separated_codes", + ), + pytest.param( + "%y%m%d%z", + "([0-9]{2})(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])([\+-]\d{4})", + id="utc_offset", + ), + pytest.param( + "%y%m%d%Z", + "([0-9]{2})(0[1-9]|1[0-2])([0-2][0-9]|3[0-1])((?:\w+)(?:[-/]\w+)*(?:[\+-]\d+)?)", + id="timezone_name", + ), + ], +) +def test_build_regex_from_datetime_template(datetime_template: str, expected: str): + assert build_regex_from_datetime_template(datetime_template) == expected + + +@pytest.mark.integ +@pytest.mark.parametrize( + "text, datetime_template, expected", + [ + pytest.param( + "7189.230405144906.wav", + "%y%m%d%H%M%S", + Timestamp("2023-04-05 14:49:06"), + id="plain_pattern", + ), + pytest.param( + "7189.(230405)/(144906).wav", + "(%y%m%d)/(%H%M%S)", + Timestamp("2023-04-05 14:49:06"), + id="escaped_parentheses", + ), + pytest.param( + "7189.23_04_05_14:49:06.wav", + "%y_%m_%d_%H:%M:%S", + Timestamp("2023-04-05 14:49:06"), + id="special_characters", + ), + pytest.param( + "7189.230405_at_144906.wav", + "%y%m%d_at_%H%M%S", + Timestamp("2023-04-05 14:49:06"), + id="alpha_letters", + ), + pytest.param( + "7189.202323040514490602PM000010.wav", + "%Y%y%m%d%H%M%S%I%p%f", + Timestamp("2023-04-05 14:49:06.000010"), + id="full_pattern", + ), + pytest.param( + "7189.230405144906+0200.wav", + "%y%m%d%H%M%S%z", + Timestamp("2023-04-05 14:49:06+0200"), + id="utc_positive_offset", + ), + pytest.param( + "7189.230405144906-0200.wav", + "%y%m%d%H%M%S%z", + Timestamp("2023-04-05 14:49:06-0200"), + id="utc_negative_offset", + ), + pytest.param( + "7189.230405144906+020012.wav", + "%y%m%d%H%M%S%z", + Timestamp("2023-04-05 14:49:06+0200"), + id="utc_offset_with_seconds", + ), + pytest.param( + "7189.230405144906+020012.123456.wav", + "%y%m%d%H%M%S%z", + Timestamp("2023-04-05 14:49:06+0200"), + id="utc_offset_with_microseconds", + ), + pytest.param( + "7189.230405144906_Japan.wav", + "%y%m%d%H%M%S_%Z", + Timestamp("2023-04-05 14:49:06+0900", tz="UTC+09:00"), + id="timezone_name", + ), + pytest.param( + "7189.230405144906_Japan.wav", + "%y%m%d%H%M%S", + Timestamp("2023-04-05 14:49:06"), + id="unspecified_timezone_name_doesnt_count", + ), + pytest.param( + "7189.230405144906_Europe/Isle_of_Man.wav", + "%y%m%d%H%M%S_%Z", + Timestamp("2023-04-05 14:49:06+0100", tz="UTC+01:00"), + id="timezone_name_with_special_chars", + ), + pytest.param( + "7189.230405144906_America/North_Dakota/New_Salem.wav", + "%y%m%d%H%M%S_%Z", + Timestamp("2023-04-05 14:49:06-0500", tz="UTC-05:00"), + id="timezone_name_with_double_slash", + ), + pytest.param( + "7189.230405144906_Etc/GMT+2.wav", + "%y%m%d%H%M%S_%Z", + Timestamp("2023-04-05 14:49:06-0200", tz="UTC-02:00"), + id="timezone_name_with_offset", + ), + ], +) +def test_strptime_from_text(text: str, datetime_template: str, expected: Timestamp): + assert strptime_from_text(text, datetime_template) == expected + + +@pytest.mark.integ +@pytest.mark.parametrize( + "text, datetime_template, expected", + [ + pytest.param( + "7189.230405144906.wav", + "%y%m%d%H%M%%S", + pytest.raises( + ValueError, match="%y%m%d%H%M%%S is not a supported strftime template" + ), + id="%%_is_wrong_strftime_code", + ), + pytest.param( + "7189.230405144906.wav", + "%y%m%d%H%M%P%S", + pytest.raises( + ValueError, match="%y%m%d%H%M%P%S is not a supported strftime template" + ), + id="%P_is_wrong_strftime_code", + ), + pytest.param( + "7189.230405144906.wav", + "%y%m%d%H%M%52%S", + pytest.raises( + ValueError, match="%y%m%d%H%M%52%S is not a supported strftime template" + ), + id="%5_is_wrong_strftime_code", + ), + pytest.param( + "7189.230405_144906.wav", + "%y%m%d%H%M%S", + pytest.raises( + ValueError, + match="7189.230405_144906.wav did not match the given %y%m%d%H%M%S template", + ), + id="unmatching_pattern", + ), + pytest.param( + "7189.230405144906.wav", + "%Y%m%d%H%M%S", + pytest.raises( + ValueError, + match="7189.230405144906.wav did not match the given %Y%m%d%H%M%S template", + ), + id="%Y_should_have_4_digits", + ), + pytest.param( + "7189.230405146706.wav", + "%y%m%d%H%M%S", + pytest.raises( + ValueError, + match="7189.230405146706.wav did not match the given %y%m%d%H%M%S template", + ), + id="%M_should_be_lower_than_60", + ), + pytest.param( + "7189.230405146706_0200.wav", + "%y%m%d%H%M%S_%z", + pytest.raises(ValueError), + id="incorrect_timezone_offset", + ), + pytest.param( + "7189.230405146706_+2500.wav", + "%y%m%d%H%M%S_%z", + pytest.raises(ValueError), + id="out_of_range_timezone_offset", + ), + pytest.param( + "7189.230405146706_Brest.wav", + "%y%m%d%H%M%S_%Z", + pytest.raises(ValueError), + id="incorrect_timezone_name", + ), + ], +) +def test_strptime_from_text(text: str, datetime_template: str, expected: Timestamp): + with expected as e: + assert strptime_from_text(text, datetime_template) == e + + +@pytest.fixture +def correct_series(): + series = pd.Series( + { + "something2345_2012_06_24__16:32:10.wav": pd.Timestamp( + "2012-06-24 16:32:10" + ), + "something2345_2023_07_28__08:24:50.flac": pd.Timestamp( + "2023-07-28 08:24:50" + ), + "something2345_2024_01_01__23:12:11.WAV": pd.Timestamp( + "2024-01-01 23:12:11" + ), + "something2345_2024_02_02__02:02:02.FLAC": pd.Timestamp( + "2024-02-02 02:02:02" + ), + }, + name="timestamp", + ) + series.index.name = "filename" + return series.reset_index() + + +@pytest.mark.integ +def test_associate_timestamps(correct_series): + input_files = list(correct_series["filename"]) + assert associate_timestamps((i for i in input_files), "%Y_%m_%d__%H:%M:%S").equals( + correct_series + ) + + +@pytest.mark.integ +def test_associate_timestamps_error_with_incorrect_datetime_format(correct_series): + input_files = list(correct_series["filename"]) + mismatching_datetime_format = "%Y%m%d__%H:%M:%S" + incorrect_datetime_format = "%y%m%d%H%M%P%S" + + with pytest.raises( + ValueError, + match=f"{input_files[0]} did not match the given {mismatching_datetime_format} template", + ) as e: + assert e == associate_timestamps( + (i for i in input_files), mismatching_datetime_format + ) + + with pytest.raises( + ValueError, + match=f"{incorrect_datetime_format} is not a supported strftime template", + ): + assert e == associate_timestamps( + (i for i in input_files), incorrect_datetime_format + ) + + +@pytest.mark.unit +@pytest.mark.parametrize( + "timestamp, expected", + [ + pytest.param( + Timestamp("2024-10-17 10:14:11.933+0000"), + "2024-10-17T10:14:11.933+0000", + id="timestamp_with_timezone", + ), + pytest.param( + Timestamp("2024-10-17 10:14:11+0000"), + "2024-10-17T10:14:11.000+0000", + id="increase_precision_to_millisecond", + ), + pytest.param( + Timestamp("2024-10-17 10:14:11.933384+0000"), + "2024-10-17T10:14:11.933+0000", + id="reduce_precision_to_millisecond", + ), + pytest.param( + Timestamp("2024-10-17 10:14:11.933293"), + "2024-10-17T10:14:11.933+0000", + id="no_timezone_defaults_to_utc", + ), + pytest.param( + Timestamp("2024-10-17 10:14:11.933-0400"), + "2024-10-17T10:14:11.933-0400", + id="delta_timezone", + ), + pytest.param( + Timestamp("2024-10-17 10:14:11.933", tz="US/Eastern"), + "2024-10-17T10:14:11.933-0400", + id="str_timezone", + ), + ], +) +def test_strftime_osmose_format(timestamp: Timestamp, expected: str): + assert strftime_osmose_format(timestamp) == expected