From e88c2885954e929bcad939f3f72b04594320bd56 Mon Sep 17 00:00:00 2001 From: RichieHakim Date: Fri, 23 Feb 2024 16:33:13 -0500 Subject: [PATCH] Add find_date_in_path function to path_helpers.py --- bnpm/path_helpers.py | 85 +++++++++++++++++++++++++++++++++++++++++- bnpm/tests/test_all.py | 29 ++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/bnpm/path_helpers.py b/bnpm/path_helpers.py index 905c0d9..654d165 100644 --- a/bnpm/path_helpers.py +++ b/bnpm/path_helpers.py @@ -251,4 +251,87 @@ def fix_spaces_in_unix_path(path): Unix path with spaces replaced by r'\ ' """ from pathlib import Path - return Path(path).as_posix().replace(' ', r'\ ') \ No newline at end of file + return Path(path).as_posix().replace(' ', r'\ ') + + +def find_date_in_path( + path, + regex_date_formats=[ + r'\d{4}\d{2}\d{2}', # 20220203 + r'\d{4}\D\d{2}\D\d{2}', # 2022_02_03 + r'\d{2}\D\d{2}\D\d{4}', # 02_03_2022 + r'\d{1}\D\d{1}\D\d{4}', # 2_3_2022 + r'\d{1}\D\d{2}\D\d{4}', # 2_03_2022 + r'\d{2}\D\d{1}\D\d{4}', # 02_3_2022 + r'\d{2}\D\d{2}\D\d{2}', # 02_03_22 + r'\d{1}\D\d{1}\D\d{2}', # 2_3_22 + r'\d{1}\D\d{2}\D\d{2}', # 2_03_22 + r'\d{2}\D\d{1}\D\d{2}', # 02_3_22 + ], + reverse_path_order=True, +): + """ + Searches a file or directory path for a date string matching one of several + regex patterns and returns the first match. + + RH 2024 + + Args: + path (str): + The file or directory path in which to search for a date. + regex_date_formats (List[str]): + A list of regex patterns to match against parts of the path.\n + Search goes in order of the list and stops at the first match.\n + (Default is a list of common date formats) + reverse_path_order (bool): + If True, search from the end of the path backwards. + + Returns: + str or None: + The first matching date string found, or None if no match is found. + """ + ## make a list of strings + regex_date_formats = [regex_date_formats] if isinstance(regex_date_formats, str) else regex_date_formats + + ## Dictionary to modify regex based on the presence of separators at start/end of the date. + modifiers = { + (0, 0): [r'' , r'' ], + (1, 0): [r'\D', r'' ], + (0, 1): [r'' , r'\D'], + (1, 1): [r'\D', r'\D'], + } + + ## Split the path into components and optionally reverse the order of search. + parts = Path(path).parts + parts = parts[::-1] if reverse_path_order else parts + + def _finder(regexs, parts): + """Inner function to find the first date in the path parts based on + provided regex patterns.""" + date = [] + for part in parts: + for regex in regexs: + date = re.findall(regex, part) + if len(date) > 0: + ## Return the last match found in the current part. + date = date[-1] + break + if isinstance(date, str): + break + date = None if isinstance(date, list) else date + return date + + ## Run the finder with each modifier and stop at the first match. + date = None + for num, mod in modifiers.items(): + ## Apply modifiers to each regex pattern and search the path parts. + date = _finder( + regexs=[mod[0] + regex + mod[1] for regex in regex_date_formats], + parts=parts, + ) + if date is not None: + ## Remove the modifiers from the date string. + date = date[num[0]:-num[1] if num[1]>0 else None] + break + + return date diff --git a/bnpm/tests/test_all.py b/bnpm/tests/test_all.py index ec0c209..6a8aa50 100644 --- a/bnpm/tests/test_all.py +++ b/bnpm/tests/test_all.py @@ -4,6 +4,9 @@ import numpy as np import scipy.signal +import pytest +import hypothesis + from .. import math_functions from ..optimization import Convergence_checker @@ -12,6 +15,8 @@ from ..clustering import cluster_similarity_matrices +from ..path_helpers import find_date_in_path + def test_Convergence_checker(): @@ -199,3 +204,27 @@ def test_cluster_similarity_matrices(): assert np.allclose(cs_mean, cs_mean_expected, atol=1e-6) assert np.allclose(cs_max, cs_max_expected, atol=1e-6) assert np.allclose(cs_min, cs_min_expected, atol=1e-6) + + +@pytest.mark.parametrize("path,expected_date,reverse_path_order", [ + (r"/home/user/documents/20220203/report.txt", "20220203", True), + (r"/home/user/docs/2022_02_03/report.pdf", "2022_02_03", True), + (r"/home/02_03_22/data.txt", "02_03_22", True), + (r"/2022-02-03/home/data.txt", "2022-02-03", True), + (r"/home/data_2_3_2022.txt", "2_3_2022", True), + (r"/home/docs/data_2-03-22.txt", '2-03-22', True), + (r"C:\home\docs\data_2-03-22.txt", '2-03-22', True), + (r"/home/docs/19900101/data_2-03-22.txt", '2-03-22', True), + (r"/home/docs/19900101/data_2-03-22.txt", '19900101', False), + (r"/home/docs/_19900101/data_2-03-22.txt", '19900101', False), + (r"/home/docs/19900101_/data_2-03-22.txt", '19900101', False), + (r"/home/docs/_19900101_/data_2-03-22.txt", '19900101', False), + (r"/home/docs/_19900101_/data_020322.txt", '19900101', False), + (r"/home/docs/_19900101_/data_20230322.txt", '19900101', False), + (r"/home/docs/_19900101_/data_20230322.txt", '20230322', True), + (r"/home/docs/_1990010_/data_2023032.txt", None, True), + (r"/home/docs/_1990010_/data_0 32.txt", None, True), + (r"/home/docs/_/data_.txt", None, True), +]) +def test_known_dates_in_path(path, expected_date, reverse_path_order): + assert find_date_in_path(path, reverse_path_order=reverse_path_order) == expected_date