Skip to content

Commit

Permalink
Add find_date_in_path function to path_helpers.py
Browse files Browse the repository at this point in the history
  • Loading branch information
RichieHakim committed Feb 23, 2024
1 parent 9fda388 commit e88c288
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 1 deletion.
85 changes: 84 additions & 1 deletion bnpm/path_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,87 @@ def fix_spaces_in_unix_path(path):
Unix path with spaces replaced by r'\ '
"""
from pathlib import Path
return Path(path).as_posix().replace(' ', r'\ ')
return Path(path).as_posix().replace(' ', r'\ ')


def find_date_in_path(
path,
regex_date_formats=[
r'\d{4}\d{2}\d{2}', # 20220203
r'\d{4}\D\d{2}\D\d{2}', # 2022_02_03
r'\d{2}\D\d{2}\D\d{4}', # 02_03_2022
r'\d{1}\D\d{1}\D\d{4}', # 2_3_2022
r'\d{1}\D\d{2}\D\d{4}', # 2_03_2022
r'\d{2}\D\d{1}\D\d{4}', # 02_3_2022
r'\d{2}\D\d{2}\D\d{2}', # 02_03_22
r'\d{1}\D\d{1}\D\d{2}', # 2_3_22
r'\d{1}\D\d{2}\D\d{2}', # 2_03_22
r'\d{2}\D\d{1}\D\d{2}', # 02_3_22
],
reverse_path_order=True,
):
"""
Searches a file or directory path for a date string matching one of several
regex patterns and returns the first match.
RH 2024
Args:
path (str):
The file or directory path in which to search for a date.
regex_date_formats (List[str]):
A list of regex patterns to match against parts of the path.\n
Search goes in order of the list and stops at the first match.\n
(Default is a list of common date formats)
reverse_path_order (bool):
If True, search from the end of the path backwards.
Returns:
str or None:
The first matching date string found, or None if no match is found.
"""
## make a list of strings
regex_date_formats = [regex_date_formats] if isinstance(regex_date_formats, str) else regex_date_formats

## Dictionary to modify regex based on the presence of separators at start/end of the date.
modifiers = {
(0, 0): [r'' , r'' ],
(1, 0): [r'\D', r'' ],
(0, 1): [r'' , r'\D'],
(1, 1): [r'\D', r'\D'],
}

## Split the path into components and optionally reverse the order of search.
parts = Path(path).parts
parts = parts[::-1] if reverse_path_order else parts

def _finder(regexs, parts):
"""Inner function to find the first date in the path parts based on
provided regex patterns."""
date = []
for part in parts:
for regex in regexs:
date = re.findall(regex, part)
if len(date) > 0:
## Return the last match found in the current part.
date = date[-1]
break
if isinstance(date, str):
break
date = None if isinstance(date, list) else date
return date

## Run the finder with each modifier and stop at the first match.
date = None
for num, mod in modifiers.items():
## Apply modifiers to each regex pattern and search the path parts.
date = _finder(
regexs=[mod[0] + regex + mod[1] for regex in regex_date_formats],
parts=parts,
)
if date is not None:
## Remove the modifiers from the date string.
date = date[num[0]:-num[1] if num[1]>0 else None]
break

return date
29 changes: 29 additions & 0 deletions bnpm/tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import numpy as np
import scipy.signal

import pytest
import hypothesis

from .. import math_functions

from ..optimization import Convergence_checker
Expand All @@ -12,6 +15,8 @@

from ..clustering import cluster_similarity_matrices

from ..path_helpers import find_date_in_path



def test_Convergence_checker():
Expand Down Expand Up @@ -199,3 +204,27 @@ def test_cluster_similarity_matrices():
assert np.allclose(cs_mean, cs_mean_expected, atol=1e-6)
assert np.allclose(cs_max, cs_max_expected, atol=1e-6)
assert np.allclose(cs_min, cs_min_expected, atol=1e-6)


@pytest.mark.parametrize("path,expected_date,reverse_path_order", [
(r"/home/user/documents/20220203/report.txt", "20220203", True),
(r"/home/user/docs/2022_02_03/report.pdf", "2022_02_03", True),
(r"/home/02_03_22/data.txt", "02_03_22", True),
(r"/2022-02-03/home/data.txt", "2022-02-03", True),
(r"/home/data_2_3_2022.txt", "2_3_2022", True),
(r"/home/docs/data_2-03-22.txt", '2-03-22', True),
(r"C:\home\docs\data_2-03-22.txt", '2-03-22', True),
(r"/home/docs/19900101/data_2-03-22.txt", '2-03-22', True),
(r"/home/docs/19900101/data_2-03-22.txt", '19900101', False),
(r"/home/docs/_19900101/data_2-03-22.txt", '19900101', False),
(r"/home/docs/19900101_/data_2-03-22.txt", '19900101', False),
(r"/home/docs/_19900101_/data_2-03-22.txt", '19900101', False),
(r"/home/docs/_19900101_/data_020322.txt", '19900101', False),
(r"/home/docs/_19900101_/data_20230322.txt", '19900101', False),
(r"/home/docs/_19900101_/data_20230322.txt", '20230322', True),
(r"/home/docs/_1990010_/data_2023032.txt", None, True),
(r"/home/docs/_1990010_/data_0 32.txt", None, True),
(r"/home/docs/_/data_.txt", None, True),
])
def test_known_dates_in_path(path, expected_date, reverse_path_order):
assert find_date_in_path(path, reverse_path_order=reverse_path_order) == expected_date

0 comments on commit e88c288

Please sign in to comment.