diff --git a/CHANGELOG.md b/CHANGELOG.md index 9aa9d944..691cf9b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.10.1] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.11.0] +This version deprecates one.alf.files in preperation for replacing with one.alf.path in version 3. + +### Modified + +- one.alf.files has been deprecated and moved to one.alf.path + +## [2.10.1] ### Modified @@ -9,7 +16,7 @@ - HOTFIX: include Subject/lab part in destination path when downloading from S3 ## [2.10.0] -This version improves behaviour of loading revisions and loading datasets from list_datasets output. +This version improves behaviour of loading revisions and loading datasets from list_datasets output. ### Modified @@ -42,7 +49,7 @@ This version improves behaviour of loading revisions and loading datasets from l ## [2.9.0] This version adds a couple of new ALF functions. -### Added +### Added - one.alf.io.find_variants allows one to find similar datasets on disk, such as revisions - one.alf.files.without_revision returns a file path without the revision folder @@ -64,7 +71,7 @@ This version of ONE adds support for loading .npz files. - one.alf.io.load_file_content loads npz files and returns only array if single compressed array with default name of 'arr_0'. - log warning when instantiating RegistrationClient with AlyxClient REST cache active -- bugfix in load_collection when one or more files missing +- bugfix in load_collection when one or more files missing ## [2.7.0] This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC filters. This version no longer supports 'data' search filter. @@ -238,7 +245,7 @@ This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC ### Modified -- HOTFIX: AWS S3 resource returned unsigned if 'public' in bucket name and no credentials on Alyx +- HOTFIX: AWS S3 resource returned unsigned if 'public' in bucket name and no credentials on Alyx ## [1.21.2] @@ -320,7 +327,7 @@ This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC ### Added -- registration client checks for protected datasets and moves them to new revision when registering files +- registration client checks for protected datasets and moves them to new revision when registering files ### Removed @@ -380,7 +387,7 @@ This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC - bugfix: username now passed to parameter setup routine - in silent mode if token invalid the user is logged out to remove old token from param file - root_dir now optional input for session_record2path -- do not check for recent cache upon instantiation in remote mode +- do not check for recent cache upon instantiation in remote mode ## [1.13.0] @@ -480,7 +487,7 @@ This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC ### Modified - HOTFIX: OneAlyx._download_datasets deals gracefully with empty datasets frame -- removed try-assert-catch logic from One._download_datasets to improve error stack +- removed try-assert-catch logic from One._download_datasets to improve error stack ## [1.8.0] @@ -602,7 +609,7 @@ This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC - runtime warning when remote list count changes ### Added -- alf.io function to return valid session paths within a directory +- alf.io function to return valid session paths within a directory ## [1.2.1] @@ -785,7 +792,7 @@ This version of ONE adds support for Alyx 2.0.0 and pandas 3.0.0 with dataset QC ### Removed - removed load method - + ### Modified - support for multiple configured databases in params diff --git a/one/__init__.py b/one/__init__.py index 442d51f1..71fa4429 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.10.1' +__version__ = '2.11.0' diff --git a/one/alf/cache.py b/one/alf/cache.py index 8fccaa95..69cff605 100644 --- a/one/alf/cache.py +++ b/one/alf/cache.py @@ -28,7 +28,7 @@ from iblutil.io.hashfile import md5 from one.alf.io import iter_sessions, iter_datasets -from one.alf.files import session_path_parts, get_alf_path +from one.alf.path import session_path_parts, get_alf_path from one.converters import session_record2path from one.util import QC_TYPE, patch_cache diff --git a/one/alf/files.py b/one/alf/files.py index bf13ad4a..3bd442a1 100644 --- a/one/alf/files.py +++ b/one/alf/files.py @@ -1,513 +1,11 @@ """ -Module for identifying and parsing ALF file names. - -An ALF file has the following components (those in brackets are optional): - (_namespace_)object.attribute(_timescale)(.extra.parts).ext - -Note the following: - Object attributes may not contain an underscore unless followed by 'times' or 'intervals'. - A namespace must not contain extra underscores (i.e. `name_space` and `__namespace__` are not - valid). - ALF files must always have an extension. - -For more information, see the following documentation: - https://int-brain-lab.github.io/ONE/alf_intro.html +(DEPRECATED) Module for identifying and parsing ALF file names. +This module has moved to :mod:`one.alf.path`. """ -from collections import OrderedDict -from datetime import datetime -from typing import Union, Optional -from pathlib import Path -import logging - -from . import spec -from .spec import SESSION_SPEC, COLLECTION_SPEC, FILE_SPEC, REL_PATH_SPEC - -_logger = logging.getLogger(__name__) - - -def rel_path_parts(rel_path, as_dict=False, assert_valid=True): - """Parse a relative path into the relevant parts. - - A relative path follows the pattern - (collection/)(#revision#/)_namespace_object.attribute_timescale.extra.extension - - Parameters - ---------- - rel_path : str, pathlib.Path - A relative path string. - as_dict : bool - If true, an OrderedDict of parts are returned with the keys ('lab', 'subject', 'date', - 'number'), otherwise a tuple of values are returned. - assert_valid : bool - If true a ValueError is raised when the session cannot be parsed, otherwise an empty - dict of tuple of Nones is returned. - - Returns - ------- - OrderedDict, tuple - A dict if as_dict is true, or a tuple of parsed values. - """ - return _path_parts(rel_path, REL_PATH_SPEC, True, as_dict, assert_valid) - - -def session_path_parts(session_path, as_dict=False, assert_valid=True): - """Parse a session path into the relevant parts. - - Return keys: - - lab - - subject - - date - - number - - Parameters - ---------- - session_path : str, pathlib.Path - A session path string. - as_dict : bool - If true, an OrderedDict of parts are returned with the keys ('lab', 'subject', 'date', - 'number'), otherwise a tuple of values are returned. - assert_valid : bool - If true a ValueError is raised when the session cannot be parsed, otherwise an empty - dict of tuple of Nones is returned. - - Returns - ------- - OrderedDict, tuple - A dict if as_dict is true, or a tuple of parsed values. - - Raises - ------ - ValueError - Invalid ALF session path (assert_valid is True). - """ - return _path_parts(session_path, SESSION_SPEC, False, as_dict, assert_valid) - - -def _path_parts(path, spec_str, match=True, as_dict=False, assert_valid=True): - """Given a ALF and a spec string, parse into parts. - - Parameters - ---------- - path : str, pathlib.Path - An ALF path or dataset. - match : bool - If True, string must match exactly, otherwise search for expression within path. - as_dict : bool - When true a dict of matches is returned. - assert_valid : bool - When true an exception is raised when the filename cannot be parsed. - - Returns - ------- - OrderedDict, tuple - A dict if as_dict is true, or a tuple of parsed values. - - Raises - ------ - ValueError - Invalid ALF path (assert_valid is True). - """ - if hasattr(path, 'as_posix'): - path = path.as_posix() - pattern = spec.regex(spec_str) - empty = OrderedDict.fromkeys(pattern.groupindex.keys()) - parsed = (pattern.match if match else pattern.search)(path) - if parsed: # py3.8 - parsed_dict = parsed.groupdict() - return OrderedDict(parsed_dict) if as_dict else tuple(parsed_dict.values()) - elif assert_valid: - raise ValueError(f'Invalid ALF: "{path}"') - else: - return empty if as_dict else tuple(empty.values()) - - -def filename_parts(filename, as_dict=False, assert_valid=True) -> Union[dict, tuple]: - """ - Return the parsed elements of a given ALF filename. - - Parameters - ---------- - filename : str - The name of the file. - as_dict : bool - When true a dict of matches is returned. - assert_valid : bool - When true an exception is raised when the filename cannot be parsed. - - Returns - ------- - namespace : str - The _namespace_ or None if not present. - object : str - ALF object. - attribute : str - The ALF attribute. - timescale : str - The ALF _timescale or None if not present. - extra : str - Any extra parts to the filename, or None if not present. - extension : str - The file extension. - - Examples - -------- - >>> filename_parts('_namespace_obj.times_timescale.extra.foo.ext') - ('namespace', 'obj', 'times', 'timescale', 'extra.foo', 'ext') - >>> filename_parts('spikes.clusters.npy', as_dict=True) - {'namespace': None, - 'object': 'spikes', - 'attribute': 'clusters', - 'timescale': None, - 'extra': None, - 'extension': 'npy'} - >>> filename_parts('spikes.times_ephysClock.npy') - (None, 'spikes', 'times', 'ephysClock', None, 'npy') - >>> filename_parts('_iblmic_audioSpectrogram.frequencies.npy') - ('iblmic', 'audioSpectrogram', 'frequencies', None, None, 'npy') - >>> filename_parts('_spikeglx_ephysData_g0_t0.imec.wiring.json') - ('spikeglx', 'ephysData_g0_t0', 'imec', None, 'wiring', 'json') - >>> filename_parts('_spikeglx_ephysData_g0_t0.imec0.lf.bin') - ('spikeglx', 'ephysData_g0_t0', 'imec0', None, 'lf', 'bin') - >>> filename_parts('_ibl_trials.goCue_times_bpod.csv') - ('ibl', 'trials', 'goCue_times', 'bpod', None, 'csv') - - Raises - ------ - ValueError - Invalid ALF dataset (assert_valid is True). - """ - return _path_parts(filename, FILE_SPEC, True, as_dict, assert_valid) - - -def full_path_parts(path, as_dict=False, assert_valid=True) -> Union[dict, tuple]: - """Parse all filename and folder parts. - - Parameters - ---------- - path : str, pathlib.Path. - The ALF path - as_dict : bool - When true a dict of matches is returned. - assert_valid : bool - When true an exception is raised when the filename cannot be parsed. - - Returns - ------- - OrderedDict, tuple - A dict if as_dict is true, or a tuple of parsed values. - - Examples - -------- - >>> full_path_parts( - ... 'lab/Subjects/subject/2020-01-01/001/collection/#revision#/' - ... '_namespace_obj.times_timescale.extra.foo.ext') - ('lab', 'subject', '2020-01-01', '001', 'collection', 'revision', - 'namespace', 'obj', 'times','timescale', 'extra.foo', 'ext') - >>> full_path_parts('spikes.clusters.npy', as_dict=True) - {'lab': None, - 'subject': None, - 'date': None, - 'number': None, - 'collection': None, - 'revision': None, - 'namespace': None, - 'object': 'spikes', - 'attribute': 'clusters', - 'timescale': None, - 'extra': None, - 'extension': 'npy'} - - Raises - ------ - ValueError - Invalid ALF path (assert_valid is True). - """ - path = Path(path) - # NB We try to determine whether we have a folder or filename path. Filenames contain at - # least two periods, however it is currently permitted to have any number of periods in a - # collection, making the ALF path ambiguous. - if sum(x == '.' for x in path.name) < 2: # folder only - folders = folder_parts(path, as_dict, assert_valid) - dataset = filename_parts('', as_dict, assert_valid=False) - elif '/' not in path.as_posix(): # filename only - folders = folder_parts('', as_dict, assert_valid=False) - dataset = filename_parts(path.name, as_dict, assert_valid) - else: # full filepath - folders = folder_parts(path.parent, as_dict, assert_valid) - dataset = filename_parts(path.name, as_dict, assert_valid) - if as_dict: - return OrderedDict(**folders, **dataset) - else: - return folders + dataset - - -def folder_parts(folder_path, as_dict=False, assert_valid=True) -> Union[dict, tuple]: - """Parse all folder parts, including session, collection and revision. - - Parameters - ---------- - folder_path : str, pathlib.Path - The ALF folder path. - as_dict : bool - When true a dict of matches is returned. - assert_valid : bool - When true an exception is raised when the filename cannot be parsed. - - Returns - ------- - OrderedDict, tuple - A dict if as_dict is true, or a tuple of parsed values. - - Examples - -------- - >>> folder_parts('lab/Subjects/subject/2020-01-01/001/collection/#revision#') - ('lab', 'subject', '2020-01-01', '001', 'collection', 'revision') - >>> folder_parts(Path('lab/Subjects/subject/2020-01-01/001'), as_dict=True) - {'lab': 'lab', - 'subject': 'subject', - 'date': '2020-01-01', - 'number': '001', - 'collection': None, - 'revision': None} - - Raises - ------ - ValueError - Invalid ALF path (assert_valid is True). - """ - if hasattr(folder_path, 'as_posix'): - folder_path = folder_path.as_posix() - if folder_path and folder_path[-1] != '/': # Slash required for regex pattern - folder_path = folder_path + '/' - spec_str = f'{SESSION_SPEC}/{COLLECTION_SPEC}' - return _path_parts(folder_path, spec_str, False, as_dict, assert_valid) - - -def _isdatetime(s: str) -> bool: - """Returns True if input is valid ISO date string.""" - try: - datetime.strptime(s, '%Y-%m-%d') - return True - except ValueError: - return False - - -def get_session_path(path: Union[str, Path]) -> Optional[Path]: - """ - Returns the session path from any filepath if the date/number pattern is found, - including the root directory. - - Returns - ------- - pathlib.Path - The session path part of the input path or None if path invalid. - - Examples - -------- - >>> get_session_path('/mnt/sd0/Data/lab/Subjects/subject/2020-01-01/001') - Path('/mnt/sd0/Data/lab/Subjects/subject/2020-01-01/001') - - >>> get_session_path('C:\\Data\\subject\\2020-01-01\\1\\trials.intervals.npy') - Path('C:/Data/subject/2020-01-01/1') - """ - if path is None: - return - if isinstance(path, str): - path = Path(path) - sess = None - for i, p in enumerate(path.parts): - if p.isdigit() and _isdatetime(path.parts[i - 1]): - sess = Path().joinpath(*path.parts[:i + 1]) - - return sess - - -def get_alf_path(path: Union[str, Path]) -> str: - """Returns the ALF part of a path or filename. - Attempts to return the first valid part of the path, first searching for a session path, - then relative path (collection/revision/filename), then just the filename. If all invalid, - None is returned. - - Parameters - ---------- - path : str, pathlib.Path - A path to parse. - - Returns - ------- - str - A string containing the full ALF path, session path, relative path or filename. - - Examples - -------- - >>> get_alf_path('etc/etc/lab/Subjects/subj/2021-01-21/001') - 'lab/Subjects/subj/2021-01-21/001/collection/file.attr.ext' - - >>> get_alf_path('etc/etc/subj/2021-01-21/001/collection/file.attr.ext') - 'subj/2021-01-21/001/collection/file.attr.ext' - - >>> get_alf_path('collection/file.attr.ext') - 'collection/file.attr.ext' - """ - if not isinstance(path, str): - path = Path(path).as_posix() - path = path.strip('/') - - # Check if session path - match_session = spec.regex(SESSION_SPEC).search(path) - if match_session: - return path[match_session.start():] - - # Check if filename / relative path (i.e. collection + filename) - parts = path.rsplit('/', 1) - match_filename = spec.regex(FILE_SPEC).match(parts[-1]) - if match_filename: - return path if spec.regex(f'{COLLECTION_SPEC}{FILE_SPEC}').match(path) else parts[-1] - - -def add_uuid_string(file_path, uuid): - """ - Add a UUID to the filename of an ALF path. - - Adds a UUID to an ALF filename as an extra part, e.g. - 'obj.attr.ext' -> 'obj.attr.a976e418-c8b8-4d24-be47-d05120b18341.ext'. - - Parameters - ---------- - file_path : str, pathlib.Path, pathlib.PurePath - An ALF path to add the UUID to. - uuid : str, uuid.UUID - The UUID to add. - - Returns - ------- - pathlib.Path, pathlib.PurePath - A new Path or PurePath object with a UUID in the filename. - - Examples - -------- - >>> add_uuid_string('/path/to/trials.intervals.npy', 'a976e418-c8b8-4d24-be47-d05120b18341') - Path('/path/to/trials.intervals.a976e418-c8b8-4d24-be47-d05120b18341.npy') - - Raises - ------ - ValueError - `uuid` must be a valid hyphen-separated hexadecimal UUID. - - See Also - -------- - one.alf.files.remove_uuid_string - one.alf.spec.is_uuid - """ - if isinstance(uuid, str) and not spec.is_uuid_string(uuid): - raise ValueError('Should provide a valid UUID v4') - uuid = str(uuid) - # NB: Only instantiate as Path if not already a Path, otherwise we risk changing the class - if isinstance(file_path, str): - file_path = Path(file_path) - name_parts = file_path.stem.split('.') - if spec.is_uuid(name_parts[-1]): - *name_parts, old_uuid = name_parts - if old_uuid == uuid: - _logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE') - return file_path - else: - _logger.debug('Replacing %s with %s in %s', old_uuid, uuid, file_path) - return file_path.parent.joinpath(f"{'.'.join(name_parts)}.{uuid}{file_path.suffix}") - - -def remove_uuid_string(file_path): - """ - Remove UUID from a filename of an ALF path. - - Parameters - ---------- - file_path : str, pathlib.Path, pathlib.PurePath - An ALF path to add the UUID to. - - Returns - ------- - pathlib.Path, pathlib.PurePath - A new Path or PurePath object without a UUID in the filename. - - Examples - -------- - >>> add_uuid_string('/path/to/trials.intervals.a976e418-c8b8-4d24-be47-d05120b18341.npy') - Path('/path/to/trials.intervals.npy') - - >>> add_uuid_string('/path/to/trials.intervals.npy') - Path('/path/to/trials.intervals.npy') - - See Also - -------- - one.alf.files.add_uuid_string - """ - if isinstance(file_path, str): - file_path = Path(file_path) - name_parts = file_path.stem.split('.') - - if spec.is_uuid_string(name_parts[-1]): - file_path = file_path.with_name('.'.join(name_parts[:-1]) + file_path.suffix) - return file_path - - -def padded_sequence(file_path): - """ - Ensures a file path contains a zero-padded experiment sequence folder. - - Parameters - ---------- - file_path : str, pathlib.Path, pathlib.PurePath - A session or file path to convert. - - Returns - ------- - pathlib.Path, pathlib.PurePath - The same path but with the experiment sequence folder zero-padded. If a PurePath was - passed, a PurePath will be returned, otherwise a Path object is returned. - - Examples - -------- - >>> file_path = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml' - >>> padded_sequence(file_path) - pathlib.Path('/iblrigdata/subject/2023-01-01/001/_ibl_experiment.description.yaml') - - Supports folders and will not affect already padded paths - - >>> session_path = pathlib.PurePosixPath('subject/2023-01-01/001') - >>> padded_sequence(file_path) - pathlib.PurePosixPath('subject/2023-01-01/001') - """ - if isinstance(file_path, str): - file_path = Path(file_path) - if (session_path := get_session_path(file_path)) is None: - raise ValueError('path must include a valid ALF session path, e.g. subject/YYYY-MM-DD/N') - idx = len(file_path.parts) - len(session_path.parts) - sequence = str(int(session_path.parts[-1])).zfill(3) # zero-pad if necessary - return file_path.parents[idx].joinpath(sequence, file_path.relative_to(session_path)) - - -def without_revision(file_path): - """ - Return file path without a revision folder. - - Parameters - ---------- - file_path : str, pathlib.Path - A valid ALF dataset path. +import warnings - Returns - ------- - pathlib.Path - The input file path without a revision folder. +from .path import * # noqa - Examples - -------- - >>> without_revision('/lab/Subjects/subject/2023-01-01/001/collection/#revision#/obj.attr.ext') - Path('/lab/Subjects/subject/2023-01-01/001/collection/obj.attr.ext') - """ - if isinstance(file_path, str): - file_path = Path(file_path) - *_, collection, revision = folder_parts(file_path.parent) - return get_session_path(file_path).joinpath(*filter(None, (collection, file_path.name))) +warnings.warn( + '`one.alf.files` will be removed in version 3.0. Use `one.alf.path` instead.', FutureWarning) diff --git a/one/alf/io.py b/one/alf/io.py index ed7dab7c..b1ee1ab8 100644 --- a/one/alf/io.py +++ b/one/alf/io.py @@ -26,7 +26,7 @@ from iblutil.io import parquet from iblutil.io import jsonable from .exceptions import ALFObjectNotFound -from . import files, spec +from . import spec, path as files from .spec import FILE_SPEC _logger = logging.getLogger(__name__) diff --git a/one/alf/path.py b/one/alf/path.py new file mode 100644 index 00000000..bf13ad4a --- /dev/null +++ b/one/alf/path.py @@ -0,0 +1,513 @@ +""" +Module for identifying and parsing ALF file names. + +An ALF file has the following components (those in brackets are optional): + (_namespace_)object.attribute(_timescale)(.extra.parts).ext + +Note the following: + Object attributes may not contain an underscore unless followed by 'times' or 'intervals'. + A namespace must not contain extra underscores (i.e. `name_space` and `__namespace__` are not + valid). + ALF files must always have an extension. + +For more information, see the following documentation: + https://int-brain-lab.github.io/ONE/alf_intro.html + +""" +from collections import OrderedDict +from datetime import datetime +from typing import Union, Optional +from pathlib import Path +import logging + +from . import spec +from .spec import SESSION_SPEC, COLLECTION_SPEC, FILE_SPEC, REL_PATH_SPEC + +_logger = logging.getLogger(__name__) + + +def rel_path_parts(rel_path, as_dict=False, assert_valid=True): + """Parse a relative path into the relevant parts. + + A relative path follows the pattern + (collection/)(#revision#/)_namespace_object.attribute_timescale.extra.extension + + Parameters + ---------- + rel_path : str, pathlib.Path + A relative path string. + as_dict : bool + If true, an OrderedDict of parts are returned with the keys ('lab', 'subject', 'date', + 'number'), otherwise a tuple of values are returned. + assert_valid : bool + If true a ValueError is raised when the session cannot be parsed, otherwise an empty + dict of tuple of Nones is returned. + + Returns + ------- + OrderedDict, tuple + A dict if as_dict is true, or a tuple of parsed values. + """ + return _path_parts(rel_path, REL_PATH_SPEC, True, as_dict, assert_valid) + + +def session_path_parts(session_path, as_dict=False, assert_valid=True): + """Parse a session path into the relevant parts. + + Return keys: + - lab + - subject + - date + - number + + Parameters + ---------- + session_path : str, pathlib.Path + A session path string. + as_dict : bool + If true, an OrderedDict of parts are returned with the keys ('lab', 'subject', 'date', + 'number'), otherwise a tuple of values are returned. + assert_valid : bool + If true a ValueError is raised when the session cannot be parsed, otherwise an empty + dict of tuple of Nones is returned. + + Returns + ------- + OrderedDict, tuple + A dict if as_dict is true, or a tuple of parsed values. + + Raises + ------ + ValueError + Invalid ALF session path (assert_valid is True). + """ + return _path_parts(session_path, SESSION_SPEC, False, as_dict, assert_valid) + + +def _path_parts(path, spec_str, match=True, as_dict=False, assert_valid=True): + """Given a ALF and a spec string, parse into parts. + + Parameters + ---------- + path : str, pathlib.Path + An ALF path or dataset. + match : bool + If True, string must match exactly, otherwise search for expression within path. + as_dict : bool + When true a dict of matches is returned. + assert_valid : bool + When true an exception is raised when the filename cannot be parsed. + + Returns + ------- + OrderedDict, tuple + A dict if as_dict is true, or a tuple of parsed values. + + Raises + ------ + ValueError + Invalid ALF path (assert_valid is True). + """ + if hasattr(path, 'as_posix'): + path = path.as_posix() + pattern = spec.regex(spec_str) + empty = OrderedDict.fromkeys(pattern.groupindex.keys()) + parsed = (pattern.match if match else pattern.search)(path) + if parsed: # py3.8 + parsed_dict = parsed.groupdict() + return OrderedDict(parsed_dict) if as_dict else tuple(parsed_dict.values()) + elif assert_valid: + raise ValueError(f'Invalid ALF: "{path}"') + else: + return empty if as_dict else tuple(empty.values()) + + +def filename_parts(filename, as_dict=False, assert_valid=True) -> Union[dict, tuple]: + """ + Return the parsed elements of a given ALF filename. + + Parameters + ---------- + filename : str + The name of the file. + as_dict : bool + When true a dict of matches is returned. + assert_valid : bool + When true an exception is raised when the filename cannot be parsed. + + Returns + ------- + namespace : str + The _namespace_ or None if not present. + object : str + ALF object. + attribute : str + The ALF attribute. + timescale : str + The ALF _timescale or None if not present. + extra : str + Any extra parts to the filename, or None if not present. + extension : str + The file extension. + + Examples + -------- + >>> filename_parts('_namespace_obj.times_timescale.extra.foo.ext') + ('namespace', 'obj', 'times', 'timescale', 'extra.foo', 'ext') + >>> filename_parts('spikes.clusters.npy', as_dict=True) + {'namespace': None, + 'object': 'spikes', + 'attribute': 'clusters', + 'timescale': None, + 'extra': None, + 'extension': 'npy'} + >>> filename_parts('spikes.times_ephysClock.npy') + (None, 'spikes', 'times', 'ephysClock', None, 'npy') + >>> filename_parts('_iblmic_audioSpectrogram.frequencies.npy') + ('iblmic', 'audioSpectrogram', 'frequencies', None, None, 'npy') + >>> filename_parts('_spikeglx_ephysData_g0_t0.imec.wiring.json') + ('spikeglx', 'ephysData_g0_t0', 'imec', None, 'wiring', 'json') + >>> filename_parts('_spikeglx_ephysData_g0_t0.imec0.lf.bin') + ('spikeglx', 'ephysData_g0_t0', 'imec0', None, 'lf', 'bin') + >>> filename_parts('_ibl_trials.goCue_times_bpod.csv') + ('ibl', 'trials', 'goCue_times', 'bpod', None, 'csv') + + Raises + ------ + ValueError + Invalid ALF dataset (assert_valid is True). + """ + return _path_parts(filename, FILE_SPEC, True, as_dict, assert_valid) + + +def full_path_parts(path, as_dict=False, assert_valid=True) -> Union[dict, tuple]: + """Parse all filename and folder parts. + + Parameters + ---------- + path : str, pathlib.Path. + The ALF path + as_dict : bool + When true a dict of matches is returned. + assert_valid : bool + When true an exception is raised when the filename cannot be parsed. + + Returns + ------- + OrderedDict, tuple + A dict if as_dict is true, or a tuple of parsed values. + + Examples + -------- + >>> full_path_parts( + ... 'lab/Subjects/subject/2020-01-01/001/collection/#revision#/' + ... '_namespace_obj.times_timescale.extra.foo.ext') + ('lab', 'subject', '2020-01-01', '001', 'collection', 'revision', + 'namespace', 'obj', 'times','timescale', 'extra.foo', 'ext') + >>> full_path_parts('spikes.clusters.npy', as_dict=True) + {'lab': None, + 'subject': None, + 'date': None, + 'number': None, + 'collection': None, + 'revision': None, + 'namespace': None, + 'object': 'spikes', + 'attribute': 'clusters', + 'timescale': None, + 'extra': None, + 'extension': 'npy'} + + Raises + ------ + ValueError + Invalid ALF path (assert_valid is True). + """ + path = Path(path) + # NB We try to determine whether we have a folder or filename path. Filenames contain at + # least two periods, however it is currently permitted to have any number of periods in a + # collection, making the ALF path ambiguous. + if sum(x == '.' for x in path.name) < 2: # folder only + folders = folder_parts(path, as_dict, assert_valid) + dataset = filename_parts('', as_dict, assert_valid=False) + elif '/' not in path.as_posix(): # filename only + folders = folder_parts('', as_dict, assert_valid=False) + dataset = filename_parts(path.name, as_dict, assert_valid) + else: # full filepath + folders = folder_parts(path.parent, as_dict, assert_valid) + dataset = filename_parts(path.name, as_dict, assert_valid) + if as_dict: + return OrderedDict(**folders, **dataset) + else: + return folders + dataset + + +def folder_parts(folder_path, as_dict=False, assert_valid=True) -> Union[dict, tuple]: + """Parse all folder parts, including session, collection and revision. + + Parameters + ---------- + folder_path : str, pathlib.Path + The ALF folder path. + as_dict : bool + When true a dict of matches is returned. + assert_valid : bool + When true an exception is raised when the filename cannot be parsed. + + Returns + ------- + OrderedDict, tuple + A dict if as_dict is true, or a tuple of parsed values. + + Examples + -------- + >>> folder_parts('lab/Subjects/subject/2020-01-01/001/collection/#revision#') + ('lab', 'subject', '2020-01-01', '001', 'collection', 'revision') + >>> folder_parts(Path('lab/Subjects/subject/2020-01-01/001'), as_dict=True) + {'lab': 'lab', + 'subject': 'subject', + 'date': '2020-01-01', + 'number': '001', + 'collection': None, + 'revision': None} + + Raises + ------ + ValueError + Invalid ALF path (assert_valid is True). + """ + if hasattr(folder_path, 'as_posix'): + folder_path = folder_path.as_posix() + if folder_path and folder_path[-1] != '/': # Slash required for regex pattern + folder_path = folder_path + '/' + spec_str = f'{SESSION_SPEC}/{COLLECTION_SPEC}' + return _path_parts(folder_path, spec_str, False, as_dict, assert_valid) + + +def _isdatetime(s: str) -> bool: + """Returns True if input is valid ISO date string.""" + try: + datetime.strptime(s, '%Y-%m-%d') + return True + except ValueError: + return False + + +def get_session_path(path: Union[str, Path]) -> Optional[Path]: + """ + Returns the session path from any filepath if the date/number pattern is found, + including the root directory. + + Returns + ------- + pathlib.Path + The session path part of the input path or None if path invalid. + + Examples + -------- + >>> get_session_path('/mnt/sd0/Data/lab/Subjects/subject/2020-01-01/001') + Path('/mnt/sd0/Data/lab/Subjects/subject/2020-01-01/001') + + >>> get_session_path('C:\\Data\\subject\\2020-01-01\\1\\trials.intervals.npy') + Path('C:/Data/subject/2020-01-01/1') + """ + if path is None: + return + if isinstance(path, str): + path = Path(path) + sess = None + for i, p in enumerate(path.parts): + if p.isdigit() and _isdatetime(path.parts[i - 1]): + sess = Path().joinpath(*path.parts[:i + 1]) + + return sess + + +def get_alf_path(path: Union[str, Path]) -> str: + """Returns the ALF part of a path or filename. + Attempts to return the first valid part of the path, first searching for a session path, + then relative path (collection/revision/filename), then just the filename. If all invalid, + None is returned. + + Parameters + ---------- + path : str, pathlib.Path + A path to parse. + + Returns + ------- + str + A string containing the full ALF path, session path, relative path or filename. + + Examples + -------- + >>> get_alf_path('etc/etc/lab/Subjects/subj/2021-01-21/001') + 'lab/Subjects/subj/2021-01-21/001/collection/file.attr.ext' + + >>> get_alf_path('etc/etc/subj/2021-01-21/001/collection/file.attr.ext') + 'subj/2021-01-21/001/collection/file.attr.ext' + + >>> get_alf_path('collection/file.attr.ext') + 'collection/file.attr.ext' + """ + if not isinstance(path, str): + path = Path(path).as_posix() + path = path.strip('/') + + # Check if session path + match_session = spec.regex(SESSION_SPEC).search(path) + if match_session: + return path[match_session.start():] + + # Check if filename / relative path (i.e. collection + filename) + parts = path.rsplit('/', 1) + match_filename = spec.regex(FILE_SPEC).match(parts[-1]) + if match_filename: + return path if spec.regex(f'{COLLECTION_SPEC}{FILE_SPEC}').match(path) else parts[-1] + + +def add_uuid_string(file_path, uuid): + """ + Add a UUID to the filename of an ALF path. + + Adds a UUID to an ALF filename as an extra part, e.g. + 'obj.attr.ext' -> 'obj.attr.a976e418-c8b8-4d24-be47-d05120b18341.ext'. + + Parameters + ---------- + file_path : str, pathlib.Path, pathlib.PurePath + An ALF path to add the UUID to. + uuid : str, uuid.UUID + The UUID to add. + + Returns + ------- + pathlib.Path, pathlib.PurePath + A new Path or PurePath object with a UUID in the filename. + + Examples + -------- + >>> add_uuid_string('/path/to/trials.intervals.npy', 'a976e418-c8b8-4d24-be47-d05120b18341') + Path('/path/to/trials.intervals.a976e418-c8b8-4d24-be47-d05120b18341.npy') + + Raises + ------ + ValueError + `uuid` must be a valid hyphen-separated hexadecimal UUID. + + See Also + -------- + one.alf.files.remove_uuid_string + one.alf.spec.is_uuid + """ + if isinstance(uuid, str) and not spec.is_uuid_string(uuid): + raise ValueError('Should provide a valid UUID v4') + uuid = str(uuid) + # NB: Only instantiate as Path if not already a Path, otherwise we risk changing the class + if isinstance(file_path, str): + file_path = Path(file_path) + name_parts = file_path.stem.split('.') + if spec.is_uuid(name_parts[-1]): + *name_parts, old_uuid = name_parts + if old_uuid == uuid: + _logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE') + return file_path + else: + _logger.debug('Replacing %s with %s in %s', old_uuid, uuid, file_path) + return file_path.parent.joinpath(f"{'.'.join(name_parts)}.{uuid}{file_path.suffix}") + + +def remove_uuid_string(file_path): + """ + Remove UUID from a filename of an ALF path. + + Parameters + ---------- + file_path : str, pathlib.Path, pathlib.PurePath + An ALF path to add the UUID to. + + Returns + ------- + pathlib.Path, pathlib.PurePath + A new Path or PurePath object without a UUID in the filename. + + Examples + -------- + >>> add_uuid_string('/path/to/trials.intervals.a976e418-c8b8-4d24-be47-d05120b18341.npy') + Path('/path/to/trials.intervals.npy') + + >>> add_uuid_string('/path/to/trials.intervals.npy') + Path('/path/to/trials.intervals.npy') + + See Also + -------- + one.alf.files.add_uuid_string + """ + if isinstance(file_path, str): + file_path = Path(file_path) + name_parts = file_path.stem.split('.') + + if spec.is_uuid_string(name_parts[-1]): + file_path = file_path.with_name('.'.join(name_parts[:-1]) + file_path.suffix) + return file_path + + +def padded_sequence(file_path): + """ + Ensures a file path contains a zero-padded experiment sequence folder. + + Parameters + ---------- + file_path : str, pathlib.Path, pathlib.PurePath + A session or file path to convert. + + Returns + ------- + pathlib.Path, pathlib.PurePath + The same path but with the experiment sequence folder zero-padded. If a PurePath was + passed, a PurePath will be returned, otherwise a Path object is returned. + + Examples + -------- + >>> file_path = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml' + >>> padded_sequence(file_path) + pathlib.Path('/iblrigdata/subject/2023-01-01/001/_ibl_experiment.description.yaml') + + Supports folders and will not affect already padded paths + + >>> session_path = pathlib.PurePosixPath('subject/2023-01-01/001') + >>> padded_sequence(file_path) + pathlib.PurePosixPath('subject/2023-01-01/001') + """ + if isinstance(file_path, str): + file_path = Path(file_path) + if (session_path := get_session_path(file_path)) is None: + raise ValueError('path must include a valid ALF session path, e.g. subject/YYYY-MM-DD/N') + idx = len(file_path.parts) - len(session_path.parts) + sequence = str(int(session_path.parts[-1])).zfill(3) # zero-pad if necessary + return file_path.parents[idx].joinpath(sequence, file_path.relative_to(session_path)) + + +def without_revision(file_path): + """ + Return file path without a revision folder. + + Parameters + ---------- + file_path : str, pathlib.Path + A valid ALF dataset path. + + Returns + ------- + pathlib.Path + The input file path without a revision folder. + + Examples + -------- + >>> without_revision('/lab/Subjects/subject/2023-01-01/001/collection/#revision#/obj.attr.ext') + Path('/lab/Subjects/subject/2023-01-01/001/collection/obj.attr.ext') + """ + if isinstance(file_path, str): + file_path = Path(file_path) + *_, collection, revision = folder_parts(file_path.parent) + return get_session_path(file_path).joinpath(*filter(None, (collection, file_path.name))) diff --git a/one/api.py b/one/api.py index cfa9d3ea..d9f467a5 100644 --- a/one/api.py +++ b/one/api.py @@ -25,7 +25,7 @@ import one.params import one.webclient as wc import one.alf.io as alfio -import one.alf.files as alfiles +import one.alf.path as alfiles import one.alf.exceptions as alferr from .alf.cache import make_parquet_db, DATASETS_COLUMNS, SESSIONS_COLUMNS from .alf.spec import is_uuid_string, QC, to_alf diff --git a/one/converters.py b/one/converters.py index 20b80ea7..abbc8415 100644 --- a/one/converters.py +++ b/one/converters.py @@ -19,7 +19,7 @@ from iblutil.util import Bunch from one.alf.spec import is_session_path, is_uuid_string -from one.alf.files import get_session_path, add_uuid_string, session_path_parts, get_alf_path +from one.alf.path import get_session_path, add_uuid_string, session_path_parts, get_alf_path from .util import Listable diff --git a/one/registration.py b/one/registration.py index 900a813e..58588da4 100644 --- a/one/registration.py +++ b/one/registration.py @@ -26,7 +26,7 @@ from iblutil.util import Bunch, ensure_list import one.alf.io as alfio -from one.alf.files import session_path_parts, get_session_path, folder_parts, filename_parts +from one.alf.path import session_path_parts, get_session_path, folder_parts, filename_parts from one.alf.spec import is_valid import one.alf.exceptions as alferr from one.api import ONE diff --git a/one/remote/globus.py b/one/remote/globus.py index f9626eb5..18b28d5e 100644 --- a/one/remote/globus.py +++ b/one/remote/globus.py @@ -100,7 +100,7 @@ from iblutil.util import ensure_list from one.alf.spec import is_uuid -from one.alf.files import remove_uuid_string +from one.alf.path import remove_uuid_string import one.params from one.webclient import AlyxClient from .base import DownloadClient, load_client_params, save_client_params diff --git a/one/tests/alf/test_alf_files.py b/one/tests/alf/test_alf_files.py index c0dbbcb5..7c4e2c53 100644 --- a/one/tests/alf/test_alf_files.py +++ b/one/tests/alf/test_alf_files.py @@ -3,7 +3,7 @@ from pathlib import Path, PureWindowsPath import uuid -import one.alf.files as files +import one.alf.path as files class TestAlfParse(unittest.TestCase): @@ -244,5 +244,17 @@ def test_without_revision(self): self.assertRegex(str(cm.exception), 'Invalid ALF') -if __name__ == "__main__": +class TestFilesDeprecation(unittest.TestCase): + """Test files module warns of deprecation.""" + + @staticmethod + def _some_function(p): + import one.alf.files + return one.alf.files.full_path_parts(p) if p else None + + def test_deprecation(self): + self.assertWarns(FutureWarning, self._some_function, None) + + +if __name__ == '__main__': unittest.main(exit=False, verbosity=2) diff --git a/one/tests/test_converters.py b/one/tests/test_converters.py index 79aa0b78..1b5a3dcc 100644 --- a/one/tests/test_converters.py +++ b/one/tests/test_converters.py @@ -9,7 +9,7 @@ from one.api import ONE from one import converters -from one.alf.files import add_uuid_string +from one.alf.path import add_uuid_string from . import util, OFFLINE_ONLY, TEST_DB_2 diff --git a/one/util.py b/one/util.py index 056da194..d7a878a0 100644 --- a/one/util.py +++ b/one/util.py @@ -16,7 +16,7 @@ from packaging import version import one.alf.exceptions as alferr -from one.alf.files import rel_path_parts, get_session_path, get_alf_path, remove_uuid_string +from one.alf.path import rel_path_parts, get_session_path, get_alf_path, remove_uuid_string from one.alf.spec import QC, FILE_SPEC, regex as alf_regex logger = logging.getLogger(__name__)