diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ccaba9a..82e1a380 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.8.1] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.0] +This version adds a couple of new ALF functions. + +### Added + +- one.alf.io.find_variants allows one to find similar datasets on disk, such as revisions +- one.alf.files.without_revision returns a file path without the revision folder + +### Modified + +- one.alf.files.add_uuid_string will now replace a UUID in a filename if one already present. + +## [2.8.1] ### Modified diff --git a/one/__init__.py b/one/__init__.py index 12e82f81..1fedcb5c 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.8.1' +__version__ = '2.9.0' diff --git a/one/alf/files.py b/one/alf/files.py index 73c040db..bf13ad4a 100644 --- a/one/alf/files.py +++ b/one/alf/files.py @@ -407,9 +407,13 @@ def add_uuid_string(file_path, uuid): if isinstance(file_path, str): file_path = Path(file_path) name_parts = file_path.stem.split('.') - if uuid == name_parts[-1]: - _logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE') - return file_path + if spec.is_uuid(name_parts[-1]): + *name_parts, old_uuid = name_parts + if old_uuid == uuid: + _logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE') + return file_path + else: + _logger.debug('Replacing %s with %s in %s', old_uuid, uuid, file_path) return file_path.parent.joinpath(f"{'.'.join(name_parts)}.{uuid}{file_path.suffix}") @@ -448,13 +452,13 @@ def remove_uuid_string(file_path): return file_path -def padded_sequence(filepath): +def padded_sequence(file_path): """ Ensures a file path contains a zero-padded experiment sequence folder. Parameters ---------- - filepath : str, pathlib.Path, pathlib.PurePath + file_path : str, pathlib.Path, pathlib.PurePath A session or file path to convert. Returns @@ -465,20 +469,45 @@ def padded_sequence(filepath): Examples -------- - >>> filepath = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml' - >>> padded_sequence(filepath) + >>> file_path = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml' + >>> padded_sequence(file_path) pathlib.Path('/iblrigdata/subject/2023-01-01/001/_ibl_experiment.description.yaml') Supports folders and will not affect already padded paths >>> session_path = pathlib.PurePosixPath('subject/2023-01-01/001') - >>> padded_sequence(filepath) + >>> padded_sequence(file_path) pathlib.PurePosixPath('subject/2023-01-01/001') """ - if isinstance(filepath, str): - filepath = Path(filepath) - if (session_path := get_session_path(filepath)) is None: + if isinstance(file_path, str): + file_path = Path(file_path) + if (session_path := get_session_path(file_path)) is None: raise ValueError('path must include a valid ALF session path, e.g. subject/YYYY-MM-DD/N') - idx = len(filepath.parts) - len(session_path.parts) + idx = len(file_path.parts) - len(session_path.parts) sequence = str(int(session_path.parts[-1])).zfill(3) # zero-pad if necessary - return filepath.parents[idx].joinpath(sequence, filepath.relative_to(session_path)) + return file_path.parents[idx].joinpath(sequence, file_path.relative_to(session_path)) + + +def without_revision(file_path): + """ + Return file path without a revision folder. + + Parameters + ---------- + file_path : str, pathlib.Path + A valid ALF dataset path. + + Returns + ------- + pathlib.Path + The input file path without a revision folder. + + Examples + -------- + >>> without_revision('/lab/Subjects/subject/2023-01-01/001/collection/#revision#/obj.attr.ext') + Path('/lab/Subjects/subject/2023-01-01/001/collection/obj.attr.ext') + """ + if isinstance(file_path, str): + file_path = Path(file_path) + *_, collection, revision = folder_parts(file_path.parent) + return get_session_path(file_path).joinpath(*filter(None, (collection, file_path.name))) diff --git a/one/alf/io.py b/one/alf/io.py index 174ddb5f..ed7dab7c 100644 --- a/one/alf/io.py +++ b/one/alf/io.py @@ -14,6 +14,8 @@ from fnmatch import fnmatch from pathlib import Path from typing import Union +from functools import partial +from itertools import chain import warnings import numpy as np @@ -342,7 +344,7 @@ def _ls(alfpath, object=None, **kwargs) -> (list, tuple): An ALF object name to filter by wildcards : bool If true uses unix shell style pattern matching, otherwise uses regular expressions - **kwargs + kwargs Other ALF parts to filter, including namespace, attribute, etc. Returns @@ -446,7 +448,7 @@ def exists(alfpath, object, attributes=None, **kwargs) -> bool: Wanted attributes wildcards : bool If true uses unix shell style pattern matching, otherwise uses regular expressions - **kwargs + kwargs Other ALF parts to filter by Returns @@ -496,7 +498,7 @@ def load_object(alfpath, object=None, short_keys=False, **kwargs): and timescale. wildcards : bool If true uses unix shell style pattern matching, otherwise uses regular expressions. - **kwargs + kwargs Other ALF parts to filter by. Returns @@ -832,3 +834,85 @@ def _match(part, pattern, split=None): break return alf_files, [tuple(attr.values()) for attr in attributes] + + +def find_variants(file_list, namespace=True, timescale=True, extra=True, extension=True): + """ + Find variant datasets. + + Finds any datasets on disk that are considered a variant of the input datasets. At minimum, a + dataset is uniquely defined by session path, collection, object and attribute. Therefore, + datasets with the same name and collection in a different revision folder are considered a + variant. If any of the keyword arguments are set to False, those parts are ignored when + comparing datasets. + + Parameters + ---------- + file_list : list of str, list of pathlib.Path + A list of ALF paths to find variants of. + namespace : bool + If true, treat datasets with a different namespace as unique. + timescale : bool + If true, treat datasets with a different timescale as unique. + extra : bool + If true, treat datasets with a different extra parts as unique. + extension : bool + If true, treat datasets with a different extension as unique. + + Returns + ------- + Dict[pathlib.Path, list of pathlib.Path] + A map of input file paths to a list variant dataset paths. + + Raises + ------ + ValueError + One or more input file paths are not valid ALF datasets. + + Examples + -------- + Find all datasets with an identical name and collection in a different revision folder + + >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy']) + {Path('/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'): [ + Path('/sub/2020-10-01/001/alf/obj.attr.npy') + ]} + + Find all datasets with different namespace or revision + + >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'], namespace=False) + {Path('/sub/2020-10-01/001/#2020-01-01#/obj.attr.npy'): [ + Path('/sub/2020-10-01/001/#2020-01-01#/_ns_obj.attr.npy'), + Path('/sub/2020-10-01/001/obj.attr.npy'), + ]} + + """ + # Parse into individual ALF parts + to_parts_dict = partial(files.full_path_parts, as_dict=True) + uParts = map(to_parts_dict, file_list) + # Initialize map of unique files to their duplicates + duplicates = {} + # Determine which parts to filter + variables = locals() + filters = {'namespace', 'timescale', 'extra', 'extension'} + to_compare = ('lab', 'subject', 'date', 'number', 'collection', 'object', 'attribute', + *(arg for arg in filters if variables[arg])) + + def parts_match(parts, file): + """Compare a file's unique parts to a given file""" + other = to_parts_dict(file) + return all(parts[k] == other[k] for k in to_compare) + + # iterate over unique files and their parts + for f, parts in zip(map(Path, file_list), uParts): + # first glob for files matching object.attribute (including revisions) + pattern = f'*{parts["object"]}.{parts["attribute"]}*' + # this works because revision will always be last folder; + # i.e. revisions can't contain collections + globbed = map(files.without_revision(f).parent.glob, (pattern, '#*#/' + pattern)) + globbed = chain.from_iterable(globbed) # unite revision and non-revision globs + # refine duplicates based on other parts (this also ensures we don't catch similar objects) + globbed = filter(partial(parts_match, parts), globbed) + # key = f.relative_to(one.alf.files.get_session_path(f)).as_posix() + duplicates[f] = [x for x in globbed if x != f] # map file to list of its duplicates + return duplicates diff --git a/one/registration.py b/one/registration.py index 11fbce14..ae85eb31 100644 --- a/one/registration.py +++ b/one/registration.py @@ -410,13 +410,13 @@ def prepare_files(self, file_list, versions=None): Returns ------- list of dicts - A dict containing a list of files for each session + A dict containing a list of files for each session. list of dicts - A dict containg a list of versions for each session + A dict containing a list of versions for each session. list - A list of files converted to paths + A list of files converted to paths. bool - A boolean indicating if input was a single file + A boolean indicating if input was a single file. """ F = defaultdict(list) # empty map whose keys will be session paths diff --git a/one/tests/alf/test_alf_files.py b/one/tests/alf/test_alf_files.py index 87eee4f6..c0dbbcb5 100644 --- a/one/tests/alf/test_alf_files.py +++ b/one/tests/alf/test_alf_files.py @@ -160,6 +160,12 @@ def test_add_uuid(self): self.assertEqual(tup[1], files.add_uuid_string(tup[0], _uuid)) self.assertEqual(tup[1], files.add_uuid_string(tup[0], str(_uuid))) + _uuid2 = uuid.uuid4() + with self.assertLogs(files.__name__, level=10) as cm: + expected = Path(f'/titi/tutu.part1.part1.{_uuid2}.json') + self.assertEqual(expected, files.add_uuid_string(file_with_uuid, _uuid2)) + self.assertRegex(cm.output[0], 'Replacing [a-f0-9-]+ with [a-f0-9-]+') + with self.assertRaises(ValueError): files.add_uuid_string('/foo/bar.npy', 'fake') @@ -225,6 +231,18 @@ def test_get_alf_path(self): path = '/trials.intervals_bpod.npy' self.assertEqual(files.get_alf_path(path), 'trials.intervals_bpod.npy') + def test_without_revision(self): + """Test for one.alf.files.without_revision function.""" + path = '/mnt/s0/Data/Subjects/ZM_1368/2019-04-19/001/alf/#2020-01-01#/obj.attr.ext' + out = files.without_revision(path) + expected = Path(path.replace('/#2020-01-01#', '')) + self.assertIsInstance(out, Path) + self.assertEqual(expected, out, 'failed to remove revision folder') + self.assertEqual(expected, files.without_revision(out)) # should do nothing to path + with self.assertRaises(ValueError) as cm: + files.without_revision('foo/bar/baz.npy') + self.assertRegex(str(cm.exception), 'Invalid ALF') + if __name__ == "__main__": unittest.main(exit=False, verbosity=2) diff --git a/one/tests/alf/test_alf_io.py b/one/tests/alf/test_alf_io.py index f4f4b160..21e56e0d 100644 --- a/one/tests/alf/test_alf_io.py +++ b/one/tests/alf/test_alf_io.py @@ -711,5 +711,51 @@ def test_iter_datasets(self): self.assertEqual([Path(*dset.parts[-2:])], ses_files) +class TestFindVariants(unittest.TestCase): + + def setUp(self): + tmp = tempfile.TemporaryDirectory() + self.tmp = Path(tmp.name) + self.addCleanup(tmp.cleanup) + + # Create tree + self.session_path = self.tmp / 'subject' / '2020-01-01' / '001' + self.dsets = [ + self.session_path.joinpath('_x_foo.bar.npy'), + self.session_path.joinpath('#2021-01-01#', 'foo.bar.npy'), + self.session_path.joinpath(f'bar.baz.{uuid.uuid4()}.npy'), + self.session_path.joinpath(f'bar.baz_y.{uuid.uuid4()}.npy'), + self.session_path.joinpath('#2021-01-01#', f'bar.baz.{uuid.uuid4()}.npy'), + self.session_path.joinpath('task_00', 'x.y.z'), + self.session_path.joinpath('x.y.z'), + ] + for f in self.dsets: + f.parent.mkdir(exist_ok=True, parents=True) + f.touch() + + def test_unique(self): + """Test for one.alf.io.find_variants function.""" + dupes = alfio.find_variants(self.dsets) + self.assertCountEqual(self.dsets, dupes.keys(), 'expected keys to match input files') + self.assertFalse(any(map(any, dupes.values())), 'expected no duplicates') + + # With extra=False should treat files with extra parts as a variant + dupes = alfio.find_variants(self.dsets, extra=False) + # 'bar.baz.abc.npy' is a variant of '#revision#/bar.baz.def.npy' and vice versa + self.assertEqual(dupes[self.dsets[2]], [self.dsets[4]]) + self.assertEqual(dupes[self.dsets[4]], [self.dsets[2]]) + # Expect all other datasets to be considered unique + others = [v for k, v in dupes.items() if k not in (self.dsets[2], self.dsets[4])] + self.assertFalse(any(map(any, others))) + + # Treat other file parts as variants + files = [self.dsets[0], self.dsets[2], self.dsets[-1]] + dupes = alfio.find_variants(files, namespace=False, timescale=False, extra=False) + expected_files = (self.dsets[1:2], self.dsets[3:5], []) # expected variants for each file + for key, expected in zip(files, expected_files): + with self.subTest(key=key): + self.assertCountEqual(dupes[self.session_path.joinpath(key)], expected) + + if __name__ == '__main__': unittest.main(exit=False, verbosity=2)