diff --git a/CHANGELOG.md b/CHANGELOG.md index af4e2530..ac88ec2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.4.0] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.0] + +### Added + +- One.uuid_filenames property supports local files containing the dataset UUID + +### Modified + +- One._save_cache lock file timeout changed from 30 to 5 seconds + +## [2.4.0] ### Added diff --git a/one/__init__.py b/one/__init__.py index 1810db5d..083e474f 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.4.0' +__version__ = '2.5.0' diff --git a/one/api.py b/one/api.py index f1c9fcf4..b76f3d8b 100644 --- a/one/api.py +++ b/one/api.py @@ -44,6 +44,9 @@ class One(ConversionMixin): 'dataset', 'date_range', 'laboratory', 'number', 'projects', 'subject', 'task_protocol' ) + uuid_filenames = None + """bool: whether datasets on disk have a UUID in their filename""" + def __init__(self, cache_dir=None, mode='auto', wildcards=True, tables_dir=None): """An API for searching and loading data on a local filesystem @@ -72,6 +75,8 @@ def __init__(self, cache_dir=None, mode='auto', wildcards=True, tables_dir=None) self.mode = mode self.wildcards = wildcards # Flag indicating whether to use regex or wildcards self.record_loaded = False + # assign property here as different instances may work on separate filesystems + self.uuid_filenames = False # init the cache file self._reset_cache() self.load_cache() @@ -177,7 +182,7 @@ def _save_cache(self, save_dir=None, force=False): force : bool If True, the cache is saved regardless of modification time. """ - TIMEOUT = 30 # Delete lock file this many seconds after creation/modification or waiting + TIMEOUT = 5 # Delete lock file this many seconds after creation/modification or waiting lock_file = Path(self.cache_dir).joinpath('.cache.lock') save_dir = Path(save_dir or self.cache_dir) meta = self._cache['_meta'] @@ -569,6 +574,8 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True): # First go through datasets and check if file exists and hash matches for i, rec in datasets.iterrows(): file = Path(self.cache_dir, *rec[['session_path', 'rel_path']]) + if self.uuid_filenames: + file = alfiles.add_uuid_string(file, i[1] if isinstance(i, tuple) else i) if file.exists(): # Check if there's a hash mismatch # If so, add this index to list of datasets that need downloading @@ -2138,23 +2145,29 @@ def _add_date(records): def _download_datasets(self, dsets, **kwargs) -> List[Path]: """ - Download a single or multitude of datasets if stored on AWS, otherwise calls - OneAlyx._download_dataset. + Download a single or multitude of datasets if stored on AWS, otherwise calls + OneAlyx._download_dataset. - NB: This will not skip files that are already present. Use check_filesystem instead. + NB: This will not skip files that are already present. Use check_filesystem instead. - Parameters - ---------- - dset : dict, str, pd.Series - A single or multitude of dataset dictionaries. + Parameters + ---------- + dset : dict, str, pandas.Series, pandas.DataFrame + A single or multitude of dataset dictionaries. For AWS downloads the input must be a + data frame. + + Returns + ------- + list of pathlib.Path + A list of local file paths. + """ + # determine whether to remove the UUID after download, this may be overridden by user + kwargs['keep_uuid'] = kwargs.get('keep_uuid', self.uuid_filenames) - Returns - ------- - pathlib.Path - A local file path or list of paths. - """ # If all datasets exist on AWS, download from there. try: + if not isinstance(dsets, pd.DataFrame): + raise TypeError('Input datasets must be a pandas data frame for AWS download.') if 'exists_aws' in dsets and np.all(np.equal(dsets['exists_aws'].values, True)): _logger.info('Downloading from AWS') return self._download_aws(map(lambda x: x[1], dsets.iterrows()), **kwargs) @@ -2162,7 +2175,31 @@ def _download_datasets(self, dsets, **kwargs) -> List[Path]: _logger.debug(ex) return self._download_dataset(dsets, **kwargs) - def _download_aws(self, dsets, update_exists=True, **_) -> List[Path]: + def _download_aws(self, dsets, update_exists=True, keep_uuid=None, **_) -> List[Path]: + """ + Download datasets from an AWS S3 instance using boto3. + + Parameters + ---------- + dsets : list of pandas.Series + An iterable for datasets as a pandas Series. + update_exists : bool + If true, the 'exists_aws' field of the cache table is set to False for any missing + datasets. + keep_uuid : bool + If false, the dataset UUID is removed from the downloaded filename. If None, the + `uuid_filenames` attribute determined whether the UUID is kept (default is false). + + Returns + ------- + list of pathlib.Path + A list the length of `dsets` of downloaded dataset file paths. Missing datasets are + returned as None. + + See Also + -------- + one.remote.aws.s3_download_file - The AWS download function. + """ # Download datasets from AWS import one.remote.aws as aws s3, bucket_name = aws.get_s3_from_alyx(self.alyx) @@ -2191,8 +2228,9 @@ def _download_aws(self, dsets, update_exists=True, **_) -> List[Path]: continue source_path = PurePosixPath(record['data_repository_path'], record['relative_path']) source_path = alfiles.add_uuid_string(source_path, uuid) - local_path = alfiles.remove_uuid_string( - self.cache_dir.joinpath(dset['session_path'], dset['rel_path'])) + local_path = self.cache_dir.joinpath(dset['session_path'], dset['rel_path']) + if keep_uuid is True or (keep_uuid is None and self.uuid_filenames is True): + local_path = alfiles.add_uuid_string(local_path, uuid) local_path.parent.mkdir(exist_ok=True, parents=True) out_files.append(aws.s3_download_file( source_path, local_path, s3=s3, bucket_name=bucket_name, overwrite=update_exists)) @@ -2275,7 +2313,7 @@ def _download_dataset(self, dset, cache_dir=None, update_cache=True, **kwargs) - Returns ------- - pathlib.Path, list + list of pathlib.Path A local file path or list of paths. """ cache_dir = cache_dir or self.cache_dir @@ -2317,7 +2355,7 @@ def _tag_mismatched_file_record(self, url): f'Failed to tag remote file record mismatch: {ex}\n' 'Please contact the database administrator.') - def _download_file(self, url, target_dir, keep_uuid=False, file_size=None, hash=None): + def _download_file(self, url, target_dir, keep_uuid=None, file_size=None, hash=None): """ Downloads a single file or multitude of files from an HTTP webserver. The webserver in question is set by the AlyxClient object. @@ -2329,7 +2367,7 @@ def _download_file(self, url, target_dir, keep_uuid=False, file_size=None, hash= target_dir : str, list Absolute path of directory to download file to (including alf path). keep_uuid : bool - If true, the UUID is not removed from the file name (default is False). + If true, the UUID is not removed from the file name. See `uuid_filenames' property. file_size : int, list The expected file size or list of file sizes to compare with downloaded file. hash : str, list @@ -2337,7 +2375,7 @@ def _download_file(self, url, target_dir, keep_uuid=False, file_size=None, hash= Returns ------- - pathlib.Path + pathlib.Path or list of pathlib.Path The file path of the downloaded file or files. Example @@ -2360,7 +2398,7 @@ def _download_file(self, url, target_dir, keep_uuid=False, file_size=None, hash= self._check_hash_and_file_size_mismatch(*args) # check if we are keeping the uuid on the list of file names - if keep_uuid: + if keep_uuid is True or (keep_uuid is None and self.uuid_filenames): return local_path # remove uuids from list of file names diff --git a/one/converters.py b/one/converters.py index 5fb779d3..312409ef 100644 --- a/one/converters.py +++ b/one/converters.py @@ -340,6 +340,9 @@ def record2path(self, dataset) -> Optional[Path]: assert isinstance(dataset, pd.Series) or len(dataset) == 1 session_path, rel_path = dataset[['session_path', 'rel_path']].to_numpy().flatten() file = Path(self.cache_dir, session_path, rel_path) + if self.uuid_filenames: + i = dataset.name if isinstance(dataset, pd.Series) else dataset.index[0] + file = add_uuid_string(file, i[1] if isinstance(i, tuple) else i) return file # files[0] if len(datasets) == 1 else files @recurse diff --git a/one/tests/test_converters.py b/one/tests/test_converters.py index 856465c2..0a979c5f 100644 --- a/one/tests/test_converters.py +++ b/one/tests/test_converters.py @@ -264,13 +264,23 @@ def test_record2path(self): alf_path = ('hoferlab/Subjects/SWC_043/2020-09-21/001/' 'alf/probe00/_phy_spikes_subset.channels.npy') expected = Path(self.one.alyx.cache_dir).joinpath(*alf_path.split('/')) - path = self.one.record2path(rec.loc[(self.eid, '00c234a3-a4ff-4f97-a522-939d15528a45')]) + data_id = '00c234a3-a4ff-4f97-a522-939d15528a45' + path = self.one.record2path(rec.loc[(self.eid, data_id)]) self.assertIsInstance(path, Path) self.assertEqual(expected, path) # As pd.DataFrame idx = rec.rel_path == 'alf/probe00/_phy_spikes_subset.channels.npy' path = self.one.record2path(rec[idx]) self.assertEqual(expected, path) + # With UUID in file name + try: + self.one.uuid_filenames = True + expected = expected.with_suffix(f'.{data_id}.npy') + self.assertEqual(expected, self.one.record2path(rec[idx])) # as pd.DataFrame + self.assertEqual(expected, self.one.record2path(rec[idx].squeeze())) # as pd.Series + self.assertEqual(expected, self.one.record2path(rec[idx].droplevel(0))) # no eid + finally: + self.one.uuid_filenames = False def test_eid2path(self): """Test for OneAlyx.eid2path""" diff --git a/one/tests/test_one.py b/one/tests/test_one.py index 65fb8ad8..746c7ebb 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -447,6 +447,16 @@ def test_check_filesystem(self): # Attempt the same with the eid index missing datasets = datasets.droplevel(0).drop('session_path', axis=1) self.assertEqual(files, self.one._check_filesystem(datasets)) + # Test with uuid_filenames as True + self.one.uuid_filenames = True + try: + for file, (uuid, _) in zip(files, datasets.iterrows()): + file.rename(file.with_suffix(f'.{uuid}{file.suffix}')) + files = self.one._check_filesystem(datasets) + self.assertTrue(all(files)) + self.assertIn(datasets.index[0], files[0].name) + finally: + self.one.uuid_filenames = False def test_load_dataset(self): """Test One.load_dataset""" @@ -735,6 +745,17 @@ def test_save_cache(self): raw_modified = One(cache_dir=tdir)._cache['_meta']['raw']['datasets']['date_modified'] expected = self.one._cache['_meta']['modified_time'].strftime('%Y-%m-%d %H:%M') self.assertEqual(raw_modified, expected) + # Test file lock + t_now = time.time() + with mock.patch('one.api.time') as time_mock: + lock_file = Path(self.one.cache_dir).joinpath('.cache.lock') + lock_file.touch() + # We expect the process to sleep at first, then after skipping time, + # the stale lock file should be removed. + time_mock.time.side_effect = (t_now, t_now + 30) + self.one._save_cache(save_dir=tdir, force=True) # force flag ignores modified time + self.assertFalse(lock_file.exists(), 'failed to remove stale lock file') + time_mock.sleep.assert_called() def test_update_cache_from_records(self): """Test One._update_cache_from_records""" @@ -1525,6 +1546,14 @@ def test_download_aws(self): mock.patch('one.remote.aws.s3_download_file', return_value=file) as method: self.one._download_datasets(dsets) self.assertEqual(len(dsets), method.call_count) + # Check output filename + _, local = method.call_args.args + self.assertTrue(local.as_posix().startswith(self.one.cache_dir.as_posix())) + self.assertTrue(local.as_posix().endswith(dsets.iloc[-1, -1])) + # Check keep_uuid = True + self.one._download_datasets(dsets, keep_uuid=True) + _, local = method.call_args.args + self.assertIn(dsets.iloc[-1].name, local.name) # Test behaviour when dataset not remotely accessible dsets = dsets[:1].copy() @@ -1544,7 +1573,11 @@ def test_download_aws(self): with mock.patch('one.remote.aws.get_s3_from_alyx', side_effect=RuntimeError), \ mock.patch.object(self.one, '_download_dataset') as mock_method: self.one._download_datasets(dsets) - mock_method.assert_called_with(dsets) + mock_method.assert_called_with(dsets, keep_uuid=False) + # Test type check (download_aws only works with data frames) + with mock.patch.object(self.one, '_download_dataset') as mock_method: + self.one._download_datasets(dsets.to_dict('records')) + mock_method.assert_called() def test_tag_mismatched_file_record(self): """Test for OneAlyx._tag_mismatched_file_record.