diff --git a/CHANGELOG.md b/CHANGELOG.md index 81d06d8d..16b0b6e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog -## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.5] +## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.6.0] + +### Modified +- `one.load_dataset` + - add an option to skip computing hash for existing files when loading datasets `check_hash=False` + - check filesize before computing hash for performance + + +## [2.5.5] ### Modified diff --git a/one/__init__.py b/one/__init__.py index d319c051..fddf4607 100644 --- a/one/__init__.py +++ b/one/__init__.py @@ -1,2 +1,2 @@ """The Open Neurophysiology Environment (ONE) API.""" -__version__ = '2.5.5' +__version__ = '2.6.0' diff --git a/one/api.py b/one/api.py index e896f83a..616ca999 100644 --- a/one/api.py +++ b/one/api.py @@ -525,7 +525,7 @@ def sort_fcn(itm): else: return eids - def _check_filesystem(self, datasets, offline=None, update_exists=True): + def _check_filesystem(self, datasets, offline=None, update_exists=True, check_hash=True): """Update the local filesystem for the given datasets. Given a set of datasets, check whether records correctly reflect the filesystem. @@ -581,15 +581,15 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True): if file.exists(): # Check if there's a hash mismatch # If so, add this index to list of datasets that need downloading - if rec['hash'] is not None: + if rec['file_size'] and file.stat().st_size != rec['file_size']: + _logger.warning('local file size mismatch on dataset: %s', + PurePosixPath(rec.session_path, rec.rel_path)) + indices_to_download.append(i) + elif check_hash and rec['hash'] is not None: if hashfile.md5(file) != rec['hash']: _logger.warning('local md5 mismatch on dataset: %s', PurePosixPath(rec.session_path, rec.rel_path)) indices_to_download.append(i) - elif rec['file_size'] and file.stat().st_size != rec['file_size']: - _logger.warning('local file size mismatch on dataset: %s', - PurePosixPath(rec.session_path, rec.rel_path)) - indices_to_download.append(i) files.append(file) # File exists so add to file list else: # File doesn't exist so add None to output file list diff --git a/one/remote/aws.py b/one/remote/aws.py index dc696c60..9ae2792a 100644 --- a/one/remote/aws.py +++ b/one/remote/aws.py @@ -168,6 +168,23 @@ def get_s3_public(): return s3, S3_BUCKET_IBL +def get_s3_allen(): + """ + Retrieve the Allen public S3 service resource. + + Returns + ------- + s3.ServiceResource + An S3 ServiceResource instance with the provided. + str + The name of the S3 bucket. + """ + S3_BUCKET_ALLEN = 'allen-brain-cell-atlas' + session = boto3.Session(region_name='us-west-2') + s3 = session.resource('s3', config=Config(signature_version=UNSIGNED)) + return s3, S3_BUCKET_ALLEN + + def get_s3_from_alyx(alyx, repo_name=REPO_DEFAULT): """ Create an S3 resource instance using credentials from an Alyx data repository. diff --git a/one/tests/remote/test_aws.py b/one/tests/remote/test_aws.py index fc820c89..151a9a4c 100644 --- a/one/tests/remote/test_aws.py +++ b/one/tests/remote/test_aws.py @@ -143,6 +143,20 @@ def test_url2uri(self): uri, loc = aws.url2uri(url, return_location=True) self.assertEqual(loc, 'eu-east-1') + @mock.patch('boto3.Session') + def test_get_ibl_s3(self, session_mock): + s3, bucket = aws.get_s3_public() + resource = session_mock().resource + self.assertIs(s3, resource()) + self.assertEqual(bucket, 'ibl-brain-wide-map-public') + + @mock.patch('boto3.Session') + def test_get_allen_s3(self, session_mock): + s3, bucket = aws.get_s3_allen() + resource = session_mock().resource + self.assertIs(s3, resource()) + self.assertEqual(bucket, 'allen-brain-cell-atlas') + if __name__ == '__main__': unittest.main(exit=False) diff --git a/one/tests/test_one.py b/one/tests/test_one.py index 86ce63cd..0507d4e7 100644 --- a/one/tests/test_one.py +++ b/one/tests/test_one.py @@ -68,6 +68,16 @@ def setUp(self) -> None: self.one = ONE(mode='local', cache_dir=self.tempdir.name) # Create dset files from cache util.create_file_tree(self.one) + # here we create some variations to get coverage over more case + # the 10 first records will have the right file size (0) but the wrong hash + # the next 10 records will have the right file size (0) but the correct empty file md5 + # all remaining records have NaN in file_size and None in hash (default cache table) + cols = self.one._cache['datasets'].columns + self.one._cache['datasets'].iloc[:20, cols.get_loc('file_size')] = 0 + self.one._cache['datasets'].iloc[:20, cols.get_loc('hash')]\ + = 'd41d8cd98f00b204e9800998ecf8427e' # empty hash correct + self.one._cache['datasets'].iloc[:10, cols.get_loc('hash')]\ + = 'd41d8cda454aaaa4e9800998ecf8497e' # wrong hash def tearDown(self) -> None: while Path(self.one.cache_dir).joinpath('.cache.lock').exists(): diff --git a/one/webclient.py b/one/webclient.py index cd6a5a12..3e57f1ad 100644 --- a/one/webclient.py +++ b/one/webclient.py @@ -610,11 +610,12 @@ def _generic_request(self, reqfunction, rest_query, data=None, files=None): self.authenticate(username=username, force=True) return self._generic_request(reqfunction, rest_query, data=data, files=files) else: - _logger.debug('Response text: ' + r.text) + _logger.debug('Response text raw: ' + r.text) try: message = json.loads(r.text) message.pop('status_code', None) # Get status code from response object instead message = message.get('detail') or message # Get details if available + _logger.debug(message) except json.decoder.JSONDecodeError: message = r.text raise requests.HTTPError(r.status_code, rest_query, message, response=r)