Merge pull request #106 from int-brain-lab/olive

Olive
int-brain-lab · Jan 24, 2024 · 22f2972 · 22f2972
2 parents 9377e7f + 647123f
commit 22f2972
Show file tree

Hide file tree

Showing 7 changed files with 59 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.5.5]
+## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.6.0]
+
+### Modified
+- `one.load_dataset`
+  - add an option to skip computing hash for existing files when loading datasets `check_hash=False`
+  - check filesize before computing hash for performance
+
+
+## [2.5.5]
 
 ### Modified
 

diff --git a/one/__init__.py b/one/__init__.py
@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API."""
-__version__ = '2.5.5'
+__version__ = '2.6.0'
diff --git a/one/api.py b/one/api.py
@@ -525,7 +525,7 @@ def sort_fcn(itm):
         else:
             return eids
 
-    def _check_filesystem(self, datasets, offline=None, update_exists=True):
+    def _check_filesystem(self, datasets, offline=None, update_exists=True, check_hash=True):
         """Update the local filesystem for the given datasets.
 
         Given a set of datasets, check whether records correctly reflect the filesystem.
@@ -581,15 +581,15 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True):
             if file.exists():
                 # Check if there's a hash mismatch
                 # If so, add this index to list of datasets that need downloading
-                if rec['hash'] is not None:
+                if rec['file_size'] and file.stat().st_size != rec['file_size']:
+                    _logger.warning('local file size mismatch on dataset: %s',
+                                    PurePosixPath(rec.session_path, rec.rel_path))
+                    indices_to_download.append(i)
+                elif check_hash and rec['hash'] is not None:
                     if hashfile.md5(file) != rec['hash']:
                         _logger.warning('local md5 mismatch on dataset: %s',
                                         PurePosixPath(rec.session_path, rec.rel_path))
                         indices_to_download.append(i)
-                elif rec['file_size'] and file.stat().st_size != rec['file_size']:
-                    _logger.warning('local file size mismatch on dataset: %s',
-                                    PurePosixPath(rec.session_path, rec.rel_path))
-                    indices_to_download.append(i)
                 files.append(file)  # File exists so add to file list
             else:
                 # File doesn't exist so add None to output file list

diff --git a/one/remote/aws.py b/one/remote/aws.py
@@ -168,6 +168,23 @@ def get_s3_public():
     return s3, S3_BUCKET_IBL
 
 
+def get_s3_allen():
+    """
+    Retrieve the Allen public S3 service resource.
+
+    Returns
+    -------
+    s3.ServiceResource
+        An S3 ServiceResource instance with the provided.
+    str
+        The name of the S3 bucket.
+    """
+    S3_BUCKET_ALLEN = 'allen-brain-cell-atlas'
+    session = boto3.Session(region_name='us-west-2')
+    s3 = session.resource('s3', config=Config(signature_version=UNSIGNED))
+    return s3, S3_BUCKET_ALLEN
+
+
 def get_s3_from_alyx(alyx, repo_name=REPO_DEFAULT):
     """
     Create an S3 resource instance using credentials from an Alyx data repository.

diff --git a/one/tests/remote/test_aws.py b/one/tests/remote/test_aws.py
@@ -143,6 +143,20 @@ def test_url2uri(self):
         uri, loc = aws.url2uri(url, return_location=True)
         self.assertEqual(loc, 'eu-east-1')
 
+    @mock.patch('boto3.Session')
+    def test_get_ibl_s3(self, session_mock):
+        s3, bucket = aws.get_s3_public()
+        resource = session_mock().resource
+        self.assertIs(s3, resource())
+        self.assertEqual(bucket, 'ibl-brain-wide-map-public')
+
+    @mock.patch('boto3.Session')
+    def test_get_allen_s3(self, session_mock):
+        s3, bucket = aws.get_s3_allen()
+        resource = session_mock().resource
+        self.assertIs(s3, resource())
+        self.assertEqual(bucket, 'allen-brain-cell-atlas')
+
 
 if __name__ == '__main__':
     unittest.main(exit=False)
diff --git a/one/tests/test_one.py b/one/tests/test_one.py
@@ -68,6 +68,16 @@ def setUp(self) -> None:
         self.one = ONE(mode='local', cache_dir=self.tempdir.name)
         # Create dset files from cache
         util.create_file_tree(self.one)
+        # here we create some variations to get coverage over more case
+        # the 10 first records will have the right file size (0) but the wrong hash
+        # the next 10 records will have the right file size (0) but the correct empty file md5
+        # all remaining records have NaN in file_size and None in hash (default cache table)
+        cols = self.one._cache['datasets'].columns
+        self.one._cache['datasets'].iloc[:20, cols.get_loc('file_size')] = 0
+        self.one._cache['datasets'].iloc[:20, cols.get_loc('hash')]\
+            = 'd41d8cd98f00b204e9800998ecf8427e'  # empty hash correct
+        self.one._cache['datasets'].iloc[:10, cols.get_loc('hash')]\
+            = 'd41d8cda454aaaa4e9800998ecf8497e'  # wrong hash
 
     def tearDown(self) -> None:
         while Path(self.one.cache_dir).joinpath('.cache.lock').exists():

diff --git a/one/webclient.py b/one/webclient.py
@@ -610,11 +610,12 @@ def _generic_request(self, reqfunction, rest_query, data=None, files=None):
             self.authenticate(username=username, force=True)
             return self._generic_request(reqfunction, rest_query, data=data, files=files)
         else:
-            _logger.debug('Response text: ' + r.text)
+            _logger.debug('Response text raw: ' + r.text)
             try:
                 message = json.loads(r.text)
                 message.pop('status_code', None)  # Get status code from response object instead
                 message = message.get('detail') or message  # Get details if available
+                _logger.debug(message)
             except json.decoder.JSONDecodeError:
                 message = r.text
             raise requests.HTTPError(r.status_code, rest_query, message, response=r)