Merge branch 'dev' into link_error

hdmf-dev · Nov 18, 2024 · 9d2f673 · 9d2f673
2 parents 241bb50 + b69c3e0
commit 9d2f673
Show file tree

Hide file tree

Showing 14 changed files with 205 additions and 53 deletions.
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -6,9 +6,9 @@
 version: 2
 
 build:
-  os: ubuntu-20.04
+  os: ubuntu-22.04
   tools:
-    python: '3.9'
+    python: '3.12'
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,15 @@
 # HDMF-ZARR Changelog
 
 ## 1.0.0 (Upcoming)
+
 ### Enhancements
 * Added initial refactor of export, supporting references and internal/external links from Zarr to Zarr. This will introduce breaking changes that could lead to existing exported files to be invalid. This update removes '.' as the object default file source. @mavaylon1 [#194](https://github.com/hdmf-dev/hdmf-zarr/pull/194)
 * Added support for Pathlib paths. @mavaylon1 [#212](https://github.com/hdmf-dev/hdmf-zarr/pull/212)
+* Updated packages used for testing and readthedocs configuration. @mavaylon1, @rly [#214](https://github.com/hdmf-dev/hdmf-zarr/pull/214)
+* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file or directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229)
+
+### Bug Fixes
+* Fix reading of cached specs and caching of specs during export. @rly [#232](https://github.com/hdmf-dev/hdmf-zarr/pull/232)
 
 ## 0.9.0 (September 16, 2024)
 ### Enhancements
@@ -12,6 +18,7 @@
 * Added test for opening file with consolidated metadata from DANDI. @mavaylon1 [#206](https://github.com/hdmf-dev/hdmf-zarr/pull/206)
 * Add dimension labels compatible with xarray. @mavaylon1 [#207](https://github.com/hdmf-dev/hdmf-zarr/pull/207)
 * Added link_data --> clear_cache relationship to support repacking zarr nwbfiles: [#215](https://github.com/hdmf-dev/hdmf-zarr/pull/215)
+* Added `NWBZarrIO.read_nwb` convenience method to simplify reading an NWB file. @oruebel [#226](https://github.com/hdmf-dev/hdmf-zarr/pull/226)
 
 ## 0.8.0 (June 4, 2024)
 ### Bug Fixes

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -32,7 +32,6 @@ in a Conda environment. Normally we don't need to install ``hdmf`` directly, but
 
     conda create --name hdmf-zarr-test python=3.9
     conda activate hdmf-zarr-test
-    conda install h5py
 
     git clone --recurse-submodules https://github.com/hdmf-dev/hdmf.git
     cd hdmf

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -12,4 +12,4 @@ pytest==7.4.3
 pytest-cov==4.1.0
 python-dateutil==2.8.2
 ruff==0.1.3
-tox==4.11.3
+tox==4.11.3
diff --git a/requirements-opt.txt b/requirements-opt.txt
@@ -1,3 +1,3 @@
-tqdm==4.66.4
-fsspec==2024.6.0
-s3fs==2024.6.0
+tqdm==4.67.0
+fsspec==2024.10.0
+s3fs==2024.10.0
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 # pinned dependencies to reproduce an entire development environment to use HDMF-ZARR
-hdmf==3.14.2
-zarr==2.16.1
-pynwb==2.5.0
-numpy==2.0.0
-numcodecs==0.12.1
-threadpoolctl==3.2.0
+hdmf==3.14.5
+zarr==2.18.3
+pynwb==2.8.2
+numpy==2.1.3
+numcodecs==0.13.1
+threadpoolctl==3.5.0
diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
@@ -1,6 +1,7 @@
 """Module with the Zarr-based I/O-backend for HDMF"""
 # Python imports
 import os
+import shutil
 import warnings
 import numpy as np
 import tempfile
@@ -91,7 +92,7 @@ def can_read(path):
              'doc': 'the path to the Zarr file or a supported Zarr store'},
             {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None},
             {'name': 'mode', 'type': str,
-             'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-") '
+             'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). '
                     'the mode r- is used to force open without consolidated metadata in read only mode.'},
             {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool),
              'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.',
@@ -102,11 +103,18 @@ def can_read(path):
              'default': None},
             {'name': 'storage_options', 'type': dict,
              'doc': 'Zarr storage options to read remote folders',
-             'default': None})
+             'default': None},
+            {'name': 'force_overwrite',
+             'type': bool,
+             'doc': "force overwriting existing object when in 'w' mode. The existing file or directory"
+                    " will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)",
+             'default': False}
+            )
     def __init__(self, **kwargs):
         self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
-        path, manager, mode, synchronizer, object_codec_class, storage_options = popargs(
-            'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs)
+        path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs(
+            'path', 'manager', 'mode', 'synchronizer', 'object_codec_class',
+            'storage_options', 'force_overwrite', kwargs)
         if manager is None:
             manager = BuildManager(TypeMap(NamespaceCatalog()))
         if isinstance(synchronizer, bool):
@@ -118,6 +126,7 @@ def __init__(self, **kwargs):
         else:
             self.__synchronizer = synchronizer
         self.__mode = mode
+        self.__force_overwrite = force_overwrite
         if isinstance(path, Path):
             path = str(path)
         self.__path = path
@@ -160,25 +169,44 @@ def synchronizer(self):
     def object_codec_class(self):
         return self.__codec_cls
 
+    @property
+    def mode(self):
+        """
+        The mode specified by the user when creating the ZarrIO instance.
+
+        NOTE: The Zarr library may not honor the mode. E.g., DirectoryStore in Zarr uses
+        append mode and does not allow setting a file to read-only mode.
+        """
+        return self.__mode
+
     def open(self):
         """Open the Zarr file"""
         if self.__file is None:
+            # Allow overwriting an existing file (e.g., an HDF5 file). Zarr will normally fail if the
+            # existing object at the path is a file. So if we are in `w` mode we need to delete the file first
+            if self.mode == 'w' and self.__force_overwrite:
+                if isinstance(self.path, (str, Path)) and os.path.exists(self.path):
+                    if os.path.isdir(self.path): # directory
+                        shutil.rmtree(self.path)
+                    else:  # File
+                        os.remove(self.path)
+
             # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'.
             # As a result, when in other modes, the file will not use consolidated metadata.
-            if self.__mode != 'r':
+            if self.mode != 'r':
                 # When we consolidate metadata, we use ConsolidatedMetadataStore.
                 # This interface does not allow for setting items.
                 # In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata.
                 # r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to
                 # use the regular mode r when r- is specified
-                mode_to_use = self.__mode if self.__mode != 'r-' else 'r'
+                mode_to_use = self.mode if self.mode != 'r-' else 'r'
                 self.__file = zarr.open(store=self.path,
                                         mode=mode_to_use,
                                         synchronizer=self.__synchronizer,
                                         storage_options=self.__storage_options)
             else:
                 self.__file = self.__open_file_consolidated(store=self.path,
-                                                            mode=self.__mode,
+                                                            mode=self.mode,
                                                             synchronizer=self.__synchronizer,
                                                             storage_options=self.__storage_options)
 
@@ -343,9 +371,9 @@ def export(self, **kwargs):
         """Export data read from a file from any backend to Zarr.
         See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details.
         """
-        if self.__mode != 'w':
+        if self.mode != 'w':
             raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'."
-                                       % (self.source, self.__mode))
+                                       % (self.source, self.mode))
 
         src_io = getargs('src_io', kwargs)
         write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs)
@@ -371,6 +399,13 @@ def export(self, **kwargs):
             ckwargs['clear_cache'] = True
         super().export(**ckwargs)
         if cache_spec:
+            # add any namespaces from the src_io that have not yet been loaded
+            for namespace in src_io.manager.namespace_catalog.namespaces:
+                if namespace not in self.manager.namespace_catalog.namespaces:
+                    self.manager.namespace_catalog.add_namespace(
+                        name=namespace,
+                        namespace=src_io.manager.namespace_catalog.get_namespace(namespace)
+                    )
             self.__cache_spec()
 
     def get_written(self, builder, check_on_disk=False):

diff --git a/src/hdmf_zarr/nwb.py b/src/hdmf_zarr/nwb.py
@@ -1,6 +1,7 @@
 """Module with Zarr backend for NWB for integration with PyNWB"""
 from warnings import warn
-from .backend import ZarrIO
+from pathlib import Path
+from .backend import ZarrIO, SUPPORTED_ZARR_STORES
 
 from hdmf.utils import (docval,
                         popargs,
@@ -63,5 +64,30 @@ def export(self, **kwargs):
             kwargs['container'] = nwbfile
             super().export(**kwargs)
 
+        @staticmethod
+        @docval({'name': 'path',
+                 'type': (str, Path, *SUPPORTED_ZARR_STORES),
+                 'doc': 'the path to the Zarr file or a supported Zarr store'},
+                is_method=False)
+        def read_nwb(**kwargs):
+            """
+            Helper factory method for reading an NWB file and return the NWBFile object
+            """
+            # Retrieve the filepath
+            path = popargs('path', kwargs)
+            if isinstance(path, Path):
+                path = str(path)
+            # determine default storage options to use when opening a file from S3
+            storage_options = {}
+            if isinstance(path, str) and path.startswith(("s3://")):
+                storage_options = dict(anon=True)
+
+            # open the file with NWBZarrIO and rad the file
+            io = NWBZarrIO(path=path, mode="r", load_namespaces=True, storage_options=storage_options)
+            nwbfile = io.read()
+
+            # return the NWBFile object
+            return nwbfile
+
 except ImportError:
     warn("PyNWB is not installed. Support for NWBZarrIO is disabled.")
diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
@@ -5,6 +5,7 @@
 import math
 import json
 import logging
+import os
 from collections import deque
 from collections.abc import Iterable
 from typing import Optional, Union, Literal, Tuple, Dict, Any
@@ -373,12 +374,12 @@ class ZarrSpecReader(SpecReader):
     Class to read format specs from Zarr
     """
 
-    @docval({'name': 'group', 'type': Group, 'doc': 'the Zarr file to read specs from'},
-            {'name': 'source', 'type': str, 'doc': 'the path spec files are relative to', 'default': '.'})
+    @docval({'name': 'group', 'type': Group, 'doc': 'the Zarr file to read specs from'})
     def __init__(self, **kwargs):
-        self.__group, source = getargs('group', 'source', kwargs)
-        super_kwargs = {'source': source}
-        super(ZarrSpecReader, self).__init__(**super_kwargs)
+        self.__group = getargs('group', kwargs)
+        source = "%s:%s" % (os.path.abspath(self.__group.store.path), self.__group.name)
+        super().__init__(source=source)
+        self.__cache = None
 
     def __read(self, path):
         s = self.__group[path][0]
@@ -391,8 +392,9 @@ def read_spec(self, spec_path):
 
     def read_namespace(self, ns_path):
         """Read a namespace from the given path"""
-        ret = self.__read(ns_path)
-        ret = ret['namespaces']
+        if self.__cache is None:
+            self.__cache = self.__read(ns_path)
+        ret = self.__cache['namespaces']
         return ret
 
 

diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py
@@ -96,7 +96,10 @@ def setUp(self):
 
     def tearDown(self):
         if os.path.exists(self.store):
-            shutil.rmtree(self.store)
+            if os.path.isdir(self.store):
+                shutil.rmtree(self.store)
+            else:  # in case a test created a file instead of a directory
+                os.remove(self.store)
 
     def createReferenceBuilder(self):
         data_1 = np.arange(100, 200, 10).reshape(2, 5)
@@ -116,9 +119,9 @@ def createReferenceBuilder(self):
                                          'ref_dataset': dataset_ref})
         return builder
 
-    def create_zarr(self, consolidate_metadata=True):
+    def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode='a'):
         builder = self.createReferenceBuilder()
-        writer = ZarrIO(self.store, mode='a')
+        writer = ZarrIO(self.store, mode=mode, force_overwrite=force_overwrite)
         writer.write_builder(builder, consolidate_metadata)
         writer.close()
 

diff --git a/tests/unit/test_fsspec_streaming.py b/tests/unit/test_fsspec_streaming.py
@@ -8,17 +8,21 @@
 
 
 class TestFSSpecStreaming(unittest.TestCase):
-    @unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed")
-    def test_fsspec_streaming(self):
+
+    def setUp(self):
         # PLACEHOLDER test file from Allen Institute for Neural Dynamics
         # TODO: store a small test file and use it to speed up testing
-        remote_path = (
+        self.s3_aind_path = (
             "s3://aind-open-data/ecephys_625749_2022-08-03_15-15-06_nwb_2023-05-16_16-34-55/"
             "ecephys_625749_2022-08-03_15-15-06_nwb/"
             "ecephys_625749_2022-08-03_15-15-06_experiment1_recording1.nwb.zarr/"
         )
+        # DANDISET: 000719/icephys_9_27_2024
+        self.https_s3_path = "https://dandiarchive.s3.amazonaws.com/zarr/7515c603-9940-4598-aa1b-8bf32dc9b10c/"
 
-        with NWBZarrIO(remote_path, mode="r", storage_options=dict(anon=True)) as io:
+    @unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed")
+    def test_fsspec_streaming(self):
+        with NWBZarrIO(self.s3_aind_path, mode="r", storage_options=dict(anon=True)) as io:
             nwbfile = io.read()
 
         self.assertEqual(nwbfile.identifier, "ecephys_625749_2022-08-03_15-15-06")
@@ -32,10 +36,20 @@ def test_s3_open_with_consolidated_(self):
         """
         The file is a Zarr file with consolidated metadata.
         """
-        s3_path = "https://dandiarchive.s3.amazonaws.com/zarr/ccefbc9f-30e7-4a4c-b044-5b59d300040b/"
-        with NWBZarrIO(s3_path, mode='r') as read_io:
+        with NWBZarrIO(self.https_s3_path, mode='r') as read_io:
             read_io.open()
             self.assertIsInstance(read_io.file.store, zarr.storage.ConsolidatedMetadataStore)
-        with NWBZarrIO(s3_path, mode='-r') as read_io:
+        with NWBZarrIO(self.https_s3_path, mode='-r') as read_io:
             read_io.open()
             self.assertIsInstance(read_io.file.store, zarr.storage.FSStore)
+
+
+    @unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed")
+    def test_fsspec_streaming_via_read_nwb(self):
+        """
+        Test reading from s3 using the convenience function NWBZarrIO.read_nwb
+        """
+        # Test with a s3:// URL
+        nwbfile = NWBZarrIO.read_nwb(self.s3_aind_path)
+        self.assertEqual(nwbfile.identifier, "ecephys_625749_2022-08-03_15-15-06")
+        self.assertEqual(nwbfile.institution, "AIND")