From 0f012815543eb681a0b79bcc2e1dc93f13bfe35b Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 7 Nov 2024 16:05:35 -0800 Subject: [PATCH 1/8] Fix #117 Fix #137 Support overwriting an existing file at the same location --- src/hdmf_zarr/backend.py | 39 ++++++++++++++++++++++++++------- tests/unit/base_tests_zarrio.py | 9 +++++--- tests/unit/test_zarrio.py | 13 +++++++++++ 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index cb0fea66..deda1351 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -102,11 +102,17 @@ def can_read(path): 'default': None}, {'name': 'storage_options', 'type': dict, 'doc': 'Zarr storage options to read remote folders', - 'default': None}) + 'default': None}, + {'name': 'force_overwrite', + 'type': bool, + 'doc': "force overwriting existing object when in 'w' mode even" + " if the existing object is a file (e.g., and HDF5 file)", + 'default': False} + ) def __init__(self, **kwargs): self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) - path, manager, mode, synchronizer, object_codec_class, storage_options = popargs( - 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs) + path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs( + 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', 'force_overwrite', kwargs) if manager is None: manager = BuildManager(TypeMap(NamespaceCatalog())) if isinstance(synchronizer, bool): @@ -118,6 +124,7 @@ def __init__(self, **kwargs): else: self.__synchronizer = synchronizer self.__mode = mode + self.__force_overwrite = force_overwrite if isinstance(path, Path): path = str(path) self.__path = path @@ -160,25 +167,41 @@ def synchronizer(self): def object_codec_class(self): return self.__codec_cls + @property + def mode(self): + """ + The mode specified by the user when creating the ZarrIO instance. + + NOTE: The Zarr library may not honor the mode. E.g., DirectoryStore in Zarr uses + append mode and does not allow setting a file to read-only mode. + """ + return self.__mode + def open(self): """Open the Zarr file""" if self.__file is None: + # Allow overwriting an exist file (e.g., and HDF5 file). Zarr will normally fail if the + # existing object at the path is a file. So if we are in `w` mode we need to delete the file first + if self.mode == 'w' and self.__force_overwrite: + if isinstance(self.path, (str, Path)) and os.path.exists(self.path) and os.path.isfile(self.path): + os.remove(self.path) + # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. # As a result, when in other modes, the file will not use consolidated metadata. - if self.__mode != 'r': + if self.mode != 'r': # When we consolidate metadata, we use ConsolidatedMetadataStore. # This interface does not allow for setting items. # In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata. # r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to # use the regular mode r when r- is specified - mode_to_use = self.__mode if self.__mode != 'r-' else 'r' + mode_to_use = self.mode if self.mode != 'r-' else 'r' self.__file = zarr.open(store=self.path, mode=mode_to_use, synchronizer=self.__synchronizer, storage_options=self.__storage_options) else: self.__file = self.__open_file_consolidated(store=self.path, - mode=self.__mode, + mode=self.mode, synchronizer=self.__synchronizer, storage_options=self.__storage_options) @@ -343,9 +366,9 @@ def export(self, **kwargs): """Export data read from a file from any backend to Zarr. See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. """ - if self.__mode != 'w': + if self.mode != 'w': raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." - % (self.source, self.__mode)) + % (self.source, self.mode)) src_io = getargs('src_io', kwargs) write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index b63bb1cf..c0324642 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -97,7 +97,10 @@ def setUp(self): def tearDown(self): if os.path.exists(self.store): - shutil.rmtree(self.store) + if os.path.isdir(self.store): + shutil.rmtree(self.store) + else: # in case a test created a file instead of a directory + os.remove(self.store) def createReferenceBuilder(self): data_1 = np.arange(100, 200, 10).reshape(2, 5) @@ -117,9 +120,9 @@ def createReferenceBuilder(self): 'ref_dataset': dataset_ref}) return builder - def create_zarr(self, consolidate_metadata=True): + def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode='a'): builder = self.createReferenceBuilder() - writer = ZarrIO(self.store, mode='a') + writer = ZarrIO(self.store, mode=mode, force_overwrite=force_overwrite) writer.write_builder(builder, consolidate_metadata) writer.close() diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index 824e5ae4..258eea41 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -198,6 +198,19 @@ def test_force_open_without_consolidated_fails(self): except ValueError as e: self.fail("ZarrIO.__open_file_consolidated raised an unexpected ValueError: {}".format(e)) +class TestOverwriteExistingFile(ZarrStoreTestCase): + def test_force_overwrite_when_file_exists(self): + """ + Test that we can overwrite a file when opening with `w` mode even if there is + an existing file. Zarr can write into a directory but not a file. + """ + # create a dummy text file + with open(self.store, "w") as file: + file.write("Just a test file used in TestOverwriteExistingFile") + # try to create a Zarr file at the same location (i.e., self.store) as the + # test text file to force overwriting the exsiting file. + self.create_zarr(force_overwrite=True, mode='w') + class TestDimensionLabels(BuildDatasetShapeMixin): """ From 6fb42daa6acd25ba0095be195f8f080f9e53d3fd Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 7 Nov 2024 16:10:27 -0800 Subject: [PATCH 2/8] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ecf61732..845a2509 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements * Added support for Pathlib paths. @mavaylon1 [#212](https://github.com/hdmf-dev/hdmf-zarr/pull/212) * Updated packages used for testing and readthedocs configuration. @mavaylon1, @rly [#214](https://github.com/hdmf-dev/hdmf-zarr/pull/214) +* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file with a Zarr file, not just a directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229) ## 0.9.0 (September 16, 2024) ### Enhancements From 5685b28c8dfcacc44e0694c0a701cdeeb3fe9888 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 7 Nov 2024 16:16:33 -0800 Subject: [PATCH 3/8] Support overwriting an existing directory --- src/hdmf_zarr/backend.py | 8 ++++++-- tests/unit/test_zarrio.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index deda1351..6aa95a29 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -1,6 +1,7 @@ """Module with the Zarr-based I/O-backend for HDMF""" # Python imports import os +import shutil import warnings import numpy as np import tempfile @@ -183,8 +184,11 @@ def open(self): # Allow overwriting an exist file (e.g., and HDF5 file). Zarr will normally fail if the # existing object at the path is a file. So if we are in `w` mode we need to delete the file first if self.mode == 'w' and self.__force_overwrite: - if isinstance(self.path, (str, Path)) and os.path.exists(self.path) and os.path.isfile(self.path): - os.remove(self.path) + if isinstance(self.path, (str, Path)) and os.path.exists(self.path): + if os.path.isdir(self.path): # directory + shutil.rmtree(self.path) + else: # File + os.remove(self.path) # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. # As a result, when in other modes, the file will not use consolidated metadata. diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index 258eea41..b0c4429e 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -211,6 +211,16 @@ def test_force_overwrite_when_file_exists(self): # test text file to force overwriting the exsiting file. self.create_zarr(force_overwrite=True, mode='w') + def test_force_overwrite_when_dir_exists(self): + """ + Test that we can overwrite a directory when opening with `w` mode even if there is + an existing directory. + """ + # create a Zarr file + self.create_zarr() + # try to overwrite the existing Zarr file + self.create_zarr(force_overwrite=True, mode='w') + class TestDimensionLabels(BuildDatasetShapeMixin): """ From 8791dff88dd1dd255454c7ffc9cec5d6b7c5127f Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 7 Nov 2024 16:19:28 -0800 Subject: [PATCH 4/8] Update docstring --- CHANGELOG.md | 2 +- src/hdmf_zarr/backend.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 845a2509..675563b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ ### Enhancements * Added support for Pathlib paths. @mavaylon1 [#212](https://github.com/hdmf-dev/hdmf-zarr/pull/212) * Updated packages used for testing and readthedocs configuration. @mavaylon1, @rly [#214](https://github.com/hdmf-dev/hdmf-zarr/pull/214) -* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file with a Zarr file, not just a directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229) +* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file or directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229) ## 0.9.0 (September 16, 2024) ### Enhancements diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 6aa95a29..dbc26d4e 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -106,8 +106,8 @@ def can_read(path): 'default': None}, {'name': 'force_overwrite', 'type': bool, - 'doc': "force overwriting existing object when in 'w' mode even" - " if the existing object is a file (e.g., and HDF5 file)", + 'doc': "force overwriting existing object when in 'w' mode. The existing file or directory" + " will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)", 'default': False} ) def __init__(self, **kwargs): From 54e0b6bcc6b733f5fa1cae4fb8cc79f7666c5540 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 7 Nov 2024 16:22:55 -0800 Subject: [PATCH 5/8] Fix Ruff --- src/hdmf_zarr/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index dbc26d4e..aa2fb3d8 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -113,7 +113,8 @@ def can_read(path): def __init__(self, **kwargs): self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs( - 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', 'force_overwrite', kwargs) + 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', + 'storage_options', 'force_overwrite', kwargs) if manager is None: manager = BuildManager(TypeMap(NamespaceCatalog())) if isinstance(synchronizer, bool): From 91ab32784ea82b662ab55f2e32d70183cfe0881c Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 7 Nov 2024 16:24:53 -0800 Subject: [PATCH 6/8] Fix codespell --- tests/unit/test_zarrio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index b0c4429e..260bf9a4 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -208,7 +208,7 @@ def test_force_overwrite_when_file_exists(self): with open(self.store, "w") as file: file.write("Just a test file used in TestOverwriteExistingFile") # try to create a Zarr file at the same location (i.e., self.store) as the - # test text file to force overwriting the exsiting file. + # test text file to force overwriting the existing file. self.create_zarr(force_overwrite=True, mode='w') def test_force_overwrite_when_dir_exists(self): From 0b72fc300e4c6e46812fa7bd0c57cdf9c97917b2 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 8 Nov 2024 11:56:54 -0800 Subject: [PATCH 7/8] Update src/hdmf_zarr/backend.py --- src/hdmf_zarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index aa2fb3d8..1eec3d36 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -182,7 +182,7 @@ def mode(self): def open(self): """Open the Zarr file""" if self.__file is None: - # Allow overwriting an exist file (e.g., and HDF5 file). Zarr will normally fail if the + # Allow overwriting an existing file (e.g., an HDF5 file). Zarr will normally fail if the # existing object at the path is a file. So if we are in `w` mode we need to delete the file first if self.mode == 'w' and self.__force_overwrite: if isinstance(self.path, (str, Path)) and os.path.exists(self.path): From 7df41ca0cc514020f4173131fd8ac2cc808ae842 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 8 Nov 2024 12:01:09 -0800 Subject: [PATCH 8/8] Update backend.py --- src/hdmf_zarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 1eec3d36..f320683c 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -92,7 +92,7 @@ def can_read(path): 'doc': 'the path to the Zarr file or a supported Zarr store'}, {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None}, {'name': 'mode', 'type': str, - 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-") ' + 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). ' 'the mode r- is used to force open without consolidated metadata in read only mode.'}, {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool), 'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.',