Skip to content

Commit

Permalink
Support overwriting an existing file at the same location (#229)
Browse files Browse the repository at this point in the history
Co-authored-by: Ryan Ly <[email protected]>
  • Loading branch information
oruebel and rly authored Nov 8, 2024
1 parent eae8fb5 commit 049402f
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
### Enhancements
* Added support for Pathlib paths. @mavaylon1 [#212](https://github.com/hdmf-dev/hdmf-zarr/pull/212)
* Updated packages used for testing and readthedocs configuration. @mavaylon1, @rly [#214](https://github.com/hdmf-dev/hdmf-zarr/pull/214)
* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file or directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229)

## 0.9.0 (September 16, 2024)
### Enhancements
Expand Down
46 changes: 37 additions & 9 deletions src/hdmf_zarr/backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module with the Zarr-based I/O-backend for HDMF"""
# Python imports
import os
import shutil
import warnings
import numpy as np
import tempfile
Expand Down Expand Up @@ -91,7 +92,7 @@ def can_read(path):
'doc': 'the path to the Zarr file or a supported Zarr store'},
{'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None},
{'name': 'mode', 'type': str,
'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-") '
'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). '
'the mode r- is used to force open without consolidated metadata in read only mode.'},
{'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool),
'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.',
Expand All @@ -102,11 +103,18 @@ def can_read(path):
'default': None},
{'name': 'storage_options', 'type': dict,
'doc': 'Zarr storage options to read remote folders',
'default': None})
'default': None},
{'name': 'force_overwrite',
'type': bool,
'doc': "force overwriting existing object when in 'w' mode. The existing file or directory"
" will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)",
'default': False}
)
def __init__(self, **kwargs):
self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
path, manager, mode, synchronizer, object_codec_class, storage_options = popargs(
'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs)
path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs(
'path', 'manager', 'mode', 'synchronizer', 'object_codec_class',
'storage_options', 'force_overwrite', kwargs)
if manager is None:
manager = BuildManager(TypeMap(NamespaceCatalog()))
if isinstance(synchronizer, bool):
Expand All @@ -118,6 +126,7 @@ def __init__(self, **kwargs):
else:
self.__synchronizer = synchronizer
self.__mode = mode
self.__force_overwrite = force_overwrite
if isinstance(path, Path):
path = str(path)
self.__path = path
Expand Down Expand Up @@ -160,25 +169,44 @@ def synchronizer(self):
def object_codec_class(self):
return self.__codec_cls

@property
def mode(self):
"""
The mode specified by the user when creating the ZarrIO instance.
NOTE: The Zarr library may not honor the mode. E.g., DirectoryStore in Zarr uses
append mode and does not allow setting a file to read-only mode.
"""
return self.__mode

def open(self):
"""Open the Zarr file"""
if self.__file is None:
# Allow overwriting an existing file (e.g., an HDF5 file). Zarr will normally fail if the
# existing object at the path is a file. So if we are in `w` mode we need to delete the file first
if self.mode == 'w' and self.__force_overwrite:
if isinstance(self.path, (str, Path)) and os.path.exists(self.path):
if os.path.isdir(self.path): # directory
shutil.rmtree(self.path)
else: # File
os.remove(self.path)

# Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'.
# As a result, when in other modes, the file will not use consolidated metadata.
if self.__mode != 'r':
if self.mode != 'r':
# When we consolidate metadata, we use ConsolidatedMetadataStore.
# This interface does not allow for setting items.
# In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata.
# r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to
# use the regular mode r when r- is specified
mode_to_use = self.__mode if self.__mode != 'r-' else 'r'
mode_to_use = self.mode if self.mode != 'r-' else 'r'
self.__file = zarr.open(store=self.path,
mode=mode_to_use,
synchronizer=self.__synchronizer,
storage_options=self.__storage_options)
else:
self.__file = self.__open_file_consolidated(store=self.path,
mode=self.__mode,
mode=self.mode,
synchronizer=self.__synchronizer,
storage_options=self.__storage_options)

Expand Down Expand Up @@ -343,9 +371,9 @@ def export(self, **kwargs):
"""Export data read from a file from any backend to Zarr.
See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details.
"""
if self.__mode != 'w':
if self.mode != 'w':
raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'."
% (self.source, self.__mode))
% (self.source, self.mode))

src_io = getargs('src_io', kwargs)
write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs)
Expand Down
9 changes: 6 additions & 3 deletions tests/unit/base_tests_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ def setUp(self):

def tearDown(self):
if os.path.exists(self.store):
shutil.rmtree(self.store)
if os.path.isdir(self.store):
shutil.rmtree(self.store)
else: # in case a test created a file instead of a directory
os.remove(self.store)

def createReferenceBuilder(self):
data_1 = np.arange(100, 200, 10).reshape(2, 5)
Expand All @@ -117,9 +120,9 @@ def createReferenceBuilder(self):
'ref_dataset': dataset_ref})
return builder

def create_zarr(self, consolidate_metadata=True):
def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode='a'):
builder = self.createReferenceBuilder()
writer = ZarrIO(self.store, mode='a')
writer = ZarrIO(self.store, mode=mode, force_overwrite=force_overwrite)
writer.write_builder(builder, consolidate_metadata)
writer.close()

Expand Down
23 changes: 23 additions & 0 deletions tests/unit/test_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,29 @@ def test_force_open_without_consolidated_fails(self):
except ValueError as e:
self.fail("ZarrIO.__open_file_consolidated raised an unexpected ValueError: {}".format(e))

class TestOverwriteExistingFile(ZarrStoreTestCase):
def test_force_overwrite_when_file_exists(self):
"""
Test that we can overwrite a file when opening with `w` mode even if there is
an existing file. Zarr can write into a directory but not a file.
"""
# create a dummy text file
with open(self.store, "w") as file:
file.write("Just a test file used in TestOverwriteExistingFile")
# try to create a Zarr file at the same location (i.e., self.store) as the
# test text file to force overwriting the existing file.
self.create_zarr(force_overwrite=True, mode='w')

def test_force_overwrite_when_dir_exists(self):
"""
Test that we can overwrite a directory when opening with `w` mode even if there is
an existing directory.
"""
# create a Zarr file
self.create_zarr()
# try to overwrite the existing Zarr file
self.create_zarr(force_overwrite=True, mode='w')


class TestDimensionLabels(BuildDatasetShapeMixin):
"""
Expand Down

0 comments on commit 049402f

Please sign in to comment.