From 03e410d297a0045b8d13ed62b7cfabc1d55aba1a Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 21 Nov 2023 10:21:47 -0800 Subject: [PATCH 01/25] Consolidate Metadata --- src/hdmf_zarr/backend.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 34f963e1..9a8684e5 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -152,9 +152,7 @@ def object_codec_class(self): def open(self): """Open the Zarr file""" if self.__file is None: - self.__file = zarr.open(store=self.path, - mode=self.__mode, - synchronizer=self.__synchronizer) + self.__file = self.open_file_consolidated() def close(self): """Close the Zarr file""" @@ -411,6 +409,35 @@ def write_builder(self, **kwargs): self.logger.debug("Done writing %s '%s' to path '%s'" % (f_builder.__class__.__qualname__, f_builder.name, self.source)) + # Consolidate metadata for the entire file after everything has been written + zarr.consolidate_metadata(self.path, metadata_key='.zmetadata') + + def consolidate_metadata(self): + """ + When a file is written, the metadata within the file is consolidated automatically. + If there are any metadata changes, the user needs to consolidate the metadata again + with this method in order for the metadata to be read correctly. + + Consolidate all metadata for groups and arrays within the given store into a + single resource and put it under .zmetadata. + """ + zarr.consolidate_metadata(self.path, metadata_key='.zmetadata') + + def open_file_consolidated(self): + """ + This method will check to see if the metadata has been consolidated, if so + """ + if os.path.isfile(self.path+'/.zmetadata'): + zarr.open_consolidated(store=self.path, + mode=self.__mode,) + else: + msg = "Could not find consolidated metadata." + warnings.warn(msg) + + zarr.open(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer) + @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, {'name': 'link_data', 'type': bool, From b75420327d2bf586e54b4b66b826f5d8ce481164 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 30 Nov 2023 15:43:15 -0800 Subject: [PATCH 02/25] checkpoint --- src/hdmf_zarr/backend.py | 79 ++- src/hdmf_zarr/backend2.py | 1420 +++++++++++++++++++++++++++++++++++++ 2 files changed, 1480 insertions(+), 19 deletions(-) create mode 100644 src/hdmf_zarr/backend2.py diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 3dda6aa0..b3338f38 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -157,10 +157,22 @@ def object_codec_class(self): def open(self): """Open the Zarr file""" if self.__file is None: - self.__file = zarr.open(store=self.path, - mode=self.__mode, - synchronizer=self.__synchronizer, - storage_options=self.__storage_options) + # self.__file = zarr.open(store=self.path, + # mode=self.__mode, + # synchronizer=self.__synchronizer, + # storage_options=self.__storage_options) + # # breakpoint() + if self.__mode == 'w': + self.__file = zarr.open(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options) + else: + self.__file = self.__open_file_consolidated(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options) + def close(self): """Close the Zarr file""" @@ -427,9 +439,10 @@ def write_builder(self, **kwargs): (f_builder.__class__.__qualname__, f_builder.name, self.source)) # Consolidate metadata for the entire file after everything has been written - zarr.consolidate_metadata(self.path, metadata_key='.zmetadata') + # breakpoint() + zarr.consolidate_metadata(store=self.path) - def consolidate_metadata(self): + def consolidate_metadata(self, store): """ When a file is written, the metadata within the file is consolidated automatically. If there are any metadata changes, the user needs to consolidate the metadata again @@ -438,22 +451,31 @@ def consolidate_metadata(self): Consolidate all metadata for groups and arrays within the given store into a single resource and put it under .zmetadata. """ - zarr.consolidate_metadata(self.path, metadata_key='.zmetadata') + zarr.consolidate_metadata(store, metadata_key='.zmetadata') - def open_file_consolidated(self): + def __open_file_consolidated(self, + store, + mode, + synchronizer = None, + storage_options = None): """ This method will check to see if the metadata has been consolidated, if so """ - if os.path.isfile(self.path+'/.zmetadata'): - zarr.open_consolidated(store=self.path, - mode=self.__mode,) + try: + temp = os.path.isfile(self.path+'/.zmetadata') + except TypeError: + temp = os.path.isfile(self.path.path+'/.zmetadata') + if temp: + return zarr.open_consolidated(store=store, + mode=mode, + synchronizer=synchronizer, + storage_options=storage_options) else: msg = "Could not find consolidated metadata." warnings.warn(msg) - - zarr.open(store=self.path, - mode=self.__mode, - synchronizer=self.__synchronizer) + return zarr.open(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer) @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, @@ -604,7 +626,11 @@ def get_zarr_paths(zarr_object): # In Zarr the path is a combination of the path of the store and the path of the object. So we first need to # merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the # directory name to get the path of the parent - fullpath = os.path.normpath(os.path.join(zarr_object.store.path, zarr_object.path)).replace("\\", "/") + if isinstance(zarr_object.store, zarr.storage.ConsolidatedMetadataStore): + fpath = zarr_object.store.store.path + else: + fpath = zarr_object.store.path + fullpath = os.path.normpath(os.path.join(fpath, zarr_object.path)).replace("\\", "/") # To determine the filepath we now iterate over the path and check if the .zgroup object exists at # a level, indicating that we are still within the Zarr file. The first level we hit where the parent # directory does not have a .zgroup means we have found the main file @@ -915,7 +941,11 @@ def write_dataset(self, **kwargs): # noqa: C901 if isinstance(data, Array): # copy the dataset if link_data: - self.__add_link__(parent, data.store.path, data.name, name) + if isinstance(data.store, zarr.storage.ConsolidatedMetadataStore): + path = data.store.store.path + else: + path = data.store.path + self.__add_link__(parent, path, data.name, name) linked = True dset = None else: @@ -1231,7 +1261,11 @@ def read_builder(self): return f_builder def __set_built(self, zarr_obj, builder): - fpath = zarr_obj.store.path + # fpath = zarr_obj.store.path + if isinstance(zarr_obj.store, zarr.storage.ConsolidatedMetadataStore): + fpath = zarr_obj.store.store.path + else: + fpath = zarr_obj.store.path path = zarr_obj.path path = os.path.join(fpath, path) self.__built.setdefault(path, builder) @@ -1271,12 +1305,19 @@ def __get_built(self, zarr_obj): :type zarr_obj: Zarr Group or Dataset :return: Builder in the self.__built cache or None """ - fpath = zarr_obj.store.path + + if isinstance(zarr_obj.store, zarr.storage.ConsolidatedMetadataStore): + fpath = zarr_obj.store.store.path + else: + fpath = zarr_obj.store.path + + # fpath = zarr_obj.store.path path = zarr_obj.path path = os.path.join(fpath, path) return self.__built.get(path, None) def __read_group(self, zarr_obj, name=None): + # breakpoint() ret = self.__get_built(zarr_obj) if ret is not None: return ret diff --git a/src/hdmf_zarr/backend2.py b/src/hdmf_zarr/backend2.py new file mode 100644 index 00000000..505031ea --- /dev/null +++ b/src/hdmf_zarr/backend2.py @@ -0,0 +1,1420 @@ +"""Module with the Zarr-based I/O-backend for HDMF""" +# Python imports +import os +import warnings +import numpy as np +import tempfile +import logging + +# Zarr imports +import zarr +from zarr.hierarchy import Group +from zarr.core import Array +from zarr.storage import (DirectoryStore, + TempStore, + NestedDirectoryStore) +import numcodecs + +# HDMF-ZARR imports +from .utils import (ZarrDataIO, + ZarrReference, + ZarrSpecWriter, + ZarrSpecReader, + ZarrIODataChunkIteratorQueue) +from .zarr_utils import BuilderZarrReferenceDataset, BuilderZarrTableDataset + +# HDMF imports +from hdmf.backends.io import HDMFIO +from hdmf.backends.errors import UnsupportedOperation +from hdmf.backends.utils import (NamespaceToBuilderHelper, + WriteStatusTracker) +from hdmf.utils import (docval, + getargs, + popargs, + get_docval, + get_data_shape) +from hdmf.build import (Builder, + GroupBuilder, + DatasetBuilder, + LinkBuilder, + BuildManager, + RegionBuilder, + ReferenceBuilder, + TypeMap) +from hdmf.data_utils import AbstractDataChunkIterator +from hdmf.spec import (RefSpec, + DtypeSpec, + NamespaceCatalog) +from hdmf.query import HDMFDataset +from hdmf.container import Container + +# Module variables +ROOT_NAME = 'root' +""" +Name of the root builder for read/write +""" + +SPEC_LOC_ATTR = '.specloc' +""" +Reserved attribute storing the path to the Group where the schema for the file are cached +""" + +DEFAULT_SPEC_LOC_DIR = 'specifications' +""" +Default name of the group where specifications should be cached +""" + +SUPPORTED_ZARR_STORES = (DirectoryStore, + TempStore, + NestedDirectoryStore) +""" +Tuple listing all Zarr storage backends supported by ZarrIO +""" + + +class ZarrIO(HDMFIO): + + @staticmethod + def can_read(path): + try: + # TODO: how to use storage_options? Maybe easier to just check for ".zarr" suffix + zarr.open(path, mode="r") + self.__open_file_consolidated(source=path, mode='r') + return True + except Exception: + return False + + @docval({'name': 'path', + 'type': (str, *SUPPORTED_ZARR_STORES), + 'doc': 'the path to the Zarr file or a supported Zarr store'}, + {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None}, + {'name': 'mode', 'type': str, + 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-")'}, + {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool), + 'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.', + 'default': None}, + {'name': 'object_codec_class', 'type': None, + 'doc': 'Set the numcodec object codec class to be used to encode objects.' + 'Use numcodecs.pickles.Pickle by default.', + 'default': None}, + {'name': 'storage_options', 'type': dict, + 'doc': 'Zarr storage options to read remote folders', + 'default': None}) + def __init__(self, **kwargs): + self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) + path, manager, mode, synchronizer, object_codec_class, storage_options = popargs( + 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs) + if manager is None: + manager = BuildManager(TypeMap(NamespaceCatalog())) + if isinstance(synchronizer, bool): + if synchronizer: + sync_path = tempfile.mkdtemp() + self.__synchronizer = zarr.ProcessSynchronizer(sync_path) + else: + self.__synchronizer = None + else: + self.__synchronizer = synchronizer + self.__mode = mode + self.__path = path + self.__file = None + self.__storage_options = storage_options + self.__built = dict() + self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object + self.__dci_queue = None # Will be initialized on call to io.write + # Codec class to be used. Alternates, e.g., =numcodecs.JSON + self.__codec_cls = numcodecs.pickles.Pickle if object_codec_class is None else object_codec_class + source_path = self.__path + if isinstance(self.__path, SUPPORTED_ZARR_STORES): + source_path = self.__path.path + super().__init__(manager, source=source_path) + + @property + def file(self): + """ + The Zarr zarr.hierarchy.Group (or zarr.core.Array) opened by the backend. + May be None in case open has not been called yet, e.g., if no data has been + read or written yet via this instance. + """ + return self.__file + + @property + def path(self): + """The path to the Zarr file as set by the user""" + return self.__path + + @property + def abspath(self): + """The absolute path to the Zarr file""" + return os.path.abspath(self.source) + + @property + def synchronizer(self): + return self.__synchronizer + + @property + def object_codec_class(self): + return self.__codec_cls + + def open(self): + """Open the Zarr file""" + if self.__file is None: + self.__file = self.__open_file_consolidated(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options) + + def close(self): + """Close the Zarr file""" + self.__file = None + return + + def is_remote(self): + """Return True if the file is remote, False otherwise""" + from zarr.storage import FSStore + if isinstance(self.file.store, FSStore): + return True + else: + return False + + @classmethod + @docval({'name': 'namespace_catalog', + 'type': (NamespaceCatalog, TypeMap), + 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'}, + {'name': 'path', + 'type': (str, *SUPPORTED_ZARR_STORES), + 'doc': 'the path to the Zarr file or a supported Zarr store'}, + {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None}) + def load_namespaces(cls, namespace_catalog, path, namespaces=None): + ''' + Load cached namespaces from a file. + ''' + # TODO: how to use storage_options here? + # f = self.__open_file_consolidated(source=path, mode='r') + f = zarr.open(path, mode='r') + if SPEC_LOC_ATTR not in f.attrs: + msg = "No cached namespaces found in %s" % path + warnings.warn(msg) + else: + spec_group = f[f.attrs[SPEC_LOC_ATTR]] + if namespaces is None: + namespaces = list(spec_group.keys()) + for ns in namespaces: + ns_group = spec_group[ns] + latest_version = list(ns_group.keys())[-1] + ns_group = ns_group[latest_version] + reader = ZarrSpecReader(ns_group) + namespace_catalog.load_namespaces('namespace', reader=reader) + + @docval( + {'name': 'container', 'type': Container, 'doc': 'the Container object to write'}, + {'name': 'cache_spec', 'type': bool, 'doc': 'cache specification to file', 'default': True}, + {'name': 'link_data', 'type': bool, + 'doc': 'If not specified otherwise link (True) or copy (False) Datasets', 'default': True}, + {'name': 'exhaust_dci', 'type': bool, + 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + + 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', + 'default': True}, + { + "name": "number_of_jobs", + "type": int, + "doc": ( + "Number of jobs to use in parallel during write " + "(only works with GenericDataChunkIterator-wrapped datasets)." + ), + "default": 1, + }, + { + "name": "max_threads_per_process", + "type": int, + "doc": ( + "Limits the number of threads used by each process. The default is None (no limits)." + ), + "default": None, + }, + { + "name": "multiprocessing_context", + "type": str, + "doc": ( + "Context for multiprocessing. It can be None (default), 'fork' or 'spawn'. " + "Note that 'fork' is only available on UNIX systems (not Windows)." + ), + "default": None, + }, + ) + def write(self, **kwargs): + """Overwrite the write method to add support for caching the specification and parallelization.""" + cache_spec, number_of_jobs, max_threads_per_process, multiprocessing_context = popargs( + "cache_spec", "number_of_jobs", "max_threads_per_process", "multiprocessing_context", kwargs + ) + + self.__dci_queue = ZarrIODataChunkIteratorQueue( + number_of_jobs=number_of_jobs, + max_threads_per_process=max_threads_per_process, + multiprocessing_context=multiprocessing_context, + ) + + super(ZarrIO, self).write(**kwargs) + if cache_spec: + self.__cache_spec() + + def __cache_spec(self): + """Interanl function used to cache the spec in the current file""" + ref = self.__file.attrs.get(SPEC_LOC_ATTR) + spec_group = None + if ref is not None: + spec_group = self.__file[ref] + else: + path = DEFAULT_SPEC_LOC_DIR # do something to figure out where the specifications should go + spec_group = self.__file.require_group(path) + self.__file.attrs[SPEC_LOC_ATTR] = path + ns_catalog = self.manager.namespace_catalog + for ns_name in ns_catalog.namespaces: + ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name) + namespace = ns_catalog.get_namespace(ns_name) + if namespace.version is None: + group_name = '%s/unversioned' % ns_name + else: + group_name = '%s/%s' % (ns_name, namespace.version) + ns_group = spec_group.require_group(group_name) + writer = ZarrSpecWriter(ns_group) + ns_builder.export('namespace', writer=writer) + + @docval( + *get_docval(HDMFIO.export), + {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file', 'default': True}, + { + "name": "number_of_jobs", + "type": int, + "doc": ( + "Number of jobs to use in parallel during write " + "(only works with GenericDataChunkIterator-wrapped datasets)." + ), + "default": 1, + }, + { + "name": "max_threads_per_process", + "type": int, + "doc": ( + "Limits the number of threads used by each process. The default is None (no limits)." + ), + "default": None, + }, + { + "name": "multiprocessing_context", + "type": str, + "doc": ( + "Context for multiprocessing. It can be None (default), 'fork' or 'spawn'. " + "Note that 'fork' is only available on UNIX systems (not Windows)." + ), + "default": None, + }, + ) + def export(self, **kwargs): + """Export data read from a file from any backend to Zarr. + See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. + """ + if self.__mode != 'w': + raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." + % (self.source, self.__mode)) + + src_io = getargs('src_io', kwargs) + write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) + number_of_jobs, max_threads_per_process, multiprocessing_context = popargs( + "number_of_jobs", "max_threads_per_process", "multiprocessing_context", kwargs + ) + + self.__dci_queue = ZarrIODataChunkIteratorQueue( + number_of_jobs=number_of_jobs, + max_threads_per_process=max_threads_per_process, + multiprocessing_context=multiprocessing_context, + ) + + if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True): + raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument " + "link_data=True." % src_io.__class__.__name__) + + write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder + ckwargs = kwargs.copy() + ckwargs['write_args'] = write_args + super().export(**ckwargs) + if cache_spec: + self.__cache_spec() + + def get_written(self, builder, check_on_disk=False): + """ + Return True if this builder has been written to (or read from) disk by this IO object, False otherwise. + + :param builder: Builder object to get the written flag for + :type builder: Builder + :param check_on_disk: Check that the builder has been physically written to disk not just flagged as written + by this I/O backend + :type check_on_disk: bool + :return: True if the builder is found in self._written_builders using the builder ID, False otherwise. If + check_on_disk is enabled then the function cals get_builder_exists_on_disk in addtion to verify + that the builder has indeed been written to disk. + """ + written = self._written_builders.get_written(builder) + if written and check_on_disk: + written = written and self.get_builder_exists_on_disk(builder=builder) + return written + + @docval({'name': 'builder', 'type': Builder, 'doc': 'The builder of interest'}) + def get_builder_exists_on_disk(self, **kwargs): + """ + Convenience function to check whether a given builder exists on disk in this Zarr file. + """ + builder = getargs('builder', kwargs) + builder_path = self.get_builder_disk_path(builder=builder, filepath=None) + exists_on_disk = os.path.exists(builder_path) + return exists_on_disk + + @docval({'name': 'builder', 'type': Builder, 'doc': 'The builder of interest'}, + {'name': 'filepath', 'type': str, + 'doc': 'The path to the Zarr file or None for this file', 'default': None}) + def get_builder_disk_path(self, **kwargs): + builder, filepath = getargs('builder', 'filepath', kwargs) + basepath = filepath if filepath is not None else self.source + builder_path = os.path.join(basepath, self.__get_path(builder).lstrip("/")) + return builder_path + + @docval( + {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the NWBFile'}, + { + 'name': 'link_data', + 'type': bool, + 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', + 'default': True + }, + { + 'name': 'exhaust_dci', + 'type': bool, + 'doc': ( + 'Exhaust DataChunkIterators one at a time. If False, add ' + 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end' + ), + 'default': True, + }, + { + 'name': 'export_source', + 'type': str, + 'doc': 'The source of the builders when exporting', + 'default': None, + }, + ) + def write_builder(self, **kwargs): + """Write a builder to disk.""" + f_builder, link_data, exhaust_dci, export_source = getargs( + 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + ) + for name, gbldr in f_builder.groups.items(): + self.write_group( + parent=self.__file, + builder=gbldr, + link_data=link_data, + exhaust_dci=exhaust_dci, + export_source=export_source, + ) + for name, dbldr in f_builder.datasets.items(): + self.write_dataset( + parent=self.__file, + builder=dbldr, + link_data=link_data, + exhaust_dci=exhaust_dci, + export_source=export_source, + ) + self.write_attributes(self.__file, f_builder.attributes) # the same as set_attributes in HDMF + self.__dci_queue.exhaust_queue() # Write any remaining DataChunkIterators that have been queued + self._written_builders.set_written(f_builder) + self.logger.debug("Done writing %s '%s' to path '%s'" % + (f_builder.__class__.__qualname__, f_builder.name, self.source)) + + # Consolidate metadata for the entire file after everything has been written + zarr.consolidate_metadata(self.__file, metadata_key='.zmetadata') + + def consolidate_metadata(self): + """ + When a file is written, the metadata within the file is consolidated automatically. + If there are any metadata changes, the user needs to consolidate the metadata again + with this method in order for the metadata to be read correctly. + + Consolidate all metadata for groups and arrays within the given store into a + single resource and put it under .zmetadata. + """ + zarr.consolidate_metadata(self.__path, metadata_key='.zmetadata') + + def __open_file_consolidated(self, + store, + mode, + synchronizer = None, + storage_options = None): + """ + This method will check to see if the metadata has been consolidated, if so + """ + + if os.path.isfile(self.__path+'/.zmetadata'): + zarr.open_consolidated(store=store, + mode=mode, + synchronizer=synchronizer, + storage_options=storage_options) + else: + msg = "Could not find consolidated metadata." + warnings.warn(msg) + + zarr.open(store=store, + mode=mode, + synchronizer=synchronizer, + storage_options=storage_options) + + @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, + {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, + {'name': 'link_data', 'type': bool, + 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', 'default': True}, + {'name': 'exhaust_dci', 'type': bool, + 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + + 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', + 'default': True}, + {'name': 'export_source', 'type': str, + 'doc': 'The source of the builders when exporting', 'default': None}, + returns='the Group that was created', rtype='Group') + def write_group(self, **kwargs): + """Write a GroupBuider to file""" + parent, builder, link_data, exhaust_dci, export_source = getargs( + 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + ) + + if self.get_written(builder): + group = parent[builder.name] + else: + group = parent.require_group(builder.name) + + subgroups = builder.groups + if subgroups: + for subgroup_name, sub_builder in subgroups.items(): + self.write_group( + parent=group, + builder=sub_builder, + link_data=link_data, + exhaust_dci=exhaust_dci, + ) + + datasets = builder.datasets + if datasets: + for dset_name, sub_builder in datasets.items(): + self.write_dataset( + parent=group, + builder=sub_builder, + link_data=link_data, + exhaust_dci=exhaust_dci, + export_source=export_source, + ) + + # write all links (haven implemented) + links = builder.links + if links: + for link_name, sub_builder in links.items(): + self.write_link(group, sub_builder) + + attributes = builder.attributes + self.write_attributes(group, attributes) + self._written_builders.set_written(builder) # record that the builder has been written + return group + + @docval({'name': 'obj', 'type': (Group, Array), 'doc': 'the Zarr object to add attributes to'}, + {'name': 'attributes', + 'type': dict, + 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}, + {'name': 'export_source', 'type': str, + 'doc': 'The source of the builders when exporting', 'default': None}) + def write_attributes(self, **kwargs): + """Set (i.e., write) the attributes on a given Zarr Group or Array.""" + obj, attributes, export_source = getargs('obj', 'attributes', 'export_source', kwargs) + + for key, value in attributes.items(): + # Case 1: list, set, tuple type attributes + if isinstance(value, (set, list, tuple)) or (isinstance(value, np.ndarray) and np.ndim(value) != 0): + # Convert to tuple for writing (e.g., numpy arrays are not JSON serializable) + if isinstance(value, np.ndarray): + tmp = tuple(value.tolist()) + else: + tmp = tuple(value) + # Attempt write of the attribute + try: + obj.attrs[key] = tmp + # Numpy scalars and bytes are not JSON serializable. Try to convert to a serializable type instead + except TypeError as e: + try: + tmp = tuple([i.item() + if (isinstance(i, np.generic) and not isinstance(i, np.bytes_)) + else i.decode("utf-8") + if isinstance(i, (bytes, np.bytes_)) + else i + for i in value]) + obj.attrs[key] = tmp + except: # noqa: E722 + raise TypeError(str(e) + " type=" + str(type(value)) + " data=" + str(value)) from e + # Case 2: References + elif isinstance(value, (Container, Builder, ReferenceBuilder)): + # TODO: Region References are not yet supported + # if isinstance(value, RegionBuilder): + # type_str = 'region' + # refs = self.__get_ref(value.builder) + if isinstance(value, (ReferenceBuilder, Container, Builder)): + type_str = 'object' + if isinstance(value, Builder): + refs = self.__get_ref(value, export_source) + else: + refs = self.__get_ref(value.builder, export_source) + tmp = {'zarr_dtype': type_str, 'value': refs} + obj.attrs[key] = tmp + # Case 3: Scalar attributes + else: + # Attempt to write the attribute + try: + obj.attrs[key] = value + # Numpy scalars and bytes are not JSON serializable. Try to convert to a serializable type instead + except TypeError as e: + try: + val = value.item if isinstance(value, np.ndarray) else value + val = value.item() \ + if (isinstance(value, np.generic) and not isinstance(value, np.bytes_)) \ + else val.decode("utf-8") \ + if isinstance(value, (bytes, np.bytes_)) \ + else val + obj.attrs[key] = val + except: # noqa: E722 + msg = str(e) + "key=" + key + " type=" + str(type(value)) + " data=" + str(value) + raise TypeError(msg) from e + + def __get_path(self, builder): + """Get the path to the builder. + If builder.location is set then it is used as the path, otherwise the function + determines the path by constructing it iteratively from the parents of the + builder. + """ + if builder.location is not None: + path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/") + else: + curr = builder + names = list() + while curr is not None and curr.name != ROOT_NAME: + names.append(curr.name) + curr = curr.parent + delim = "/" + path = "%s%s" % (delim, delim.join(reversed(names))) + return path + + @staticmethod + def get_zarr_paths(zarr_object): + """ + For a Zarr object find 1) the path to the main zarr file it is in and 2) the path to the object within the file + :param zarr_object: Object for which we are looking up the path + :type zarr_object: Zarr Group or Array + :return: Tuple of two string with: 1) path of the Zarr file and 2) full path within the zarr file to the object + """ + # In Zarr the path is a combination of the path of the store and the path of the object. So we first need to + # merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the + # directory name to get the path of the parent + fullpath = os.path.normpath(os.path.join(zarr_object.store.path, zarr_object.path)).replace("\\", "/") + # To determine the filepath we now iterate over the path and check if the .zgroup object exists at + # a level, indicating that we are still within the Zarr file. The first level we hit where the parent + # directory does not have a .zgroup means we have found the main file + filepath = fullpath + while os.path.exists(os.path.join(os.path.dirname(filepath), ".zgroup")): + filepath = os.path.dirname(filepath) + # From the fullpath and filepath we can now compute the objectpath within the zarr file as the relative + # path from the filepath to the object + objectpath = "/" + os.path.relpath(fullpath, filepath) + # return the result + return filepath, objectpath + + @staticmethod + def get_zarr_parent_path(zarr_object): + """ + Get the location of the parent of a zarr_object within the file + :param zarr_object: Object for which we are looking up the path + :type zarr_object: Zarr Group or Array + :return: String with the path + """ + filepath, objectpath = ZarrIO.get_zarr_paths(zarr_object) + parentpath = os.path.dirname(objectpath) + return parentpath + + @staticmethod + def is_zarr_file(path): + """ + Check if the given path defines a Zarr file + :param path: Full path to main directory + :return: Bool + """ + if os.path.exists(path): + if os.path.isdir(path): + if os.path.exists(os.path.join(path, ".zgroup")): + return True + return False + + def __is_ref(self, dtype): + if isinstance(dtype, DtypeSpec): + return self.__is_ref(dtype.dtype) + elif isinstance(dtype, RefSpec): + return True + elif isinstance(dtype, np.dtype): + return False + else: + return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE + + def resolve_ref(self, zarr_ref): + """ + Get the full path to the object linked to by the zarr reference + + The function only constructs the links to the targe object, but it does not check if the object exists + + :param zarr_ref: Dict with `source` and `path` keys or a `ZarrReference` object + :return: 1) name of the target object + 2) the target zarr object within the target file + """ + # Extract the path as defined in the zarr_ref object + if zarr_ref.get('source', None) is None: + source_file = str(zarr_ref['path']) + else: + source_file = str(zarr_ref['source']) + # Resolve the path relative to the current file + if not self.is_remote(): + source_file = os.path.abspath(os.path.join(self.source, source_file)) + else: + # get rid of extra "/" and "./" in the path root and source_file + root_path = str(self.path).rstrip("/") + source_path = str(source_file).lstrip(".") + source_file = root_path + source_path + + object_path = zarr_ref.get('path', None) + if object_path: + target_name = os.path.basename(object_path) + else: + target_name = ROOT_NAME + + target_zarr_obj = zarr.open(source_file, mode='r', storage_options=self.__storage_options) + # target_zarr_obj = self.__open_file_consolidated(store=source_file, mode='r', storage_options=self.__storage_options) + if object_path is not None: + try: + target_zarr_obj = target_zarr_obj[object_path] + except Exception: + raise ValueError("Found bad link to object %s in file %s" % (object_path, source_file)) + # Return the create path + return target_name, target_zarr_obj + + def __get_ref(self, ref_object, export_source=None): + """ + Create a ZarrReference object that points to the given container + + :param ref_object: the object to be referenced + :type ref_object: Builder, Container, ReferenceBuilder + :returns: ZarrReference object + """ + if isinstance(ref_object, RegionBuilder): # or region is not None: TODO: Add to support regions + raise NotImplementedError("Region references are currently not supported by ZarrIO") + if isinstance(ref_object, Builder): + if isinstance(ref_object, LinkBuilder): + builder = ref_object.target_builder + else: + builder = ref_object + elif isinstance(ref_object, ReferenceBuilder): + builder = ref_object.builder + else: + builder = self.manager.build(ref_object) + path = self.__get_path(builder) + # TODO Add to get region for region references. + # Also add {'name': 'region', 'type': (slice, list, tuple), + # 'doc': 'the region reference indexing object', 'default': None}, + # if isinstance(ref_object, RegionBuilder): + # region = ref_object.region + + # get the object id if available + object_id = builder.get('object_id', None) + + # determine the object_id of the source by following the parents of the builder until we find the root + # the root builder should be the same as the source file containing the reference + curr = builder + while curr is not None and curr.name != ROOT_NAME: + curr = curr.parent + if curr: + source_object_id = curr.get('object_id', None) + # We did not find ROOT_NAME as a parent. This should only happen if we have an invalid + # file as a source, e.g., if during testing we use an arbitrary builder. We check this + # anyways to avoid potential errors just in case + else: + source_object_id = None + warn_msg = "Could not determine source_object_id for builder with path: %s" % path + warnings.warn(warn_msg) + + # by checking os.isdir makes sure we have a valid link path to a dir for Zarr. For conversion + # between backends a user should always use export which takes care of creating a clean set of builders. + source = (builder.source + if (builder.source is not None and os.path.isdir(builder.source)) + else self.source) + + # Make the source relative to the current file + # TODO: This check assumes that all links are internal links on export. + # Need to deal with external links on export. + if export_source is not None: + # Make sure the source of the reference is now towards the new file + # and not the original source when exporting. + source = '.' + else: + source = os.path.relpath(os.path.abspath(source), start=self.abspath) + # Return the ZarrReference object + ref = ZarrReference( + source=source, + path=path, + object_id=object_id, + source_object_id=source_object_id) + return ref + + def __add_link__(self, parent, target_source, target_path, link_name): + """ + Add a link to the file + :param parent: The parent Zarr group containing the link + :type parent: zarr.hierarchy.Group + :param target_source: Source path within the Zarr file to the linked object + :type target_source: str + :param target_path: Path to the Zarr file containing the linked object + :param link_name: Name of the link + :type link_name: str + """ + if 'zarr_link' not in parent.attrs: + parent.attrs['zarr_link'] = [] + zarr_link = list(parent.attrs['zarr_link']) + zarr_link.append({'source': target_source, 'path': target_path, 'name': link_name}) + parent.attrs['zarr_link'] = zarr_link + + @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, + {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'}) + def write_link(self, **kwargs): + parent, builder = getargs('parent', 'builder', kwargs) + if self.get_written(builder): + self.logger.debug("Skipping LinkBuilder '%s' already written to parent group '%s'" + % (builder.name, parent.name)) + return + self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) + name = builder.name + target_builder = builder.builder + # Get the reference + zarr_ref = self.__get_ref(target_builder) + # EXPORT WITH LINKS: Fix link source + # if the target and source are both the same, then we need to ALWAYS use ourselves as a source + # When exporting from one source to another, the LinkBuilders.source are not updated, i.e,. the + # builder.source and target_builder.source are not being updated and point to the old file, but + # for internal links (a.k.a, SoftLinks) they will be the same and our target will be part of + # our new file, so we can safely replace the source + if builder.source == target_builder.source: + zarr_ref.source = "." # Link should be relative to self + # EXPORT WITH LINKS: Make sure target is written. If is not then if the target points to a + # non-Zarr source, then we need to copy the data instead of writing a + # link to the data + # When exporting from a different backend, then we may encounter external links to + # other datasets, groups (or links) in another file. Since they are from another + # backend, we must ensure that those targets are copied as well, so we check here + # if our target_builder has been written and write it if it doesn't + # TODO: Review the logic for when we need to copy data and when to link it. We may need the export_source? + """ + skip_link = False + if not self.get_written(target_builder): + if not self.is_zarr_file(target_builder.source): + # We need to copy the target in place of the link so we need to + # change the name of target_builder to match the link instead + temp = copy(target_builder.name) + target_builder._Builder__name = name + # Skip writing the link since we copied the data into place + skip_link = True + if isinstance(target_builder, DatasetBuilder): + self.write_dataset(parent=parent, builder=target_builder) + elif isinstance(target_builder, GroupBuilder): + self.write_group(parent=parent, builder=target_builder) + elif isinstance(target_builder, LinkBuilder): + self.write_link(parent=parent, builder=target_builder) + target_builder._Builder__name = temp + # REGULAR LINK I/O: + # Write the actual link as we should in most cases. Skip it only if we copied the + # data from an external source in place instead + if not skip_link: + self.__add_link__(parent, zarr_ref.source, zarr_ref.path, name) + """ + self.__add_link__(parent, zarr_ref.source, zarr_ref.path, name) + self._written_builders.set_written(builder) # record that the builder has been written + + @classmethod + def __setup_chunked_dataset__(cls, parent, name, data, options=None): + """ + Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator. This + is a helper function for write_dataset() + :param parent: The parent object to which the dataset should be added + :type parent: Zarr Group or File + :param name: The name of the dataset + :type name: str + :param data: The data to be written. + :type data: AbstractDataChunkIterator + :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' + :type options: dict + """ + io_settings = {} + if options is not None: + if 'io_settings' in options: + io_settings = options.get('io_settings') + # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write. + if 'chunks' not in io_settings: + recommended_chunks = data.recommended_chunk_shape() + io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks + # Define the shape of the data if not provided by the user + if 'shape' not in io_settings: + io_settings['shape'] = data.recommended_data_shape() + if 'dtype' not in io_settings: + if (options is not None) and ('dtype' in options): + io_settings['dtype'] = options['dtype'] + else: + io_settings['dtype'] = data.dtype + if isinstance(io_settings['dtype'], str): + # map to real dtype if we were given a string + io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) + try: + dset = parent.create_dataset(name, **io_settings) + dset.attrs['zarr_dtype'] = np.dtype(io_settings['dtype']).str + except Exception as exc: + raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc + return dset + + @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, # noqa: C901 + {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'}, + {'name': 'link_data', 'type': bool, + 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', 'default': True}, + {'name': 'exhaust_dci', 'type': bool, + 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + + 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', + 'default': True}, + {'name': 'force_data', 'type': None, + 'doc': 'Used internally to force the data being used when we have to load the data', 'default': None}, + {'name': 'export_source', 'type': str, + 'doc': 'The source of the builders when exporting', 'default': None}, + returns='the Zarr array that was created', rtype=Array) + def write_dataset(self, **kwargs): # noqa: C901 + parent, builder, link_data, exhaust_dci, export_source = getargs( + 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + ) + + force_data = getargs('force_data', kwargs) + + if exhaust_dci and self.__dci_queue is None: + self.__dci_queue = ZarrIODataChunkIteratorQueue() + + if self.get_written(builder): + return None + name = builder.name + data = builder.data if force_data is None else force_data + options = dict() + if isinstance(data, ZarrDataIO): + options['io_settings'] = data.io_settings + link_data = data.link_data + data = data.data + else: + options['io_settings'] = {} + + attributes = builder.attributes + options['dtype'] = builder.dtype + + linked = False + + # Write a regular Zarr array + dset = None + if isinstance(data, Array): + # copy the dataset + if link_data: + self.__add_link__(parent, data.store.path, data.name, name) + linked = True + dset = None + else: + zarr.copy(data, parent, name=name) + dset = parent[name] + # When converting data between backends we may see an HDMFDataset, e.g., a H55ReferenceDataset, with references + elif isinstance(data, HDMFDataset): + # If we have a dataset of containers we need to make the references to the containers + if len(data) > 0 and isinstance(data[0], Container): + ref_data = [self.__get_ref(data[i], export_source=export_source) for i in range(len(data))] + shape = (len(data), ) + type_str = 'object' + dset = parent.require_dataset(name, + shape=shape, + dtype=object, + object_codec=self.__codec_cls(), + **options['io_settings']) + dset.attrs['zarr_dtype'] = type_str + dset[:] = ref_data + self._written_builders.set_written(builder) # record that the builder has been written + # If we have a regular dataset, then load the data and write the builder after load + else: + # TODO This code path is also exercised when data is a + # hdmf.backends.hdf5.h5_utils.BuilderH5ReferenceDataset (aka. ReferenceResolver) + # check that this is indeed the right thing to do here + + # We can/should not update the data in the builder itself so we load the data here and instead + # force write_dataset when we call it recursively to use the data we loaded, rather than the + # dataset that is set on the builder + dset = self.write_dataset(parent=parent, + builder=builder, + link_data=link_data, + force_data=data[:], + export_source=export_source) + self._written_builders.set_written(builder) # record that the builder has been written + # Write a compound dataset + elif isinstance(options['dtype'], list): + refs = list() + type_str = list() + for i, dts in enumerate(options['dtype']): + if self.__is_ref(dts['dtype']): + refs.append(i) + ref_tmp = self.__get_ref(data[0][i], export_source=export_source) + if isinstance(ref_tmp, ZarrReference): + dts_str = 'object' + else: + dts_str = 'region' + type_str.append({'name': dts['name'], 'dtype': dts_str}) + else: + i = list([dts, ]) + t = self.__resolve_dtype_helper__(i) + type_str.append(self.__serial_dtype__(t)[0]) + + if len(refs) > 0: + dset = parent.require_dataset(name, + shape=(len(data), ), + dtype=object, + object_codec=self.__codec_cls(), + **options['io_settings']) + self._written_builders.set_written(builder) # record that the builder has been written + dset.attrs['zarr_dtype'] = type_str + for j, item in enumerate(data): + new_item = list(item) + for i in refs: + new_item[i] = self.__get_ref(item[i], export_source=export_source) + dset[j] = new_item + else: + # write a compound datatype + dset = self.__list_fill__(parent, name, data, options) + # Write a dataset of references + elif self.__is_ref(options['dtype']): + # TODO Region references are not yet support, but here how the code should look + # if isinstance(data, RegionBuilder): + # shape = (1,) + # type_str = 'region' + # refs = self.__get_ref(data.builder, data.region) + if isinstance(data, ReferenceBuilder): + shape = (1,) + type_str = 'object' + refs = self.__get_ref(data.builder, export_source=export_source) + # TODO: Region References are not yet supported + # elif options['dtype'] == 'region': + # shape = (len(data), ) + # type_str = 'region' + # refs = [self.__get_ref(item.builder, item.region) for item in data] + else: + shape = (len(data), ) + type_str = 'object' + refs = [self.__get_ref(item, export_source=export_source) for item in data] + + dset = parent.require_dataset(name, + shape=shape, + dtype=object, + object_codec=self.__codec_cls(), + **options['io_settings']) + self._written_builders.set_written(builder) # record that the builder has been written + dset.attrs['zarr_dtype'] = type_str + if hasattr(refs, '__len__'): + dset[:] = refs + else: + dset[0] = refs + # write a 'regular' dataset without DatasetIO info + else: + if isinstance(data, (str, bytes)): + dset = self.__scalar_fill__(parent, name, data, options) + # Iterative write of a data chunk iterator + elif isinstance(data, AbstractDataChunkIterator): + dset = self.__setup_chunked_dataset__(parent, name, data, options) + self.__dci_queue.append(dataset=dset, data=data) + elif hasattr(data, '__len__'): + dset = self.__list_fill__(parent, name, data, options) + else: + dset = self.__scalar_fill__(parent, name, data, options) + if not linked: + self.write_attributes(dset, attributes) + # record that the builder has been written + self._written_builders.set_written(builder) + # Exhaust the DataChunkIterator if the dataset was given this way. Note this is a no-op + # if the self.__dci_queue is empty + if exhaust_dci: + self.__dci_queue.exhaust_queue() + return dset + + __dtypes = { + "float": np.float32, + "float32": np.float32, + "double": np.float64, + "float64": np.float64, + "long": np.int64, + "int64": np.int64, + "uint64": np.uint64, + "int": np.int32, + "int32": np.int32, + "int16": np.int16, + "int8": np.int8, + "bool": np.bool_, + "bool_": np.bool_, + "text": str, + "utf": str, + "utf8": str, + "utf-8": str, + "ascii": bytes, + "str": str, + "isodatetime": str, + "string_": bytes, + "uint32": np.uint32, + "uint16": np.uint16, + "uint8": np.uint8, + "ref": ZarrReference, + "reference": ZarrReference, + "object": ZarrReference, + "region": ZarrReference, + } + + @classmethod + def __serial_dtype__(cls, dtype): + if isinstance(dtype, type): + return dtype.__name__ + elif isinstance(dtype, np.dtype): + if dtype.names is None: + return dtype.type.__name__ + else: + ret = list() + for n in dtype.names: + item = dict() + item['name'] = n + item['dtype'] = cls.__serial_dtype__(dtype[n]) + ret.append(item) + return ret + # TODO Does not work when Reference in compound datatype + elif dtype == ZarrReference: + return 'object' + + @classmethod + def __resolve_dtype__(cls, dtype, data): + dtype = cls.__resolve_dtype_helper__(dtype) + if dtype is None: + dtype = cls.get_type(data) + return dtype + + @classmethod + def __resolve_dtype_helper__(cls, dtype): + if dtype is None: + return None + elif isinstance(dtype, (type, np.dtype)): + return dtype + elif isinstance(dtype, str): + return cls.__dtypes.get(dtype) + elif isinstance(dtype, dict): + return cls.__dtypes.get(dtype['reftype']) + else: + return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype]) + + @classmethod + def get_type(cls, data): + if isinstance(data, str): + return str + elif not hasattr(data, '__len__'): + return type(data) + else: + if len(data) == 0: + raise ValueError('cannot determine type for empty data') + return cls.get_type(data[0]) + + __reserve_attribute = ('zarr_dtype', 'zarr_link') + + def __list_fill__(self, parent, name, data, options=None): # noqa: C901 + dtype = None + io_settings = dict() + if options is not None: + dtype = options.get('dtype') + io_settings = options.get('io_settings') + if io_settings is None: + io_settings = dict() + # Determine the dtype + if not isinstance(dtype, type): + try: + dtype = self.__resolve_dtype__(dtype, data) + except Exception as exc: + msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) # noqa: F821 + raise Exception(msg) from exc + + # Set the type_str + type_str = self.__serial_dtype__(dtype) + + # Determine the shape and update the dtype if necessary when dtype==object + if 'shape' in io_settings: # Use the shape set by the user + data_shape = io_settings.pop('shape') + # If we have a numeric numpy array then use its shape + elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.number) or dtype == np.bool_: + data_shape = get_data_shape(data) + # Deal with object dtype + elif isinstance(dtype, np.dtype): + data = data[:] # load the data in case we come from HDF5 or another on-disk data source we don't know + data_shape = (len(data), ) + # if we have a compound data type + if dtype.names: + data_shape = get_data_shape(data) + # If strings are part of our compound type then we need to use Object type instead + # otherwise we try to keep the native compound datatype that numpy is using + for substype in dtype.fields.items(): + if np.issubdtype(substype[1][0], np.flexible) or np.issubdtype(substype[1][0], np.object_): + dtype = object + io_settings['object_codec'] = self.__codec_cls() + break + # sometimes bytes and strings can hide as object in numpy array so lets try + # to write those as strings and bytes rather than as objects + elif len(data) > 0 and isinstance(data, np.ndarray): + if isinstance(data.item(0), bytes): + dtype = bytes + data_shape = get_data_shape(data) + elif isinstance(data.item(0), str): + dtype = str + data_shape = get_data_shape(data) + # Set encoding for objects + else: + dtype = object + io_settings['object_codec'] = self.__codec_cls() + # Determine the shape from the data if all other cases have not been hit + else: + data_shape = get_data_shape(data) + + # Create the dataset + dset = parent.require_dataset(name, shape=data_shape, dtype=dtype, **io_settings) + dset.attrs['zarr_dtype'] = type_str + + # Write the data to file + if dtype == object: + for c in np.ndindex(data_shape): + o = data + for i in c: + o = o[i] + # bytes are not JSON serializable + dset[c] = o if not isinstance(o, (bytes, np.bytes_)) else o.decode("utf-8") + return dset + # standard write + else: + try: + dset[:] = data # If data is an h5py.Dataset then this will copy the data + # For compound data types containing strings Zarr sometimes does not like wirting multiple values + # try to write them one-at-a-time instead then + except ValueError: + for i in range(len(data)): + dset[i] = data[i] + return dset + + def __scalar_fill__(self, parent, name, data, options=None): + dtype = None + io_settings = dict() + if options is not None: + dtype = options.get('dtype') + io_settings = options.get('io_settings') + if io_settings is None: + io_settings = dict() + if not isinstance(dtype, type): + try: + dtype = self.__resolve_dtype__(dtype, data) + except Exception as exc: + msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) + raise Exception(msg) from exc + if dtype == object: + io_settings['object_codec'] = self.__codec_cls() + + dset = parent.require_dataset(name, shape=(1, ), dtype=dtype, **io_settings) + dset[:] = data + type_str = 'scalar' + dset.attrs['zarr_dtype'] = type_str + return dset + + @docval(returns='a GroupBuilder representing the NWB Dataset', rtype='GroupBuilder') + def read_builder(self): + f_builder = self.__read_group(self.__file, ROOT_NAME) + return f_builder + + def __set_built(self, zarr_obj, builder): + fpath = zarr_obj.store.path + path = zarr_obj.path + path = os.path.join(fpath, path) + self.__built.setdefault(path, builder) + + @docval({'name': 'zarr_obj', 'type': (Array, Group), + 'doc': 'the Zarr object to the corresponding Container/Data object for'}) + def get_container(self, **kwargs): + """ + Get the container for the corresponding Zarr Group or Dataset + + :raises ValueError: When no builder has been constructed yet for the given h5py object + """ + zarr_obj = getargs('zarr_obj', kwargs) + builder = self.get_builder(zarr_obj) + container = self.manager.construct(builder) + return container # TODO: This method should be moved to HDMFIO + + @docval({'name': 'zarr_obj', 'type': (Array, Group), + 'doc': 'the Zarr object to the corresponding Builder object for'}) + def get_builder(self, **kwargs): # TODO: move this to HDMFIO (define skeleton in there at least) + """ + Get the builder for the corresponding Group or Dataset + + :raises ValueError: When no builder has been constructed + """ + zarr_obj = kwargs['zarr_obj'] + builder = self.__get_built(zarr_obj) + if builder is None: + msg = '%s has not been built' % (zarr_obj.name) + raise ValueError(msg) + return builder + + def __get_built(self, zarr_obj): + """ + Look up a builder for the given zarr object + :param zarr_obj: The Zarr object to be built + :type zarr_obj: Zarr Group or Dataset + :return: Builder in the self.__built cache or None + """ + fpath = zarr_obj.store.path + path = zarr_obj.path + path = os.path.join(fpath, path) + return self.__built.get(path, None) + + def __read_group(self, zarr_obj, name=None): + ret = self.__get_built(zarr_obj) + if ret is not None: + return ret + + if name is None: + name = str(os.path.basename(zarr_obj.name)) + + # Create the GroupBuilder + attributes = self.__read_attrs(zarr_obj) + ret = GroupBuilder(name=name, source=self.source, attributes=attributes) + ret.location = self.get_zarr_parent_path(zarr_obj) + + # read sub groups + for sub_name, sub_group in zarr_obj.groups(): + sub_builder = self.__read_group(sub_group, sub_name) + ret.set_group(sub_builder) + + # read sub datasets + for sub_name, sub_array in zarr_obj.arrays(): + sub_builder = self.__read_dataset(sub_array, sub_name) + ret.set_dataset(sub_builder) + + # read the links + self.__read_links(zarr_obj=zarr_obj, parent=ret) + + self._written_builders.set_written(ret) # record that the builder has been written + self.__set_built(zarr_obj, ret) + return ret + + def __read_links(self, zarr_obj, parent): + """ + Read the links associated with a zarr group + :param zarr_obj: The Zarr group we should read links from + :type zarr_obj: zarr.hiearchy.Group + :param parent: GroupBuilder with which the links need to be associated + :type parent: GroupBuilder + """ + # read links + if 'zarr_link' in zarr_obj.attrs: + links = zarr_obj.attrs['zarr_link'] + for link in links: + link_name = link['name'] + target_name, target_zarr_obj = self.resolve_ref(link) + # NOTE: __read_group and __read_dataset return the cached builders if the target has already been built + if isinstance(target_zarr_obj, Group): + builder = self.__read_group(target_zarr_obj, target_name) + else: + builder = self.__read_dataset(target_zarr_obj, target_name) + link_builder = LinkBuilder(builder=builder, name=link_name, source=self.source) + link_builder.location = os.path.join(parent.location, parent.name) + self._written_builders.set_written(link_builder) # record that the builder has been written + parent.set_link(link_builder) + + def __read_dataset(self, zarr_obj, name): + ret = self.__get_built(zarr_obj) + if ret is not None: + return ret + + if 'zarr_dtype' in zarr_obj.attrs: + zarr_dtype = zarr_obj.attrs['zarr_dtype'] + elif hasattr(zarr_obj, 'dtype'): # Fallback for invalid files that are mssing zarr_type + zarr_dtype = zarr_obj.dtype + warnings.warn( + "Inferred dtype from zarr type. Dataset missing zarr_dtype: " + str(name) + " " + str(zarr_obj) + ) + else: + raise ValueError("Dataset missing zarr_dtype: " + str(name) + " " + str(zarr_obj)) + + kwargs = {"attributes": self.__read_attrs(zarr_obj), + "dtype": zarr_dtype, + "maxshape": zarr_obj.shape, + "chunks": not (zarr_obj.shape == zarr_obj.chunks), + "source": self.source} + dtype = kwargs['dtype'] + + # By default, use the zarr.core.Array as data for lazy data load + data = zarr_obj + + # Read scalar dataset + if dtype == 'scalar': + data = zarr_obj[0] + + if isinstance(dtype, list): + # Check compound dataset where one of the subsets contains references + has_reference = False + for i, dts in enumerate(dtype): + if dts['dtype'] in ['object', 'region']: # check items for object reference + has_reference = True + break + retrieved_dtypes = [dtype_dict['dtype'] for dtype_dict in dtype] + if has_reference: + # TODO: BuilderZarrTableDataset does not yet support region reference + data = BuilderZarrTableDataset(zarr_obj, self, retrieved_dtypes) + elif self.__is_ref(dtype): + # Array of references + if dtype == 'object': + data = BuilderZarrReferenceDataset(data, self) + # TODO: Resolution of Region reference not yet supported by BuilderZarrRegionDataset + # elif dtype == 'region': + # data = BuilderZarrRegionDataset(data, self) + + kwargs['data'] = data + if name is None: + name = str(os.path.basename(zarr_obj.name)) + ret = DatasetBuilder(name, **kwargs) # create builder object for dataset + ret.location = self.get_zarr_parent_path(zarr_obj) + self._written_builders.set_written(ret) # record that the builder has been written + self.__set_built(zarr_obj, ret) + return ret + + def __read_attrs(self, zarr_obj): + ret = dict() + for k in zarr_obj.attrs.keys(): + if k not in self.__reserve_attribute: + v = zarr_obj.attrs[k] + if isinstance(v, dict) and 'zarr_dtype' in v: + if v['zarr_dtype'] == 'object': + target_name, target_zarr_obj = self.resolve_ref(v['value']) + if isinstance(target_zarr_obj, zarr.hierarchy.Group): + ret[k] = self.__read_group(target_zarr_obj, target_name) + else: + ret[k] = self.__read_dataset(target_zarr_obj, target_name) + # TODO Need to implement region references for attributes + elif v['zarr_dtype'] == 'region': + raise NotImplementedError("Read of region references from attributes not implemented in ZarrIO") + else: + raise NotImplementedError("Unsupported zarr_dtype for attribute " + str(v)) + else: + ret[k] = v + return ret From 33b4d0b34fc799772218ee95fa1216defe9166c8 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 30 Nov 2023 15:43:45 -0800 Subject: [PATCH 03/25] checkpoint --- src/hdmf_zarr/backend2.py | 1420 ------------------------------------- 1 file changed, 1420 deletions(-) delete mode 100644 src/hdmf_zarr/backend2.py diff --git a/src/hdmf_zarr/backend2.py b/src/hdmf_zarr/backend2.py deleted file mode 100644 index 505031ea..00000000 --- a/src/hdmf_zarr/backend2.py +++ /dev/null @@ -1,1420 +0,0 @@ -"""Module with the Zarr-based I/O-backend for HDMF""" -# Python imports -import os -import warnings -import numpy as np -import tempfile -import logging - -# Zarr imports -import zarr -from zarr.hierarchy import Group -from zarr.core import Array -from zarr.storage import (DirectoryStore, - TempStore, - NestedDirectoryStore) -import numcodecs - -# HDMF-ZARR imports -from .utils import (ZarrDataIO, - ZarrReference, - ZarrSpecWriter, - ZarrSpecReader, - ZarrIODataChunkIteratorQueue) -from .zarr_utils import BuilderZarrReferenceDataset, BuilderZarrTableDataset - -# HDMF imports -from hdmf.backends.io import HDMFIO -from hdmf.backends.errors import UnsupportedOperation -from hdmf.backends.utils import (NamespaceToBuilderHelper, - WriteStatusTracker) -from hdmf.utils import (docval, - getargs, - popargs, - get_docval, - get_data_shape) -from hdmf.build import (Builder, - GroupBuilder, - DatasetBuilder, - LinkBuilder, - BuildManager, - RegionBuilder, - ReferenceBuilder, - TypeMap) -from hdmf.data_utils import AbstractDataChunkIterator -from hdmf.spec import (RefSpec, - DtypeSpec, - NamespaceCatalog) -from hdmf.query import HDMFDataset -from hdmf.container import Container - -# Module variables -ROOT_NAME = 'root' -""" -Name of the root builder for read/write -""" - -SPEC_LOC_ATTR = '.specloc' -""" -Reserved attribute storing the path to the Group where the schema for the file are cached -""" - -DEFAULT_SPEC_LOC_DIR = 'specifications' -""" -Default name of the group where specifications should be cached -""" - -SUPPORTED_ZARR_STORES = (DirectoryStore, - TempStore, - NestedDirectoryStore) -""" -Tuple listing all Zarr storage backends supported by ZarrIO -""" - - -class ZarrIO(HDMFIO): - - @staticmethod - def can_read(path): - try: - # TODO: how to use storage_options? Maybe easier to just check for ".zarr" suffix - zarr.open(path, mode="r") - self.__open_file_consolidated(source=path, mode='r') - return True - except Exception: - return False - - @docval({'name': 'path', - 'type': (str, *SUPPORTED_ZARR_STORES), - 'doc': 'the path to the Zarr file or a supported Zarr store'}, - {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None}, - {'name': 'mode', 'type': str, - 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-")'}, - {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool), - 'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.', - 'default': None}, - {'name': 'object_codec_class', 'type': None, - 'doc': 'Set the numcodec object codec class to be used to encode objects.' - 'Use numcodecs.pickles.Pickle by default.', - 'default': None}, - {'name': 'storage_options', 'type': dict, - 'doc': 'Zarr storage options to read remote folders', - 'default': None}) - def __init__(self, **kwargs): - self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) - path, manager, mode, synchronizer, object_codec_class, storage_options = popargs( - 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs) - if manager is None: - manager = BuildManager(TypeMap(NamespaceCatalog())) - if isinstance(synchronizer, bool): - if synchronizer: - sync_path = tempfile.mkdtemp() - self.__synchronizer = zarr.ProcessSynchronizer(sync_path) - else: - self.__synchronizer = None - else: - self.__synchronizer = synchronizer - self.__mode = mode - self.__path = path - self.__file = None - self.__storage_options = storage_options - self.__built = dict() - self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object - self.__dci_queue = None # Will be initialized on call to io.write - # Codec class to be used. Alternates, e.g., =numcodecs.JSON - self.__codec_cls = numcodecs.pickles.Pickle if object_codec_class is None else object_codec_class - source_path = self.__path - if isinstance(self.__path, SUPPORTED_ZARR_STORES): - source_path = self.__path.path - super().__init__(manager, source=source_path) - - @property - def file(self): - """ - The Zarr zarr.hierarchy.Group (or zarr.core.Array) opened by the backend. - May be None in case open has not been called yet, e.g., if no data has been - read or written yet via this instance. - """ - return self.__file - - @property - def path(self): - """The path to the Zarr file as set by the user""" - return self.__path - - @property - def abspath(self): - """The absolute path to the Zarr file""" - return os.path.abspath(self.source) - - @property - def synchronizer(self): - return self.__synchronizer - - @property - def object_codec_class(self): - return self.__codec_cls - - def open(self): - """Open the Zarr file""" - if self.__file is None: - self.__file = self.__open_file_consolidated(store=self.path, - mode=self.__mode, - synchronizer=self.__synchronizer, - storage_options=self.__storage_options) - - def close(self): - """Close the Zarr file""" - self.__file = None - return - - def is_remote(self): - """Return True if the file is remote, False otherwise""" - from zarr.storage import FSStore - if isinstance(self.file.store, FSStore): - return True - else: - return False - - @classmethod - @docval({'name': 'namespace_catalog', - 'type': (NamespaceCatalog, TypeMap), - 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'}, - {'name': 'path', - 'type': (str, *SUPPORTED_ZARR_STORES), - 'doc': 'the path to the Zarr file or a supported Zarr store'}, - {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None}) - def load_namespaces(cls, namespace_catalog, path, namespaces=None): - ''' - Load cached namespaces from a file. - ''' - # TODO: how to use storage_options here? - # f = self.__open_file_consolidated(source=path, mode='r') - f = zarr.open(path, mode='r') - if SPEC_LOC_ATTR not in f.attrs: - msg = "No cached namespaces found in %s" % path - warnings.warn(msg) - else: - spec_group = f[f.attrs[SPEC_LOC_ATTR]] - if namespaces is None: - namespaces = list(spec_group.keys()) - for ns in namespaces: - ns_group = spec_group[ns] - latest_version = list(ns_group.keys())[-1] - ns_group = ns_group[latest_version] - reader = ZarrSpecReader(ns_group) - namespace_catalog.load_namespaces('namespace', reader=reader) - - @docval( - {'name': 'container', 'type': Container, 'doc': 'the Container object to write'}, - {'name': 'cache_spec', 'type': bool, 'doc': 'cache specification to file', 'default': True}, - {'name': 'link_data', 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Datasets', 'default': True}, - {'name': 'exhaust_dci', 'type': bool, - 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', - 'default': True}, - { - "name": "number_of_jobs", - "type": int, - "doc": ( - "Number of jobs to use in parallel during write " - "(only works with GenericDataChunkIterator-wrapped datasets)." - ), - "default": 1, - }, - { - "name": "max_threads_per_process", - "type": int, - "doc": ( - "Limits the number of threads used by each process. The default is None (no limits)." - ), - "default": None, - }, - { - "name": "multiprocessing_context", - "type": str, - "doc": ( - "Context for multiprocessing. It can be None (default), 'fork' or 'spawn'. " - "Note that 'fork' is only available on UNIX systems (not Windows)." - ), - "default": None, - }, - ) - def write(self, **kwargs): - """Overwrite the write method to add support for caching the specification and parallelization.""" - cache_spec, number_of_jobs, max_threads_per_process, multiprocessing_context = popargs( - "cache_spec", "number_of_jobs", "max_threads_per_process", "multiprocessing_context", kwargs - ) - - self.__dci_queue = ZarrIODataChunkIteratorQueue( - number_of_jobs=number_of_jobs, - max_threads_per_process=max_threads_per_process, - multiprocessing_context=multiprocessing_context, - ) - - super(ZarrIO, self).write(**kwargs) - if cache_spec: - self.__cache_spec() - - def __cache_spec(self): - """Interanl function used to cache the spec in the current file""" - ref = self.__file.attrs.get(SPEC_LOC_ATTR) - spec_group = None - if ref is not None: - spec_group = self.__file[ref] - else: - path = DEFAULT_SPEC_LOC_DIR # do something to figure out where the specifications should go - spec_group = self.__file.require_group(path) - self.__file.attrs[SPEC_LOC_ATTR] = path - ns_catalog = self.manager.namespace_catalog - for ns_name in ns_catalog.namespaces: - ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name) - namespace = ns_catalog.get_namespace(ns_name) - if namespace.version is None: - group_name = '%s/unversioned' % ns_name - else: - group_name = '%s/%s' % (ns_name, namespace.version) - ns_group = spec_group.require_group(group_name) - writer = ZarrSpecWriter(ns_group) - ns_builder.export('namespace', writer=writer) - - @docval( - *get_docval(HDMFIO.export), - {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file', 'default': True}, - { - "name": "number_of_jobs", - "type": int, - "doc": ( - "Number of jobs to use in parallel during write " - "(only works with GenericDataChunkIterator-wrapped datasets)." - ), - "default": 1, - }, - { - "name": "max_threads_per_process", - "type": int, - "doc": ( - "Limits the number of threads used by each process. The default is None (no limits)." - ), - "default": None, - }, - { - "name": "multiprocessing_context", - "type": str, - "doc": ( - "Context for multiprocessing. It can be None (default), 'fork' or 'spawn'. " - "Note that 'fork' is only available on UNIX systems (not Windows)." - ), - "default": None, - }, - ) - def export(self, **kwargs): - """Export data read from a file from any backend to Zarr. - See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. - """ - if self.__mode != 'w': - raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." - % (self.source, self.__mode)) - - src_io = getargs('src_io', kwargs) - write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) - number_of_jobs, max_threads_per_process, multiprocessing_context = popargs( - "number_of_jobs", "max_threads_per_process", "multiprocessing_context", kwargs - ) - - self.__dci_queue = ZarrIODataChunkIteratorQueue( - number_of_jobs=number_of_jobs, - max_threads_per_process=max_threads_per_process, - multiprocessing_context=multiprocessing_context, - ) - - if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True): - raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument " - "link_data=True." % src_io.__class__.__name__) - - write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder - ckwargs = kwargs.copy() - ckwargs['write_args'] = write_args - super().export(**ckwargs) - if cache_spec: - self.__cache_spec() - - def get_written(self, builder, check_on_disk=False): - """ - Return True if this builder has been written to (or read from) disk by this IO object, False otherwise. - - :param builder: Builder object to get the written flag for - :type builder: Builder - :param check_on_disk: Check that the builder has been physically written to disk not just flagged as written - by this I/O backend - :type check_on_disk: bool - :return: True if the builder is found in self._written_builders using the builder ID, False otherwise. If - check_on_disk is enabled then the function cals get_builder_exists_on_disk in addtion to verify - that the builder has indeed been written to disk. - """ - written = self._written_builders.get_written(builder) - if written and check_on_disk: - written = written and self.get_builder_exists_on_disk(builder=builder) - return written - - @docval({'name': 'builder', 'type': Builder, 'doc': 'The builder of interest'}) - def get_builder_exists_on_disk(self, **kwargs): - """ - Convenience function to check whether a given builder exists on disk in this Zarr file. - """ - builder = getargs('builder', kwargs) - builder_path = self.get_builder_disk_path(builder=builder, filepath=None) - exists_on_disk = os.path.exists(builder_path) - return exists_on_disk - - @docval({'name': 'builder', 'type': Builder, 'doc': 'The builder of interest'}, - {'name': 'filepath', 'type': str, - 'doc': 'The path to the Zarr file or None for this file', 'default': None}) - def get_builder_disk_path(self, **kwargs): - builder, filepath = getargs('builder', 'filepath', kwargs) - basepath = filepath if filepath is not None else self.source - builder_path = os.path.join(basepath, self.__get_path(builder).lstrip("/")) - return builder_path - - @docval( - {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the NWBFile'}, - { - 'name': 'link_data', - 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', - 'default': True - }, - { - 'name': 'exhaust_dci', - 'type': bool, - 'doc': ( - 'Exhaust DataChunkIterators one at a time. If False, add ' - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end' - ), - 'default': True, - }, - { - 'name': 'export_source', - 'type': str, - 'doc': 'The source of the builders when exporting', - 'default': None, - }, - ) - def write_builder(self, **kwargs): - """Write a builder to disk.""" - f_builder, link_data, exhaust_dci, export_source = getargs( - 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs - ) - for name, gbldr in f_builder.groups.items(): - self.write_group( - parent=self.__file, - builder=gbldr, - link_data=link_data, - exhaust_dci=exhaust_dci, - export_source=export_source, - ) - for name, dbldr in f_builder.datasets.items(): - self.write_dataset( - parent=self.__file, - builder=dbldr, - link_data=link_data, - exhaust_dci=exhaust_dci, - export_source=export_source, - ) - self.write_attributes(self.__file, f_builder.attributes) # the same as set_attributes in HDMF - self.__dci_queue.exhaust_queue() # Write any remaining DataChunkIterators that have been queued - self._written_builders.set_written(f_builder) - self.logger.debug("Done writing %s '%s' to path '%s'" % - (f_builder.__class__.__qualname__, f_builder.name, self.source)) - - # Consolidate metadata for the entire file after everything has been written - zarr.consolidate_metadata(self.__file, metadata_key='.zmetadata') - - def consolidate_metadata(self): - """ - When a file is written, the metadata within the file is consolidated automatically. - If there are any metadata changes, the user needs to consolidate the metadata again - with this method in order for the metadata to be read correctly. - - Consolidate all metadata for groups and arrays within the given store into a - single resource and put it under .zmetadata. - """ - zarr.consolidate_metadata(self.__path, metadata_key='.zmetadata') - - def __open_file_consolidated(self, - store, - mode, - synchronizer = None, - storage_options = None): - """ - This method will check to see if the metadata has been consolidated, if so - """ - - if os.path.isfile(self.__path+'/.zmetadata'): - zarr.open_consolidated(store=store, - mode=mode, - synchronizer=synchronizer, - storage_options=storage_options) - else: - msg = "Could not find consolidated metadata." - warnings.warn(msg) - - zarr.open(store=store, - mode=mode, - synchronizer=synchronizer, - storage_options=storage_options) - - @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, - {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, - {'name': 'link_data', 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', 'default': True}, - {'name': 'exhaust_dci', 'type': bool, - 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', - 'default': True}, - {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None}, - returns='the Group that was created', rtype='Group') - def write_group(self, **kwargs): - """Write a GroupBuider to file""" - parent, builder, link_data, exhaust_dci, export_source = getargs( - 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs - ) - - if self.get_written(builder): - group = parent[builder.name] - else: - group = parent.require_group(builder.name) - - subgroups = builder.groups - if subgroups: - for subgroup_name, sub_builder in subgroups.items(): - self.write_group( - parent=group, - builder=sub_builder, - link_data=link_data, - exhaust_dci=exhaust_dci, - ) - - datasets = builder.datasets - if datasets: - for dset_name, sub_builder in datasets.items(): - self.write_dataset( - parent=group, - builder=sub_builder, - link_data=link_data, - exhaust_dci=exhaust_dci, - export_source=export_source, - ) - - # write all links (haven implemented) - links = builder.links - if links: - for link_name, sub_builder in links.items(): - self.write_link(group, sub_builder) - - attributes = builder.attributes - self.write_attributes(group, attributes) - self._written_builders.set_written(builder) # record that the builder has been written - return group - - @docval({'name': 'obj', 'type': (Group, Array), 'doc': 'the Zarr object to add attributes to'}, - {'name': 'attributes', - 'type': dict, - 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}, - {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None}) - def write_attributes(self, **kwargs): - """Set (i.e., write) the attributes on a given Zarr Group or Array.""" - obj, attributes, export_source = getargs('obj', 'attributes', 'export_source', kwargs) - - for key, value in attributes.items(): - # Case 1: list, set, tuple type attributes - if isinstance(value, (set, list, tuple)) or (isinstance(value, np.ndarray) and np.ndim(value) != 0): - # Convert to tuple for writing (e.g., numpy arrays are not JSON serializable) - if isinstance(value, np.ndarray): - tmp = tuple(value.tolist()) - else: - tmp = tuple(value) - # Attempt write of the attribute - try: - obj.attrs[key] = tmp - # Numpy scalars and bytes are not JSON serializable. Try to convert to a serializable type instead - except TypeError as e: - try: - tmp = tuple([i.item() - if (isinstance(i, np.generic) and not isinstance(i, np.bytes_)) - else i.decode("utf-8") - if isinstance(i, (bytes, np.bytes_)) - else i - for i in value]) - obj.attrs[key] = tmp - except: # noqa: E722 - raise TypeError(str(e) + " type=" + str(type(value)) + " data=" + str(value)) from e - # Case 2: References - elif isinstance(value, (Container, Builder, ReferenceBuilder)): - # TODO: Region References are not yet supported - # if isinstance(value, RegionBuilder): - # type_str = 'region' - # refs = self.__get_ref(value.builder) - if isinstance(value, (ReferenceBuilder, Container, Builder)): - type_str = 'object' - if isinstance(value, Builder): - refs = self.__get_ref(value, export_source) - else: - refs = self.__get_ref(value.builder, export_source) - tmp = {'zarr_dtype': type_str, 'value': refs} - obj.attrs[key] = tmp - # Case 3: Scalar attributes - else: - # Attempt to write the attribute - try: - obj.attrs[key] = value - # Numpy scalars and bytes are not JSON serializable. Try to convert to a serializable type instead - except TypeError as e: - try: - val = value.item if isinstance(value, np.ndarray) else value - val = value.item() \ - if (isinstance(value, np.generic) and not isinstance(value, np.bytes_)) \ - else val.decode("utf-8") \ - if isinstance(value, (bytes, np.bytes_)) \ - else val - obj.attrs[key] = val - except: # noqa: E722 - msg = str(e) + "key=" + key + " type=" + str(type(value)) + " data=" + str(value) - raise TypeError(msg) from e - - def __get_path(self, builder): - """Get the path to the builder. - If builder.location is set then it is used as the path, otherwise the function - determines the path by constructing it iteratively from the parents of the - builder. - """ - if builder.location is not None: - path = os.path.normpath(os.path.join(builder.location, builder.name)).replace("\\", "/") - else: - curr = builder - names = list() - while curr is not None and curr.name != ROOT_NAME: - names.append(curr.name) - curr = curr.parent - delim = "/" - path = "%s%s" % (delim, delim.join(reversed(names))) - return path - - @staticmethod - def get_zarr_paths(zarr_object): - """ - For a Zarr object find 1) the path to the main zarr file it is in and 2) the path to the object within the file - :param zarr_object: Object for which we are looking up the path - :type zarr_object: Zarr Group or Array - :return: Tuple of two string with: 1) path of the Zarr file and 2) full path within the zarr file to the object - """ - # In Zarr the path is a combination of the path of the store and the path of the object. So we first need to - # merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the - # directory name to get the path of the parent - fullpath = os.path.normpath(os.path.join(zarr_object.store.path, zarr_object.path)).replace("\\", "/") - # To determine the filepath we now iterate over the path and check if the .zgroup object exists at - # a level, indicating that we are still within the Zarr file. The first level we hit where the parent - # directory does not have a .zgroup means we have found the main file - filepath = fullpath - while os.path.exists(os.path.join(os.path.dirname(filepath), ".zgroup")): - filepath = os.path.dirname(filepath) - # From the fullpath and filepath we can now compute the objectpath within the zarr file as the relative - # path from the filepath to the object - objectpath = "/" + os.path.relpath(fullpath, filepath) - # return the result - return filepath, objectpath - - @staticmethod - def get_zarr_parent_path(zarr_object): - """ - Get the location of the parent of a zarr_object within the file - :param zarr_object: Object for which we are looking up the path - :type zarr_object: Zarr Group or Array - :return: String with the path - """ - filepath, objectpath = ZarrIO.get_zarr_paths(zarr_object) - parentpath = os.path.dirname(objectpath) - return parentpath - - @staticmethod - def is_zarr_file(path): - """ - Check if the given path defines a Zarr file - :param path: Full path to main directory - :return: Bool - """ - if os.path.exists(path): - if os.path.isdir(path): - if os.path.exists(os.path.join(path, ".zgroup")): - return True - return False - - def __is_ref(self, dtype): - if isinstance(dtype, DtypeSpec): - return self.__is_ref(dtype.dtype) - elif isinstance(dtype, RefSpec): - return True - elif isinstance(dtype, np.dtype): - return False - else: - return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE - - def resolve_ref(self, zarr_ref): - """ - Get the full path to the object linked to by the zarr reference - - The function only constructs the links to the targe object, but it does not check if the object exists - - :param zarr_ref: Dict with `source` and `path` keys or a `ZarrReference` object - :return: 1) name of the target object - 2) the target zarr object within the target file - """ - # Extract the path as defined in the zarr_ref object - if zarr_ref.get('source', None) is None: - source_file = str(zarr_ref['path']) - else: - source_file = str(zarr_ref['source']) - # Resolve the path relative to the current file - if not self.is_remote(): - source_file = os.path.abspath(os.path.join(self.source, source_file)) - else: - # get rid of extra "/" and "./" in the path root and source_file - root_path = str(self.path).rstrip("/") - source_path = str(source_file).lstrip(".") - source_file = root_path + source_path - - object_path = zarr_ref.get('path', None) - if object_path: - target_name = os.path.basename(object_path) - else: - target_name = ROOT_NAME - - target_zarr_obj = zarr.open(source_file, mode='r', storage_options=self.__storage_options) - # target_zarr_obj = self.__open_file_consolidated(store=source_file, mode='r', storage_options=self.__storage_options) - if object_path is not None: - try: - target_zarr_obj = target_zarr_obj[object_path] - except Exception: - raise ValueError("Found bad link to object %s in file %s" % (object_path, source_file)) - # Return the create path - return target_name, target_zarr_obj - - def __get_ref(self, ref_object, export_source=None): - """ - Create a ZarrReference object that points to the given container - - :param ref_object: the object to be referenced - :type ref_object: Builder, Container, ReferenceBuilder - :returns: ZarrReference object - """ - if isinstance(ref_object, RegionBuilder): # or region is not None: TODO: Add to support regions - raise NotImplementedError("Region references are currently not supported by ZarrIO") - if isinstance(ref_object, Builder): - if isinstance(ref_object, LinkBuilder): - builder = ref_object.target_builder - else: - builder = ref_object - elif isinstance(ref_object, ReferenceBuilder): - builder = ref_object.builder - else: - builder = self.manager.build(ref_object) - path = self.__get_path(builder) - # TODO Add to get region for region references. - # Also add {'name': 'region', 'type': (slice, list, tuple), - # 'doc': 'the region reference indexing object', 'default': None}, - # if isinstance(ref_object, RegionBuilder): - # region = ref_object.region - - # get the object id if available - object_id = builder.get('object_id', None) - - # determine the object_id of the source by following the parents of the builder until we find the root - # the root builder should be the same as the source file containing the reference - curr = builder - while curr is not None and curr.name != ROOT_NAME: - curr = curr.parent - if curr: - source_object_id = curr.get('object_id', None) - # We did not find ROOT_NAME as a parent. This should only happen if we have an invalid - # file as a source, e.g., if during testing we use an arbitrary builder. We check this - # anyways to avoid potential errors just in case - else: - source_object_id = None - warn_msg = "Could not determine source_object_id for builder with path: %s" % path - warnings.warn(warn_msg) - - # by checking os.isdir makes sure we have a valid link path to a dir for Zarr. For conversion - # between backends a user should always use export which takes care of creating a clean set of builders. - source = (builder.source - if (builder.source is not None and os.path.isdir(builder.source)) - else self.source) - - # Make the source relative to the current file - # TODO: This check assumes that all links are internal links on export. - # Need to deal with external links on export. - if export_source is not None: - # Make sure the source of the reference is now towards the new file - # and not the original source when exporting. - source = '.' - else: - source = os.path.relpath(os.path.abspath(source), start=self.abspath) - # Return the ZarrReference object - ref = ZarrReference( - source=source, - path=path, - object_id=object_id, - source_object_id=source_object_id) - return ref - - def __add_link__(self, parent, target_source, target_path, link_name): - """ - Add a link to the file - :param parent: The parent Zarr group containing the link - :type parent: zarr.hierarchy.Group - :param target_source: Source path within the Zarr file to the linked object - :type target_source: str - :param target_path: Path to the Zarr file containing the linked object - :param link_name: Name of the link - :type link_name: str - """ - if 'zarr_link' not in parent.attrs: - parent.attrs['zarr_link'] = [] - zarr_link = list(parent.attrs['zarr_link']) - zarr_link.append({'source': target_source, 'path': target_path, 'name': link_name}) - parent.attrs['zarr_link'] = zarr_link - - @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, - {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'}) - def write_link(self, **kwargs): - parent, builder = getargs('parent', 'builder', kwargs) - if self.get_written(builder): - self.logger.debug("Skipping LinkBuilder '%s' already written to parent group '%s'" - % (builder.name, parent.name)) - return - self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) - name = builder.name - target_builder = builder.builder - # Get the reference - zarr_ref = self.__get_ref(target_builder) - # EXPORT WITH LINKS: Fix link source - # if the target and source are both the same, then we need to ALWAYS use ourselves as a source - # When exporting from one source to another, the LinkBuilders.source are not updated, i.e,. the - # builder.source and target_builder.source are not being updated and point to the old file, but - # for internal links (a.k.a, SoftLinks) they will be the same and our target will be part of - # our new file, so we can safely replace the source - if builder.source == target_builder.source: - zarr_ref.source = "." # Link should be relative to self - # EXPORT WITH LINKS: Make sure target is written. If is not then if the target points to a - # non-Zarr source, then we need to copy the data instead of writing a - # link to the data - # When exporting from a different backend, then we may encounter external links to - # other datasets, groups (or links) in another file. Since they are from another - # backend, we must ensure that those targets are copied as well, so we check here - # if our target_builder has been written and write it if it doesn't - # TODO: Review the logic for when we need to copy data and when to link it. We may need the export_source? - """ - skip_link = False - if not self.get_written(target_builder): - if not self.is_zarr_file(target_builder.source): - # We need to copy the target in place of the link so we need to - # change the name of target_builder to match the link instead - temp = copy(target_builder.name) - target_builder._Builder__name = name - # Skip writing the link since we copied the data into place - skip_link = True - if isinstance(target_builder, DatasetBuilder): - self.write_dataset(parent=parent, builder=target_builder) - elif isinstance(target_builder, GroupBuilder): - self.write_group(parent=parent, builder=target_builder) - elif isinstance(target_builder, LinkBuilder): - self.write_link(parent=parent, builder=target_builder) - target_builder._Builder__name = temp - # REGULAR LINK I/O: - # Write the actual link as we should in most cases. Skip it only if we copied the - # data from an external source in place instead - if not skip_link: - self.__add_link__(parent, zarr_ref.source, zarr_ref.path, name) - """ - self.__add_link__(parent, zarr_ref.source, zarr_ref.path, name) - self._written_builders.set_written(builder) # record that the builder has been written - - @classmethod - def __setup_chunked_dataset__(cls, parent, name, data, options=None): - """ - Setup a dataset for writing to one-chunk-at-a-time based on the given DataChunkIterator. This - is a helper function for write_dataset() - :param parent: The parent object to which the dataset should be added - :type parent: Zarr Group or File - :param name: The name of the dataset - :type name: str - :param data: The data to be written. - :type data: AbstractDataChunkIterator - :param options: Dict with options for creating a dataset. available options are 'dtype' and 'io_settings' - :type options: dict - """ - io_settings = {} - if options is not None: - if 'io_settings' in options: - io_settings = options.get('io_settings') - # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write. - if 'chunks' not in io_settings: - recommended_chunks = data.recommended_chunk_shape() - io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks - # Define the shape of the data if not provided by the user - if 'shape' not in io_settings: - io_settings['shape'] = data.recommended_data_shape() - if 'dtype' not in io_settings: - if (options is not None) and ('dtype' in options): - io_settings['dtype'] = options['dtype'] - else: - io_settings['dtype'] = data.dtype - if isinstance(io_settings['dtype'], str): - # map to real dtype if we were given a string - io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) - try: - dset = parent.create_dataset(name, **io_settings) - dset.attrs['zarr_dtype'] = np.dtype(io_settings['dtype']).str - except Exception as exc: - raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc - return dset - - @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, # noqa: C901 - {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'}, - {'name': 'link_data', 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', 'default': True}, - {'name': 'exhaust_dci', 'type': bool, - 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', - 'default': True}, - {'name': 'force_data', 'type': None, - 'doc': 'Used internally to force the data being used when we have to load the data', 'default': None}, - {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None}, - returns='the Zarr array that was created', rtype=Array) - def write_dataset(self, **kwargs): # noqa: C901 - parent, builder, link_data, exhaust_dci, export_source = getargs( - 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs - ) - - force_data = getargs('force_data', kwargs) - - if exhaust_dci and self.__dci_queue is None: - self.__dci_queue = ZarrIODataChunkIteratorQueue() - - if self.get_written(builder): - return None - name = builder.name - data = builder.data if force_data is None else force_data - options = dict() - if isinstance(data, ZarrDataIO): - options['io_settings'] = data.io_settings - link_data = data.link_data - data = data.data - else: - options['io_settings'] = {} - - attributes = builder.attributes - options['dtype'] = builder.dtype - - linked = False - - # Write a regular Zarr array - dset = None - if isinstance(data, Array): - # copy the dataset - if link_data: - self.__add_link__(parent, data.store.path, data.name, name) - linked = True - dset = None - else: - zarr.copy(data, parent, name=name) - dset = parent[name] - # When converting data between backends we may see an HDMFDataset, e.g., a H55ReferenceDataset, with references - elif isinstance(data, HDMFDataset): - # If we have a dataset of containers we need to make the references to the containers - if len(data) > 0 and isinstance(data[0], Container): - ref_data = [self.__get_ref(data[i], export_source=export_source) for i in range(len(data))] - shape = (len(data), ) - type_str = 'object' - dset = parent.require_dataset(name, - shape=shape, - dtype=object, - object_codec=self.__codec_cls(), - **options['io_settings']) - dset.attrs['zarr_dtype'] = type_str - dset[:] = ref_data - self._written_builders.set_written(builder) # record that the builder has been written - # If we have a regular dataset, then load the data and write the builder after load - else: - # TODO This code path is also exercised when data is a - # hdmf.backends.hdf5.h5_utils.BuilderH5ReferenceDataset (aka. ReferenceResolver) - # check that this is indeed the right thing to do here - - # We can/should not update the data in the builder itself so we load the data here and instead - # force write_dataset when we call it recursively to use the data we loaded, rather than the - # dataset that is set on the builder - dset = self.write_dataset(parent=parent, - builder=builder, - link_data=link_data, - force_data=data[:], - export_source=export_source) - self._written_builders.set_written(builder) # record that the builder has been written - # Write a compound dataset - elif isinstance(options['dtype'], list): - refs = list() - type_str = list() - for i, dts in enumerate(options['dtype']): - if self.__is_ref(dts['dtype']): - refs.append(i) - ref_tmp = self.__get_ref(data[0][i], export_source=export_source) - if isinstance(ref_tmp, ZarrReference): - dts_str = 'object' - else: - dts_str = 'region' - type_str.append({'name': dts['name'], 'dtype': dts_str}) - else: - i = list([dts, ]) - t = self.__resolve_dtype_helper__(i) - type_str.append(self.__serial_dtype__(t)[0]) - - if len(refs) > 0: - dset = parent.require_dataset(name, - shape=(len(data), ), - dtype=object, - object_codec=self.__codec_cls(), - **options['io_settings']) - self._written_builders.set_written(builder) # record that the builder has been written - dset.attrs['zarr_dtype'] = type_str - for j, item in enumerate(data): - new_item = list(item) - for i in refs: - new_item[i] = self.__get_ref(item[i], export_source=export_source) - dset[j] = new_item - else: - # write a compound datatype - dset = self.__list_fill__(parent, name, data, options) - # Write a dataset of references - elif self.__is_ref(options['dtype']): - # TODO Region references are not yet support, but here how the code should look - # if isinstance(data, RegionBuilder): - # shape = (1,) - # type_str = 'region' - # refs = self.__get_ref(data.builder, data.region) - if isinstance(data, ReferenceBuilder): - shape = (1,) - type_str = 'object' - refs = self.__get_ref(data.builder, export_source=export_source) - # TODO: Region References are not yet supported - # elif options['dtype'] == 'region': - # shape = (len(data), ) - # type_str = 'region' - # refs = [self.__get_ref(item.builder, item.region) for item in data] - else: - shape = (len(data), ) - type_str = 'object' - refs = [self.__get_ref(item, export_source=export_source) for item in data] - - dset = parent.require_dataset(name, - shape=shape, - dtype=object, - object_codec=self.__codec_cls(), - **options['io_settings']) - self._written_builders.set_written(builder) # record that the builder has been written - dset.attrs['zarr_dtype'] = type_str - if hasattr(refs, '__len__'): - dset[:] = refs - else: - dset[0] = refs - # write a 'regular' dataset without DatasetIO info - else: - if isinstance(data, (str, bytes)): - dset = self.__scalar_fill__(parent, name, data, options) - # Iterative write of a data chunk iterator - elif isinstance(data, AbstractDataChunkIterator): - dset = self.__setup_chunked_dataset__(parent, name, data, options) - self.__dci_queue.append(dataset=dset, data=data) - elif hasattr(data, '__len__'): - dset = self.__list_fill__(parent, name, data, options) - else: - dset = self.__scalar_fill__(parent, name, data, options) - if not linked: - self.write_attributes(dset, attributes) - # record that the builder has been written - self._written_builders.set_written(builder) - # Exhaust the DataChunkIterator if the dataset was given this way. Note this is a no-op - # if the self.__dci_queue is empty - if exhaust_dci: - self.__dci_queue.exhaust_queue() - return dset - - __dtypes = { - "float": np.float32, - "float32": np.float32, - "double": np.float64, - "float64": np.float64, - "long": np.int64, - "int64": np.int64, - "uint64": np.uint64, - "int": np.int32, - "int32": np.int32, - "int16": np.int16, - "int8": np.int8, - "bool": np.bool_, - "bool_": np.bool_, - "text": str, - "utf": str, - "utf8": str, - "utf-8": str, - "ascii": bytes, - "str": str, - "isodatetime": str, - "string_": bytes, - "uint32": np.uint32, - "uint16": np.uint16, - "uint8": np.uint8, - "ref": ZarrReference, - "reference": ZarrReference, - "object": ZarrReference, - "region": ZarrReference, - } - - @classmethod - def __serial_dtype__(cls, dtype): - if isinstance(dtype, type): - return dtype.__name__ - elif isinstance(dtype, np.dtype): - if dtype.names is None: - return dtype.type.__name__ - else: - ret = list() - for n in dtype.names: - item = dict() - item['name'] = n - item['dtype'] = cls.__serial_dtype__(dtype[n]) - ret.append(item) - return ret - # TODO Does not work when Reference in compound datatype - elif dtype == ZarrReference: - return 'object' - - @classmethod - def __resolve_dtype__(cls, dtype, data): - dtype = cls.__resolve_dtype_helper__(dtype) - if dtype is None: - dtype = cls.get_type(data) - return dtype - - @classmethod - def __resolve_dtype_helper__(cls, dtype): - if dtype is None: - return None - elif isinstance(dtype, (type, np.dtype)): - return dtype - elif isinstance(dtype, str): - return cls.__dtypes.get(dtype) - elif isinstance(dtype, dict): - return cls.__dtypes.get(dtype['reftype']) - else: - return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype]) - - @classmethod - def get_type(cls, data): - if isinstance(data, str): - return str - elif not hasattr(data, '__len__'): - return type(data) - else: - if len(data) == 0: - raise ValueError('cannot determine type for empty data') - return cls.get_type(data[0]) - - __reserve_attribute = ('zarr_dtype', 'zarr_link') - - def __list_fill__(self, parent, name, data, options=None): # noqa: C901 - dtype = None - io_settings = dict() - if options is not None: - dtype = options.get('dtype') - io_settings = options.get('io_settings') - if io_settings is None: - io_settings = dict() - # Determine the dtype - if not isinstance(dtype, type): - try: - dtype = self.__resolve_dtype__(dtype, data) - except Exception as exc: - msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) # noqa: F821 - raise Exception(msg) from exc - - # Set the type_str - type_str = self.__serial_dtype__(dtype) - - # Determine the shape and update the dtype if necessary when dtype==object - if 'shape' in io_settings: # Use the shape set by the user - data_shape = io_settings.pop('shape') - # If we have a numeric numpy array then use its shape - elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.number) or dtype == np.bool_: - data_shape = get_data_shape(data) - # Deal with object dtype - elif isinstance(dtype, np.dtype): - data = data[:] # load the data in case we come from HDF5 or another on-disk data source we don't know - data_shape = (len(data), ) - # if we have a compound data type - if dtype.names: - data_shape = get_data_shape(data) - # If strings are part of our compound type then we need to use Object type instead - # otherwise we try to keep the native compound datatype that numpy is using - for substype in dtype.fields.items(): - if np.issubdtype(substype[1][0], np.flexible) or np.issubdtype(substype[1][0], np.object_): - dtype = object - io_settings['object_codec'] = self.__codec_cls() - break - # sometimes bytes and strings can hide as object in numpy array so lets try - # to write those as strings and bytes rather than as objects - elif len(data) > 0 and isinstance(data, np.ndarray): - if isinstance(data.item(0), bytes): - dtype = bytes - data_shape = get_data_shape(data) - elif isinstance(data.item(0), str): - dtype = str - data_shape = get_data_shape(data) - # Set encoding for objects - else: - dtype = object - io_settings['object_codec'] = self.__codec_cls() - # Determine the shape from the data if all other cases have not been hit - else: - data_shape = get_data_shape(data) - - # Create the dataset - dset = parent.require_dataset(name, shape=data_shape, dtype=dtype, **io_settings) - dset.attrs['zarr_dtype'] = type_str - - # Write the data to file - if dtype == object: - for c in np.ndindex(data_shape): - o = data - for i in c: - o = o[i] - # bytes are not JSON serializable - dset[c] = o if not isinstance(o, (bytes, np.bytes_)) else o.decode("utf-8") - return dset - # standard write - else: - try: - dset[:] = data # If data is an h5py.Dataset then this will copy the data - # For compound data types containing strings Zarr sometimes does not like wirting multiple values - # try to write them one-at-a-time instead then - except ValueError: - for i in range(len(data)): - dset[i] = data[i] - return dset - - def __scalar_fill__(self, parent, name, data, options=None): - dtype = None - io_settings = dict() - if options is not None: - dtype = options.get('dtype') - io_settings = options.get('io_settings') - if io_settings is None: - io_settings = dict() - if not isinstance(dtype, type): - try: - dtype = self.__resolve_dtype__(dtype, data) - except Exception as exc: - msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) - raise Exception(msg) from exc - if dtype == object: - io_settings['object_codec'] = self.__codec_cls() - - dset = parent.require_dataset(name, shape=(1, ), dtype=dtype, **io_settings) - dset[:] = data - type_str = 'scalar' - dset.attrs['zarr_dtype'] = type_str - return dset - - @docval(returns='a GroupBuilder representing the NWB Dataset', rtype='GroupBuilder') - def read_builder(self): - f_builder = self.__read_group(self.__file, ROOT_NAME) - return f_builder - - def __set_built(self, zarr_obj, builder): - fpath = zarr_obj.store.path - path = zarr_obj.path - path = os.path.join(fpath, path) - self.__built.setdefault(path, builder) - - @docval({'name': 'zarr_obj', 'type': (Array, Group), - 'doc': 'the Zarr object to the corresponding Container/Data object for'}) - def get_container(self, **kwargs): - """ - Get the container for the corresponding Zarr Group or Dataset - - :raises ValueError: When no builder has been constructed yet for the given h5py object - """ - zarr_obj = getargs('zarr_obj', kwargs) - builder = self.get_builder(zarr_obj) - container = self.manager.construct(builder) - return container # TODO: This method should be moved to HDMFIO - - @docval({'name': 'zarr_obj', 'type': (Array, Group), - 'doc': 'the Zarr object to the corresponding Builder object for'}) - def get_builder(self, **kwargs): # TODO: move this to HDMFIO (define skeleton in there at least) - """ - Get the builder for the corresponding Group or Dataset - - :raises ValueError: When no builder has been constructed - """ - zarr_obj = kwargs['zarr_obj'] - builder = self.__get_built(zarr_obj) - if builder is None: - msg = '%s has not been built' % (zarr_obj.name) - raise ValueError(msg) - return builder - - def __get_built(self, zarr_obj): - """ - Look up a builder for the given zarr object - :param zarr_obj: The Zarr object to be built - :type zarr_obj: Zarr Group or Dataset - :return: Builder in the self.__built cache or None - """ - fpath = zarr_obj.store.path - path = zarr_obj.path - path = os.path.join(fpath, path) - return self.__built.get(path, None) - - def __read_group(self, zarr_obj, name=None): - ret = self.__get_built(zarr_obj) - if ret is not None: - return ret - - if name is None: - name = str(os.path.basename(zarr_obj.name)) - - # Create the GroupBuilder - attributes = self.__read_attrs(zarr_obj) - ret = GroupBuilder(name=name, source=self.source, attributes=attributes) - ret.location = self.get_zarr_parent_path(zarr_obj) - - # read sub groups - for sub_name, sub_group in zarr_obj.groups(): - sub_builder = self.__read_group(sub_group, sub_name) - ret.set_group(sub_builder) - - # read sub datasets - for sub_name, sub_array in zarr_obj.arrays(): - sub_builder = self.__read_dataset(sub_array, sub_name) - ret.set_dataset(sub_builder) - - # read the links - self.__read_links(zarr_obj=zarr_obj, parent=ret) - - self._written_builders.set_written(ret) # record that the builder has been written - self.__set_built(zarr_obj, ret) - return ret - - def __read_links(self, zarr_obj, parent): - """ - Read the links associated with a zarr group - :param zarr_obj: The Zarr group we should read links from - :type zarr_obj: zarr.hiearchy.Group - :param parent: GroupBuilder with which the links need to be associated - :type parent: GroupBuilder - """ - # read links - if 'zarr_link' in zarr_obj.attrs: - links = zarr_obj.attrs['zarr_link'] - for link in links: - link_name = link['name'] - target_name, target_zarr_obj = self.resolve_ref(link) - # NOTE: __read_group and __read_dataset return the cached builders if the target has already been built - if isinstance(target_zarr_obj, Group): - builder = self.__read_group(target_zarr_obj, target_name) - else: - builder = self.__read_dataset(target_zarr_obj, target_name) - link_builder = LinkBuilder(builder=builder, name=link_name, source=self.source) - link_builder.location = os.path.join(parent.location, parent.name) - self._written_builders.set_written(link_builder) # record that the builder has been written - parent.set_link(link_builder) - - def __read_dataset(self, zarr_obj, name): - ret = self.__get_built(zarr_obj) - if ret is not None: - return ret - - if 'zarr_dtype' in zarr_obj.attrs: - zarr_dtype = zarr_obj.attrs['zarr_dtype'] - elif hasattr(zarr_obj, 'dtype'): # Fallback for invalid files that are mssing zarr_type - zarr_dtype = zarr_obj.dtype - warnings.warn( - "Inferred dtype from zarr type. Dataset missing zarr_dtype: " + str(name) + " " + str(zarr_obj) - ) - else: - raise ValueError("Dataset missing zarr_dtype: " + str(name) + " " + str(zarr_obj)) - - kwargs = {"attributes": self.__read_attrs(zarr_obj), - "dtype": zarr_dtype, - "maxshape": zarr_obj.shape, - "chunks": not (zarr_obj.shape == zarr_obj.chunks), - "source": self.source} - dtype = kwargs['dtype'] - - # By default, use the zarr.core.Array as data for lazy data load - data = zarr_obj - - # Read scalar dataset - if dtype == 'scalar': - data = zarr_obj[0] - - if isinstance(dtype, list): - # Check compound dataset where one of the subsets contains references - has_reference = False - for i, dts in enumerate(dtype): - if dts['dtype'] in ['object', 'region']: # check items for object reference - has_reference = True - break - retrieved_dtypes = [dtype_dict['dtype'] for dtype_dict in dtype] - if has_reference: - # TODO: BuilderZarrTableDataset does not yet support region reference - data = BuilderZarrTableDataset(zarr_obj, self, retrieved_dtypes) - elif self.__is_ref(dtype): - # Array of references - if dtype == 'object': - data = BuilderZarrReferenceDataset(data, self) - # TODO: Resolution of Region reference not yet supported by BuilderZarrRegionDataset - # elif dtype == 'region': - # data = BuilderZarrRegionDataset(data, self) - - kwargs['data'] = data - if name is None: - name = str(os.path.basename(zarr_obj.name)) - ret = DatasetBuilder(name, **kwargs) # create builder object for dataset - ret.location = self.get_zarr_parent_path(zarr_obj) - self._written_builders.set_written(ret) # record that the builder has been written - self.__set_built(zarr_obj, ret) - return ret - - def __read_attrs(self, zarr_obj): - ret = dict() - for k in zarr_obj.attrs.keys(): - if k not in self.__reserve_attribute: - v = zarr_obj.attrs[k] - if isinstance(v, dict) and 'zarr_dtype' in v: - if v['zarr_dtype'] == 'object': - target_name, target_zarr_obj = self.resolve_ref(v['value']) - if isinstance(target_zarr_obj, zarr.hierarchy.Group): - ret[k] = self.__read_group(target_zarr_obj, target_name) - else: - ret[k] = self.__read_dataset(target_zarr_obj, target_name) - # TODO Need to implement region references for attributes - elif v['zarr_dtype'] == 'region': - raise NotImplementedError("Read of region references from attributes not implemented in ZarrIO") - else: - raise NotImplementedError("Unsupported zarr_dtype for attribute " + str(v)) - else: - ret[k] = v - return ret From 52938c5079baa1c5d1607633a13cb5edf3cd295d Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 30 Nov 2023 20:34:13 -0800 Subject: [PATCH 04/25] abstract --- src/hdmf_zarr/backend.py | 55 ++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index b3338f38..1fcf149b 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -157,11 +157,8 @@ def object_codec_class(self): def open(self): """Open the Zarr file""" if self.__file is None: - # self.__file = zarr.open(store=self.path, - # mode=self.__mode, - # synchronizer=self.__synchronizer, - # storage_options=self.__storage_options) - # # breakpoint() + # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. + # As a result, when in 'w' mode, the file will not use consolidated metadata. if self.__mode == 'w': self.__file = zarr.open(store=self.path, mode=self.__mode, @@ -439,7 +436,6 @@ def write_builder(self, **kwargs): (f_builder.__class__.__qualname__, f_builder.name, self.source)) # Consolidate metadata for the entire file after everything has been written - # breakpoint() zarr.consolidate_metadata(store=self.path) def consolidate_metadata(self, store): @@ -453,6 +449,19 @@ def consolidate_metadata(self, store): """ zarr.consolidate_metadata(store, metadata_key='.zmetadata') + def __get_store_path(self, zarr_object): + """ + Method to retrieve the path from the Zarr storage. + ConsolidatedMetadataStore wraps around other Zarr Store objects, requiring a check to + retrieve the path. + """ + if isinstance(zarr_object.store, zarr.storage.ConsolidatedMetadataStore): + fpath = zarr_object.store.store.path + else: + fpath = zarr_object.store.path + + return fpath + def __open_file_consolidated(self, store, mode, @@ -475,7 +484,8 @@ def __open_file_consolidated(self, warnings.warn(msg) return zarr.open(store=self.path, mode=self.__mode, - synchronizer=self.__synchronizer) + synchronizer=self.__synchronizer, + storage_options=self.__storage_options) @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, @@ -615,8 +625,7 @@ def __get_path(self, builder): path = "%s%s" % (delim, delim.join(reversed(names))) return path - @staticmethod - def get_zarr_paths(zarr_object): + def get_zarr_paths(self, zarr_object): """ For a Zarr object find 1) the path to the main zarr file it is in and 2) the path to the object within the file :param zarr_object: Object for which we are looking up the path @@ -626,10 +635,7 @@ def get_zarr_paths(zarr_object): # In Zarr the path is a combination of the path of the store and the path of the object. So we first need to # merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the # directory name to get the path of the parent - if isinstance(zarr_object.store, zarr.storage.ConsolidatedMetadataStore): - fpath = zarr_object.store.store.path - else: - fpath = zarr_object.store.path + fpath = self.__get_store_path(zarr_object) fullpath = os.path.normpath(os.path.join(fpath, zarr_object.path)).replace("\\", "/") # To determine the filepath we now iterate over the path and check if the .zgroup object exists at # a level, indicating that we are still within the Zarr file. The first level we hit where the parent @@ -643,15 +649,14 @@ def get_zarr_paths(zarr_object): # return the result return filepath, objectpath - @staticmethod - def get_zarr_parent_path(zarr_object): + def get_zarr_parent_path(self, zarr_object): """ Get the location of the parent of a zarr_object within the file :param zarr_object: Object for which we are looking up the path :type zarr_object: Zarr Group or Array :return: String with the path """ - filepath, objectpath = ZarrIO.get_zarr_paths(zarr_object) + filepath, objectpath = self.get_zarr_paths(zarr_object) parentpath = os.path.dirname(objectpath) return parentpath @@ -941,10 +946,7 @@ def write_dataset(self, **kwargs): # noqa: C901 if isinstance(data, Array): # copy the dataset if link_data: - if isinstance(data.store, zarr.storage.ConsolidatedMetadataStore): - path = data.store.store.path - else: - path = data.store.path + path = self.__get_store_path(data) self.__add_link__(parent, path, data.name, name) linked = True dset = None @@ -1262,10 +1264,7 @@ def read_builder(self): def __set_built(self, zarr_obj, builder): # fpath = zarr_obj.store.path - if isinstance(zarr_obj.store, zarr.storage.ConsolidatedMetadataStore): - fpath = zarr_obj.store.store.path - else: - fpath = zarr_obj.store.path + fpath = self.__get_store_path(zarr_obj) path = zarr_obj.path path = os.path.join(fpath, path) self.__built.setdefault(path, builder) @@ -1306,18 +1305,12 @@ def __get_built(self, zarr_obj): :return: Builder in the self.__built cache or None """ - if isinstance(zarr_obj.store, zarr.storage.ConsolidatedMetadataStore): - fpath = zarr_obj.store.store.path - else: - fpath = zarr_obj.store.path - - # fpath = zarr_obj.store.path + fpath = self.__get_store_path(zarr_obj) path = zarr_obj.path path = os.path.join(fpath, path) return self.__built.get(path, None) def __read_group(self, zarr_obj, name=None): - # breakpoint() ret = self.__get_built(zarr_obj) if ret is not None: return ret From 1391ebdabc19e2e22c43453d9edd4441ac16c792 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 30 Nov 2023 20:52:57 -0800 Subject: [PATCH 05/25] checkpoint --- src/hdmf_zarr/backend.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 1fcf149b..97207a87 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -468,13 +468,16 @@ def __open_file_consolidated(self, synchronizer = None, storage_options = None): """ - This method will check to see if the metadata has been consolidated, if so + This method will check to see if the metadata has been consolidated. + If so, use open_consolidated. """ - try: - temp = os.path.isfile(self.path+'/.zmetadata') - except TypeError: - temp = os.path.isfile(self.path.path+'/.zmetadata') - if temp: + # self.path can be both a string or a one of the `SUPPORTED_ZARR_STORES`. + if isinstance(self.path, str): + path = self.path + else: + path = self.path.path + + if os.path.isfile(path+'/.zmetadata'): return zarr.open_consolidated(store=store, mode=mode, synchronizer=synchronizer, From 320546b99dcd915daea484469b71cd7a9735c81b Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 30 Nov 2023 21:27:59 -0800 Subject: [PATCH 06/25] open update --- src/hdmf_zarr/backend.py | 4 ++-- tests/unit/test_zarrio.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 97207a87..c91d8a21 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -159,7 +159,7 @@ def open(self): if self.__file is None: # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. # As a result, when in 'w' mode, the file will not use consolidated metadata. - if self.__mode == 'w': + if self.__mode not in ['r', 'r+']: self.__file = zarr.open(store=self.path, mode=self.__mode, synchronizer=self.__synchronizer, @@ -716,7 +716,7 @@ def resolve_ref(self, zarr_ref): else: target_name = ROOT_NAME - target_zarr_obj = zarr.open(source_file, mode='r', storage_options=self.__storage_options) + target_zarr_obj = self.__open_file_consolidated(source_file, mode='r', storage_options=self.__storage_options) if object_path is not None: try: target_zarr_obj = target_zarr_obj[object_path] diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index e1526282..d3f46648 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -122,3 +122,20 @@ class TestExportZarrToZarrNestedDirectoryStore(BaseTestExportZarrToZarr): def setUp(self): super().setUp() self.store = [NestedDirectoryStore(p) for p in self.store_path] + + +######################################### +# Consolidate Metadata tests +######################################### +# class TestConsolidateMetadata(TestCase): +# """ +# +# """ +# def test_get_store_path(self): +# pass +# +# def test_warning_consolidate_metadata(self): +# pass +# +# def test_open_with_zmetadata(self): +# pass From ea4e24d2d3391328ac81a04a4581bed6c1da0f4d Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 30 Nov 2023 21:29:56 -0800 Subject: [PATCH 07/25] flake --- src/hdmf_zarr/backend.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index c91d8a21..38998b6a 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -170,7 +170,6 @@ def open(self): synchronizer=self.__synchronizer, storage_options=self.__storage_options) - def close(self): """Close the Zarr file""" self.__file = None @@ -465,8 +464,8 @@ def __get_store_path(self, zarr_object): def __open_file_consolidated(self, store, mode, - synchronizer = None, - storage_options = None): + synchronizer=None, + storage_options=None): """ This method will check to see if the metadata has been consolidated. If so, use open_consolidated. @@ -479,9 +478,9 @@ def __open_file_consolidated(self, if os.path.isfile(path+'/.zmetadata'): return zarr.open_consolidated(store=store, - mode=mode, - synchronizer=synchronizer, - storage_options=storage_options) + mode=mode, + synchronizer=synchronizer, + storage_options=storage_options) else: msg = "Could not find consolidated metadata." warnings.warn(msg) From 9a8127d98605f31778da4d7ea93999c95c139708 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 30 Nov 2023 22:28:28 -0800 Subject: [PATCH 08/25] static and some test --- src/hdmf_zarr/backend.py | 29 +++++++------ tests/unit/example.zarr/.zattrs | 12 ++++++ tests/unit/example.zarr/.zgroup | 3 ++ tests/unit/example.zarr/first_name/.zarray | 24 +++++++++++ tests/unit/example.zarr/first_name/.zattrs | 7 ++++ tests/unit/example.zarr/first_name/0 | Bin 0 -> 37 bytes tests/unit/example.zarr/id/.zarray | 20 +++++++++ tests/unit/example.zarr/id/.zattrs | 6 +++ tests/unit/example.zarr/id/0 | Bin 0 -> 32 bytes tests/unit/example.zarr/last_name/.zarray | 24 +++++++++++ tests/unit/example.zarr/last_name/.zattrs | 7 ++++ tests/unit/example.zarr/last_name/0 | Bin 0 -> 40 bytes tests/unit/example.zarr/phone_number/.zarray | 24 +++++++++++ tests/unit/example.zarr/phone_number/.zattrs | 7 ++++ tests/unit/example.zarr/phone_number/0 | Bin 0 -> 68 bytes .../example.zarr/phone_number_index/.zarray | 20 +++++++++ .../example.zarr/phone_number_index/.zattrs | 16 ++++++++ tests/unit/example.zarr/phone_number_index/0 | Bin 0 -> 18 bytes .../unit/example.zarr/specifications/.zgroup | 3 ++ .../specifications/hdmf-common/.zgroup | 3 ++ .../specifications/hdmf-common/1.8.0/.zgroup | 3 ++ .../hdmf-common/1.8.0/base/.zarray | 30 ++++++++++++++ .../hdmf-common/1.8.0/base/.zattrs | 3 ++ .../specifications/hdmf-common/1.8.0/base/0 | 1 + .../hdmf-common/1.8.0/namespace/.zarray | 30 ++++++++++++++ .../hdmf-common/1.8.0/namespace/.zattrs | 3 ++ .../hdmf-common/1.8.0/namespace/0 | 1 + .../hdmf-common/1.8.0/sparse/.zarray | 30 ++++++++++++++ .../hdmf-common/1.8.0/sparse/.zattrs | 3 ++ .../specifications/hdmf-common/1.8.0/sparse/0 | 1 + .../hdmf-common/1.8.0/table/.zarray | 30 ++++++++++++++ .../hdmf-common/1.8.0/table/.zattrs | 3 ++ .../specifications/hdmf-common/1.8.0/table/0 | 1 + .../specifications/hdmf-experimental/.zgroup | 3 ++ .../hdmf-experimental/0.5.0/.zgroup | 3 ++ .../0.5.0/experimental/.zarray | 30 ++++++++++++++ .../0.5.0/experimental/.zattrs | 3 ++ .../hdmf-experimental/0.5.0/experimental/0 | 1 + .../hdmf-experimental/0.5.0/namespace/.zarray | 30 ++++++++++++++ .../hdmf-experimental/0.5.0/namespace/.zattrs | 3 ++ .../hdmf-experimental/0.5.0/namespace/0 | 1 + .../hdmf-experimental/0.5.0/resources/.zarray | 30 ++++++++++++++ .../hdmf-experimental/0.5.0/resources/.zattrs | 3 ++ .../hdmf-experimental/0.5.0/resources/0 | 1 + tests/unit/test_zarrio.py | 38 ++++++++++++------ 45 files changed, 465 insertions(+), 25 deletions(-) create mode 100644 tests/unit/example.zarr/.zattrs create mode 100644 tests/unit/example.zarr/.zgroup create mode 100644 tests/unit/example.zarr/first_name/.zarray create mode 100644 tests/unit/example.zarr/first_name/.zattrs create mode 100644 tests/unit/example.zarr/first_name/0 create mode 100644 tests/unit/example.zarr/id/.zarray create mode 100644 tests/unit/example.zarr/id/.zattrs create mode 100644 tests/unit/example.zarr/id/0 create mode 100644 tests/unit/example.zarr/last_name/.zarray create mode 100644 tests/unit/example.zarr/last_name/.zattrs create mode 100644 tests/unit/example.zarr/last_name/0 create mode 100644 tests/unit/example.zarr/phone_number/.zarray create mode 100644 tests/unit/example.zarr/phone_number/.zattrs create mode 100644 tests/unit/example.zarr/phone_number/0 create mode 100644 tests/unit/example.zarr/phone_number_index/.zarray create mode 100644 tests/unit/example.zarr/phone_number_index/.zattrs create mode 100644 tests/unit/example.zarr/phone_number_index/0 create mode 100644 tests/unit/example.zarr/specifications/.zgroup create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/.zgroup create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/.zgroup create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/base/.zarray create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/base/.zattrs create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/base/0 create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/namespace/.zarray create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/namespace/.zattrs create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/namespace/0 create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/sparse/.zarray create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/sparse/.zattrs create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/sparse/0 create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/table/.zarray create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/table/.zattrs create mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/table/0 create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/.zgroup create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/.zgroup create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zarray create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zattrs create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/experimental/0 create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zarray create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zattrs create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/namespace/0 create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/resources/.zarray create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/resources/.zattrs create mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/resources/0 diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 38998b6a..244b6560 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -448,16 +448,17 @@ def consolidate_metadata(self, store): """ zarr.consolidate_metadata(store, metadata_key='.zmetadata') - def __get_store_path(self, zarr_object): + @staticmethod + def __get_store_path(store): """ Method to retrieve the path from the Zarr storage. ConsolidatedMetadataStore wraps around other Zarr Store objects, requiring a check to retrieve the path. """ - if isinstance(zarr_object.store, zarr.storage.ConsolidatedMetadataStore): - fpath = zarr_object.store.store.path + if isinstance(store, zarr.storage.ConsolidatedMetadataStore): + fpath = store.store.path else: - fpath = zarr_object.store.path + fpath = store.path return fpath @@ -627,7 +628,8 @@ def __get_path(self, builder): path = "%s%s" % (delim, delim.join(reversed(names))) return path - def get_zarr_paths(self, zarr_object): + @staticmethod + def get_zarr_paths(zarr_object): """ For a Zarr object find 1) the path to the main zarr file it is in and 2) the path to the object within the file :param zarr_object: Object for which we are looking up the path @@ -637,7 +639,7 @@ def get_zarr_paths(self, zarr_object): # In Zarr the path is a combination of the path of the store and the path of the object. So we first need to # merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the # directory name to get the path of the parent - fpath = self.__get_store_path(zarr_object) + fpath = ZarrIO._ZarrIO__get_store_path(zarr_object.store) fullpath = os.path.normpath(os.path.join(fpath, zarr_object.path)).replace("\\", "/") # To determine the filepath we now iterate over the path and check if the .zgroup object exists at # a level, indicating that we are still within the Zarr file. The first level we hit where the parent @@ -651,14 +653,15 @@ def get_zarr_paths(self, zarr_object): # return the result return filepath, objectpath - def get_zarr_parent_path(self, zarr_object): + @staticmethod + def get_zarr_parent_path(zarr_object): """ Get the location of the parent of a zarr_object within the file :param zarr_object: Object for which we are looking up the path :type zarr_object: Zarr Group or Array :return: String with the path """ - filepath, objectpath = self.get_zarr_paths(zarr_object) + filepath, objectpath = ZarrIO.get_zarr_paths(zarr_object) parentpath = os.path.dirname(objectpath) return parentpath @@ -948,7 +951,7 @@ def write_dataset(self, **kwargs): # noqa: C901 if isinstance(data, Array): # copy the dataset if link_data: - path = self.__get_store_path(data) + path = self.__get_store_path(data.store) self.__add_link__(parent, path, data.name, name) linked = True dset = None @@ -1266,7 +1269,7 @@ def read_builder(self): def __set_built(self, zarr_obj, builder): # fpath = zarr_obj.store.path - fpath = self.__get_store_path(zarr_obj) + fpath = self.__get_store_path(zarr_obj.store) path = zarr_obj.path path = os.path.join(fpath, path) self.__built.setdefault(path, builder) @@ -1307,7 +1310,7 @@ def __get_built(self, zarr_obj): :return: Builder in the self.__built cache or None """ - fpath = self.__get_store_path(zarr_obj) + fpath = self.__get_store_path(zarr_obj.store) path = zarr_obj.path path = os.path.join(fpath, path) return self.__built.get(path, None) @@ -1323,7 +1326,7 @@ def __read_group(self, zarr_obj, name=None): # Create the GroupBuilder attributes = self.__read_attrs(zarr_obj) ret = GroupBuilder(name=name, source=self.source, attributes=attributes) - ret.location = self.get_zarr_parent_path(zarr_obj) + ret.location = ZarrIO.get_zarr_parent_path(zarr_obj) # read sub groups for sub_name, sub_group in zarr_obj.groups(): @@ -1418,7 +1421,7 @@ def __read_dataset(self, zarr_obj, name): if name is None: name = str(os.path.basename(zarr_obj.name)) ret = DatasetBuilder(name, **kwargs) # create builder object for dataset - ret.location = self.get_zarr_parent_path(zarr_obj) + ret.location = ZarrIO.get_zarr_parent_path(zarr_obj) self._written_builders.set_written(ret) # record that the builder has been written self.__set_built(zarr_obj, ret) return ret diff --git a/tests/unit/example.zarr/.zattrs b/tests/unit/example.zarr/.zattrs new file mode 100644 index 00000000..fda81f3a --- /dev/null +++ b/tests/unit/example.zarr/.zattrs @@ -0,0 +1,12 @@ +{ + ".specloc": "specifications", + "colnames": [ + "first_name", + "last_name", + "phone_number" + ], + "data_type": "DynamicTable", + "description": "a table containing data/metadata about users, one user per row", + "namespace": "hdmf-common", + "object_id": "ea83daef-37db-4b95-af84-6f5d840423f6" +} \ No newline at end of file diff --git a/tests/unit/example.zarr/.zgroup b/tests/unit/example.zarr/.zgroup new file mode 100644 index 00000000..3b7daf22 --- /dev/null +++ b/tests/unit/example.zarr/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/tests/unit/example.zarr/first_name/.zarray b/tests/unit/example.zarr/first_name/.zarray new file mode 100644 index 00000000..eca134e7 --- /dev/null +++ b/tests/unit/example.zarr/first_name/.zarray @@ -0,0 +1,24 @@ +{ + "chunks": [ + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|O", + "fill_value": 0, + "filters": [ + { + "id": "vlen-utf8" + } + ], + "order": "C", + "shape": [ + 2 + ], + "zarr_format": 2 +} \ No newline at end of file diff --git a/tests/unit/example.zarr/first_name/.zattrs b/tests/unit/example.zarr/first_name/.zattrs new file mode 100644 index 00000000..064fca30 --- /dev/null +++ b/tests/unit/example.zarr/first_name/.zattrs @@ -0,0 +1,7 @@ +{ + "data_type": "VectorData", + "description": "the first name of the user", + "namespace": "hdmf-common", + "object_id": "cdbf89b5-00bd-44e5-a350-2b4ba549dc3c", + "zarr_dtype": "str" +} \ No newline at end of file diff --git a/tests/unit/example.zarr/first_name/0 b/tests/unit/example.zarr/first_name/0 new file mode 100644 index 0000000000000000000000000000000000000000..b890f257c003da42b9b99cadd51ebe101d6056b4 GIT binary patch literal 37 jcmZQ#G-ecKU|;}YRUl>pVpbq_FG@^KWdSlAa}x6aDFFlU literal 0 HcmV?d00001 diff --git a/tests/unit/example.zarr/id/.zarray b/tests/unit/example.zarr/id/.zarray new file mode 100644 index 00000000..49bcc3cb --- /dev/null +++ b/tests/unit/example.zarr/id/.zarray @@ -0,0 +1,20 @@ +{ + "chunks": [ + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": " Date: Thu, 30 Nov 2023 22:56:43 -0800 Subject: [PATCH 09/25] unfinished tests --- tests/unit/example_consolidate.zarr/.zattrs | 12 + tests/unit/example_consolidate.zarr/.zgroup | 3 + .../unit/example_consolidate.zarr/.zmetadata | 421 ++++++++++++++++++ .../first_name/.zarray | 24 + .../first_name/.zattrs | 7 + .../example_consolidate.zarr/first_name/0 | Bin 0 -> 37 bytes .../unit/example_consolidate.zarr/id/.zarray | 20 + .../unit/example_consolidate.zarr/id/.zattrs | 6 + tests/unit/example_consolidate.zarr/id/0 | Bin 0 -> 32 bytes .../last_name/.zarray | 24 + .../last_name/.zattrs | 7 + .../unit/example_consolidate.zarr/last_name/0 | Bin 0 -> 40 bytes .../phone_number/.zarray | 24 + .../phone_number/.zattrs | 7 + .../example_consolidate.zarr/phone_number/0 | Bin 0 -> 68 bytes .../phone_number_index/.zarray | 20 + .../phone_number_index/.zattrs | 16 + .../phone_number_index/0 | Bin 0 -> 18 bytes .../specifications/.zgroup | 3 + .../specifications/hdmf-common/.zgroup | 3 + .../specifications/hdmf-common/1.8.0/.zgroup | 3 + .../hdmf-common/1.8.0/base/.zarray | 30 ++ .../hdmf-common/1.8.0/base/.zattrs | 3 + .../specifications/hdmf-common/1.8.0/base/0 | 1 + .../hdmf-common/1.8.0/namespace/.zarray | 30 ++ .../hdmf-common/1.8.0/namespace/.zattrs | 3 + .../hdmf-common/1.8.0/namespace/0 | 1 + .../hdmf-common/1.8.0/sparse/.zarray | 30 ++ .../hdmf-common/1.8.0/sparse/.zattrs | 3 + .../specifications/hdmf-common/1.8.0/sparse/0 | 1 + .../hdmf-common/1.8.0/table/.zarray | 30 ++ .../hdmf-common/1.8.0/table/.zattrs | 3 + .../specifications/hdmf-common/1.8.0/table/0 | 1 + .../specifications/hdmf-experimental/.zgroup | 3 + .../hdmf-experimental/0.5.0/.zgroup | 3 + .../0.5.0/experimental/.zarray | 30 ++ .../0.5.0/experimental/.zattrs | 3 + .../hdmf-experimental/0.5.0/experimental/0 | 1 + .../hdmf-experimental/0.5.0/namespace/.zarray | 30 ++ .../hdmf-experimental/0.5.0/namespace/.zattrs | 3 + .../hdmf-experimental/0.5.0/namespace/0 | 1 + .../hdmf-experimental/0.5.0/resources/.zarray | 30 ++ .../hdmf-experimental/0.5.0/resources/.zattrs | 3 + .../hdmf-experimental/0.5.0/resources/0 | 1 + tests/unit/test_zarrio.py | 10 +- 45 files changed, 848 insertions(+), 6 deletions(-) create mode 100644 tests/unit/example_consolidate.zarr/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/.zgroup create mode 100644 tests/unit/example_consolidate.zarr/.zmetadata create mode 100644 tests/unit/example_consolidate.zarr/first_name/.zarray create mode 100644 tests/unit/example_consolidate.zarr/first_name/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/first_name/0 create mode 100644 tests/unit/example_consolidate.zarr/id/.zarray create mode 100644 tests/unit/example_consolidate.zarr/id/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/id/0 create mode 100644 tests/unit/example_consolidate.zarr/last_name/.zarray create mode 100644 tests/unit/example_consolidate.zarr/last_name/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/last_name/0 create mode 100644 tests/unit/example_consolidate.zarr/phone_number/.zarray create mode 100644 tests/unit/example_consolidate.zarr/phone_number/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/phone_number/0 create mode 100644 tests/unit/example_consolidate.zarr/phone_number_index/.zarray create mode 100644 tests/unit/example_consolidate.zarr/phone_number_index/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/phone_number_index/0 create mode 100644 tests/unit/example_consolidate.zarr/specifications/.zgroup create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/.zgroup create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/.zgroup create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/base/.zarray create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/base/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/base/0 create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/namespace/.zarray create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/namespace/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/namespace/0 create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/sparse/.zarray create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/sparse/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/sparse/0 create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/table/.zarray create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/table/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/table/0 create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/.zgroup create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/.zgroup create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zarray create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/experimental/0 create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zarray create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/namespace/0 create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/resources/.zarray create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/resources/.zattrs create mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/resources/0 diff --git a/tests/unit/example_consolidate.zarr/.zattrs b/tests/unit/example_consolidate.zarr/.zattrs new file mode 100644 index 00000000..fda81f3a --- /dev/null +++ b/tests/unit/example_consolidate.zarr/.zattrs @@ -0,0 +1,12 @@ +{ + ".specloc": "specifications", + "colnames": [ + "first_name", + "last_name", + "phone_number" + ], + "data_type": "DynamicTable", + "description": "a table containing data/metadata about users, one user per row", + "namespace": "hdmf-common", + "object_id": "ea83daef-37db-4b95-af84-6f5d840423f6" +} \ No newline at end of file diff --git a/tests/unit/example_consolidate.zarr/.zgroup b/tests/unit/example_consolidate.zarr/.zgroup new file mode 100644 index 00000000..3b7daf22 --- /dev/null +++ b/tests/unit/example_consolidate.zarr/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/tests/unit/example_consolidate.zarr/.zmetadata b/tests/unit/example_consolidate.zarr/.zmetadata new file mode 100644 index 00000000..9f6f8ad0 --- /dev/null +++ b/tests/unit/example_consolidate.zarr/.zmetadata @@ -0,0 +1,421 @@ +{ + "metadata": { + ".zattrs": { + ".specloc": "specifications", + "colnames": [ + "first_name", + "last_name", + "phone_number" + ], + "data_type": "DynamicTable", + "description": "a table containing data/metadata about users, one user per row", + "namespace": "hdmf-common", + "object_id": "ea83daef-37db-4b95-af84-6f5d840423f6" + }, + ".zgroup": { + "zarr_format": 2 + }, + "first_name/.zarray": { + "chunks": [ + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|O", + "fill_value": 0, + "filters": [ + { + "id": "vlen-utf8" + } + ], + "order": "C", + "shape": [ + 2 + ], + "zarr_format": 2 + }, + "first_name/.zattrs": { + "data_type": "VectorData", + "description": "the first name of the user", + "namespace": "hdmf-common", + "object_id": "cdbf89b5-00bd-44e5-a350-2b4ba549dc3c", + "zarr_dtype": "str" + }, + "id/.zarray": { + "chunks": [ + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "pVpbq_FG@^KWdSlAa}x6aDFFlU literal 0 HcmV?d00001 diff --git a/tests/unit/example_consolidate.zarr/id/.zarray b/tests/unit/example_consolidate.zarr/id/.zarray new file mode 100644 index 00000000..49bcc3cb --- /dev/null +++ b/tests/unit/example_consolidate.zarr/id/.zarray @@ -0,0 +1,20 @@ +{ + "chunks": [ + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": " Date: Fri, 1 Dec 2023 10:41:25 -0800 Subject: [PATCH 10/25] working basic tests --- src/hdmf_zarr/backend.py | 22 +- tests/unit/example_consolidate.zarr/.zattrs | 12 - .../unit/example_consolidate.zarr/.zmetadata | 421 ------------------ .../first_name/.zattrs | 7 - .../example_consolidate.zarr/first_name/0 | Bin 37 -> 0 bytes .../last_name/.zattrs | 7 - .../unit/example_consolidate.zarr/last_name/0 | Bin 40 -> 0 bytes .../phone_number/.zattrs | 7 - .../example_consolidate.zarr/phone_number/0 | Bin 68 -> 0 bytes .../phone_number_index/.zattrs | 16 - .../phone_number_index/0 | Bin 18 -> 0 bytes .../specifications/hdmf-common/.zgroup | 3 - .../specifications/hdmf-common/1.8.0/.zgroup | 3 - .../hdmf-common/1.8.0/base/.zarray | 30 -- .../hdmf-common/1.8.0/base/.zattrs | 3 - .../specifications/hdmf-common/1.8.0/base/0 | 1 - .../hdmf-common/1.8.0/namespace/.zarray | 30 -- .../hdmf-common/1.8.0/namespace/.zattrs | 3 - .../hdmf-common/1.8.0/namespace/0 | 1 - .../hdmf-common/1.8.0/sparse/.zarray | 30 -- .../hdmf-common/1.8.0/sparse/.zattrs | 3 - .../specifications/hdmf-common/1.8.0/sparse/0 | 1 - .../hdmf-common/1.8.0/table/.zarray | 30 -- .../hdmf-common/1.8.0/table/.zattrs | 3 - .../specifications/hdmf-common/1.8.0/table/0 | 1 - .../specifications/hdmf-experimental/.zgroup | 3 - .../hdmf-experimental/0.5.0/.zgroup | 3 - .../0.5.0/experimental/.zarray | 30 -- .../0.5.0/experimental/.zattrs | 3 - .../hdmf-experimental/0.5.0/experimental/0 | 1 - .../hdmf-experimental/0.5.0/namespace/.zarray | 30 -- .../hdmf-experimental/0.5.0/namespace/.zattrs | 3 - .../hdmf-experimental/0.5.0/namespace/0 | 1 - .../hdmf-experimental/0.5.0/resources/.zarray | 30 -- .../hdmf-experimental/0.5.0/resources/.zattrs | 3 - .../hdmf-experimental/0.5.0/resources/0 | 1 - tests/unit/test_consolidate.zarr/.zattrs | 13 + .../.zgroup | 0 tests/unit/test_consolidate.zarr/.zmetadata | 194 ++++++++ .../bar}/.zarray | 14 +- tests/unit/test_consolidate.zarr/bar/.zattrs | 7 + tests/unit/test_consolidate.zarr/bar/0 | Bin 0 -> 32 bytes .../id => test_consolidate.zarr/foo}/.zarray | 0 tests/unit/test_consolidate.zarr/foo/.zattrs | 7 + tests/unit/test_consolidate.zarr/foo/0 | Bin 0 -> 32 bytes .../id}/.zarray | 8 +- .../id/.zattrs | 2 +- .../id/0 | Bin .../quux}/.zarray | 0 tests/unit/test_consolidate.zarr/quux/.zattrs | 16 + tests/unit/test_consolidate.zarr/quux/0 | Bin 0 -> 18 bytes .../quux_elements}/.zarray | 0 .../quux_elements/.zattrs | 7 + .../test_consolidate.zarr/quux_elements/0 | Bin 0 -> 30 bytes tests/unit/test_consolidate.zarr/qux/.zarray | 20 + tests/unit/test_consolidate.zarr/qux/.zattrs | 7 + tests/unit/test_consolidate.zarr/qux/0 | Bin 0 -> 18 bytes .../specifications/.zgroup | 0 tests/unit/test_zarrio.py | 10 +- 59 files changed, 296 insertions(+), 751 deletions(-) delete mode 100644 tests/unit/example_consolidate.zarr/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/.zmetadata delete mode 100644 tests/unit/example_consolidate.zarr/first_name/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/first_name/0 delete mode 100644 tests/unit/example_consolidate.zarr/last_name/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/last_name/0 delete mode 100644 tests/unit/example_consolidate.zarr/phone_number/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/phone_number/0 delete mode 100644 tests/unit/example_consolidate.zarr/phone_number_index/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/phone_number_index/0 delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/.zgroup delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/.zgroup delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/base/.zarray delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/base/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/base/0 delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/namespace/.zarray delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/namespace/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/namespace/0 delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/sparse/.zarray delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/sparse/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/sparse/0 delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/table/.zarray delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/table/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-common/1.8.0/table/0 delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/.zgroup delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/.zgroup delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zarray delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/experimental/0 delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zarray delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/namespace/0 delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/resources/.zarray delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/resources/.zattrs delete mode 100644 tests/unit/example_consolidate.zarr/specifications/hdmf-experimental/0.5.0/resources/0 create mode 100644 tests/unit/test_consolidate.zarr/.zattrs rename tests/unit/{example_consolidate.zarr => test_consolidate.zarr}/.zgroup (100%) create mode 100644 tests/unit/test_consolidate.zarr/.zmetadata rename tests/unit/{example_consolidate.zarr/phone_number => test_consolidate.zarr/bar}/.zarray (62%) create mode 100644 tests/unit/test_consolidate.zarr/bar/.zattrs create mode 100644 tests/unit/test_consolidate.zarr/bar/0 rename tests/unit/{example_consolidate.zarr/id => test_consolidate.zarr/foo}/.zarray (100%) create mode 100644 tests/unit/test_consolidate.zarr/foo/.zattrs create mode 100644 tests/unit/test_consolidate.zarr/foo/0 rename tests/unit/{example_consolidate.zarr/last_name => test_consolidate.zarr/id}/.zarray (74%) rename tests/unit/{example_consolidate.zarr => test_consolidate.zarr}/id/.zattrs (63%) rename tests/unit/{example_consolidate.zarr => test_consolidate.zarr}/id/0 (100%) rename tests/unit/{example_consolidate.zarr/phone_number_index => test_consolidate.zarr/quux}/.zarray (100%) create mode 100644 tests/unit/test_consolidate.zarr/quux/.zattrs create mode 100644 tests/unit/test_consolidate.zarr/quux/0 rename tests/unit/{example_consolidate.zarr/first_name => test_consolidate.zarr/quux_elements}/.zarray (100%) create mode 100644 tests/unit/test_consolidate.zarr/quux_elements/.zattrs create mode 100644 tests/unit/test_consolidate.zarr/quux_elements/0 create mode 100644 tests/unit/test_consolidate.zarr/qux/.zarray create mode 100644 tests/unit/test_consolidate.zarr/qux/.zattrs create mode 100644 tests/unit/test_consolidate.zarr/qux/0 rename tests/unit/{example_consolidate.zarr => test_consolidate.zarr}/specifications/.zgroup (100%) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 244b6560..f6e1534a 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -437,16 +437,16 @@ def write_builder(self, **kwargs): # Consolidate metadata for the entire file after everything has been written zarr.consolidate_metadata(store=self.path) - def consolidate_metadata(self, store): - """ - When a file is written, the metadata within the file is consolidated automatically. - If there are any metadata changes, the user needs to consolidate the metadata again - with this method in order for the metadata to be read correctly. - - Consolidate all metadata for groups and arrays within the given store into a - single resource and put it under .zmetadata. - """ - zarr.consolidate_metadata(store, metadata_key='.zmetadata') + # def consolidate_metadata(store): + # """ + # When a file is written, the metadata within the file is consolidated automatically. + # If there are any metadata changes, the user needs to consolidate the metadata again + # with this method in order for the metadata to be read correctly. + # + # Consolidate all metadata for groups and arrays within the given store into a + # single resource and put it under .zmetadata. + # """ + # zarr.consolidate_metadata(store, metadata_key='.zmetadata') @staticmethod def __get_store_path(store): @@ -456,6 +456,7 @@ def __get_store_path(store): retrieve the path. """ if isinstance(store, zarr.storage.ConsolidatedMetadataStore): + # breakpoint() fpath = store.store.path else: fpath = store.path @@ -1268,7 +1269,6 @@ def read_builder(self): return f_builder def __set_built(self, zarr_obj, builder): - # fpath = zarr_obj.store.path fpath = self.__get_store_path(zarr_obj.store) path = zarr_obj.path path = os.path.join(fpath, path) diff --git a/tests/unit/example_consolidate.zarr/.zattrs b/tests/unit/example_consolidate.zarr/.zattrs deleted file mode 100644 index fda81f3a..00000000 --- a/tests/unit/example_consolidate.zarr/.zattrs +++ /dev/null @@ -1,12 +0,0 @@ -{ - ".specloc": "specifications", - "colnames": [ - "first_name", - "last_name", - "phone_number" - ], - "data_type": "DynamicTable", - "description": "a table containing data/metadata about users, one user per row", - "namespace": "hdmf-common", - "object_id": "ea83daef-37db-4b95-af84-6f5d840423f6" -} \ No newline at end of file diff --git a/tests/unit/example_consolidate.zarr/.zmetadata b/tests/unit/example_consolidate.zarr/.zmetadata deleted file mode 100644 index 9f6f8ad0..00000000 --- a/tests/unit/example_consolidate.zarr/.zmetadata +++ /dev/null @@ -1,421 +0,0 @@ -{ - "metadata": { - ".zattrs": { - ".specloc": "specifications", - "colnames": [ - "first_name", - "last_name", - "phone_number" - ], - "data_type": "DynamicTable", - "description": "a table containing data/metadata about users, one user per row", - "namespace": "hdmf-common", - "object_id": "ea83daef-37db-4b95-af84-6f5d840423f6" - }, - ".zgroup": { - "zarr_format": 2 - }, - "first_name/.zarray": { - "chunks": [ - 2 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dtype": "|O", - "fill_value": 0, - "filters": [ - { - "id": "vlen-utf8" - } - ], - "order": "C", - "shape": [ - 2 - ], - "zarr_format": 2 - }, - "first_name/.zattrs": { - "data_type": "VectorData", - "description": "the first name of the user", - "namespace": "hdmf-common", - "object_id": "cdbf89b5-00bd-44e5-a350-2b4ba549dc3c", - "zarr_dtype": "str" - }, - "id/.zarray": { - "chunks": [ - 2 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dtype": "pVpbq_FG@^KWdSlAa}x6aDFFlU diff --git a/tests/unit/example_consolidate.zarr/last_name/.zattrs b/tests/unit/example_consolidate.zarr/last_name/.zattrs deleted file mode 100644 index 7a06f0c0..00000000 --- a/tests/unit/example_consolidate.zarr/last_name/.zattrs +++ /dev/null @@ -1,7 +0,0 @@ -{ - "data_type": "VectorData", - "description": "the last name of the user", - "namespace": "hdmf-common", - "object_id": "ff302d07-62c1-4e84-b50a-75f4d8586e88", - "zarr_dtype": "str" -} \ No newline at end of file diff --git a/tests/unit/example_consolidate.zarr/last_name/0 b/tests/unit/example_consolidate.zarr/last_name/0 deleted file mode 100644 index 35f74acf51170e539cc0aa5eb2e940118bdc74ce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 lcmZQ#G-i}wU|;}Y4IpL$Vm2W5$S){JEdsGZN{ce{(g8g51%&_r diff --git a/tests/unit/example_consolidate.zarr/phone_number/.zattrs b/tests/unit/example_consolidate.zarr/phone_number/.zattrs deleted file mode 100644 index 4446f0e9..00000000 --- a/tests/unit/example_consolidate.zarr/phone_number/.zattrs +++ /dev/null @@ -1,7 +0,0 @@ -{ - "data_type": "VectorData", - "description": "the phone number of the user", - "namespace": "hdmf-common", - "object_id": "a3ef9627-dd8a-4715-a144-afe6b62c400a", - "zarr_dtype": "str" -} \ No newline at end of file diff --git a/tests/unit/example_consolidate.zarr/phone_number/0 b/tests/unit/example_consolidate.zarr/phone_number/0 deleted file mode 100644 index ec2fe8ace1775e043b0c123bc7f8ae09e0d681ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 68 zcmWm1!3_W)2n0dYudQ4nDu{pkugPPRd2B7KY!OM18g;M1~e*t~~ literal 0 HcmV?d00001 diff --git a/tests/unit/example_consolidate.zarr/id/.zarray b/tests/unit/test_consolidate.zarr/foo/.zarray similarity index 100% rename from tests/unit/example_consolidate.zarr/id/.zarray rename to tests/unit/test_consolidate.zarr/foo/.zarray diff --git a/tests/unit/test_consolidate.zarr/foo/.zattrs b/tests/unit/test_consolidate.zarr/foo/.zattrs new file mode 100644 index 00000000..597550b7 --- /dev/null +++ b/tests/unit/test_consolidate.zarr/foo/.zattrs @@ -0,0 +1,7 @@ +{ + "data_type": "VectorData", + "description": "an int column", + "namespace": "hdmf-common", + "object_id": "503b7599-9ab9-46f5-b184-c5c03c6e40e4", + "zarr_dtype": "int64" +} \ No newline at end of file diff --git a/tests/unit/test_consolidate.zarr/foo/0 b/tests/unit/test_consolidate.zarr/foo/0 new file mode 100644 index 0000000000000000000000000000000000000000..1fcafd113d312ec408f2854c4c995da14fc1ef0c GIT binary patch literal 32 acmZQ#H0BUsU|;}Y1t69NVh~V;Pz(SM`2fEF literal 0 HcmV?d00001 diff --git a/tests/unit/example_consolidate.zarr/last_name/.zarray b/tests/unit/test_consolidate.zarr/id/.zarray similarity index 74% rename from tests/unit/example_consolidate.zarr/last_name/.zarray rename to tests/unit/test_consolidate.zarr/id/.zarray index eca134e7..49bcc3cb 100644 --- a/tests/unit/example_consolidate.zarr/last_name/.zarray +++ b/tests/unit/test_consolidate.zarr/id/.zarray @@ -9,13 +9,9 @@ "id": "blosc", "shuffle": 1 }, - "dtype": "|O", + "dtype": " Date: Fri, 1 Dec 2023 10:44:33 -0800 Subject: [PATCH 11/25] assert --- tests/unit/test_zarrio.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index 1c98f903..b477bd4a 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -18,6 +18,10 @@ import zarr from hdmf.testing import TestCase from hdmf_zarr.backend import ZarrIO +import os + + +CUR_DIR = os.path.dirname(os.path.realpath(__file__)) ###################################################### @@ -137,13 +141,15 @@ class TestConsolidateMetadata(TestCase): def test_get_store_path_shallow(self): store = DirectoryStore('tests/unit/example.zarr') path = ZarrIO._ZarrIO__get_store_path(store) - # assert + expected_path = os.path.normpath(os.path.join(CUR_DIR, 'example.zarr')) + self.assertEqual(path, expected_path) def test_get_store_path_deep(self): zarr_obj = zarr.open_consolidated('tests/unit/test_consolidate.zarr', mode='r') store = zarr_obj.store path = ZarrIO._ZarrIO__get_store_path(store) - # assert + expected_path = os.path.normpath(os.path.join(CUR_DIR, 'test_consolidate.zarr')) + self.assertEqual(path, expected_path) # def test_warning_consolidate_metadata(self): # pass From bdd6d7b596acab4b15724e7a1b51233d5e32168d Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Fri, 1 Dec 2023 10:54:43 -0800 Subject: [PATCH 12/25] cleaned --- src/hdmf_zarr/backend.py | 12 ------------ tests/unit/test_zarrio.py | 9 --------- 2 files changed, 21 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index f6e1534a..771bf1a2 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -437,17 +437,6 @@ def write_builder(self, **kwargs): # Consolidate metadata for the entire file after everything has been written zarr.consolidate_metadata(store=self.path) - # def consolidate_metadata(store): - # """ - # When a file is written, the metadata within the file is consolidated automatically. - # If there are any metadata changes, the user needs to consolidate the metadata again - # with this method in order for the metadata to be read correctly. - # - # Consolidate all metadata for groups and arrays within the given store into a - # single resource and put it under .zmetadata. - # """ - # zarr.consolidate_metadata(store, metadata_key='.zmetadata') - @staticmethod def __get_store_path(store): """ @@ -456,7 +445,6 @@ def __get_store_path(store): retrieve the path. """ if isinstance(store, zarr.storage.ConsolidatedMetadataStore): - # breakpoint() fpath = store.store.path else: fpath = store.path diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index b477bd4a..c2a81bbe 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -150,12 +150,3 @@ def test_get_store_path_deep(self): path = ZarrIO._ZarrIO__get_store_path(store) expected_path = os.path.normpath(os.path.join(CUR_DIR, 'test_consolidate.zarr')) self.assertEqual(path, expected_path) - - # def test_warning_consolidate_metadata(self): - # pass - # - # def test_open_with_zmetadata(self): - # pass - # - # def test_open_with_zmetadata_references(self): - # pass From ed065d46955836afa9e7d12873323e879cc3d182 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 1 Dec 2023 11:14:38 -0800 Subject: [PATCH 13/25] Update backend.py --- src/hdmf_zarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 771bf1a2..bd0b13f0 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -158,7 +158,7 @@ def open(self): """Open the Zarr file""" if self.__file is None: # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. - # As a result, when in 'w' mode, the file will not use consolidated metadata. + # As a result, when in other modes, the file will not use consolidated metadata. if self.__mode not in ['r', 'r+']: self.__file = zarr.open(store=self.path, mode=self.__mode, From 9a06baa63e5cf031fb81ab678ca8e7c045aa3cf9 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 1 Dec 2023 11:15:25 -0800 Subject: [PATCH 14/25] Update backend.py --- src/hdmf_zarr/backend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index bd0b13f0..863fc029 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -472,8 +472,6 @@ def __open_file_consolidated(self, synchronizer=synchronizer, storage_options=storage_options) else: - msg = "Could not find consolidated metadata." - warnings.warn(msg) return zarr.open(store=self.path, mode=self.__mode, synchronizer=self.__synchronizer, From f6e4976bca34a191bc3498d85c83bc922df9fd46 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 1 Dec 2023 11:18:21 -0800 Subject: [PATCH 15/25] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70212772..86270582 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## 0.5.0 (Upcoming) ### Enhancements +* Added a new default to consolidate metadata in order more efficeintly traverse storage contents. @mavaylon1 [#142](https://github.com/hdmf-dev/hdmf-zarr/pull/142) * Fix linking for FSSpec and support passing of `storage_options` required reading data from S3 #138. @alejoe91 [#120](https://github.com/hdmf-dev/hdmf-zarr/pull/138) ## 0.4.0 (October 3, 2023) From c20e9c59398621ac0d3085f7d7165ff31784b7b2 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Mon, 4 Dec 2023 10:32:45 -0800 Subject: [PATCH 16/25] Update storage.rst --- docs/source/storage.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index 45500c67..af8ac0f2 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -394,3 +394,10 @@ data type. The specification of the namespace is stored in ``/specifications///``. Here ```` refers to the main name of the source-file without file extension (e.g., the core namespace defines ``nwb.ephys.yaml`` as source which would be stored in ``/specifications/core/2.0.1/nwb.ecephys``). + +Consolidating Metadata +====================== + +Zarr allows users to consolidate all metadata for groups and arrays within the given store. In practice, every file +will consolidate all metadata within into a single `.zmetadata` file, stored in the root group. This reduces the number of read +operations when retrieving certain metadata in read mode. From 51f66c1b9bf2722798bcf507086313f42d29283c Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 4 Dec 2023 11:51:35 -0800 Subject: [PATCH 17/25] bool and updated tests --- src/hdmf_zarr/backend.py | 23 +++++++++++++++++--- tests/unit/base_tests_zarrio.py | 37 +++++++++++++++++++++++++++++++++ tests/unit/test_zarrio.py | 14 +++++++------ 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 771bf1a2..fa61b0ab 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -246,6 +246,14 @@ def load_namespaces(cls, namespace_catalog, path, namespaces=None): ), "default": None, }, + { + "name": "consolidate_metadata", + "type": bool, + "doc": ( + "Boolean to consolidate metadata into a single a .zmetadata file within root group." + ), + "default": True, + } ) def write(self, **kwargs): """Overwrite the write method to add support for caching the specification and parallelization.""" @@ -406,11 +414,19 @@ def get_builder_disk_path(self, **kwargs): 'doc': 'The source of the builders when exporting', 'default': None, }, + { + "name": "consolidate_metadata", + "type": bool, + "doc": ( + "Boolean to consolidate metadata into a single a .zmetadata file within root group." + ), + "default": True, + } ) def write_builder(self, **kwargs): """Write a builder to disk.""" - f_builder, link_data, exhaust_dci, export_source = getargs( - 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + f_builder, link_data, exhaust_dci, export_source, consolidate_metadata = getargs( + 'builder', 'link_data', 'exhaust_dci', 'export_source', 'consolidate_metadata', kwargs ) for name, gbldr in f_builder.groups.items(): self.write_group( @@ -435,7 +451,8 @@ def write_builder(self, **kwargs): (f_builder.__class__.__qualname__, f_builder.name, self.source)) # Consolidate metadata for the entire file after everything has been written - zarr.consolidate_metadata(store=self.path) + if consolidate_metadata: + zarr.consolidate_metadata(store=self.path) @staticmethod def __get_store_path(store): diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index 440850af..91abadf2 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -87,6 +87,43 @@ def tearDown(self): warnings.warn("Could not remove: %s" % path) +class ZarrStoreTestCase(TestCase): + """ + Class that creates a zarr file containing groups, datasets, and references for + general purpose testing. + """ + def setUp(self): + self.manager = get_foo_buildmanager() + self.store = "tests/unit/test_io.zarr" + + def tearDown(self): + shutil.rmtree(self.store) + + def createReferenceBuilder(self): + data_1 = np.arange(100, 200, 10).reshape(2, 5) + data_2 = np.arange(0, 200, 10).reshape(4, 5) + dataset_1 = DatasetBuilder('dataset_1', data_1) + dataset_2 = DatasetBuilder('dataset_2', data_2) + + ref_dataset_1 = ReferenceBuilder(dataset_1) + ref_dataset_2 = ReferenceBuilder(dataset_2) + ref_data = [ref_dataset_1, ref_dataset_2] + dataset_ref = DatasetBuilder('ref_dataset', ref_data, dtype='object') + + builder = GroupBuilder('root', + source=self.store, + datasets={'dataset_1': dataset_1, + 'dataset_2': dataset_2, + 'ref_dataset': dataset_ref}) + return builder + + def create_zarr(self, consolidate_metadata=True): + builder = self.createReferenceBuilder() + writer = ZarrIO(self.store, manager=self.manager, mode='a') + writer.write_builder(builder, consolidate_metadata) + writer.close() + + class BaseTestZarrWriter(BaseZarrWriterTestCase): """ Test writing of builder with ZarrIO diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index c2a81bbe..2af9bcef 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -10,13 +10,13 @@ need to implement the tests separately for the different backends. """ from tests.unit.base_tests_zarrio import (BaseTestZarrWriter, + ZarrStoreTestCase, BaseTestZarrWriteUnit, BaseTestExportZarrToZarr) from zarr.storage import (DirectoryStore, TempStore, NestedDirectoryStore) import zarr -from hdmf.testing import TestCase from hdmf_zarr.backend import ZarrIO import os @@ -134,19 +134,21 @@ def setUp(self): ######################################### # Consolidate Metadata tests ######################################### -class TestConsolidateMetadata(TestCase): +class TestConsolidateMetadata(ZarrStoreTestCase): """ Tests for consolidated metadata and corresponding helper methods. """ def test_get_store_path_shallow(self): - store = DirectoryStore('tests/unit/example.zarr') + self.create_zarr(consolidate_metadata=False) + store = DirectoryStore(self.store) path = ZarrIO._ZarrIO__get_store_path(store) - expected_path = os.path.normpath(os.path.join(CUR_DIR, 'example.zarr')) + expected_path = os.path.normpath(os.path.join(CUR_DIR, 'test_io.zarr')) self.assertEqual(path, expected_path) def test_get_store_path_deep(self): - zarr_obj = zarr.open_consolidated('tests/unit/test_consolidate.zarr', mode='r') + self.create_zarr() + zarr_obj = zarr.open_consolidated(self.store, mode='r') store = zarr_obj.store path = ZarrIO._ZarrIO__get_store_path(store) - expected_path = os.path.normpath(os.path.join(CUR_DIR, 'test_consolidate.zarr')) + expected_path = os.path.normpath(os.path.join(CUR_DIR, 'test_io.zarr')) self.assertEqual(path, expected_path) From a0eeb3944c2167a6b83fa6d5d8b6b89d086cc378 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 4 Dec 2023 11:52:37 -0800 Subject: [PATCH 18/25] storage rst --- docs/source/storage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index af8ac0f2..47690245 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -398,6 +398,6 @@ be stored in ``/specifications/core/2.0.1/nwb.ecephys``). Consolidating Metadata ====================== -Zarr allows users to consolidate all metadata for groups and arrays within the given store. In practice, every file +Zarr allows users to consolidate all metadata for groups and arrays within the given store. By default, every file will consolidate all metadata within into a single `.zmetadata` file, stored in the root group. This reduces the number of read operations when retrieving certain metadata in read mode. From bd3bb42b813ed25fa650bec55f9b52eb5f3f2484 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 4 Dec 2023 12:13:24 -0800 Subject: [PATCH 19/25] tutorial --- docs/gallery/plot_nwb_zarrio.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/gallery/plot_nwb_zarrio.py b/docs/gallery/plot_nwb_zarrio.py index 95eed3db..eacceddc 100644 --- a/docs/gallery/plot_nwb_zarrio.py +++ b/docs/gallery/plot_nwb_zarrio.py @@ -26,6 +26,7 @@ from datetime import datetime from dateutil.tz import tzlocal +import zarr import numpy as np from pynwb import NWBFile @@ -142,3 +143,15 @@ # relative ``path`` here instead is fine. with NWBZarrIO(path=absolute_path, mode="r") as io: infile = io.read() + +############################################################################### +# Consolidating Metadata +# ---------------------- +# When writing to Zarr, the metadata within the file will be consolidated into a single +# file within the root group, `.zmetadata`. Users who do not wish to consolidate the +# metadata can set the boolean parameter `consolidate_metadata` to `False` within `write`. +# Even when the metadata is consolidated, the metadata natively within the file can be altered. +# Any alterations within would require the user to call `zarr.convenience.consolidate_metadata()` +# to sync the file with the changes. Please refer to the Zarr documentation for more details: +# https://zarr.readthedocs.io/en/stable/tutorial.html#storage-alternatives +zarr.consolidate_metadata(path) From 8b33f7898280a3b589189500688dd489abda7607 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 4 Dec 2023 12:19:59 -0800 Subject: [PATCH 20/25] remove --- tests/unit/example.zarr/.zattrs | 12 -- tests/unit/example.zarr/.zgroup | 3 - tests/unit/example.zarr/first_name/.zarray | 24 --- tests/unit/example.zarr/first_name/.zattrs | 7 - tests/unit/example.zarr/first_name/0 | Bin 37 -> 0 bytes tests/unit/example.zarr/id/.zarray | 20 -- tests/unit/example.zarr/id/.zattrs | 6 - tests/unit/example.zarr/id/0 | Bin 32 -> 0 bytes tests/unit/example.zarr/last_name/.zarray | 24 --- tests/unit/example.zarr/last_name/.zattrs | 7 - tests/unit/example.zarr/last_name/0 | Bin 40 -> 0 bytes tests/unit/example.zarr/phone_number/.zarray | 24 --- tests/unit/example.zarr/phone_number/.zattrs | 7 - tests/unit/example.zarr/phone_number/0 | Bin 68 -> 0 bytes .../example.zarr/phone_number_index/.zarray | 20 -- .../example.zarr/phone_number_index/.zattrs | 16 -- tests/unit/example.zarr/phone_number_index/0 | Bin 18 -> 0 bytes .../unit/example.zarr/specifications/.zgroup | 3 - .../specifications/hdmf-common/.zgroup | 3 - .../specifications/hdmf-common/1.8.0/.zgroup | 3 - .../hdmf-common/1.8.0/base/.zarray | 30 --- .../hdmf-common/1.8.0/base/.zattrs | 3 - .../specifications/hdmf-common/1.8.0/base/0 | 1 - .../hdmf-common/1.8.0/namespace/.zarray | 30 --- .../hdmf-common/1.8.0/namespace/.zattrs | 3 - .../hdmf-common/1.8.0/namespace/0 | 1 - .../hdmf-common/1.8.0/sparse/.zarray | 30 --- .../hdmf-common/1.8.0/sparse/.zattrs | 3 - .../specifications/hdmf-common/1.8.0/sparse/0 | 1 - .../hdmf-common/1.8.0/table/.zarray | 30 --- .../hdmf-common/1.8.0/table/.zattrs | 3 - .../specifications/hdmf-common/1.8.0/table/0 | 1 - .../specifications/hdmf-experimental/.zgroup | 3 - .../hdmf-experimental/0.5.0/.zgroup | 3 - .../0.5.0/experimental/.zarray | 30 --- .../0.5.0/experimental/.zattrs | 3 - .../hdmf-experimental/0.5.0/experimental/0 | 1 - .../hdmf-experimental/0.5.0/namespace/.zarray | 30 --- .../hdmf-experimental/0.5.0/namespace/.zattrs | 3 - .../hdmf-experimental/0.5.0/namespace/0 | 1 - .../hdmf-experimental/0.5.0/resources/.zarray | 30 --- .../hdmf-experimental/0.5.0/resources/.zattrs | 3 - .../hdmf-experimental/0.5.0/resources/0 | 1 - tests/unit/test_consolidate.zarr/.zattrs | 13 -- tests/unit/test_consolidate.zarr/.zgroup | 3 - tests/unit/test_consolidate.zarr/.zmetadata | 194 ------------------ tests/unit/test_consolidate.zarr/bar/.zarray | 20 -- tests/unit/test_consolidate.zarr/bar/.zattrs | 7 - tests/unit/test_consolidate.zarr/bar/0 | Bin 32 -> 0 bytes tests/unit/test_consolidate.zarr/foo/.zarray | 20 -- tests/unit/test_consolidate.zarr/foo/.zattrs | 7 - tests/unit/test_consolidate.zarr/foo/0 | Bin 32 -> 0 bytes tests/unit/test_consolidate.zarr/id/.zarray | 20 -- tests/unit/test_consolidate.zarr/id/.zattrs | 6 - tests/unit/test_consolidate.zarr/id/0 | Bin 32 -> 0 bytes tests/unit/test_consolidate.zarr/quux/.zarray | 20 -- tests/unit/test_consolidate.zarr/quux/.zattrs | 16 -- tests/unit/test_consolidate.zarr/quux/0 | Bin 18 -> 0 bytes .../quux_elements/.zarray | 24 --- .../quux_elements/.zattrs | 7 - .../test_consolidate.zarr/quux_elements/0 | Bin 30 -> 0 bytes tests/unit/test_consolidate.zarr/qux/.zarray | 20 -- tests/unit/test_consolidate.zarr/qux/.zattrs | 7 - tests/unit/test_consolidate.zarr/qux/0 | Bin 18 -> 0 bytes .../specifications/.zgroup | 3 - 65 files changed, 810 deletions(-) delete mode 100644 tests/unit/example.zarr/.zattrs delete mode 100644 tests/unit/example.zarr/.zgroup delete mode 100644 tests/unit/example.zarr/first_name/.zarray delete mode 100644 tests/unit/example.zarr/first_name/.zattrs delete mode 100644 tests/unit/example.zarr/first_name/0 delete mode 100644 tests/unit/example.zarr/id/.zarray delete mode 100644 tests/unit/example.zarr/id/.zattrs delete mode 100644 tests/unit/example.zarr/id/0 delete mode 100644 tests/unit/example.zarr/last_name/.zarray delete mode 100644 tests/unit/example.zarr/last_name/.zattrs delete mode 100644 tests/unit/example.zarr/last_name/0 delete mode 100644 tests/unit/example.zarr/phone_number/.zarray delete mode 100644 tests/unit/example.zarr/phone_number/.zattrs delete mode 100644 tests/unit/example.zarr/phone_number/0 delete mode 100644 tests/unit/example.zarr/phone_number_index/.zarray delete mode 100644 tests/unit/example.zarr/phone_number_index/.zattrs delete mode 100644 tests/unit/example.zarr/phone_number_index/0 delete mode 100644 tests/unit/example.zarr/specifications/.zgroup delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/.zgroup delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/.zgroup delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/base/.zarray delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/base/.zattrs delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/base/0 delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/namespace/.zarray delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/namespace/.zattrs delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/namespace/0 delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/sparse/.zarray delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/sparse/.zattrs delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/sparse/0 delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/table/.zarray delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/table/.zattrs delete mode 100644 tests/unit/example.zarr/specifications/hdmf-common/1.8.0/table/0 delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/.zgroup delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/.zgroup delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zarray delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/experimental/.zattrs delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/experimental/0 delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zarray delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/namespace/.zattrs delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/namespace/0 delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/resources/.zarray delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/resources/.zattrs delete mode 100644 tests/unit/example.zarr/specifications/hdmf-experimental/0.5.0/resources/0 delete mode 100644 tests/unit/test_consolidate.zarr/.zattrs delete mode 100644 tests/unit/test_consolidate.zarr/.zgroup delete mode 100644 tests/unit/test_consolidate.zarr/.zmetadata delete mode 100644 tests/unit/test_consolidate.zarr/bar/.zarray delete mode 100644 tests/unit/test_consolidate.zarr/bar/.zattrs delete mode 100644 tests/unit/test_consolidate.zarr/bar/0 delete mode 100644 tests/unit/test_consolidate.zarr/foo/.zarray delete mode 100644 tests/unit/test_consolidate.zarr/foo/.zattrs delete mode 100644 tests/unit/test_consolidate.zarr/foo/0 delete mode 100644 tests/unit/test_consolidate.zarr/id/.zarray delete mode 100644 tests/unit/test_consolidate.zarr/id/.zattrs delete mode 100644 tests/unit/test_consolidate.zarr/id/0 delete mode 100644 tests/unit/test_consolidate.zarr/quux/.zarray delete mode 100644 tests/unit/test_consolidate.zarr/quux/.zattrs delete mode 100644 tests/unit/test_consolidate.zarr/quux/0 delete mode 100644 tests/unit/test_consolidate.zarr/quux_elements/.zarray delete mode 100644 tests/unit/test_consolidate.zarr/quux_elements/.zattrs delete mode 100644 tests/unit/test_consolidate.zarr/quux_elements/0 delete mode 100644 tests/unit/test_consolidate.zarr/qux/.zarray delete mode 100644 tests/unit/test_consolidate.zarr/qux/.zattrs delete mode 100644 tests/unit/test_consolidate.zarr/qux/0 delete mode 100644 tests/unit/test_consolidate.zarr/specifications/.zgroup diff --git a/tests/unit/example.zarr/.zattrs b/tests/unit/example.zarr/.zattrs deleted file mode 100644 index fda81f3a..00000000 --- a/tests/unit/example.zarr/.zattrs +++ /dev/null @@ -1,12 +0,0 @@ -{ - ".specloc": "specifications", - "colnames": [ - "first_name", - "last_name", - "phone_number" - ], - "data_type": "DynamicTable", - "description": "a table containing data/metadata about users, one user per row", - "namespace": "hdmf-common", - "object_id": "ea83daef-37db-4b95-af84-6f5d840423f6" -} \ No newline at end of file diff --git a/tests/unit/example.zarr/.zgroup b/tests/unit/example.zarr/.zgroup deleted file mode 100644 index 3b7daf22..00000000 --- a/tests/unit/example.zarr/.zgroup +++ /dev/null @@ -1,3 +0,0 @@ -{ - "zarr_format": 2 -} \ No newline at end of file diff --git a/tests/unit/example.zarr/first_name/.zarray b/tests/unit/example.zarr/first_name/.zarray deleted file mode 100644 index eca134e7..00000000 --- a/tests/unit/example.zarr/first_name/.zarray +++ /dev/null @@ -1,24 +0,0 @@ -{ - "chunks": [ - 2 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dtype": "|O", - "fill_value": 0, - "filters": [ - { - "id": "vlen-utf8" - } - ], - "order": "C", - "shape": [ - 2 - ], - "zarr_format": 2 -} \ No newline at end of file diff --git a/tests/unit/example.zarr/first_name/.zattrs b/tests/unit/example.zarr/first_name/.zattrs deleted file mode 100644 index 064fca30..00000000 --- a/tests/unit/example.zarr/first_name/.zattrs +++ /dev/null @@ -1,7 +0,0 @@ -{ - "data_type": "VectorData", - "description": "the first name of the user", - "namespace": "hdmf-common", - "object_id": "cdbf89b5-00bd-44e5-a350-2b4ba549dc3c", - "zarr_dtype": "str" -} \ No newline at end of file diff --git a/tests/unit/example.zarr/first_name/0 b/tests/unit/example.zarr/first_name/0 deleted file mode 100644 index b890f257c003da42b9b99cadd51ebe101d6056b4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 37 jcmZQ#G-ecKU|;}YRUl>pVpbq_FG@^KWdSlAa}x6aDFFlU diff --git a/tests/unit/example.zarr/id/.zarray b/tests/unit/example.zarr/id/.zarray deleted file mode 100644 index 49bcc3cb..00000000 --- a/tests/unit/example.zarr/id/.zarray +++ /dev/null @@ -1,20 +0,0 @@ -{ - "chunks": [ - 2 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dtype": ";M1~e*t~~ diff --git a/tests/unit/test_consolidate.zarr/foo/.zarray b/tests/unit/test_consolidate.zarr/foo/.zarray deleted file mode 100644 index 49bcc3cb..00000000 --- a/tests/unit/test_consolidate.zarr/foo/.zarray +++ /dev/null @@ -1,20 +0,0 @@ -{ - "chunks": [ - 2 - ], - "compressor": { - "blocksize": 0, - "clevel": 5, - "cname": "lz4", - "id": "blosc", - "shuffle": 1 - }, - "dtype": " Date: Mon, 4 Dec 2023 14:17:04 -0800 Subject: [PATCH 21/25] review --- tests/unit/base_tests_zarrio.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index 91abadf2..4895d495 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -93,7 +93,6 @@ class ZarrStoreTestCase(TestCase): general purpose testing. """ def setUp(self): - self.manager = get_foo_buildmanager() self.store = "tests/unit/test_io.zarr" def tearDown(self): @@ -119,7 +118,7 @@ def createReferenceBuilder(self): def create_zarr(self, consolidate_metadata=True): builder = self.createReferenceBuilder() - writer = ZarrIO(self.store, manager=self.manager, mode='a') + writer = ZarrIO(self.store, mode='a') writer.write_builder(builder, consolidate_metadata) writer.close() From b75b279624beb3bfc0b49adf64ecae8a909cb223 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 4 Dec 2023 16:51:09 -0800 Subject: [PATCH 22/25] Update docs/source/storage.rst --- docs/source/storage.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index 47690245..f5dbfd59 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -401,3 +401,10 @@ Consolidating Metadata Zarr allows users to consolidate all metadata for groups and arrays within the given store. By default, every file will consolidate all metadata within into a single `.zmetadata` file, stored in the root group. This reduces the number of read operations when retrieving certain metadata in read mode. + +.. note:: + + When updating a file, the consolidated metadata will also need to be updated via + `zarr.consolidate_metadata(path)` to ensure the consolidated metadata is consistent + with the file. + From f881c2343dec83e579fc40732402e0fdd5fea469 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 4 Dec 2023 16:52:28 -0800 Subject: [PATCH 23/25] Update src/hdmf_zarr/backend.py --- src/hdmf_zarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index cf79d5b2..2e73d9b4 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -250,7 +250,7 @@ def load_namespaces(cls, namespace_catalog, path, namespaces=None): "name": "consolidate_metadata", "type": bool, "doc": ( - "Boolean to consolidate metadata into a single a .zmetadata file within root group." + "Boolean to consolidate metadata into a single .zmetadata file in the root group to accelerate read." ), "default": True, } From 1175383f184981a2be9d91bc2117957a3d6abba1 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 4 Dec 2023 16:52:57 -0800 Subject: [PATCH 24/25] Update src/hdmf_zarr/backend.py --- src/hdmf_zarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 2e73d9b4..c7561ada 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -250,7 +250,7 @@ def load_namespaces(cls, namespace_catalog, path, namespaces=None): "name": "consolidate_metadata", "type": bool, "doc": ( - "Boolean to consolidate metadata into a single .zmetadata file in the root group to accelerate read." + "Consolidate metadata into a single .zmetadata file in the root group to accelerate read." ), "default": True, } From f23e3f381eb8940d49fb0de389c8d34a031abd63 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 4 Dec 2023 16:53:42 -0800 Subject: [PATCH 25/25] Update src/hdmf_zarr/backend.py --- src/hdmf_zarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index c7561ada..3a16512a 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -418,7 +418,7 @@ def get_builder_disk_path(self, **kwargs): "name": "consolidate_metadata", "type": bool, "doc": ( - "Boolean to consolidate metadata into a single a .zmetadata file within root group." + "Consolidate metadata into a single .zmetadata file in the root group to accelerate read." ), "default": True, }