From 9c3b4bee013c825e2cdaccbe443009f88302803e Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Mon, 4 Dec 2023 18:19:55 -0800 Subject: [PATCH] Consolidate Metadata (#142) * Consolidate Metadata * checkpoint * checkpoint * abstract * checkpoint * open update * flake * static and some test * unfinished tests * working basic tests * assert * cleaned * Update backend.py * Update backend.py * Update CHANGELOG.md * Update storage.rst * bool and updated tests * storage rst * tutorial * remove * review * Update docs/source/storage.rst * Update src/hdmf_zarr/backend.py * Update src/hdmf_zarr/backend.py * Update src/hdmf_zarr/backend.py --------- Co-authored-by: Oliver Ruebel --- CHANGELOG.md | 1 + docs/gallery/plot_nwb_zarrio.py | 13 +++++ docs/source/storage.rst | 14 +++++ src/hdmf_zarr/backend.py | 97 ++++++++++++++++++++++++++++----- tests/unit/base_tests_zarrio.py | 36 ++++++++++++ tests/unit/test_zarrio.py | 30 ++++++++++ 6 files changed, 178 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70212772..86270582 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## 0.5.0 (Upcoming) ### Enhancements +* Added a new default to consolidate metadata in order more efficeintly traverse storage contents. @mavaylon1 [#142](https://github.com/hdmf-dev/hdmf-zarr/pull/142) * Fix linking for FSSpec and support passing of `storage_options` required reading data from S3 #138. @alejoe91 [#120](https://github.com/hdmf-dev/hdmf-zarr/pull/138) ## 0.4.0 (October 3, 2023) diff --git a/docs/gallery/plot_nwb_zarrio.py b/docs/gallery/plot_nwb_zarrio.py index 95eed3db..eacceddc 100644 --- a/docs/gallery/plot_nwb_zarrio.py +++ b/docs/gallery/plot_nwb_zarrio.py @@ -26,6 +26,7 @@ from datetime import datetime from dateutil.tz import tzlocal +import zarr import numpy as np from pynwb import NWBFile @@ -142,3 +143,15 @@ # relative ``path`` here instead is fine. with NWBZarrIO(path=absolute_path, mode="r") as io: infile = io.read() + +############################################################################### +# Consolidating Metadata +# ---------------------- +# When writing to Zarr, the metadata within the file will be consolidated into a single +# file within the root group, `.zmetadata`. Users who do not wish to consolidate the +# metadata can set the boolean parameter `consolidate_metadata` to `False` within `write`. +# Even when the metadata is consolidated, the metadata natively within the file can be altered. +# Any alterations within would require the user to call `zarr.convenience.consolidate_metadata()` +# to sync the file with the changes. Please refer to the Zarr documentation for more details: +# https://zarr.readthedocs.io/en/stable/tutorial.html#storage-alternatives +zarr.consolidate_metadata(path) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index 45500c67..f5dbfd59 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -394,3 +394,17 @@ data type. The specification of the namespace is stored in ``/specifications///``. Here ```` refers to the main name of the source-file without file extension (e.g., the core namespace defines ``nwb.ephys.yaml`` as source which would be stored in ``/specifications/core/2.0.1/nwb.ecephys``). + +Consolidating Metadata +====================== + +Zarr allows users to consolidate all metadata for groups and arrays within the given store. By default, every file +will consolidate all metadata within into a single `.zmetadata` file, stored in the root group. This reduces the number of read +operations when retrieving certain metadata in read mode. + +.. note:: + + When updating a file, the consolidated metadata will also need to be updated via + `zarr.consolidate_metadata(path)` to ensure the consolidated metadata is consistent + with the file. + diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 40b40bc2..3a16512a 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -157,10 +157,18 @@ def object_codec_class(self): def open(self): """Open the Zarr file""" if self.__file is None: - self.__file = zarr.open(store=self.path, - mode=self.__mode, - synchronizer=self.__synchronizer, - storage_options=self.__storage_options) + # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. + # As a result, when in other modes, the file will not use consolidated metadata. + if self.__mode not in ['r', 'r+']: + self.__file = zarr.open(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options) + else: + self.__file = self.__open_file_consolidated(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options) def close(self): """Close the Zarr file""" @@ -238,6 +246,14 @@ def load_namespaces(cls, namespace_catalog, path, namespaces=None): ), "default": None, }, + { + "name": "consolidate_metadata", + "type": bool, + "doc": ( + "Consolidate metadata into a single .zmetadata file in the root group to accelerate read." + ), + "default": True, + } ) def write(self, **kwargs): """Overwrite the write method to add support for caching the specification and parallelization.""" @@ -398,11 +414,19 @@ def get_builder_disk_path(self, **kwargs): 'doc': 'The source of the builders when exporting', 'default': None, }, + { + "name": "consolidate_metadata", + "type": bool, + "doc": ( + "Consolidate metadata into a single .zmetadata file in the root group to accelerate read." + ), + "default": True, + } ) def write_builder(self, **kwargs): """Write a builder to disk.""" - f_builder, link_data, exhaust_dci, export_source = getargs( - 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + f_builder, link_data, exhaust_dci, export_source, consolidate_metadata = getargs( + 'builder', 'link_data', 'exhaust_dci', 'export_source', 'consolidate_metadata', kwargs ) for name, gbldr in f_builder.groups.items(): self.write_group( @@ -426,6 +450,50 @@ def write_builder(self, **kwargs): self.logger.debug("Done writing %s '%s' to path '%s'" % (f_builder.__class__.__qualname__, f_builder.name, self.source)) + # Consolidate metadata for the entire file after everything has been written + if consolidate_metadata: + zarr.consolidate_metadata(store=self.path) + + @staticmethod + def __get_store_path(store): + """ + Method to retrieve the path from the Zarr storage. + ConsolidatedMetadataStore wraps around other Zarr Store objects, requiring a check to + retrieve the path. + """ + if isinstance(store, zarr.storage.ConsolidatedMetadataStore): + fpath = store.store.path + else: + fpath = store.path + + return fpath + + def __open_file_consolidated(self, + store, + mode, + synchronizer=None, + storage_options=None): + """ + This method will check to see if the metadata has been consolidated. + If so, use open_consolidated. + """ + # self.path can be both a string or a one of the `SUPPORTED_ZARR_STORES`. + if isinstance(self.path, str): + path = self.path + else: + path = self.path.path + + if os.path.isfile(path+'/.zmetadata'): + return zarr.open_consolidated(store=store, + mode=mode, + synchronizer=synchronizer, + storage_options=storage_options) + else: + return zarr.open(store=self.path, + mode=self.__mode, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options) + @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, {'name': 'link_data', 'type': bool, @@ -575,7 +643,8 @@ def get_zarr_paths(zarr_object): # In Zarr the path is a combination of the path of the store and the path of the object. So we first need to # merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the # directory name to get the path of the parent - fullpath = os.path.normpath(os.path.join(zarr_object.store.path, zarr_object.path)).replace("\\", "/") + fpath = ZarrIO._ZarrIO__get_store_path(zarr_object.store) + fullpath = os.path.normpath(os.path.join(fpath, zarr_object.path)).replace("\\", "/") # To determine the filepath we now iterate over the path and check if the .zgroup object exists at # a level, indicating that we are still within the Zarr file. The first level we hit where the parent # directory does not have a .zgroup means we have found the main file @@ -653,7 +722,7 @@ def resolve_ref(self, zarr_ref): else: target_name = ROOT_NAME - target_zarr_obj = zarr.open(source_file, mode='r', storage_options=self.__storage_options) + target_zarr_obj = self.__open_file_consolidated(source_file, mode='r', storage_options=self.__storage_options) if object_path is not None: try: target_zarr_obj = target_zarr_obj[object_path] @@ -886,7 +955,8 @@ def write_dataset(self, **kwargs): # noqa: C901 if isinstance(data, Array): # copy the dataset if link_data: - self.__add_link__(parent, data.store.path, data.name, name) + path = self.__get_store_path(data.store) + self.__add_link__(parent, path, data.name, name) linked = True dset = None else: @@ -1202,7 +1272,7 @@ def read_builder(self): return f_builder def __set_built(self, zarr_obj, builder): - fpath = zarr_obj.store.path + fpath = self.__get_store_path(zarr_obj.store) path = zarr_obj.path path = os.path.join(fpath, path) self.__built.setdefault(path, builder) @@ -1242,7 +1312,8 @@ def __get_built(self, zarr_obj): :type zarr_obj: Zarr Group or Dataset :return: Builder in the self.__built cache or None """ - fpath = zarr_obj.store.path + + fpath = self.__get_store_path(zarr_obj.store) path = zarr_obj.path path = os.path.join(fpath, path) return self.__built.get(path, None) @@ -1258,7 +1329,7 @@ def __read_group(self, zarr_obj, name=None): # Create the GroupBuilder attributes = self.__read_attrs(zarr_obj) ret = GroupBuilder(name=name, source=self.source, attributes=attributes) - ret.location = self.get_zarr_parent_path(zarr_obj) + ret.location = ZarrIO.get_zarr_parent_path(zarr_obj) # read sub groups for sub_name, sub_group in zarr_obj.groups(): @@ -1353,7 +1424,7 @@ def __read_dataset(self, zarr_obj, name): if name is None: name = str(os.path.basename(zarr_obj.name)) ret = DatasetBuilder(name, **kwargs) # create builder object for dataset - ret.location = self.get_zarr_parent_path(zarr_obj) + ret.location = ZarrIO.get_zarr_parent_path(zarr_obj) self._written_builders.set_written(ret) # record that the builder has been written self.__set_built(zarr_obj, ret) return ret diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index 440850af..4895d495 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -87,6 +87,42 @@ def tearDown(self): warnings.warn("Could not remove: %s" % path) +class ZarrStoreTestCase(TestCase): + """ + Class that creates a zarr file containing groups, datasets, and references for + general purpose testing. + """ + def setUp(self): + self.store = "tests/unit/test_io.zarr" + + def tearDown(self): + shutil.rmtree(self.store) + + def createReferenceBuilder(self): + data_1 = np.arange(100, 200, 10).reshape(2, 5) + data_2 = np.arange(0, 200, 10).reshape(4, 5) + dataset_1 = DatasetBuilder('dataset_1', data_1) + dataset_2 = DatasetBuilder('dataset_2', data_2) + + ref_dataset_1 = ReferenceBuilder(dataset_1) + ref_dataset_2 = ReferenceBuilder(dataset_2) + ref_data = [ref_dataset_1, ref_dataset_2] + dataset_ref = DatasetBuilder('ref_dataset', ref_data, dtype='object') + + builder = GroupBuilder('root', + source=self.store, + datasets={'dataset_1': dataset_1, + 'dataset_2': dataset_2, + 'ref_dataset': dataset_ref}) + return builder + + def create_zarr(self, consolidate_metadata=True): + builder = self.createReferenceBuilder() + writer = ZarrIO(self.store, mode='a') + writer.write_builder(builder, consolidate_metadata) + writer.close() + + class BaseTestZarrWriter(BaseZarrWriterTestCase): """ Test writing of builder with ZarrIO diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index e1526282..2af9bcef 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -10,11 +10,18 @@ need to implement the tests separately for the different backends. """ from tests.unit.base_tests_zarrio import (BaseTestZarrWriter, + ZarrStoreTestCase, BaseTestZarrWriteUnit, BaseTestExportZarrToZarr) from zarr.storage import (DirectoryStore, TempStore, NestedDirectoryStore) +import zarr +from hdmf_zarr.backend import ZarrIO +import os + + +CUR_DIR = os.path.dirname(os.path.realpath(__file__)) ###################################################### @@ -122,3 +129,26 @@ class TestExportZarrToZarrNestedDirectoryStore(BaseTestExportZarrToZarr): def setUp(self): super().setUp() self.store = [NestedDirectoryStore(p) for p in self.store_path] + + +######################################### +# Consolidate Metadata tests +######################################### +class TestConsolidateMetadata(ZarrStoreTestCase): + """ + Tests for consolidated metadata and corresponding helper methods. + """ + def test_get_store_path_shallow(self): + self.create_zarr(consolidate_metadata=False) + store = DirectoryStore(self.store) + path = ZarrIO._ZarrIO__get_store_path(store) + expected_path = os.path.normpath(os.path.join(CUR_DIR, 'test_io.zarr')) + self.assertEqual(path, expected_path) + + def test_get_store_path_deep(self): + self.create_zarr() + zarr_obj = zarr.open_consolidated(self.store, mode='r') + store = zarr_obj.store + path = ZarrIO._ZarrIO__get_store_path(store) + expected_path = os.path.normpath(os.path.join(CUR_DIR, 'test_io.zarr')) + self.assertEqual(path, expected_path)