From bd36a4abef43e5ba17768fad237078c31261fc5c Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 14:13:18 -0800 Subject: [PATCH 01/17] Fix #152 Preserve h5py.Dataset filter settings on export --- src/hdmf_zarr/backend.py | 5 +++ src/hdmf_zarr/utils.py | 74 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index ccc955a8..691f144b 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -938,6 +938,11 @@ def write_dataset(self, **kwargs): # noqa: C901 name = builder.name data = builder.data if force_data is None else force_data options = dict() + # Check if data is a h5py.Dataset to infer I/O settings if necessary + if ZarrDataIO.is_h5py_dataset(data): + # Wrap the h5py.Dataset in ZarrDataIO with chunking and compression settings inferred from the input data + data = ZarrDataIO.from_h5py_dataset(h5dataset=data) + # Separate data values and io_settings for write if isinstance(data, ZarrDataIO): options['io_settings'] = data.io_settings link_data = data.link_data diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index aa73d510..43271fe8 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -461,13 +461,83 @@ def __init__(self, **kwargs): self.__iosettings['filters'] = filters @property - def link_data(self): + def link_data(self) -> bool: + """Bool indicating should it be linked to or copied. NOTE: Only applies to zarr.Array type data""" return self.__link_data @property - def io_settings(self): + def io_settings(self) -> dict: + """""" return self.__iosettings + @staticmethod + def from_h5py_dataset(h5dataset, **kwargs): + """ + Create a ZarrDataIO instance that wraps the given h5py dataset and infers + the filters that should be used in Zarr from the filters used in h5py. + + :param dataset: h5py.Dataset object that should be wrapped + :type dataset: h5py.Dataset + :param **kwargs: Other keyword arguments to pass to ZarrDataIO.__init__ + + :returns: ZarrDataIO object wrapping the dataset + """ + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset) + fillvalue = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue') + chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks') + re = ZarrDataIO( + data=h5dataset, + filters=filters, + fillvalue=fillvalue, + chunks=chunks, + **kwargs) + + @staticmethod + def hdf5_to_zarr_filters(self, h5dataset) -> list: + """From the given h5py.Dataset infer the corresponding filters to use in Zarr""" + # Based on https://github.com/fsspec/kerchunk/blob/617d9ce06b9d02375ec0e5584541fcfa9e99014a/kerchunk/hdf.py#L181 + filters = [] + # Check for unsupported filters + if h5dataset.scaleoffset: + # TODO: translate to numcodecs.fixedscaleoffset.FixedScaleOffset() + warnings.warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") + if h5obj.compression in ("szip", "lzf"): + warnings.warn(f"{h5dateset.name} HDF5 szip or lzf compression ignored in Zarr") + # Add the shuffle filter if possible + if h5obj.shuffle and h5obj.dtype.kind != "O": + # cannot use shuffle if we materialised objects + filters.append(numcodecs.Shuffle(elementsize=h5dataset.dtype.itemsize)) + # iterate through all the filters and add them to the list + for filter_id, properties in h5dataset._filters.items(): + filter_id_str = str(filter_id) + if filter_id_str == "32001": + blosc_compressors = ("blosclz", "lz4", "lz4hc", "snappy", "zlib", "zstd") + (_1, _2, bytes_per_num, total_bytes, clevel, shuffle, compressor) = properties + pars = dict( + blocksize=total_bytes, + clevel=clevel, + shuffle=shuffle, + cname=blosc_compressors[compressor]) + filters.append(numcodecs.Blosc(**pars)) + elif filter_id_str == "32015": + filters.append(numcodecs.Zstd(level=properties[0])) + elif filter_id_str == "gzip": + filters.append(numcodecs.Zlib(level=properties)) + elif filter_id_str == "32004": + warnings.warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr" + ) + elif filter_id_str == "32008": + warnings.warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr") + elif filter_id_str == "shuffle": # already handled above + pass + else: + warnings.warn( + f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.") + return filters + @staticmethod + def is_h5py_dataset(obj): + """Check if the object is an instance of h5py.Dataset without requiring import of h5py""" + return (obj.__class__.__module__, obj.__class__.__name__) == ('h5py._hl.dataset', 'Dataset') class ZarrReference(dict): """ From b3232975f072acd8bf10a0f567931a46d98f6862 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 14:18:21 -0800 Subject: [PATCH 02/17] Remove extra arg --- src/hdmf_zarr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index 43271fe8..1f1c3f24 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -493,7 +493,7 @@ def from_h5py_dataset(h5dataset, **kwargs): **kwargs) @staticmethod - def hdf5_to_zarr_filters(self, h5dataset) -> list: + def hdf5_to_zarr_filters(h5dataset) -> list: """From the given h5py.Dataset infer the corresponding filters to use in Zarr""" # Based on https://github.com/fsspec/kerchunk/blob/617d9ce06b9d02375ec0e5584541fcfa9e99014a/kerchunk/hdf.py#L181 filters = [] From 6deb0fb922956f2530db707f1e1e640afe1f149e Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 14:24:25 -0800 Subject: [PATCH 03/17] Fix bad variable name --- src/hdmf_zarr/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index 1f1c3f24..adee5bf0 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -501,10 +501,10 @@ def hdf5_to_zarr_filters(h5dataset) -> list: if h5dataset.scaleoffset: # TODO: translate to numcodecs.fixedscaleoffset.FixedScaleOffset() warnings.warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") - if h5obj.compression in ("szip", "lzf"): + if h5dataset.compression in ("szip", "lzf"): warnings.warn(f"{h5dateset.name} HDF5 szip or lzf compression ignored in Zarr") # Add the shuffle filter if possible - if h5obj.shuffle and h5obj.dtype.kind != "O": + if h5dataset.shuffle and h5dataset.dtype.kind != "O": # cannot use shuffle if we materialised objects filters.append(numcodecs.Shuffle(elementsize=h5dataset.dtype.itemsize)) # iterate through all the filters and add them to the list From f208dde19372ff37e96f331bdf8c2791c181b5b2 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 15:56:25 -0800 Subject: [PATCH 04/17] Add missing return statement in ZarrDataIO.from_h5py_dataset --- src/hdmf_zarr/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index adee5bf0..b4d46e1e 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -473,8 +473,10 @@ def io_settings(self) -> dict: @staticmethod def from_h5py_dataset(h5dataset, **kwargs): """ - Create a ZarrDataIO instance that wraps the given h5py dataset and infers - the filters that should be used in Zarr from the filters used in h5py. + Factory method to create a ZarrDataIO instance from a h5py.Dataset. + The ZarrDataIO object wraps the h5py.Dataset and the io filter settings + are inferred from filters used in h5py such that the options in Zarr match + (if possible) the options used in HDF5. :param dataset: h5py.Dataset object that should be wrapped :type dataset: h5py.Dataset @@ -491,6 +493,7 @@ def from_h5py_dataset(h5dataset, **kwargs): fillvalue=fillvalue, chunks=chunks, **kwargs) + return re @staticmethod def hdf5_to_zarr_filters(h5dataset) -> list: From 90d2af75f099e3976f956fa76d44eee149fe4de2 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 15:59:23 -0800 Subject: [PATCH 05/17] Fix ruff --- src/hdmf_zarr/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index b4d46e1e..e52106d6 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -503,9 +503,9 @@ def hdf5_to_zarr_filters(h5dataset) -> list: # Check for unsupported filters if h5dataset.scaleoffset: # TODO: translate to numcodecs.fixedscaleoffset.FixedScaleOffset() - warnings.warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") + warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") if h5dataset.compression in ("szip", "lzf"): - warnings.warn(f"{h5dateset.name} HDF5 szip or lzf compression ignored in Zarr") + warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr") # Add the shuffle filter if possible if h5dataset.shuffle and h5dataset.dtype.kind != "O": # cannot use shuffle if we materialised objects @@ -527,15 +527,14 @@ def hdf5_to_zarr_filters(h5dataset) -> list: elif filter_id_str == "gzip": filters.append(numcodecs.Zlib(level=properties)) elif filter_id_str == "32004": - warnings.warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr" + warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr" ) elif filter_id_str == "32008": - warnings.warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr") + warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr") elif filter_id_str == "shuffle": # already handled above pass else: - warnings.warn( - f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.") + warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.") return filters @staticmethod def is_h5py_dataset(obj): From 12082d39c06af4f0729c50a4dcb7bef99c003368 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 20:19:37 -0800 Subject: [PATCH 06/17] Add unit test for hdf5 to zarr export with filters --- src/hdmf_zarr/backend.py | 5 ++- tests/unit/test_io_convert.py | 85 ++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 691f144b..39b1dc9e 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -344,8 +344,9 @@ def export(self, **kwargs): ) if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True): - raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument " - "link_data=True." % src_io.__class__.__name__) + raise UnsupportedOperation(f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " + + "to Zarr with write argument link_data=True. " + + "Set write_args={'link_data': False}") write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder ckwargs = kwargs.copy() diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py index eddd0400..db1a446b 100644 --- a/tests/unit/test_io_convert.py +++ b/tests/unit/test_io_convert.py @@ -35,13 +35,14 @@ import os import shutil import numpy as np +import numcodecs from abc import ABCMeta, abstractmethod from hdmf_zarr.backend import (ZarrIO, ROOT_NAME) from hdmf_zarr.zarr_utils import ContainerZarrReferenceDataset -from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset +from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset, H5DataIO from hdmf.backends.hdf5 import HDF5IO from hdmf.common import get_manager as get_hdmfcommon_manager from hdmf.testing import TestCase @@ -822,6 +823,88 @@ def test_export_cpd_dset_refs(self): self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name]) +class TestHDF5toZarrWithFilters(TestCase): + """ + Test conversion from HDF5 to Zarr while preserving HDF5 filter settings + """ + def setUp(self): + self.hdf_filename = get_temp_filepath() + self.zarr_filename = get_temp_filepath() + self.out_container = None + self.read_container = None + + def tearDown(self): + # close the ZarrIO used for reading + del self.out_container + del self.read_container + # clean up any opened files + for fn in [self.hdf_filename, self.zarr_filename]: + if fn is not None and os.path.exists(fn): + if os.path.isdir(fn): + shutil.rmtree(fn) + else: + os.remove(fn) + self.filenames = [] + + def __roundtrip_data(self, data): + """Sets the variables self.out_container, self.read_container""" + # Create example foofile with the provided data (which may be wrapped in H5DataIO) + foo1 = Foo('foo1', data, "I am foo1", 17, 3.14) + foobucket = FooBucket('bucket1', [foo1,]) + foofile = FooFile(buckets=[foobucket]) + self.out_container = foofile + + # write example HDF5 file with no filter settings + with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='w') as write_io: + write_io.write(foofile, cache_spec=True) + # Export the HDF5 file to Zarr + with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='r') as hdf_read_io: + with ZarrIO(self.zarr_filename, mode='w') as export_io: + export_io.export(src_io=hdf_read_io, write_args={'link_data': False}) + # read and compare the containers + with ZarrIO(self.zarr_filename, mode='r', manager=get_foo_buildmanager()) as zarr_read_io: + self.read_container = zarr_read_io.read() + + def __get_data_array(self, foo_container): + """For a container created by __roundtrip_data return the data array""" + return foo_container.buckets['bucket1'].foos['foo1'].my_data + + def test_nofilters(self): + """basic test that export without any options specified is working as expected""" + data = list(range(5)) + self.__roundtrip_data(data=data) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + + def test_chunking(self): + """Test that chunking is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,)) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertTupleEqual((10,), read_array.chunks) + + def test_shuffle(self): + """Test that shuffle filter is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,), shuffle=True) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertEqual(len(read_array.filters), 1) + self.assertIsInstance(read_array.filters[0], numcodecs.Shuffle) + self.assertTupleEqual((10,), read_array.chunks) + + def test_gzip(self): + """Test that gzip filter is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,), compression='gzip', compression_opts=2 ) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertEqual(len(read_array.filters), 1) + self.assertIsInstance(read_array.filters[0], numcodecs.Zlib) + self.assertEqual(read_array.filters[0].level, 2) + self.assertTupleEqual((10,), read_array.chunks) + + # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source """ class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase): From e3ac387f91c9434e311d50792ed308e42e2aac9e Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 20:47:07 -0800 Subject: [PATCH 07/17] Update warning check in failing unit test --- tests/unit/base_tests_zarrio.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index 98123ff1..693f9ff5 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -1579,7 +1579,8 @@ def close(self): with OtherIO(manager=get_foo_buildmanager()) as read_io: with ZarrIO(self.store[1], mode='w') as export_io: - msg = "Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True." + msg = ("Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True. " + "Set write_args={'link_data': False}") with self.assertRaisesWith(UnsupportedOperation, msg): export_io.export(src_io=read_io, container=foofile) From 971dfce6d6dee837d1b5139d8efcb56f280c272b Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 23:28:44 -0800 Subject: [PATCH 08/17] Fix bytes fill value --- src/hdmf_zarr/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index e52106d6..c0870c07 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -486,6 +486,8 @@ def from_h5py_dataset(h5dataset, **kwargs): """ filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset) fillvalue = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue') + if isinstance(fillvalue, bytes): # bytes are not JSON serializable so use string instead + fillvalue = str(fillvalue) chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks') re = ZarrDataIO( data=h5dataset, From 111c78c790b596304f73a969389222a19a072108 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 23:34:46 -0800 Subject: [PATCH 09/17] Fix sphinx docs warning --- src/hdmf_zarr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index c0870c07..cf735b60 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -480,7 +480,7 @@ def from_h5py_dataset(h5dataset, **kwargs): :param dataset: h5py.Dataset object that should be wrapped :type dataset: h5py.Dataset - :param **kwargs: Other keyword arguments to pass to ZarrDataIO.__init__ + :param kwargs: Other keyword arguments to pass to ZarrDataIO.__init__ :returns: ZarrDataIO object wrapping the dataset """ From 879055b58f66e48467aa28cf61ecff1a1c94bb5b Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 23:38:24 -0800 Subject: [PATCH 10/17] Fix docstring --- src/hdmf_zarr/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index cf735b60..2143e754 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -467,7 +467,7 @@ def link_data(self) -> bool: @property def io_settings(self) -> dict: - """""" + """Dict with the io settings to use""" return self.__iosettings @staticmethod From 5d74be75217b7917af39b6a33726fe3f1c56383f Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 23:42:56 -0800 Subject: [PATCH 11/17] Update Changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ddd55ba8..358ad12e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HDMF-ZARR Changelog +## 0.6.0 (Upcoming) + +### Enhancements +* Enhanced `ZarrIO` and `ZarrDataIO` to infer io settings (e.g., chunking and compression) from HDF5 datasets to preserve storage settings on export if possible @oruebel [#153](https://github.com/hdmf-dev/hdmf-zarr/pull/153) + ## 0.5.0 (December 8, 2023) ### Enhancements From ea2c9917a668cb83636a8a79f4c7e56438be298f Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 23:50:14 -0800 Subject: [PATCH 12/17] Fix ruff --- src/hdmf_zarr/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index 2143e754..328f4caa 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -538,6 +538,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list: else: warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.") return filters + @staticmethod def is_h5py_dataset(obj): """Check if the object is an instance of h5py.Dataset without requiring import of h5py""" From 2fddbb9ee6db8add9a9d94242b5e4b149e91d514 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 11 Jan 2024 00:42:26 -0800 Subject: [PATCH 13/17] Add h5plugins as a dependency for the tests --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index cb72d345..2f81d255 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,3 +8,4 @@ pytest==7.1.2 pytest-cov==3.0.0 python-dateutil==2.8.2 tox==3.25.1 +hdf5plugin==4.3.0 # hdf5plugin is used to test conversion of plugin filters From 1ebb04a8b681402fc127b8e1e69b1698159b31de Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 11 Jan 2024 00:43:41 -0800 Subject: [PATCH 14/17] Add unit tests for mapping HDF5 IO filters to Zarr filters via ZarrDataIO --- src/hdmf_zarr/utils.py | 5 +- tests/unit/test_io_convert.py | 2 +- tests/unit/test_zarrdataio.py | 226 ++++++++++++++++++++++++++++++++++ 3 files changed, 229 insertions(+), 4 deletions(-) create mode 100644 tests/unit/test_zarrdataio.py diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index 328f4caa..bde22296 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -505,7 +505,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list: # Check for unsupported filters if h5dataset.scaleoffset: # TODO: translate to numcodecs.fixedscaleoffset.FixedScaleOffset() - warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") + warn( f"{h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") if h5dataset.compression in ("szip", "lzf"): warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr") # Add the shuffle filter if possible @@ -529,8 +529,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list: elif filter_id_str == "gzip": filters.append(numcodecs.Zlib(level=properties)) elif filter_id_str == "32004": - warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr" - ) + warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr") elif filter_id_str == "32008": warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr") elif filter_id_str == "shuffle": # already handled above diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py index db1a446b..b7f119a2 100644 --- a/tests/unit/test_io_convert.py +++ b/tests/unit/test_io_convert.py @@ -844,7 +844,6 @@ def tearDown(self): shutil.rmtree(fn) else: os.remove(fn) - self.filenames = [] def __roundtrip_data(self, data): """Sets the variables self.out_container, self.read_container""" @@ -905,6 +904,7 @@ def test_gzip(self): self.assertTupleEqual((10,), read_array.chunks) + # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source """ class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase): diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py new file mode 100644 index 00000000..8e732201 --- /dev/null +++ b/tests/unit/test_zarrdataio.py @@ -0,0 +1,226 @@ +""" +Module for testing the ZarrDataIO class. + +Many of the functions of ZarrDataIO are covered in tests related to ZarrIO and data conversion, +such as test_io_convert.TestHDF5toZarrWithFilters. However, those tests are in the context of +more complex operations and are more akin to integration tests This module focuses on test for +specific unit functions of ZarrDataIO. +""" +import numcodecs +import h5py +import os +import shutil +import unittest + +import numpy as np + +try: + import hdf5plugin + HDF5PLUGIN = True +except ImportError: + HDF5PLUGIN = False +from hdmf.testing import TestCase +from hdmf_zarr.utils import ZarrDataIO +from tests.unit.utils import get_temp_filepath + +class TestZarrDataIO(TestCase): + """Test the ZarrDataIO class""" + def setUp(self): + self.hdf_filename = get_temp_filepath() + self.zarr_filename = get_temp_filepath() + + def tearDown(self): + # clean up any opened files + for fn in [self.hdf_filename, self.zarr_filename]: + if fn is not None and os.path.exists(fn): + if os.path.isdir(fn): + shutil.rmtree(fn) + else: + os.remove(fn) + + def test_hdf5_to_zarr_filters_scaleoffset(self): + """Test that we warn when the scaleoffset filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', data=[1,2,3,4,5], scaleoffset=10) + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 scaleoffset filter ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + def test_hdf5_to_zarr_filters_lzf(self): + """Test that we warn when the lzf filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', data=[1, 2, 3, 4, 5], compression="lzf") + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 szip or lzf compression ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_lz4(self): + """Test that we warn when the lz4 filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', + data=[1, 2, 3, 4, 5], + **hdf5plugin.LZ4()) + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 lz4 compression ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_bitshuffle(self): + """Test that we warn when the bitshuffle filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', + data=[1, 2, 3, 4, 5], + **hdf5plugin.Bitshuffle(nelems=0, lz4=True)) + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 bitshuffle compression ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_other_unsupported(self): + """Test that we warn when other unsupported filter is used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset_FciDecomp = h5file.create_dataset( + name='test_fcidecomp', + data=[1, 2, 3, 4, 5], + **hdf5plugin.FciDecomp()) + h5dset_SZ3 = h5file.create_dataset( + name='test_sz3', + data=[1, 2, 3, 4, 5], + **hdf5plugin.SZ3(absolute=0.1)) + # test that we warn due to the FciDecomp + msg = "/test_fcidecomp HDF5 filter id 32018 with properties None ignored in Zarr." + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_FciDecomp) + self.assertEqual(len(filters), 0) + # test that we warn due to the SZ3 + msg = ("/test_sz3 HDF5 filter id 32024 with properties " + "(1, 9, 0, 5, 0, 1069128089, 2576980378, 0, 0, 0, 0, 0, 0) ignored in Zarr.") + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_SZ3) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + def test_hdf5_to_zarr_filters_shuffle(self): + """Test HDF5 shuffle filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset_int = h5file.create_dataset(name='test_int', data=np.arange(5, dtype='int32'), shuffle=True) + h5dset_float = h5file.create_dataset(name='test_float', data=np.arange(5, dtype='float32'), shuffle=True) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_int) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Shuffle) + # test that we apply shuffle filter on float data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_float) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Shuffle) + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_blosclz(self): + """Test HDF5 blosclz filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test_int', + data=np.arange(100, dtype='float32'), + **hdf5plugin.Blosc(cname='blosclz', clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE) + ) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Blosc) + self.assertEqual(filters[0].cname, 'blosclz') + self.assertEqual(filters[0].clevel, 9) + self.assertEqual(filters[0].shuffle, hdf5plugin.Blosc.SHUFFLE) + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_zstd(self): + """Test HDF5 zstd filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test_int', + data=np.arange(100, dtype='float32'), + **hdf5plugin.Zstd(clevel=22) + ) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Zstd) + self.assertEqual(filters[0].level, 22) + # Close the HDF5 file + h5file.close() + + def test_hdf5_to_zarr_filters_gzip(self): + """Test HDF5 gzip filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test_int', + data=np.arange(100, dtype='float32'), + compression='gzip', + compression_opts=2 + ) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Zlib) + self.assertEqual(filters[0].level, 2) + # Close the HDF5 file + h5file.close() + + def test_is_h5py_dataset(self): + """Test ZarrDataIO.is_h5py_dataset""" + h5file = h5py.File(self.hdf_filename, mode='a') + arr=np.arange(10) + h5dset = h5file.create_dataset(name='test', data=arr) + self.assertTrue(ZarrDataIO.is_h5py_dataset(h5dset)) + self.assertFalse(ZarrDataIO.is_h5py_dataset(arr)) + + def test_from_h5py_dataset(self): + """Test ZarrDataIO.from_h5py_dataset""" + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test', + data=np.arange(1000).reshape((10,100)), + compression='gzip', + compression_opts=6, + shuffle=True, + fillvalue=100, + chunks=(5,10)) + re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset) + # Test that all settings are being presevered when creating the ZarrDataIO object + self.assertIsInstance(re_zarrdataio, ZarrDataIO) + self.assertEqual(re_zarrdataio.data, h5dset) + self.assertEqual(re_zarrdataio.fillvalue, 100) + self.assertEqual(re_zarrdataio.chunks, (5,10)) + self.assertEqual(len(re_zarrdataio.io_settings['filters']), 2) + self.assertIsInstance(re_zarrdataio.io_settings['filters'][0], numcodecs.Shuffle) + self.assertIsInstance(re_zarrdataio.io_settings['filters'][1], numcodecs.Zlib) + # Close the HDF5 file + h5file.close() \ No newline at end of file From 4935ab9cdaa6a9dfa353789c5f3fd373f73c3df1 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 11 Jan 2024 00:50:54 -0800 Subject: [PATCH 15/17] Remove test case with system-dependent warning message --- tests/unit/test_zarrdataio.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py index 8e732201..0fc2f88c 100644 --- a/tests/unit/test_zarrdataio.py +++ b/tests/unit/test_zarrdataio.py @@ -98,28 +98,21 @@ def test_hdf5_to_zarr_filters_bitshuffle(self): @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") def test_hdf5_to_zarr_filters_other_unsupported(self): - """Test that we warn when other unsupported filter is used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + """ + Test that we warn when an unsupported filter is used in HDF5 with ZarrDataIO.hdf5_to_zarr_filters. + This test is to ensure that the catch-all at the end of the loop works. + """ # Create a test HDF5 dataset with scaleoffset h5file = h5py.File(self.hdf_filename, mode='a') h5dset_FciDecomp = h5file.create_dataset( name='test_fcidecomp', data=[1, 2, 3, 4, 5], **hdf5plugin.FciDecomp()) - h5dset_SZ3 = h5file.create_dataset( - name='test_sz3', - data=[1, 2, 3, 4, 5], - **hdf5plugin.SZ3(absolute=0.1)) # test that we warn due to the FciDecomp msg = "/test_fcidecomp HDF5 filter id 32018 with properties None ignored in Zarr." with self.assertWarnsWith(UserWarning, msg): filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_FciDecomp) self.assertEqual(len(filters), 0) - # test that we warn due to the SZ3 - msg = ("/test_sz3 HDF5 filter id 32024 with properties " - "(1, 9, 0, 5, 0, 1069128089, 2576980378, 0, 0, 0, 0, 0, 0) ignored in Zarr.") - with self.assertWarnsWith(UserWarning, msg): - filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_SZ3) - self.assertEqual(len(filters), 0) # Close the HDF5 file h5file.close() From b5c93227d1760399ebcd6e5ebb2fa923c22ed165 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 11 Jan 2024 01:26:44 -0800 Subject: [PATCH 16/17] Add unit test for conversion of byte fillvalue to string --- src/hdmf_zarr/utils.py | 8 ++++---- tests/unit/test_zarrdataio.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index bde22296..19bf75b9 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -485,14 +485,14 @@ def from_h5py_dataset(h5dataset, **kwargs): :returns: ZarrDataIO object wrapping the dataset """ filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset) - fillvalue = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue') - if isinstance(fillvalue, bytes): # bytes are not JSON serializable so use string instead - fillvalue = str(fillvalue) + fillval = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue') + if isinstance(fillval, bytes): # bytes are not JSON serializable so use string instead + fillval = fillval.decode("utf-8") chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks') re = ZarrDataIO( data=h5dataset, filters=filters, - fillvalue=fillvalue, + fillvalue=fillval, chunks=chunks, **kwargs) return re diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py index 0fc2f88c..bb5a1fe2 100644 --- a/tests/unit/test_zarrdataio.py +++ b/tests/unit/test_zarrdataio.py @@ -216,4 +216,22 @@ def test_from_h5py_dataset(self): self.assertIsInstance(re_zarrdataio.io_settings['filters'][0], numcodecs.Shuffle) self.assertIsInstance(re_zarrdataio.io_settings['filters'][1], numcodecs.Zlib) # Close the HDF5 file + h5file.close() + + def test_from_h5py_datase_bytes_fillvaluet(self): + """ + Test ZarrDataIO.from_h5py_dataset with a fillvalue that is in bytes, which needs to be handled + separately since bytes are not JSON serializable by default + """ + h5file = h5py.File(self.hdf_filename, mode='a') + # print(np.arange(10, dtype=np.int8).tobytes()) + h5dset = h5file.create_dataset( + name='test_str', + data=[b'hello', b'world', b'go'], + fillvalue=b'None') + re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset) + # Test that all settings are being presevered when creating the ZarrDataIO object + self.assertIsInstance(re_zarrdataio, ZarrDataIO) + self.assertEqual(re_zarrdataio.io_settings['fill_value'], str("None")) + # Close the HDF5 file h5file.close() \ No newline at end of file From 4bdb47abe596a8c1ebeb90b7699def4c462dd01c Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Thu, 11 Jan 2024 11:03:25 -0800 Subject: [PATCH 17/17] Update tests/unit/test_zarrdataio.py --- tests/unit/test_zarrdataio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py index bb5a1fe2..d6070784 100644 --- a/tests/unit/test_zarrdataio.py +++ b/tests/unit/test_zarrdataio.py @@ -218,7 +218,7 @@ def test_from_h5py_dataset(self): # Close the HDF5 file h5file.close() - def test_from_h5py_datase_bytes_fillvaluet(self): + def test_from_h5py_dataset_bytes_fillvalue(self): """ Test ZarrDataIO.from_h5py_dataset with a fillvalue that is in bytes, which needs to be handled separately since bytes are not JSON serializable by default