diff --git a/CHANGELOG.md b/CHANGELOG.md index ddd55ba8..358ad12e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HDMF-ZARR Changelog +## 0.6.0 (Upcoming) + +### Enhancements +* Enhanced `ZarrIO` and `ZarrDataIO` to infer io settings (e.g., chunking and compression) from HDF5 datasets to preserve storage settings on export if possible @oruebel [#153](https://github.com/hdmf-dev/hdmf-zarr/pull/153) + ## 0.5.0 (December 8, 2023) ### Enhancements diff --git a/requirements-dev.txt b/requirements-dev.txt index cb72d345..2f81d255 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,3 +8,4 @@ pytest==7.1.2 pytest-cov==3.0.0 python-dateutil==2.8.2 tox==3.25.1 +hdf5plugin==4.3.0 # hdf5plugin is used to test conversion of plugin filters diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index ccc955a8..39b1dc9e 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -344,8 +344,9 @@ def export(self, **kwargs): ) if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True): - raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument " - "link_data=True." % src_io.__class__.__name__) + raise UnsupportedOperation(f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " + + "to Zarr with write argument link_data=True. " + + "Set write_args={'link_data': False}") write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder ckwargs = kwargs.copy() @@ -938,6 +939,11 @@ def write_dataset(self, **kwargs): # noqa: C901 name = builder.name data = builder.data if force_data is None else force_data options = dict() + # Check if data is a h5py.Dataset to infer I/O settings if necessary + if ZarrDataIO.is_h5py_dataset(data): + # Wrap the h5py.Dataset in ZarrDataIO with chunking and compression settings inferred from the input data + data = ZarrDataIO.from_h5py_dataset(h5dataset=data) + # Separate data values and io_settings for write if isinstance(data, ZarrDataIO): options['io_settings'] = data.io_settings link_data = data.link_data diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index aa73d510..19bf75b9 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -461,13 +461,87 @@ def __init__(self, **kwargs): self.__iosettings['filters'] = filters @property - def link_data(self): + def link_data(self) -> bool: + """Bool indicating should it be linked to or copied. NOTE: Only applies to zarr.Array type data""" return self.__link_data @property - def io_settings(self): + def io_settings(self) -> dict: + """Dict with the io settings to use""" return self.__iosettings + @staticmethod + def from_h5py_dataset(h5dataset, **kwargs): + """ + Factory method to create a ZarrDataIO instance from a h5py.Dataset. + The ZarrDataIO object wraps the h5py.Dataset and the io filter settings + are inferred from filters used in h5py such that the options in Zarr match + (if possible) the options used in HDF5. + + :param dataset: h5py.Dataset object that should be wrapped + :type dataset: h5py.Dataset + :param kwargs: Other keyword arguments to pass to ZarrDataIO.__init__ + + :returns: ZarrDataIO object wrapping the dataset + """ + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset) + fillval = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue') + if isinstance(fillval, bytes): # bytes are not JSON serializable so use string instead + fillval = fillval.decode("utf-8") + chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks') + re = ZarrDataIO( + data=h5dataset, + filters=filters, + fillvalue=fillval, + chunks=chunks, + **kwargs) + return re + + @staticmethod + def hdf5_to_zarr_filters(h5dataset) -> list: + """From the given h5py.Dataset infer the corresponding filters to use in Zarr""" + # Based on https://github.com/fsspec/kerchunk/blob/617d9ce06b9d02375ec0e5584541fcfa9e99014a/kerchunk/hdf.py#L181 + filters = [] + # Check for unsupported filters + if h5dataset.scaleoffset: + # TODO: translate to numcodecs.fixedscaleoffset.FixedScaleOffset() + warn( f"{h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") + if h5dataset.compression in ("szip", "lzf"): + warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr") + # Add the shuffle filter if possible + if h5dataset.shuffle and h5dataset.dtype.kind != "O": + # cannot use shuffle if we materialised objects + filters.append(numcodecs.Shuffle(elementsize=h5dataset.dtype.itemsize)) + # iterate through all the filters and add them to the list + for filter_id, properties in h5dataset._filters.items(): + filter_id_str = str(filter_id) + if filter_id_str == "32001": + blosc_compressors = ("blosclz", "lz4", "lz4hc", "snappy", "zlib", "zstd") + (_1, _2, bytes_per_num, total_bytes, clevel, shuffle, compressor) = properties + pars = dict( + blocksize=total_bytes, + clevel=clevel, + shuffle=shuffle, + cname=blosc_compressors[compressor]) + filters.append(numcodecs.Blosc(**pars)) + elif filter_id_str == "32015": + filters.append(numcodecs.Zstd(level=properties[0])) + elif filter_id_str == "gzip": + filters.append(numcodecs.Zlib(level=properties)) + elif filter_id_str == "32004": + warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr") + elif filter_id_str == "32008": + warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr") + elif filter_id_str == "shuffle": # already handled above + pass + else: + warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.") + return filters + + @staticmethod + def is_h5py_dataset(obj): + """Check if the object is an instance of h5py.Dataset without requiring import of h5py""" + return (obj.__class__.__module__, obj.__class__.__name__) == ('h5py._hl.dataset', 'Dataset') class ZarrReference(dict): """ diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index 98123ff1..693f9ff5 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -1579,7 +1579,8 @@ def close(self): with OtherIO(manager=get_foo_buildmanager()) as read_io: with ZarrIO(self.store[1], mode='w') as export_io: - msg = "Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True." + msg = ("Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True. " + "Set write_args={'link_data': False}") with self.assertRaisesWith(UnsupportedOperation, msg): export_io.export(src_io=read_io, container=foofile) diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py index eddd0400..b7f119a2 100644 --- a/tests/unit/test_io_convert.py +++ b/tests/unit/test_io_convert.py @@ -35,13 +35,14 @@ import os import shutil import numpy as np +import numcodecs from abc import ABCMeta, abstractmethod from hdmf_zarr.backend import (ZarrIO, ROOT_NAME) from hdmf_zarr.zarr_utils import ContainerZarrReferenceDataset -from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset +from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset, H5DataIO from hdmf.backends.hdf5 import HDF5IO from hdmf.common import get_manager as get_hdmfcommon_manager from hdmf.testing import TestCase @@ -822,6 +823,88 @@ def test_export_cpd_dset_refs(self): self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name]) +class TestHDF5toZarrWithFilters(TestCase): + """ + Test conversion from HDF5 to Zarr while preserving HDF5 filter settings + """ + def setUp(self): + self.hdf_filename = get_temp_filepath() + self.zarr_filename = get_temp_filepath() + self.out_container = None + self.read_container = None + + def tearDown(self): + # close the ZarrIO used for reading + del self.out_container + del self.read_container + # clean up any opened files + for fn in [self.hdf_filename, self.zarr_filename]: + if fn is not None and os.path.exists(fn): + if os.path.isdir(fn): + shutil.rmtree(fn) + else: + os.remove(fn) + + def __roundtrip_data(self, data): + """Sets the variables self.out_container, self.read_container""" + # Create example foofile with the provided data (which may be wrapped in H5DataIO) + foo1 = Foo('foo1', data, "I am foo1", 17, 3.14) + foobucket = FooBucket('bucket1', [foo1,]) + foofile = FooFile(buckets=[foobucket]) + self.out_container = foofile + + # write example HDF5 file with no filter settings + with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='w') as write_io: + write_io.write(foofile, cache_spec=True) + # Export the HDF5 file to Zarr + with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='r') as hdf_read_io: + with ZarrIO(self.zarr_filename, mode='w') as export_io: + export_io.export(src_io=hdf_read_io, write_args={'link_data': False}) + # read and compare the containers + with ZarrIO(self.zarr_filename, mode='r', manager=get_foo_buildmanager()) as zarr_read_io: + self.read_container = zarr_read_io.read() + + def __get_data_array(self, foo_container): + """For a container created by __roundtrip_data return the data array""" + return foo_container.buckets['bucket1'].foos['foo1'].my_data + + def test_nofilters(self): + """basic test that export without any options specified is working as expected""" + data = list(range(5)) + self.__roundtrip_data(data=data) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + + def test_chunking(self): + """Test that chunking is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,)) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertTupleEqual((10,), read_array.chunks) + + def test_shuffle(self): + """Test that shuffle filter is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,), shuffle=True) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertEqual(len(read_array.filters), 1) + self.assertIsInstance(read_array.filters[0], numcodecs.Shuffle) + self.assertTupleEqual((10,), read_array.chunks) + + def test_gzip(self): + """Test that gzip filter is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,), compression='gzip', compression_opts=2 ) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertEqual(len(read_array.filters), 1) + self.assertIsInstance(read_array.filters[0], numcodecs.Zlib) + self.assertEqual(read_array.filters[0].level, 2) + self.assertTupleEqual((10,), read_array.chunks) + + + # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source """ class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase): diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py new file mode 100644 index 00000000..d6070784 --- /dev/null +++ b/tests/unit/test_zarrdataio.py @@ -0,0 +1,237 @@ +""" +Module for testing the ZarrDataIO class. + +Many of the functions of ZarrDataIO are covered in tests related to ZarrIO and data conversion, +such as test_io_convert.TestHDF5toZarrWithFilters. However, those tests are in the context of +more complex operations and are more akin to integration tests This module focuses on test for +specific unit functions of ZarrDataIO. +""" +import numcodecs +import h5py +import os +import shutil +import unittest + +import numpy as np + +try: + import hdf5plugin + HDF5PLUGIN = True +except ImportError: + HDF5PLUGIN = False +from hdmf.testing import TestCase +from hdmf_zarr.utils import ZarrDataIO +from tests.unit.utils import get_temp_filepath + +class TestZarrDataIO(TestCase): + """Test the ZarrDataIO class""" + def setUp(self): + self.hdf_filename = get_temp_filepath() + self.zarr_filename = get_temp_filepath() + + def tearDown(self): + # clean up any opened files + for fn in [self.hdf_filename, self.zarr_filename]: + if fn is not None and os.path.exists(fn): + if os.path.isdir(fn): + shutil.rmtree(fn) + else: + os.remove(fn) + + def test_hdf5_to_zarr_filters_scaleoffset(self): + """Test that we warn when the scaleoffset filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', data=[1,2,3,4,5], scaleoffset=10) + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 scaleoffset filter ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + def test_hdf5_to_zarr_filters_lzf(self): + """Test that we warn when the lzf filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', data=[1, 2, 3, 4, 5], compression="lzf") + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 szip or lzf compression ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_lz4(self): + """Test that we warn when the lz4 filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', + data=[1, 2, 3, 4, 5], + **hdf5plugin.LZ4()) + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 lz4 compression ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_bitshuffle(self): + """Test that we warn when the bitshuffle filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset(name='test_dset', + data=[1, 2, 3, 4, 5], + **hdf5plugin.Bitshuffle(nelems=0, lz4=True)) + # test that we warn due to the scaleoffset + msg = "/test_dset HDF5 bitshuffle compression ignored in Zarr" + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_other_unsupported(self): + """ + Test that we warn when an unsupported filter is used in HDF5 with ZarrDataIO.hdf5_to_zarr_filters. + This test is to ensure that the catch-all at the end of the loop works. + """ + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset_FciDecomp = h5file.create_dataset( + name='test_fcidecomp', + data=[1, 2, 3, 4, 5], + **hdf5plugin.FciDecomp()) + # test that we warn due to the FciDecomp + msg = "/test_fcidecomp HDF5 filter id 32018 with properties None ignored in Zarr." + with self.assertWarnsWith(UserWarning, msg): + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_FciDecomp) + self.assertEqual(len(filters), 0) + # Close the HDF5 file + h5file.close() + + def test_hdf5_to_zarr_filters_shuffle(self): + """Test HDF5 shuffle filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset_int = h5file.create_dataset(name='test_int', data=np.arange(5, dtype='int32'), shuffle=True) + h5dset_float = h5file.create_dataset(name='test_float', data=np.arange(5, dtype='float32'), shuffle=True) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_int) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Shuffle) + # test that we apply shuffle filter on float data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_float) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Shuffle) + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_blosclz(self): + """Test HDF5 blosclz filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test_int', + data=np.arange(100, dtype='float32'), + **hdf5plugin.Blosc(cname='blosclz', clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE) + ) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Blosc) + self.assertEqual(filters[0].cname, 'blosclz') + self.assertEqual(filters[0].clevel, 9) + self.assertEqual(filters[0].shuffle, hdf5plugin.Blosc.SHUFFLE) + h5file.close() + + @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed") + def test_hdf5_to_zarr_filters_zstd(self): + """Test HDF5 zstd filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test_int', + data=np.arange(100, dtype='float32'), + **hdf5plugin.Zstd(clevel=22) + ) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Zstd) + self.assertEqual(filters[0].level, 22) + # Close the HDF5 file + h5file.close() + + def test_hdf5_to_zarr_filters_gzip(self): + """Test HDF5 gzip filter works with ZarrDataIO.hdf5_to_zarr_filters.""" + # Create a test HDF5 dataset with scaleoffset + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test_int', + data=np.arange(100, dtype='float32'), + compression='gzip', + compression_opts=2 + ) + # test that we apply shuffle filter on int data + filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) + self.assertEqual(len(filters), 1) + self.assertIsInstance(filters[0], numcodecs.Zlib) + self.assertEqual(filters[0].level, 2) + # Close the HDF5 file + h5file.close() + + def test_is_h5py_dataset(self): + """Test ZarrDataIO.is_h5py_dataset""" + h5file = h5py.File(self.hdf_filename, mode='a') + arr=np.arange(10) + h5dset = h5file.create_dataset(name='test', data=arr) + self.assertTrue(ZarrDataIO.is_h5py_dataset(h5dset)) + self.assertFalse(ZarrDataIO.is_h5py_dataset(arr)) + + def test_from_h5py_dataset(self): + """Test ZarrDataIO.from_h5py_dataset""" + h5file = h5py.File(self.hdf_filename, mode='a') + h5dset = h5file.create_dataset( + name='test', + data=np.arange(1000).reshape((10,100)), + compression='gzip', + compression_opts=6, + shuffle=True, + fillvalue=100, + chunks=(5,10)) + re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset) + # Test that all settings are being presevered when creating the ZarrDataIO object + self.assertIsInstance(re_zarrdataio, ZarrDataIO) + self.assertEqual(re_zarrdataio.data, h5dset) + self.assertEqual(re_zarrdataio.fillvalue, 100) + self.assertEqual(re_zarrdataio.chunks, (5,10)) + self.assertEqual(len(re_zarrdataio.io_settings['filters']), 2) + self.assertIsInstance(re_zarrdataio.io_settings['filters'][0], numcodecs.Shuffle) + self.assertIsInstance(re_zarrdataio.io_settings['filters'][1], numcodecs.Zlib) + # Close the HDF5 file + h5file.close() + + def test_from_h5py_dataset_bytes_fillvalue(self): + """ + Test ZarrDataIO.from_h5py_dataset with a fillvalue that is in bytes, which needs to be handled + separately since bytes are not JSON serializable by default + """ + h5file = h5py.File(self.hdf_filename, mode='a') + # print(np.arange(10, dtype=np.int8).tobytes()) + h5dset = h5file.create_dataset( + name='test_str', + data=[b'hello', b'world', b'go'], + fillvalue=b'None') + re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset) + # Test that all settings are being presevered when creating the ZarrDataIO object + self.assertIsInstance(re_zarrdataio, ZarrDataIO) + self.assertEqual(re_zarrdataio.io_settings['fill_value'], str("None")) + # Close the HDF5 file + h5file.close() \ No newline at end of file