From 12082d39c06af4f0729c50a4dcb7bef99c003368 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 31 Dec 2023 20:19:37 -0800 Subject: [PATCH] Add unit test for hdf5 to zarr export with filters --- src/hdmf_zarr/backend.py | 5 ++- tests/unit/test_io_convert.py | 85 ++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 691f144b..39b1dc9e 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -344,8 +344,9 @@ def export(self, **kwargs): ) if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True): - raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument " - "link_data=True." % src_io.__class__.__name__) + raise UnsupportedOperation(f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " + + "to Zarr with write argument link_data=True. " + + "Set write_args={'link_data': False}") write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder ckwargs = kwargs.copy() diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py index eddd0400..db1a446b 100644 --- a/tests/unit/test_io_convert.py +++ b/tests/unit/test_io_convert.py @@ -35,13 +35,14 @@ import os import shutil import numpy as np +import numcodecs from abc import ABCMeta, abstractmethod from hdmf_zarr.backend import (ZarrIO, ROOT_NAME) from hdmf_zarr.zarr_utils import ContainerZarrReferenceDataset -from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset +from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset, H5DataIO from hdmf.backends.hdf5 import HDF5IO from hdmf.common import get_manager as get_hdmfcommon_manager from hdmf.testing import TestCase @@ -822,6 +823,88 @@ def test_export_cpd_dset_refs(self): self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name]) +class TestHDF5toZarrWithFilters(TestCase): + """ + Test conversion from HDF5 to Zarr while preserving HDF5 filter settings + """ + def setUp(self): + self.hdf_filename = get_temp_filepath() + self.zarr_filename = get_temp_filepath() + self.out_container = None + self.read_container = None + + def tearDown(self): + # close the ZarrIO used for reading + del self.out_container + del self.read_container + # clean up any opened files + for fn in [self.hdf_filename, self.zarr_filename]: + if fn is not None and os.path.exists(fn): + if os.path.isdir(fn): + shutil.rmtree(fn) + else: + os.remove(fn) + self.filenames = [] + + def __roundtrip_data(self, data): + """Sets the variables self.out_container, self.read_container""" + # Create example foofile with the provided data (which may be wrapped in H5DataIO) + foo1 = Foo('foo1', data, "I am foo1", 17, 3.14) + foobucket = FooBucket('bucket1', [foo1,]) + foofile = FooFile(buckets=[foobucket]) + self.out_container = foofile + + # write example HDF5 file with no filter settings + with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='w') as write_io: + write_io.write(foofile, cache_spec=True) + # Export the HDF5 file to Zarr + with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='r') as hdf_read_io: + with ZarrIO(self.zarr_filename, mode='w') as export_io: + export_io.export(src_io=hdf_read_io, write_args={'link_data': False}) + # read and compare the containers + with ZarrIO(self.zarr_filename, mode='r', manager=get_foo_buildmanager()) as zarr_read_io: + self.read_container = zarr_read_io.read() + + def __get_data_array(self, foo_container): + """For a container created by __roundtrip_data return the data array""" + return foo_container.buckets['bucket1'].foos['foo1'].my_data + + def test_nofilters(self): + """basic test that export without any options specified is working as expected""" + data = list(range(5)) + self.__roundtrip_data(data=data) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + + def test_chunking(self): + """Test that chunking is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,)) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertTupleEqual((10,), read_array.chunks) + + def test_shuffle(self): + """Test that shuffle filter is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,), shuffle=True) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertEqual(len(read_array.filters), 1) + self.assertIsInstance(read_array.filters[0], numcodecs.Shuffle) + self.assertTupleEqual((10,), read_array.chunks) + + def test_gzip(self): + """Test that gzip filter is being preserved""" + outdata = H5DataIO(data=list(range(100)), chunks=(10,), compression='gzip', compression_opts=2 ) + self.__roundtrip_data(data=outdata) + self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True) + read_array = self.__get_data_array(self.read_container) + self.assertEqual(len(read_array.filters), 1) + self.assertIsInstance(read_array.filters[0], numcodecs.Zlib) + self.assertEqual(read_array.filters[0].level, 2) + self.assertTupleEqual((10,), read_array.chunks) + + # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source """ class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase):