From bd36a4abef43e5ba17768fad237078c31261fc5c Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 14:13:18 -0800
Subject: [PATCH 01/17] Fix #152 Preserve h5py.Dataset filter settings on
 export

---
 src/hdmf_zarr/backend.py |  5 +++
 src/hdmf_zarr/utils.py   | 74 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
index ccc955a8..691f144b 100644
--- a/src/hdmf_zarr/backend.py
+++ b/src/hdmf_zarr/backend.py
@@ -938,6 +938,11 @@ def write_dataset(self, **kwargs):  # noqa: C901
         name = builder.name
         data = builder.data if force_data is None else force_data
         options = dict()
+        # Check if data is a h5py.Dataset to infer I/O settings if necessary
+        if ZarrDataIO.is_h5py_dataset(data):
+            # Wrap the h5py.Dataset in ZarrDataIO with chunking and compression settings inferred from the input data
+            data = ZarrDataIO.from_h5py_dataset(h5dataset=data)
+        # Separate data values and io_settings for write
         if isinstance(data, ZarrDataIO):
             options['io_settings'] = data.io_settings
             link_data = data.link_data
diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index aa73d510..43271fe8 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -461,13 +461,83 @@ def __init__(self, **kwargs):
             self.__iosettings['filters'] = filters
 
     @property
-    def link_data(self):
+    def link_data(self) -> bool:
+        """Bool indicating should it be linked to or copied. NOTE: Only applies to zarr.Array type data"""
         return self.__link_data
 
     @property
-    def io_settings(self):
+    def io_settings(self) -> dict:
+        """"""
         return self.__iosettings
 
+    @staticmethod
+    def from_h5py_dataset(h5dataset, **kwargs):
+        """
+        Create a ZarrDataIO instance that wraps the given h5py dataset and infers
+        the filters that should be used in Zarr from the filters used in h5py.
+
+        :param dataset: h5py.Dataset object that should be wrapped
+        :type dataset: h5py.Dataset
+        :param **kwargs: Other keyword arguments to pass to ZarrDataIO.__init__
+
+        :returns: ZarrDataIO object wrapping the dataset
+        """
+        filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset)
+        fillvalue = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue')
+        chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks')
+        re = ZarrDataIO(
+            data=h5dataset,
+            filters=filters,
+            fillvalue=fillvalue,
+            chunks=chunks,
+            **kwargs)
+
+    @staticmethod
+    def hdf5_to_zarr_filters(self, h5dataset) -> list:
+        """From the given h5py.Dataset infer the corresponding filters to use in Zarr"""
+        # Based on https://github.com/fsspec/kerchunk/blob/617d9ce06b9d02375ec0e5584541fcfa9e99014a/kerchunk/hdf.py#L181
+        filters = []
+        # Check for unsupported filters
+        if h5dataset.scaleoffset:
+            # TODO: translate to  numcodecs.fixedscaleoffset.FixedScaleOffset()
+            warnings.warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
+        if h5obj.compression in ("szip", "lzf"):
+            warnings.warn(f"{h5dateset.name} HDF5 szip or lzf compression ignored in Zarr")
+        # Add the shuffle filter if possible
+        if h5obj.shuffle and h5obj.dtype.kind != "O":
+            # cannot use shuffle if we materialised objects
+            filters.append(numcodecs.Shuffle(elementsize=h5dataset.dtype.itemsize))
+        # iterate through all the filters and add them to the list
+        for filter_id, properties in h5dataset._filters.items():
+            filter_id_str = str(filter_id)
+            if filter_id_str == "32001":
+                blosc_compressors = ("blosclz", "lz4", "lz4hc", "snappy", "zlib", "zstd")
+                (_1, _2, bytes_per_num, total_bytes, clevel, shuffle, compressor) = properties
+                pars = dict(
+                    blocksize=total_bytes,
+                    clevel=clevel,
+                    shuffle=shuffle,
+                    cname=blosc_compressors[compressor])
+                filters.append(numcodecs.Blosc(**pars))
+            elif filter_id_str == "32015":
+                filters.append(numcodecs.Zstd(level=properties[0]))
+            elif filter_id_str == "gzip":
+                filters.append(numcodecs.Zlib(level=properties))
+            elif filter_id_str == "32004":
+                warnings.warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr"
+                )
+            elif filter_id_str == "32008":
+                warnings.warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr")
+            elif filter_id_str == "shuffle": # already handled above
+                pass
+            else:
+                warnings.warn(
+                    f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.")
+        return filters
+    @staticmethod
+    def is_h5py_dataset(obj):
+        """Check if the object is an instance of h5py.Dataset without requiring import of h5py"""
+        return (obj.__class__.__module__, obj.__class__.__name__) == ('h5py._hl.dataset', 'Dataset')
 
 class ZarrReference(dict):
     """

From b3232975f072acd8bf10a0f567931a46d98f6862 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 14:18:21 -0800
Subject: [PATCH 02/17] Remove extra arg

---
 src/hdmf_zarr/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index 43271fe8..1f1c3f24 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -493,7 +493,7 @@ def from_h5py_dataset(h5dataset, **kwargs):
             **kwargs)
 
     @staticmethod
-    def hdf5_to_zarr_filters(self, h5dataset) -> list:
+    def hdf5_to_zarr_filters(h5dataset) -> list:
         """From the given h5py.Dataset infer the corresponding filters to use in Zarr"""
         # Based on https://github.com/fsspec/kerchunk/blob/617d9ce06b9d02375ec0e5584541fcfa9e99014a/kerchunk/hdf.py#L181
         filters = []

From 6deb0fb922956f2530db707f1e1e640afe1f149e Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 14:24:25 -0800
Subject: [PATCH 03/17] Fix bad variable name

---
 src/hdmf_zarr/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index 1f1c3f24..adee5bf0 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -501,10 +501,10 @@ def hdf5_to_zarr_filters(h5dataset) -> list:
         if h5dataset.scaleoffset:
             # TODO: translate to  numcodecs.fixedscaleoffset.FixedScaleOffset()
             warnings.warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
-        if h5obj.compression in ("szip", "lzf"):
+        if h5dataset.compression in ("szip", "lzf"):
             warnings.warn(f"{h5dateset.name} HDF5 szip or lzf compression ignored in Zarr")
         # Add the shuffle filter if possible
-        if h5obj.shuffle and h5obj.dtype.kind != "O":
+        if h5dataset.shuffle and h5dataset.dtype.kind != "O":
             # cannot use shuffle if we materialised objects
             filters.append(numcodecs.Shuffle(elementsize=h5dataset.dtype.itemsize))
         # iterate through all the filters and add them to the list

From f208dde19372ff37e96f331bdf8c2791c181b5b2 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 15:56:25 -0800
Subject: [PATCH 04/17] Add missing return statement in
 ZarrDataIO.from_h5py_dataset

---
 src/hdmf_zarr/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index adee5bf0..b4d46e1e 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -473,8 +473,10 @@ def io_settings(self) -> dict:
     @staticmethod
     def from_h5py_dataset(h5dataset, **kwargs):
         """
-        Create a ZarrDataIO instance that wraps the given h5py dataset and infers
-        the filters that should be used in Zarr from the filters used in h5py.
+        Factory method to create a ZarrDataIO instance from a h5py.Dataset.
+        The ZarrDataIO object wraps the h5py.Dataset and the io filter settings
+        are inferred from filters used in h5py such that the options in Zarr match
+        (if possible) the options used in HDF5.
 
         :param dataset: h5py.Dataset object that should be wrapped
         :type dataset: h5py.Dataset
@@ -491,6 +493,7 @@ def from_h5py_dataset(h5dataset, **kwargs):
             fillvalue=fillvalue,
             chunks=chunks,
             **kwargs)
+        return re
 
     @staticmethod
     def hdf5_to_zarr_filters(h5dataset) -> list:

From 90d2af75f099e3976f956fa76d44eee149fe4de2 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 15:59:23 -0800
Subject: [PATCH 05/17] Fix ruff

---
 src/hdmf_zarr/utils.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index b4d46e1e..e52106d6 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -503,9 +503,9 @@ def hdf5_to_zarr_filters(h5dataset) -> list:
         # Check for unsupported filters
         if h5dataset.scaleoffset:
             # TODO: translate to  numcodecs.fixedscaleoffset.FixedScaleOffset()
-            warnings.warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
+            warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
         if h5dataset.compression in ("szip", "lzf"):
-            warnings.warn(f"{h5dateset.name} HDF5 szip or lzf compression ignored in Zarr")
+            warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr")
         # Add the shuffle filter if possible
         if h5dataset.shuffle and h5dataset.dtype.kind != "O":
             # cannot use shuffle if we materialised objects
@@ -527,15 +527,14 @@ def hdf5_to_zarr_filters(h5dataset) -> list:
             elif filter_id_str == "gzip":
                 filters.append(numcodecs.Zlib(level=properties))
             elif filter_id_str == "32004":
-                warnings.warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr"
+                warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr"
                 )
             elif filter_id_str == "32008":
-                warnings.warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr")
+                warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr")
             elif filter_id_str == "shuffle": # already handled above
                 pass
             else:
-                warnings.warn(
-                    f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.")
+                warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.")
         return filters
     @staticmethod
     def is_h5py_dataset(obj):

From 12082d39c06af4f0729c50a4dcb7bef99c003368 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 20:19:37 -0800
Subject: [PATCH 06/17] Add unit test for hdf5 to zarr export with filters

---
 src/hdmf_zarr/backend.py      |  5 ++-
 tests/unit/test_io_convert.py | 85 ++++++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
index 691f144b..39b1dc9e 100644
--- a/src/hdmf_zarr/backend.py
+++ b/src/hdmf_zarr/backend.py
@@ -344,8 +344,9 @@ def export(self, **kwargs):
         )
 
         if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True):
-            raise UnsupportedOperation("Cannot export from non-Zarr backend %s to Zarr with write argument "
-                                       "link_data=True." % src_io.__class__.__name__)
+            raise UnsupportedOperation(f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " +
+                                       "to Zarr with write argument link_data=True. "
+                                       + "Set write_args={'link_data': False}")
 
         write_args['export_source'] = src_io.source  # pass export_source=src_io.source to write_builder
         ckwargs = kwargs.copy()
diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py
index eddd0400..db1a446b 100644
--- a/tests/unit/test_io_convert.py
+++ b/tests/unit/test_io_convert.py
@@ -35,13 +35,14 @@
 import os
 import shutil
 import numpy as np
+import numcodecs
 from abc import ABCMeta, abstractmethod
 
 from hdmf_zarr.backend import (ZarrIO,
                                ROOT_NAME)
 from hdmf_zarr.zarr_utils import ContainerZarrReferenceDataset
 
-from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset
+from hdmf.backends.hdf5.h5_utils import ContainerH5ReferenceDataset, H5DataIO
 from hdmf.backends.hdf5 import HDF5IO
 from hdmf.common import get_manager as get_hdmfcommon_manager
 from hdmf.testing import TestCase
@@ -822,6 +823,88 @@ def test_export_cpd_dset_refs(self):
                 self.assertIs(read_bucket2.baz_cpd_data.data[i][1], read_bucket2.bazs[baz_name])
 
 
+class TestHDF5toZarrWithFilters(TestCase):
+    """
+    Test conversion from HDF5 to Zarr while preserving HDF5 filter settings
+    """
+    def setUp(self):
+        self.hdf_filename = get_temp_filepath()
+        self.zarr_filename = get_temp_filepath()
+        self.out_container = None
+        self.read_container = None
+
+    def tearDown(self):
+        # close the ZarrIO used for reading
+        del self.out_container
+        del self.read_container
+        # clean up any opened files
+        for fn in  [self.hdf_filename, self.zarr_filename]:
+            if fn is not None and os.path.exists(fn):
+                if os.path.isdir(fn):
+                    shutil.rmtree(fn)
+                else:
+                    os.remove(fn)
+        self.filenames = []
+
+    def __roundtrip_data(self, data):
+        """Sets the variables self.out_container, self.read_container"""
+        # Create example foofile with the provided data (which may be wrapped in H5DataIO)
+        foo1 = Foo('foo1', data, "I am foo1", 17, 3.14)
+        foobucket = FooBucket('bucket1', [foo1,])
+        foofile = FooFile(buckets=[foobucket])
+        self.out_container = foofile
+
+        # write example HDF5 file with no filter settings
+        with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='w') as write_io:
+            write_io.write(foofile, cache_spec=True)
+        # Export the HDF5 file to Zarr
+        with HDF5IO(self.hdf_filename, manager=get_foo_buildmanager(), mode='r') as hdf_read_io:
+            with ZarrIO(self.zarr_filename, mode='w') as export_io:
+                export_io.export(src_io=hdf_read_io, write_args={'link_data': False})
+        # read and compare the containers
+        with ZarrIO(self.zarr_filename, mode='r', manager=get_foo_buildmanager()) as zarr_read_io:
+            self.read_container = zarr_read_io.read()
+
+    def __get_data_array(self, foo_container):
+        """For a container created by __roundtrip_data return the data array"""
+        return foo_container.buckets['bucket1'].foos['foo1'].my_data
+
+    def test_nofilters(self):
+        """basic test that export without any options specified is working as expected"""
+        data = list(range(5))
+        self.__roundtrip_data(data=data)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+
+    def test_chunking(self):
+        """Test that chunking is being preserved"""
+        outdata = H5DataIO(data=list(range(100)), chunks=(10,))
+        self.__roundtrip_data(data=outdata)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+        read_array = self.__get_data_array(self.read_container)
+        self.assertTupleEqual((10,), read_array.chunks)
+
+    def test_shuffle(self):
+        """Test that shuffle filter is being preserved"""
+        outdata = H5DataIO(data=list(range(100)), chunks=(10,), shuffle=True)
+        self.__roundtrip_data(data=outdata)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+        read_array = self.__get_data_array(self.read_container)
+        self.assertEqual(len(read_array.filters), 1)
+        self.assertIsInstance(read_array.filters[0], numcodecs.Shuffle)
+        self.assertTupleEqual((10,), read_array.chunks)
+
+    def test_gzip(self):
+        """Test that gzip filter is being preserved"""
+        outdata = H5DataIO(data=list(range(100)), chunks=(10,), compression='gzip', compression_opts=2 )
+        self.__roundtrip_data(data=outdata)
+        self.assertContainerEqual(self.out_container, self.read_container, ignore_hdmf_attrs=True)
+        read_array = self.__get_data_array(self.read_container)
+        self.assertEqual(len(read_array.filters), 1)
+        self.assertIsInstance(read_array.filters[0], numcodecs.Zlib)
+        self.assertEqual(read_array.filters[0].level, 2)
+        self.assertTupleEqual((10,), read_array.chunks)
+
+
 # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source
 """
 class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase):

From e3ac387f91c9434e311d50792ed308e42e2aac9e Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 20:47:07 -0800
Subject: [PATCH 07/17] Update warning check in failing unit test

---
 tests/unit/base_tests_zarrio.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py
index 98123ff1..693f9ff5 100644
--- a/tests/unit/base_tests_zarrio.py
+++ b/tests/unit/base_tests_zarrio.py
@@ -1579,7 +1579,8 @@ def close(self):
 
         with OtherIO(manager=get_foo_buildmanager()) as read_io:
             with ZarrIO(self.store[1], mode='w') as export_io:
-                msg = "Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True."
+                msg = ("Cannot export from non-Zarr backend OtherIO to Zarr with write argument link_data=True. "
+                       "Set write_args={'link_data': False}")
                 with self.assertRaisesWith(UnsupportedOperation, msg):
                     export_io.export(src_io=read_io, container=foofile)
 

From 971dfce6d6dee837d1b5139d8efcb56f280c272b Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 23:28:44 -0800
Subject: [PATCH 08/17] Fix bytes fill value

---
 src/hdmf_zarr/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index e52106d6..c0870c07 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -486,6 +486,8 @@ def from_h5py_dataset(h5dataset, **kwargs):
         """
         filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset)
         fillvalue = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue')
+        if isinstance(fillvalue, bytes): # bytes are not JSON serializable so use string instead
+            fillvalue = str(fillvalue)
         chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks')
         re = ZarrDataIO(
             data=h5dataset,

From 111c78c790b596304f73a969389222a19a072108 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 23:34:46 -0800
Subject: [PATCH 09/17] Fix sphinx docs warning

---
 src/hdmf_zarr/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index c0870c07..cf735b60 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -480,7 +480,7 @@ def from_h5py_dataset(h5dataset, **kwargs):
 
         :param dataset: h5py.Dataset object that should be wrapped
         :type dataset: h5py.Dataset
-        :param **kwargs: Other keyword arguments to pass to ZarrDataIO.__init__
+        :param kwargs: Other keyword arguments to pass to ZarrDataIO.__init__
 
         :returns: ZarrDataIO object wrapping the dataset
         """

From 879055b58f66e48467aa28cf61ecff1a1c94bb5b Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 23:38:24 -0800
Subject: [PATCH 10/17] Fix docstring

---
 src/hdmf_zarr/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index cf735b60..2143e754 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -467,7 +467,7 @@ def link_data(self) -> bool:
 
     @property
     def io_settings(self) -> dict:
-        """"""
+        """Dict with the io settings to use"""
         return self.__iosettings
 
     @staticmethod

From 5d74be75217b7917af39b6a33726fe3f1c56383f Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 23:42:56 -0800
Subject: [PATCH 11/17] Update Changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ddd55ba8..358ad12e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # HDMF-ZARR Changelog
 
+## 0.6.0 (Upcoming)
+
+### Enhancements
+* Enhanced `ZarrIO` and `ZarrDataIO` to infer io settings (e.g., chunking and compression) from HDF5 datasets to preserve storage settings on export if possible @oruebel [#153](https://github.com/hdmf-dev/hdmf-zarr/pull/153)
+
 ## 0.5.0 (December 8, 2023)
 
 ### Enhancements

From ea2c9917a668cb83636a8a79f4c7e56438be298f Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Sun, 31 Dec 2023 23:50:14 -0800
Subject: [PATCH 12/17] Fix ruff

---
 src/hdmf_zarr/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index 2143e754..328f4caa 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -538,6 +538,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list:
             else:
                 warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.")
         return filters
+
     @staticmethod
     def is_h5py_dataset(obj):
         """Check if the object is an instance of h5py.Dataset without requiring import of h5py"""

From 2fddbb9ee6db8add9a9d94242b5e4b149e91d514 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Thu, 11 Jan 2024 00:42:26 -0800
Subject: [PATCH 13/17] Add h5plugins as a dependency for the tests

---
 requirements-dev.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index cb72d345..2f81d255 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -8,3 +8,4 @@ pytest==7.1.2
 pytest-cov==3.0.0
 python-dateutil==2.8.2
 tox==3.25.1
+hdf5plugin==4.3.0  # hdf5plugin is used to test conversion of plugin filters

From 1ebb04a8b681402fc127b8e1e69b1698159b31de Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Thu, 11 Jan 2024 00:43:41 -0800
Subject: [PATCH 14/17] Add unit tests for mapping HDF5 IO filters to Zarr
 filters via ZarrDataIO

---
 src/hdmf_zarr/utils.py        |   5 +-
 tests/unit/test_io_convert.py |   2 +-
 tests/unit/test_zarrdataio.py | 226 ++++++++++++++++++++++++++++++++++
 3 files changed, 229 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit/test_zarrdataio.py

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index 328f4caa..bde22296 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -505,7 +505,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list:
         # Check for unsupported filters
         if h5dataset.scaleoffset:
             # TODO: translate to  numcodecs.fixedscaleoffset.FixedScaleOffset()
-            warn( f" {h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
+            warn( f"{h5dataset.name} HDF5 scaleoffset filter ignored in Zarr")
         if h5dataset.compression in ("szip", "lzf"):
             warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr")
         # Add the shuffle filter if possible
@@ -529,8 +529,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list:
             elif filter_id_str == "gzip":
                 filters.append(numcodecs.Zlib(level=properties))
             elif filter_id_str == "32004":
-                warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr"
-                )
+                warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr")
             elif filter_id_str == "32008":
                 warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr")
             elif filter_id_str == "shuffle": # already handled above
diff --git a/tests/unit/test_io_convert.py b/tests/unit/test_io_convert.py
index db1a446b..b7f119a2 100644
--- a/tests/unit/test_io_convert.py
+++ b/tests/unit/test_io_convert.py
@@ -844,7 +844,6 @@ def tearDown(self):
                     shutil.rmtree(fn)
                 else:
                     os.remove(fn)
-        self.filenames = []
 
     def __roundtrip_data(self, data):
         """Sets the variables self.out_container, self.read_container"""
@@ -905,6 +904,7 @@ def test_gzip(self):
         self.assertTupleEqual((10,), read_array.chunks)
 
 
+
 # TODO: Fails because we need to copy the data from the ExternalLink as it points to a non-Zarr source
 """
 class TestFooExternalLinkHDF5ToZarr(MixinTestCaseConvert, TestCase):
diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py
new file mode 100644
index 00000000..8e732201
--- /dev/null
+++ b/tests/unit/test_zarrdataio.py
@@ -0,0 +1,226 @@
+"""
+Module for testing the ZarrDataIO class.
+
+Many of the functions of ZarrDataIO are covered in tests related to ZarrIO and data conversion,
+such as test_io_convert.TestHDF5toZarrWithFilters.  However, those tests are in the context of
+more complex operations and are more akin to integration tests This module focuses on test for
+specific unit functions of ZarrDataIO.
+"""
+import numcodecs
+import h5py
+import os
+import shutil
+import unittest
+
+import numpy as np
+
+try:
+    import hdf5plugin
+    HDF5PLUGIN = True
+except ImportError:
+    HDF5PLUGIN = False
+from hdmf.testing import TestCase
+from hdmf_zarr.utils import ZarrDataIO
+from tests.unit.utils import get_temp_filepath
+
+class TestZarrDataIO(TestCase):
+    """Test the ZarrDataIO class"""
+    def setUp(self):
+        self.hdf_filename = get_temp_filepath()
+        self.zarr_filename = get_temp_filepath()
+
+    def tearDown(self):
+        # clean up any opened files
+        for fn in [self.hdf_filename, self.zarr_filename]:
+            if fn is not None and os.path.exists(fn):
+                if os.path.isdir(fn):
+                    shutil.rmtree(fn)
+                else:
+                    os.remove(fn)
+
+    def test_hdf5_to_zarr_filters_scaleoffset(self):
+        """Test that we warn when the scaleoffset filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(name='test_dset', data=[1,2,3,4,5], scaleoffset=10)
+        # test that we warn due to the scaleoffset
+        msg = "/test_dset HDF5 scaleoffset filter ignored in Zarr"
+        with self.assertWarnsWith(UserWarning, msg):
+            filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset)
+            self.assertEqual(len(filters), 0)
+        # Close the HDF5 file
+        h5file.close()
+
+    def test_hdf5_to_zarr_filters_lzf(self):
+        """Test that we warn when the lzf filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(name='test_dset', data=[1, 2, 3, 4, 5], compression="lzf")
+        # test that we warn due to the scaleoffset
+        msg = "/test_dset HDF5 szip or lzf compression ignored in Zarr"
+        with self.assertWarnsWith(UserWarning, msg):
+            filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset)
+            self.assertEqual(len(filters), 0)
+        # Close the HDF5 file
+        h5file.close()
+
+    @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed")
+    def test_hdf5_to_zarr_filters_lz4(self):
+        """Test that we warn when the lz4 filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(name='test_dset',
+                                       data=[1, 2, 3, 4, 5],
+                                       **hdf5plugin.LZ4())
+        # test that we warn due to the scaleoffset
+        msg = "/test_dset HDF5 lz4 compression ignored in Zarr"
+        with self.assertWarnsWith(UserWarning, msg):
+            filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset)
+            self.assertEqual(len(filters), 0)
+        # Close the HDF5 file
+        h5file.close()
+
+    @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed")
+    def test_hdf5_to_zarr_filters_bitshuffle(self):
+        """Test that we warn when the bitshuffle filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(name='test_dset',
+                                       data=[1, 2, 3, 4, 5],
+                                       **hdf5plugin.Bitshuffle(nelems=0, lz4=True))
+        # test that we warn due to the scaleoffset
+        msg = "/test_dset HDF5 bitshuffle compression ignored in Zarr"
+        with self.assertWarnsWith(UserWarning, msg):
+            filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset)
+            self.assertEqual(len(filters), 0)
+        # Close the HDF5 file
+        h5file.close()
+
+    @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed")
+    def test_hdf5_to_zarr_filters_other_unsupported(self):
+        """Test that we warn when other unsupported filter is used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset_FciDecomp = h5file.create_dataset(
+            name='test_fcidecomp',
+            data=[1, 2, 3, 4, 5],
+            **hdf5plugin.FciDecomp())
+        h5dset_SZ3 = h5file.create_dataset(
+            name='test_sz3',
+            data=[1, 2, 3, 4, 5],
+            **hdf5plugin.SZ3(absolute=0.1))
+        # test that we warn due to the FciDecomp
+        msg = "/test_fcidecomp HDF5 filter id 32018 with properties None ignored in Zarr."
+        with self.assertWarnsWith(UserWarning, msg):
+            filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_FciDecomp)
+            self.assertEqual(len(filters), 0)
+        # test that we warn due to the SZ3
+        msg = ("/test_sz3 HDF5 filter id 32024 with properties "
+               "(1, 9, 0, 5, 0, 1069128089, 2576980378, 0, 0, 0, 0, 0, 0) ignored in Zarr.")
+        with self.assertWarnsWith(UserWarning, msg):
+            filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_SZ3)
+            self.assertEqual(len(filters), 0)
+        # Close the HDF5 file
+        h5file.close()
+
+    def test_hdf5_to_zarr_filters_shuffle(self):
+        """Test HDF5 shuffle filter works with ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset_int = h5file.create_dataset(name='test_int', data=np.arange(5, dtype='int32'), shuffle=True)
+        h5dset_float = h5file.create_dataset(name='test_float', data=np.arange(5, dtype='float32'), shuffle=True)
+        # test that we apply shuffle filter on int data
+        filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_int)
+        self.assertEqual(len(filters), 1)
+        self.assertIsInstance(filters[0], numcodecs.Shuffle)
+        # test that we apply shuffle filter on float data
+        filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_float)
+        self.assertEqual(len(filters), 1)
+        self.assertIsInstance(filters[0], numcodecs.Shuffle)
+        h5file.close()
+
+    @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed")
+    def test_hdf5_to_zarr_filters_blosclz(self):
+        """Test HDF5 blosclz filter works with ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(
+            name='test_int',
+            data=np.arange(100, dtype='float32'),
+            **hdf5plugin.Blosc(cname='blosclz', clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE)
+        )
+        # test that we apply shuffle filter on int data
+        filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset)
+        self.assertEqual(len(filters), 1)
+        self.assertIsInstance(filters[0], numcodecs.Blosc)
+        self.assertEqual(filters[0].cname, 'blosclz')
+        self.assertEqual(filters[0].clevel, 9)
+        self.assertEqual(filters[0].shuffle, hdf5plugin.Blosc.SHUFFLE)
+        h5file.close()
+
+    @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed")
+    def test_hdf5_to_zarr_filters_zstd(self):
+        """Test HDF5 zstd filter works with ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(
+            name='test_int',
+            data=np.arange(100, dtype='float32'),
+            **hdf5plugin.Zstd(clevel=22)
+        )
+        # test that we apply shuffle filter on int data
+        filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset)
+        self.assertEqual(len(filters), 1)
+        self.assertIsInstance(filters[0], numcodecs.Zstd)
+        self.assertEqual(filters[0].level, 22)
+        # Close the HDF5 file
+        h5file.close()
+
+    def test_hdf5_to_zarr_filters_gzip(self):
+        """Test HDF5 gzip filter works with ZarrDataIO.hdf5_to_zarr_filters."""
+        # Create a test HDF5 dataset with scaleoffset
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(
+            name='test_int',
+            data=np.arange(100, dtype='float32'),
+            compression='gzip',
+            compression_opts=2
+        )
+        # test that we apply shuffle filter on int data
+        filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset)
+        self.assertEqual(len(filters), 1)
+        self.assertIsInstance(filters[0], numcodecs.Zlib)
+        self.assertEqual(filters[0].level, 2)
+        # Close the HDF5 file
+        h5file.close()
+
+    def test_is_h5py_dataset(self):
+        """Test ZarrDataIO.is_h5py_dataset"""
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        arr=np.arange(10)
+        h5dset = h5file.create_dataset(name='test', data=arr)
+        self.assertTrue(ZarrDataIO.is_h5py_dataset(h5dset))
+        self.assertFalse(ZarrDataIO.is_h5py_dataset(arr))
+
+    def test_from_h5py_dataset(self):
+        """Test ZarrDataIO.from_h5py_dataset"""
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        h5dset = h5file.create_dataset(
+            name='test',
+            data=np.arange(1000).reshape((10,100)),
+            compression='gzip',
+            compression_opts=6,
+            shuffle=True,
+            fillvalue=100,
+            chunks=(5,10))
+        re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset)
+        # Test that all settings are being presevered when creating the ZarrDataIO object
+        self.assertIsInstance(re_zarrdataio, ZarrDataIO)
+        self.assertEqual(re_zarrdataio.data, h5dset)
+        self.assertEqual(re_zarrdataio.fillvalue, 100)
+        self.assertEqual(re_zarrdataio.chunks, (5,10))
+        self.assertEqual(len(re_zarrdataio.io_settings['filters']), 2)
+        self.assertIsInstance(re_zarrdataio.io_settings['filters'][0], numcodecs.Shuffle)
+        self.assertIsInstance(re_zarrdataio.io_settings['filters'][1], numcodecs.Zlib)
+        # Close the HDF5 file
+        h5file.close()
\ No newline at end of file

From 4935ab9cdaa6a9dfa353789c5f3fd373f73c3df1 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Thu, 11 Jan 2024 00:50:54 -0800
Subject: [PATCH 15/17] Remove test case with system-dependent warning message

---
 tests/unit/test_zarrdataio.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py
index 8e732201..0fc2f88c 100644
--- a/tests/unit/test_zarrdataio.py
+++ b/tests/unit/test_zarrdataio.py
@@ -98,28 +98,21 @@ def test_hdf5_to_zarr_filters_bitshuffle(self):
 
     @unittest.skipIf(not HDF5PLUGIN, "hdf5_plugin not installed")
     def test_hdf5_to_zarr_filters_other_unsupported(self):
-        """Test that we warn when other unsupported filter is used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters."""
+        """
+        Test that we warn when an unsupported filter is used in HDF5 with ZarrDataIO.hdf5_to_zarr_filters.
+        This test is to ensure that the catch-all at the end of the loop works.
+        """
         # Create a test HDF5 dataset with scaleoffset
         h5file = h5py.File(self.hdf_filename, mode='a')
         h5dset_FciDecomp = h5file.create_dataset(
             name='test_fcidecomp',
             data=[1, 2, 3, 4, 5],
             **hdf5plugin.FciDecomp())
-        h5dset_SZ3 = h5file.create_dataset(
-            name='test_sz3',
-            data=[1, 2, 3, 4, 5],
-            **hdf5plugin.SZ3(absolute=0.1))
         # test that we warn due to the FciDecomp
         msg = "/test_fcidecomp HDF5 filter id 32018 with properties None ignored in Zarr."
         with self.assertWarnsWith(UserWarning, msg):
             filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_FciDecomp)
             self.assertEqual(len(filters), 0)
-        # test that we warn due to the SZ3
-        msg = ("/test_sz3 HDF5 filter id 32024 with properties "
-               "(1, 9, 0, 5, 0, 1069128089, 2576980378, 0, 0, 0, 0, 0, 0) ignored in Zarr.")
-        with self.assertWarnsWith(UserWarning, msg):
-            filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_SZ3)
-            self.assertEqual(len(filters), 0)
         # Close the HDF5 file
         h5file.close()
 

From b5c93227d1760399ebcd6e5ebb2fa923c22ed165 Mon Sep 17 00:00:00 2001
From: Oliver Ruebel <oruebel@lbl.gov>
Date: Thu, 11 Jan 2024 01:26:44 -0800
Subject: [PATCH 16/17] Add unit test for conversion of byte fillvalue to
 string

---
 src/hdmf_zarr/utils.py        |  8 ++++----
 tests/unit/test_zarrdataio.py | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py
index bde22296..19bf75b9 100644
--- a/src/hdmf_zarr/utils.py
+++ b/src/hdmf_zarr/utils.py
@@ -485,14 +485,14 @@ def from_h5py_dataset(h5dataset, **kwargs):
         :returns: ZarrDataIO object wrapping the dataset
         """
         filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset)
-        fillvalue = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue')
-        if isinstance(fillvalue, bytes): # bytes are not JSON serializable so use string instead
-            fillvalue = str(fillvalue)
+        fillval = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue')
+        if isinstance(fillval, bytes): # bytes are not JSON serializable so use string instead
+            fillval = fillval.decode("utf-8")
         chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks')
         re = ZarrDataIO(
             data=h5dataset,
             filters=filters,
-            fillvalue=fillvalue,
+            fillvalue=fillval,
             chunks=chunks,
             **kwargs)
         return re
diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py
index 0fc2f88c..bb5a1fe2 100644
--- a/tests/unit/test_zarrdataio.py
+++ b/tests/unit/test_zarrdataio.py
@@ -216,4 +216,22 @@ def test_from_h5py_dataset(self):
         self.assertIsInstance(re_zarrdataio.io_settings['filters'][0], numcodecs.Shuffle)
         self.assertIsInstance(re_zarrdataio.io_settings['filters'][1], numcodecs.Zlib)
         # Close the HDF5 file
+        h5file.close()
+
+    def test_from_h5py_datase_bytes_fillvaluet(self):
+        """
+        Test ZarrDataIO.from_h5py_dataset with a fillvalue that is in bytes, which needs to be handled
+        separately since bytes are not JSON serializable by default
+        """
+        h5file = h5py.File(self.hdf_filename, mode='a')
+        # print(np.arange(10, dtype=np.int8).tobytes())
+        h5dset = h5file.create_dataset(
+            name='test_str',
+            data=[b'hello', b'world', b'go'],
+            fillvalue=b'None')
+        re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset)
+        # Test that all settings are being presevered when creating the ZarrDataIO object
+        self.assertIsInstance(re_zarrdataio, ZarrDataIO)
+        self.assertEqual(re_zarrdataio.io_settings['fill_value'], str("None"))
+        # Close the HDF5 file
         h5file.close()
\ No newline at end of file

From 4bdb47abe596a8c1ebeb90b7699def4c462dd01c Mon Sep 17 00:00:00 2001
From: Ryan Ly <rly@lbl.gov>
Date: Thu, 11 Jan 2024 11:03:25 -0800
Subject: [PATCH 17/17] Update tests/unit/test_zarrdataio.py

---
 tests/unit/test_zarrdataio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py
index bb5a1fe2..d6070784 100644
--- a/tests/unit/test_zarrdataio.py
+++ b/tests/unit/test_zarrdataio.py
@@ -218,7 +218,7 @@ def test_from_h5py_dataset(self):
         # Close the HDF5 file
         h5file.close()
 
-    def test_from_h5py_datase_bytes_fillvaluet(self):
+    def test_from_h5py_dataset_bytes_fillvalue(self):
         """
         Test ZarrDataIO.from_h5py_dataset with a fillvalue that is in bytes, which needs to be handled
         separately since bytes are not JSON serializable by default