From 044f5a579a960b9228f602f2c9380414b216df32 Mon Sep 17 00:00:00 2001
From: Matthew Avaylon <avaylonmatthew@gmail.com>
Date: Wed, 4 Sep 2024 10:28:09 -0700
Subject: [PATCH 1/3] Update CHANGELOG.md (#1184)

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 97d89e320..cb51dc538 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # HDMF Changelog
 
-## HDMF 3.14.4 (August 22, 2024)
+## HDMF 3.14.4 (September 4, 2024)
 
 ### Enhancements
 - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157)

From 874db31fb171577bfc0c962708786f9774ea6b1b Mon Sep 17 00:00:00 2001
From: Ryan Ly <rly@lbl.gov>
Date: Fri, 13 Sep 2024 10:58:25 -0700
Subject: [PATCH 2/3] Fix handling of read h5py string datasets (#1189)

---
 CHANGELOG.md                          |   6 ++
 src/hdmf/build/objectmapper.py        |   5 +-
 src/hdmf/utils.py                     |   2 +-
 tests/unit/build_tests/test_io_map.py | 131 +++++++++++++++++++++++++-
 4 files changed, 141 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb51dc538..c7919c81b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # HDMF Changelog
 
+## HDMF 3.14.5 (September 6, 2024)
+
+### Bug fixes
+- Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince
+  [#1189](https://github.com/hdmf-dev/hdmf/pull/1189)
+
 ## HDMF 3.14.4 (September 4, 2024)
 
 ### Enhancements
diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
index 3e8d835f1..83df1b427 100644
--- a/src/hdmf/build/objectmapper.py
+++ b/src/hdmf/build/objectmapper.py
@@ -602,7 +602,10 @@ def __get_data_type(cls, spec):
     def __convert_string(self, value, spec):
         """Convert string types to the specified dtype."""
         def __apply_string_type(value, string_type):
-            if isinstance(value, (list, tuple, np.ndarray, DataIO)):
+            # NOTE: if a user passes a h5py.Dataset that is not wrapped with a hdmf.utils.StrDataset,
+            # then this conversion may not be correct. Users should unpack their string h5py.Datasets
+            # into a numpy array (or wrap them in StrDataset) before passing them to a container object.
+            if hasattr(value, '__iter__') and not isinstance(value, (str, bytes)):
                 return [__apply_string_type(item, string_type) for item in value]
             else:
                 return string_type(value)
diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py
index 5e0b61539..50db79c40 100644
--- a/src/hdmf/utils.py
+++ b/src/hdmf/utils.py
@@ -1140,7 +1140,7 @@ def update(self, other):
 
 @docval_macro('array_data')
 class StrDataset(h5py.Dataset):
-    """Wrapper to decode strings on reading the dataset"""
+    """Wrapper to decode strings on reading the dataset. Use only for h5py 3+."""
     def __init__(self, dset, encoding, errors='strict'):
         self.dset = dset
         if encoding is None:
diff --git a/tests/unit/build_tests/test_io_map.py b/tests/unit/build_tests/test_io_map.py
index e095ef318..730530a5a 100644
--- a/tests/unit/build_tests/test_io_map.py
+++ b/tests/unit/build_tests/test_io_map.py
@@ -1,4 +1,4 @@
-from hdmf.utils import docval, getargs
+from hdmf.utils import StrDataset, docval, getargs
 from hdmf import Container, Data
 from hdmf.backends.hdf5 import H5DataIO
 from hdmf.build import (GroupBuilder, DatasetBuilder, ObjectMapper, BuildManager, TypeMap, LinkBuilder,
@@ -7,12 +7,15 @@
 from hdmf.spec import (GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog, RefSpec,
                        LinkSpec)
 from hdmf.testing import TestCase
+import h5py
 from abc import ABCMeta, abstractmethod
 import unittest
 import numpy as np
 
 from tests.unit.helpers.utils import CORE_NAMESPACE, create_test_type_map
 
+H5PY_3 = h5py.__version__.startswith('3')
+
 
 class Bar(Container):
 
@@ -460,6 +463,132 @@ def test_build_3d_ndarray(self):
         np.testing.assert_array_equal(builder.get('data').data, str_array_3d)
         np.testing.assert_array_equal(builder.get('attr_array'), str_array_3d)
 
+    @unittest.skipIf(not H5PY_3, "Use StrDataset only for h5py 3+")
+    def test_build_1d_h5py_3_dataset(self):
+        bar_spec = GroupSpec(
+            doc='A test group specification with a data type',
+            data_type_def='Bar',
+            datasets=[
+                DatasetSpec(
+                    doc='an example dataset',
+                    dtype='text',
+                    name='data',
+                    shape=(None, ),
+                    attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
+                )
+            ],
+            attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
+                                      shape=(None, ))],
+        )
+        type_map = self.customSetUp(bar_spec)
+        type_map.register_map(Bar, BarMapper)
+        # create in-memory hdf5 file that is discarded after closing
+        with h5py.File("test.h5", "w", driver="core", backing_store=False) as f:
+            str_array_1d = np.array(
+                ['aa', 'bb', 'cc', 'dd'],
+                dtype=h5py.special_dtype(vlen=str)
+            )
+            # wrap the dataset in a StrDataset to mimic how HDF5IO would read this dataset with h5py 3+
+            dataset = StrDataset(f.create_dataset('data', data=str_array_1d), None)
+            bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset)
+            builder = type_map.build(bar_inst)
+            np.testing.assert_array_equal(builder.get('data').data, dataset[:])
+            np.testing.assert_array_equal(builder.get('attr_array'), dataset[:])
+
+    @unittest.skipIf(not H5PY_3, "Use StrDataset only for h5py 3+")
+    def test_build_3d_h5py_3_dataset(self):
+        bar_spec = GroupSpec(
+            doc='A test group specification with a data type',
+            data_type_def='Bar',
+            datasets=[
+                DatasetSpec(
+                    doc='an example dataset',
+                    dtype='text',
+                    name='data',
+                    shape=(None, None, None),
+                    attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
+                )
+            ],
+            attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
+                                      shape=(None, None, None))],
+        )
+        type_map = self.customSetUp(bar_spec)
+        type_map.register_map(Bar, BarMapper)
+        # create in-memory hdf5 file that is discarded after closing
+        with h5py.File("test.h5", "w", driver="core", backing_store=False) as f:
+            str_array_3d = np.array(
+                [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]],
+                dtype=h5py.special_dtype(vlen=str)
+            )
+            # wrap the dataset in a StrDataset to mimic how HDF5IO would read this dataset with h5py 3+
+            dataset = StrDataset(f.create_dataset('data', data=str_array_3d), None)
+            bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset)
+            builder = type_map.build(bar_inst)
+            np.testing.assert_array_equal(builder.get('data').data, dataset[:])
+            np.testing.assert_array_equal(builder.get('attr_array'), dataset[:])
+
+    @unittest.skipIf(H5PY_3, "Create dataset differently for h5py < 3")
+    def test_build_1d_h5py_2_dataset(self):
+        bar_spec = GroupSpec(
+            doc='A test group specification with a data type',
+            data_type_def='Bar',
+            datasets=[
+                DatasetSpec(
+                    doc='an example dataset',
+                    dtype='text',
+                    name='data',
+                    shape=(None, ),
+                    attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
+                )
+            ],
+            attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
+                                      shape=(None, ))],
+        )
+        type_map = self.customSetUp(bar_spec)
+        type_map.register_map(Bar, BarMapper)
+        # create in-memory hdf5 file that is discarded after closing
+        with h5py.File("test.h5", "w", driver="core", backing_store=False) as f:
+            str_array_1d = np.array(
+                ['aa', 'bb', 'cc', 'dd'],
+                dtype=h5py.special_dtype(vlen=str)
+            )
+            dataset = f.create_dataset('data', data=str_array_1d)
+            bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset)
+            builder = type_map.build(bar_inst)
+            np.testing.assert_array_equal(builder.get('data').data, dataset[:])
+            np.testing.assert_array_equal(builder.get('attr_array'), dataset[:])
+
+    @unittest.skipIf(H5PY_3, "Create dataset differently for h5py < 3")
+    def test_build_3d_h5py_2_dataset(self):
+        bar_spec = GroupSpec(
+            doc='A test group specification with a data type',
+            data_type_def='Bar',
+            datasets=[
+                DatasetSpec(
+                    doc='an example dataset',
+                    dtype='text',
+                    name='data',
+                    shape=(None, None, None),
+                    attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')],
+                )
+            ],
+            attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text',
+                                      shape=(None, None, None))],
+        )
+        type_map = self.customSetUp(bar_spec)
+        type_map.register_map(Bar, BarMapper)
+        # create in-memory hdf5 file that is discarded after closing
+        with h5py.File("test.h5", "w", driver="core", backing_store=False) as f:
+            str_array_3d = np.array(
+                [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]],
+                dtype=h5py.special_dtype(vlen=str)
+            )
+            dataset = f.create_dataset('data', data=str_array_3d)
+            bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset)
+            builder = type_map.build(bar_inst)
+            np.testing.assert_array_equal(builder.get('data').data, dataset[:])
+            np.testing.assert_array_equal(builder.get('attr_array'), dataset[:])
+
     def test_build_dataio(self):
         bar_spec = GroupSpec('A test group specification with a data type',
                              data_type_def='Bar',

From aed1de3ec0c26f9fd492abe9d7c9ea1ddb142f7a Mon Sep 17 00:00:00 2001
From: Paul Adkisson <paul.wesley.adkisson@gmail.com>
Date: Tue, 17 Sep 2024 08:07:48 +1000
Subject: [PATCH 3/3] Wrap data in set_data_io with a DataChunkIterator to
 support overriding hdf5 dataset backend configurations (#1172)

Co-authored-by: Oliver Ruebel <oruebel@users.noreply.github.com>
Co-authored-by: Ryan Ly <rly@lbl.gov>
---
 CHANGELOG.md                       |  5 ++-
 src/hdmf/container.py              | 48 ++++++++++++++++++++++++---
 tests/unit/test_io_hdf5_h5tools.py | 52 ++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c7919c81b..7e56cde40 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,9 @@
 # HDMF Changelog
 
-## HDMF 3.14.5 (September 6, 2024)
+## HDMF 3.14.5 (September 17, 2024)
+
+### Enhancements
+- Added support for overriding backend configurations of `h5py.Dataset` objects in `Container.set_data_io`. @pauladkisson [#1172](https://github.com/hdmf-dev/hdmf/pull/1172)
 
 ### Bug fixes
 - Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince
diff --git a/src/hdmf/container.py b/src/hdmf/container.py
index 88a083599..7c450770a 100644
--- a/src/hdmf/container.py
+++ b/src/hdmf/container.py
@@ -2,7 +2,7 @@
 from abc import abstractmethod
 from collections import OrderedDict
 from copy import deepcopy
-from typing import Type
+from typing import Type, Optional
 from uuid import uuid4
 from warnings import warn
 import os
@@ -11,7 +11,7 @@
 import numpy as np
 import pandas as pd
 
-from .data_utils import DataIO, append_data, extend_data
+from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator
 from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict
 
 from .term_set import TermSet, TermSetWrapper
@@ -826,7 +826,14 @@ def __smart_str_dict(d, num_indent):
         out += '\n' + indent + right_br
         return out
 
-    def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kwargs: dict = None, **kwargs):
+    def set_data_io(
+        self,
+        dataset_name: str,
+        data_io_class: Type[DataIO],
+        data_io_kwargs: dict = None,
+        data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None,
+        data_chunk_iterator_kwargs: dict = None, **kwargs
+    ):
         """
         Apply DataIO object to a dataset field of the Container.
 
@@ -838,9 +845,18 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw
             Class to use for DataIO, e.g. H5DataIO or ZarrDataIO
         data_io_kwargs: dict
             keyword arguments passed to the constructor of the DataIO class.
+        data_chunk_iterator_class: Type[AbstractDataChunkIterator]
+            Class to use for DataChunkIterator. If None, no DataChunkIterator is used.
+        data_chunk_iterator_kwargs: dict
+            keyword arguments passed to the constructor of the DataChunkIterator class.
         **kwargs:
             DEPRECATED. Use data_io_kwargs instead.
             kwargs are passed to the constructor of the DataIO class.
+
+        Notes
+        -----
+        If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in
+        the DataIO. This allows for rewriting the backend configuration of hdf5 datasets.
         """
         if kwargs or (data_io_kwargs is None):
             warn(
@@ -851,8 +867,11 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw
             )
             data_io_kwargs = kwargs
         data = self.fields.get(dataset_name)
+        data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict()
         if data is None:
             raise ValueError(f"{dataset_name} is None and cannot be wrapped in a DataIO class")
+        if data_chunk_iterator_class is not None:
+            data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs)
         self.fields[dataset_name] = data_io_class(data=data, **data_io_kwargs)
 
 
@@ -896,7 +915,13 @@ def set_dataio(self, **kwargs):
         dataio.data = self.__data
         self.__data = dataio
 
-    def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None:
+    def set_data_io(
+        self,
+        data_io_class: Type[DataIO],
+        data_io_kwargs: dict,
+        data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None,
+        data_chunk_iterator_kwargs: dict = None,
+    ) -> None:
         """
         Apply DataIO object to the data held by this Data object.
 
@@ -906,8 +931,21 @@ def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None
             The DataIO to apply to the data held by this Data.
         data_io_kwargs: dict
             The keyword arguments to pass to the DataIO.
+        data_chunk_iterator_class: Type[AbstractDataChunkIterator]
+            The DataChunkIterator to use for the DataIO. If None, no DataChunkIterator is used.
+        data_chunk_iterator_kwargs: dict
+            The keyword arguments to pass to the DataChunkIterator.
+
+        Notes
+        -----
+        If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in
+        the DataIO. This allows for rewriting the backend configuration of hdf5 datasets.
         """
-        self.__data = data_io_class(data=self.__data, **data_io_kwargs)
+        data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict()
+        data = self.__data
+        if data_chunk_iterator_class is not None:
+            data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs)
+        self.__data = data_io_class(data=data, **data_io_kwargs)
 
     @docval({'name': 'func', 'type': types.FunctionType, 'doc': 'a function to transform *data*'})
     def transform(self, **kwargs):
diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py
index 131e4a6de..58119ce9b 100644
--- a/tests/unit/test_io_hdf5_h5tools.py
+++ b/tests/unit/test_io_hdf5_h5tools.py
@@ -3801,6 +3801,11 @@ def __init__(self, **kwargs):
                 self.data2 = kwargs["data2"]
 
         self.obj = ContainerWithData("name", [1, 2, 3, 4, 5], None)
+        self.file_path = get_temp_filepath()
+
+    def tearDown(self):
+        if os.path.exists(self.file_path):
+            os.remove(self.file_path)
 
     def test_set_data_io(self):
         self.obj.set_data_io("data1", H5DataIO, data_io_kwargs=dict(chunks=True))
@@ -3823,6 +3828,31 @@ def test_set_data_io_old_api(self):
         self.assertIsInstance(self.obj.data1, H5DataIO)
         self.assertTrue(self.obj.data1.io_settings["chunks"])
 
+    def test_set_data_io_h5py_dataset(self):
+        file = File(self.file_path, 'w')
+        data = file.create_dataset('data', data=[1, 2, 3, 4, 5], chunks=(3,))
+        class ContainerWithData(Container):
+            __fields__ = ('data',)
+
+            @docval(
+                {"name": "name", "doc": "name", "type": str},
+                {'name': 'data', 'doc': 'field1 doc', 'type': h5py.Dataset},
+            )
+            def __init__(self, **kwargs):
+                super().__init__(name=kwargs["name"])
+                self.data = kwargs["data"]
+
+        container = ContainerWithData("name", data)
+        container.set_data_io(
+            "data",
+            H5DataIO,
+            data_io_kwargs=dict(chunks=(2,)),
+            data_chunk_iterator_class=DataChunkIterator,
+        )
+
+        self.assertIsInstance(container.data, H5DataIO)
+        self.assertEqual(container.data.io_settings["chunks"], (2,))
+        file.close()
 
 class TestDataSetDataIO(TestCase):
 
@@ -3831,8 +3861,30 @@ class MyData(Data):
             pass
 
         self.data = MyData("my_data", [1, 2, 3])
+        self.file_path = get_temp_filepath()
+
+    def tearDown(self):
+        if os.path.exists(self.file_path):
+            os.remove(self.file_path)
 
     def test_set_data_io(self):
         self.data.set_data_io(H5DataIO, dict(chunks=True))
         assert isinstance(self.data.data, H5DataIO)
         assert self.data.data.io_settings["chunks"]
+
+    def test_set_data_io_h5py_dataset(self):
+        file = File(self.file_path, 'w')
+        data = file.create_dataset('data', data=[1, 2, 3, 4, 5], chunks=(3,))
+        class MyData(Data):
+            pass
+
+        my_data = MyData("my_data", data)
+        my_data.set_data_io(
+            H5DataIO,
+            data_io_kwargs=dict(chunks=(2,)),
+            data_chunk_iterator_class=DataChunkIterator,
+        )
+
+        self.assertIsInstance(my_data.data, H5DataIO)
+        self.assertEqual(my_data.data.io_settings["chunks"], (2,))
+        file.close()