From 044f5a579a960b9228f602f2c9380414b216df32 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Wed, 4 Sep 2024 10:28:09 -0700 Subject: [PATCH 1/3] Update CHANGELOG.md (#1184) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97d89e320..cb51dc538 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # HDMF Changelog -## HDMF 3.14.4 (August 22, 2024) +## HDMF 3.14.4 (September 4, 2024) ### Enhancements - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157) From 874db31fb171577bfc0c962708786f9774ea6b1b Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 13 Sep 2024 10:58:25 -0700 Subject: [PATCH 2/3] Fix handling of read h5py string datasets (#1189) --- CHANGELOG.md | 6 ++ src/hdmf/build/objectmapper.py | 5 +- src/hdmf/utils.py | 2 +- tests/unit/build_tests/test_io_map.py | 131 +++++++++++++++++++++++++- 4 files changed, 141 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb51dc538..c7919c81b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # HDMF Changelog +## HDMF 3.14.5 (September 6, 2024) + +### Bug fixes +- Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince + [#1189](https://github.com/hdmf-dev/hdmf/pull/1189) + ## HDMF 3.14.4 (September 4, 2024) ### Enhancements diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 3e8d835f1..83df1b427 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -602,7 +602,10 @@ def __get_data_type(cls, spec): def __convert_string(self, value, spec): """Convert string types to the specified dtype.""" def __apply_string_type(value, string_type): - if isinstance(value, (list, tuple, np.ndarray, DataIO)): + # NOTE: if a user passes a h5py.Dataset that is not wrapped with a hdmf.utils.StrDataset, + # then this conversion may not be correct. Users should unpack their string h5py.Datasets + # into a numpy array (or wrap them in StrDataset) before passing them to a container object. + if hasattr(value, '__iter__') and not isinstance(value, (str, bytes)): return [__apply_string_type(item, string_type) for item in value] else: return string_type(value) diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 5e0b61539..50db79c40 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -1140,7 +1140,7 @@ def update(self, other): @docval_macro('array_data') class StrDataset(h5py.Dataset): - """Wrapper to decode strings on reading the dataset""" + """Wrapper to decode strings on reading the dataset. Use only for h5py 3+.""" def __init__(self, dset, encoding, errors='strict'): self.dset = dset if encoding is None: diff --git a/tests/unit/build_tests/test_io_map.py b/tests/unit/build_tests/test_io_map.py index e095ef318..730530a5a 100644 --- a/tests/unit/build_tests/test_io_map.py +++ b/tests/unit/build_tests/test_io_map.py @@ -1,4 +1,4 @@ -from hdmf.utils import docval, getargs +from hdmf.utils import StrDataset, docval, getargs from hdmf import Container, Data from hdmf.backends.hdf5 import H5DataIO from hdmf.build import (GroupBuilder, DatasetBuilder, ObjectMapper, BuildManager, TypeMap, LinkBuilder, @@ -7,12 +7,15 @@ from hdmf.spec import (GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog, RefSpec, LinkSpec) from hdmf.testing import TestCase +import h5py from abc import ABCMeta, abstractmethod import unittest import numpy as np from tests.unit.helpers.utils import CORE_NAMESPACE, create_test_type_map +H5PY_3 = h5py.__version__.startswith('3') + class Bar(Container): @@ -460,6 +463,132 @@ def test_build_3d_ndarray(self): np.testing.assert_array_equal(builder.get('data').data, str_array_3d) np.testing.assert_array_equal(builder.get('attr_array'), str_array_3d) + @unittest.skipIf(not H5PY_3, "Use StrDataset only for h5py 3+") + def test_build_1d_h5py_3_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, ), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, ))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_1d = np.array( + ['aa', 'bb', 'cc', 'dd'], + dtype=h5py.special_dtype(vlen=str) + ) + # wrap the dataset in a StrDataset to mimic how HDF5IO would read this dataset with h5py 3+ + dataset = StrDataset(f.create_dataset('data', data=str_array_1d), None) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + + @unittest.skipIf(not H5PY_3, "Use StrDataset only for h5py 3+") + def test_build_3d_h5py_3_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_3d = np.array( + [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]], + dtype=h5py.special_dtype(vlen=str) + ) + # wrap the dataset in a StrDataset to mimic how HDF5IO would read this dataset with h5py 3+ + dataset = StrDataset(f.create_dataset('data', data=str_array_3d), None) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + + @unittest.skipIf(H5PY_3, "Create dataset differently for h5py < 3") + def test_build_1d_h5py_2_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, ), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, ))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_1d = np.array( + ['aa', 'bb', 'cc', 'dd'], + dtype=h5py.special_dtype(vlen=str) + ) + dataset = f.create_dataset('data', data=str_array_1d) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + + @unittest.skipIf(H5PY_3, "Create dataset differently for h5py < 3") + def test_build_3d_h5py_2_dataset(self): + bar_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='text', + name='data', + shape=(None, None, None), + attributes=[AttributeSpec(name='attr2', doc='an example integer attribute', dtype='int')], + ) + ], + attributes=[AttributeSpec(name='attr_array', doc='an example array attribute', dtype='text', + shape=(None, None, None))], + ) + type_map = self.customSetUp(bar_spec) + type_map.register_map(Bar, BarMapper) + # create in-memory hdf5 file that is discarded after closing + with h5py.File("test.h5", "w", driver="core", backing_store=False) as f: + str_array_3d = np.array( + [[['aa', 'bb'], ['cc', 'dd']], [['ee', 'ff'], ['gg', 'hh']]], + dtype=h5py.special_dtype(vlen=str) + ) + dataset = f.create_dataset('data', data=str_array_3d) + bar_inst = Bar('my_bar', dataset, 'value1', 10, attr_array=dataset) + builder = type_map.build(bar_inst) + np.testing.assert_array_equal(builder.get('data').data, dataset[:]) + np.testing.assert_array_equal(builder.get('attr_array'), dataset[:]) + def test_build_dataio(self): bar_spec = GroupSpec('A test group specification with a data type', data_type_def='Bar', From aed1de3ec0c26f9fd492abe9d7c9ea1ddb142f7a Mon Sep 17 00:00:00 2001 From: Paul Adkisson Date: Tue, 17 Sep 2024 08:07:48 +1000 Subject: [PATCH 3/3] Wrap data in set_data_io with a DataChunkIterator to support overriding hdf5 dataset backend configurations (#1172) Co-authored-by: Oliver Ruebel Co-authored-by: Ryan Ly --- CHANGELOG.md | 5 ++- src/hdmf/container.py | 48 ++++++++++++++++++++++++--- tests/unit/test_io_hdf5_h5tools.py | 52 ++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7919c81b..7e56cde40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ # HDMF Changelog -## HDMF 3.14.5 (September 6, 2024) +## HDMF 3.14.5 (September 17, 2024) + +### Enhancements +- Added support for overriding backend configurations of `h5py.Dataset` objects in `Container.set_data_io`. @pauladkisson [#1172](https://github.com/hdmf-dev/hdmf/pull/1172) ### Bug fixes - Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince diff --git a/src/hdmf/container.py b/src/hdmf/container.py index 88a083599..7c450770a 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -2,7 +2,7 @@ from abc import abstractmethod from collections import OrderedDict from copy import deepcopy -from typing import Type +from typing import Type, Optional from uuid import uuid4 from warnings import warn import os @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from .data_utils import DataIO, append_data, extend_data +from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict from .term_set import TermSet, TermSetWrapper @@ -826,7 +826,14 @@ def __smart_str_dict(d, num_indent): out += '\n' + indent + right_br return out - def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kwargs: dict = None, **kwargs): + def set_data_io( + self, + dataset_name: str, + data_io_class: Type[DataIO], + data_io_kwargs: dict = None, + data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None, + data_chunk_iterator_kwargs: dict = None, **kwargs + ): """ Apply DataIO object to a dataset field of the Container. @@ -838,9 +845,18 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw Class to use for DataIO, e.g. H5DataIO or ZarrDataIO data_io_kwargs: dict keyword arguments passed to the constructor of the DataIO class. + data_chunk_iterator_class: Type[AbstractDataChunkIterator] + Class to use for DataChunkIterator. If None, no DataChunkIterator is used. + data_chunk_iterator_kwargs: dict + keyword arguments passed to the constructor of the DataChunkIterator class. **kwargs: DEPRECATED. Use data_io_kwargs instead. kwargs are passed to the constructor of the DataIO class. + + Notes + ----- + If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in + the DataIO. This allows for rewriting the backend configuration of hdf5 datasets. """ if kwargs or (data_io_kwargs is None): warn( @@ -851,8 +867,11 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw ) data_io_kwargs = kwargs data = self.fields.get(dataset_name) + data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict() if data is None: raise ValueError(f"{dataset_name} is None and cannot be wrapped in a DataIO class") + if data_chunk_iterator_class is not None: + data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs) self.fields[dataset_name] = data_io_class(data=data, **data_io_kwargs) @@ -896,7 +915,13 @@ def set_dataio(self, **kwargs): dataio.data = self.__data self.__data = dataio - def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None: + def set_data_io( + self, + data_io_class: Type[DataIO], + data_io_kwargs: dict, + data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None, + data_chunk_iterator_kwargs: dict = None, + ) -> None: """ Apply DataIO object to the data held by this Data object. @@ -906,8 +931,21 @@ def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None The DataIO to apply to the data held by this Data. data_io_kwargs: dict The keyword arguments to pass to the DataIO. + data_chunk_iterator_class: Type[AbstractDataChunkIterator] + The DataChunkIterator to use for the DataIO. If None, no DataChunkIterator is used. + data_chunk_iterator_kwargs: dict + The keyword arguments to pass to the DataChunkIterator. + + Notes + ----- + If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in + the DataIO. This allows for rewriting the backend configuration of hdf5 datasets. """ - self.__data = data_io_class(data=self.__data, **data_io_kwargs) + data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict() + data = self.__data + if data_chunk_iterator_class is not None: + data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs) + self.__data = data_io_class(data=data, **data_io_kwargs) @docval({'name': 'func', 'type': types.FunctionType, 'doc': 'a function to transform *data*'}) def transform(self, **kwargs): diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 131e4a6de..58119ce9b 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -3801,6 +3801,11 @@ def __init__(self, **kwargs): self.data2 = kwargs["data2"] self.obj = ContainerWithData("name", [1, 2, 3, 4, 5], None) + self.file_path = get_temp_filepath() + + def tearDown(self): + if os.path.exists(self.file_path): + os.remove(self.file_path) def test_set_data_io(self): self.obj.set_data_io("data1", H5DataIO, data_io_kwargs=dict(chunks=True)) @@ -3823,6 +3828,31 @@ def test_set_data_io_old_api(self): self.assertIsInstance(self.obj.data1, H5DataIO) self.assertTrue(self.obj.data1.io_settings["chunks"]) + def test_set_data_io_h5py_dataset(self): + file = File(self.file_path, 'w') + data = file.create_dataset('data', data=[1, 2, 3, 4, 5], chunks=(3,)) + class ContainerWithData(Container): + __fields__ = ('data',) + + @docval( + {"name": "name", "doc": "name", "type": str}, + {'name': 'data', 'doc': 'field1 doc', 'type': h5py.Dataset}, + ) + def __init__(self, **kwargs): + super().__init__(name=kwargs["name"]) + self.data = kwargs["data"] + + container = ContainerWithData("name", data) + container.set_data_io( + "data", + H5DataIO, + data_io_kwargs=dict(chunks=(2,)), + data_chunk_iterator_class=DataChunkIterator, + ) + + self.assertIsInstance(container.data, H5DataIO) + self.assertEqual(container.data.io_settings["chunks"], (2,)) + file.close() class TestDataSetDataIO(TestCase): @@ -3831,8 +3861,30 @@ class MyData(Data): pass self.data = MyData("my_data", [1, 2, 3]) + self.file_path = get_temp_filepath() + + def tearDown(self): + if os.path.exists(self.file_path): + os.remove(self.file_path) def test_set_data_io(self): self.data.set_data_io(H5DataIO, dict(chunks=True)) assert isinstance(self.data.data, H5DataIO) assert self.data.data.io_settings["chunks"] + + def test_set_data_io_h5py_dataset(self): + file = File(self.file_path, 'w') + data = file.create_dataset('data', data=[1, 2, 3, 4, 5], chunks=(3,)) + class MyData(Data): + pass + + my_data = MyData("my_data", data) + my_data.set_data_io( + H5DataIO, + data_io_kwargs=dict(chunks=(2,)), + data_chunk_iterator_class=DataChunkIterator, + ) + + self.assertIsInstance(my_data.data, H5DataIO) + self.assertEqual(my_data.data.io_settings["chunks"], (2,)) + file.close()