From aed1de3ec0c26f9fd492abe9d7c9ea1ddb142f7a Mon Sep 17 00:00:00 2001 From: Paul Adkisson Date: Tue, 17 Sep 2024 08:07:48 +1000 Subject: [PATCH] Wrap data in set_data_io with a DataChunkIterator to support overriding hdf5 dataset backend configurations (#1172) Co-authored-by: Oliver Ruebel Co-authored-by: Ryan Ly --- CHANGELOG.md | 5 ++- src/hdmf/container.py | 48 ++++++++++++++++++++++++--- tests/unit/test_io_hdf5_h5tools.py | 52 ++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7919c81b..7e56cde40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ # HDMF Changelog -## HDMF 3.14.5 (September 6, 2024) +## HDMF 3.14.5 (September 17, 2024) + +### Enhancements +- Added support for overriding backend configurations of `h5py.Dataset` objects in `Container.set_data_io`. @pauladkisson [#1172](https://github.com/hdmf-dev/hdmf/pull/1172) ### Bug fixes - Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince diff --git a/src/hdmf/container.py b/src/hdmf/container.py index 88a083599..7c450770a 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -2,7 +2,7 @@ from abc import abstractmethod from collections import OrderedDict from copy import deepcopy -from typing import Type +from typing import Type, Optional from uuid import uuid4 from warnings import warn import os @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from .data_utils import DataIO, append_data, extend_data +from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict from .term_set import TermSet, TermSetWrapper @@ -826,7 +826,14 @@ def __smart_str_dict(d, num_indent): out += '\n' + indent + right_br return out - def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kwargs: dict = None, **kwargs): + def set_data_io( + self, + dataset_name: str, + data_io_class: Type[DataIO], + data_io_kwargs: dict = None, + data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None, + data_chunk_iterator_kwargs: dict = None, **kwargs + ): """ Apply DataIO object to a dataset field of the Container. @@ -838,9 +845,18 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw Class to use for DataIO, e.g. H5DataIO or ZarrDataIO data_io_kwargs: dict keyword arguments passed to the constructor of the DataIO class. + data_chunk_iterator_class: Type[AbstractDataChunkIterator] + Class to use for DataChunkIterator. If None, no DataChunkIterator is used. + data_chunk_iterator_kwargs: dict + keyword arguments passed to the constructor of the DataChunkIterator class. **kwargs: DEPRECATED. Use data_io_kwargs instead. kwargs are passed to the constructor of the DataIO class. + + Notes + ----- + If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in + the DataIO. This allows for rewriting the backend configuration of hdf5 datasets. """ if kwargs or (data_io_kwargs is None): warn( @@ -851,8 +867,11 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw ) data_io_kwargs = kwargs data = self.fields.get(dataset_name) + data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict() if data is None: raise ValueError(f"{dataset_name} is None and cannot be wrapped in a DataIO class") + if data_chunk_iterator_class is not None: + data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs) self.fields[dataset_name] = data_io_class(data=data, **data_io_kwargs) @@ -896,7 +915,13 @@ def set_dataio(self, **kwargs): dataio.data = self.__data self.__data = dataio - def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None: + def set_data_io( + self, + data_io_class: Type[DataIO], + data_io_kwargs: dict, + data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None, + data_chunk_iterator_kwargs: dict = None, + ) -> None: """ Apply DataIO object to the data held by this Data object. @@ -906,8 +931,21 @@ def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None The DataIO to apply to the data held by this Data. data_io_kwargs: dict The keyword arguments to pass to the DataIO. + data_chunk_iterator_class: Type[AbstractDataChunkIterator] + The DataChunkIterator to use for the DataIO. If None, no DataChunkIterator is used. + data_chunk_iterator_kwargs: dict + The keyword arguments to pass to the DataChunkIterator. + + Notes + ----- + If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in + the DataIO. This allows for rewriting the backend configuration of hdf5 datasets. """ - self.__data = data_io_class(data=self.__data, **data_io_kwargs) + data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict() + data = self.__data + if data_chunk_iterator_class is not None: + data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs) + self.__data = data_io_class(data=data, **data_io_kwargs) @docval({'name': 'func', 'type': types.FunctionType, 'doc': 'a function to transform *data*'}) def transform(self, **kwargs): diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 131e4a6de..58119ce9b 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -3801,6 +3801,11 @@ def __init__(self, **kwargs): self.data2 = kwargs["data2"] self.obj = ContainerWithData("name", [1, 2, 3, 4, 5], None) + self.file_path = get_temp_filepath() + + def tearDown(self): + if os.path.exists(self.file_path): + os.remove(self.file_path) def test_set_data_io(self): self.obj.set_data_io("data1", H5DataIO, data_io_kwargs=dict(chunks=True)) @@ -3823,6 +3828,31 @@ def test_set_data_io_old_api(self): self.assertIsInstance(self.obj.data1, H5DataIO) self.assertTrue(self.obj.data1.io_settings["chunks"]) + def test_set_data_io_h5py_dataset(self): + file = File(self.file_path, 'w') + data = file.create_dataset('data', data=[1, 2, 3, 4, 5], chunks=(3,)) + class ContainerWithData(Container): + __fields__ = ('data',) + + @docval( + {"name": "name", "doc": "name", "type": str}, + {'name': 'data', 'doc': 'field1 doc', 'type': h5py.Dataset}, + ) + def __init__(self, **kwargs): + super().__init__(name=kwargs["name"]) + self.data = kwargs["data"] + + container = ContainerWithData("name", data) + container.set_data_io( + "data", + H5DataIO, + data_io_kwargs=dict(chunks=(2,)), + data_chunk_iterator_class=DataChunkIterator, + ) + + self.assertIsInstance(container.data, H5DataIO) + self.assertEqual(container.data.io_settings["chunks"], (2,)) + file.close() class TestDataSetDataIO(TestCase): @@ -3831,8 +3861,30 @@ class MyData(Data): pass self.data = MyData("my_data", [1, 2, 3]) + self.file_path = get_temp_filepath() + + def tearDown(self): + if os.path.exists(self.file_path): + os.remove(self.file_path) def test_set_data_io(self): self.data.set_data_io(H5DataIO, dict(chunks=True)) assert isinstance(self.data.data, H5DataIO) assert self.data.data.io_settings["chunks"] + + def test_set_data_io_h5py_dataset(self): + file = File(self.file_path, 'w') + data = file.create_dataset('data', data=[1, 2, 3, 4, 5], chunks=(3,)) + class MyData(Data): + pass + + my_data = MyData("my_data", data) + my_data.set_data_io( + H5DataIO, + data_io_kwargs=dict(chunks=(2,)), + data_chunk_iterator_class=DataChunkIterator, + ) + + self.assertIsInstance(my_data.data, H5DataIO) + self.assertEqual(my_data.data.io_settings["chunks"], (2,)) + file.close()