Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VectorData Expand by Default via write_dataset #1093

Merged
merged 15 commits into from
May 7, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Added `TypeConfigurator` to automatically wrap fields with `TermSetWrapper` according to a configuration file. @mavaylon1 [#1016](https://github.com/hdmf-dev/hdmf/pull/1016)
- Updated `TermSetWrapper` to support validating a single field within a compound array. @mavaylon1 [#1061](https://github.com/hdmf-dev/hdmf/pull/1061)
- Updated testing to not install in editable mode and not run `coverage` by default. @rly [#1107](https://github.com/hdmf-dev/hdmf/pull/1107)
- Updated the default behavior for writing HDF5 datasets to be expandandable datasets with chunking enabled by default. This does not override user set chunking parameters. @mavaylon1 [#1093](https://github.com/hdmf-dev/hdmf/pull/1093)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

expandandable -> expandable


## HDMF 3.13.0 (March 20, 2024)

Expand Down
34 changes: 27 additions & 7 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,10 @@ def copy_file(self, **kwargs):
'default': True},
{'name': 'herd', 'type': 'hdmf.common.resources.HERD',
'doc': 'A HERD object to populate with references.',
'default': None})
'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')})
def write(self, **kwargs):
"""Write the container to an HDF5 file."""
if self.__mode == 'r':
Expand Down Expand Up @@ -804,10 +807,16 @@ def close_linked_files(self):
'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None})
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')})
def write_builder(self, **kwargs):
f_builder = popargs('builder', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data',
'exhaust_dci',
'export_source',
kwargs)
self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s"
% (f_builder.name, self.source, kwargs))
for name, gbldr in f_builder.groups.items():
Expand Down Expand Up @@ -978,6 +987,9 @@ def _filler():
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')},
returns='the Group that was created', rtype=Group)
def write_group(self, **kwargs):
parent, builder = popargs('parent', 'builder', kwargs)
Expand Down Expand Up @@ -1078,14 +1090,17 @@ def write_link(self, **kwargs):
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')},
returns='the Dataset that was created', rtype=Dataset)
def write_dataset(self, **kwargs): # noqa: C901
""" Write a dataset to HDF5

The function uses other dataset-dependent write functions, e.g,
``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data.
"""
parent, builder = popargs('parent', 'builder', kwargs)
parent, builder, expandable = popargs('parent', 'builder', 'expandable', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
if self.get_written(builder):
Expand All @@ -1102,6 +1117,7 @@ def write_dataset(self, **kwargs): # noqa: C901
data = data.data
else:
options['io_settings'] = {}

attributes = builder.attributes
options['dtype'] = builder.dtype
dset = None
Expand Down Expand Up @@ -1206,7 +1222,7 @@ def _filler():
return
# If the compound data type contains only regular data (i.e., no references) then we can write it as usual
else:
dset = self.__list_fill__(parent, name, data, options)
dset = self.__list_fill__(parent, name, data, expandable, options)
# Write a dataset containing references, i.e., a region or object reference.
# NOTE: we can ignore options['io_settings'] for scalar data
elif self.__is_ref(options['dtype']):
Expand Down Expand Up @@ -1301,7 +1317,7 @@ def _filler():
self.__dci_queue.append(dataset=dset, data=data)
# Write a regular in memory array (e.g., numpy array, list etc.)
elif hasattr(data, '__len__'):
dset = self.__list_fill__(parent, name, data, options)
dset = self.__list_fill__(parent, name, data, expandable, options)
rly marked this conversation as resolved.
Show resolved Hide resolved
# Write a regular scalar dataset
else:
dset = self.__scalar_fill__(parent, name, data, options)
Expand Down Expand Up @@ -1429,7 +1445,7 @@ def __chunked_iter_fill__(cls, parent, name, data, options=None):
return dset

@classmethod
def __list_fill__(cls, parent, name, data, options=None):
def __list_fill__(cls, parent, name, data, expandable, options=None):
# define the io settings and data type if necessary
io_settings = {}
dtype = None
Expand All @@ -1451,6 +1467,10 @@ def __list_fill__(cls, parent, name, data, options=None):
data_shape = (len(data),)
else:
data_shape = get_data_shape(data)
if expandable:
# Don't override existing settings
if 'maxshape' not in io_settings:
io_settings['maxshape'] = tuple([None]*len(data_shape))

# Create the dataset
try:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_io_hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,5 +225,5 @@ def test_dataset_shape(self):
io.write_builder(self.builder)
builder = io.read_builder()
dset = builder['test_bucket']['foo_holder']['foo1']['my_data'].data
self.assertEqual(get_data_shape(dset), (10,))
self.assertEqual(get_data_shape(dset), (None,))
io.close()
35 changes: 30 additions & 5 deletions tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from hdmf.testing import TestCase, remove_test_file
from hdmf.common.resources import HERD
from hdmf.term_set import TermSet, TermSetWrapper
from hdmf.utils import get_data_shape


from tests.unit.helpers.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager,
Expand Down Expand Up @@ -163,6 +164,7 @@ def test_write_dataset_list(self):
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a.tolist(), attributes={}))
dset = self.f['test_dataset']
self.assertTrue(np.all(dset[:] == a))
self.assertEqual(get_data_shape(dset), (None, None, None))

def test_write_dataset_list_compress_gzip(self):
a = H5DataIO(np.arange(30).reshape(5, 2, 3),
Expand Down Expand Up @@ -266,9 +268,9 @@ def test_write_dataset_list_fillvalue(self):
self.assertEqual(dset.fillvalue, -1)

##########################################
# write_dataset tests: tables
# write_dataset tests: cmpd_dt
##########################################
def test_write_table(self):
def test_write_cmpd_dt(self):
cmpd_dt = np.dtype([('a', np.int32), ('b', np.float64)])
data = np.zeros(10, dtype=cmpd_dt)
data['a'][1] = 101
Expand All @@ -279,8 +281,9 @@ def test_write_table(self):
dset = self.f['test_dataset']
self.assertEqual(dset['a'].tolist(), data['a'].tolist())
self.assertEqual(dset['b'].tolist(), data['b'].tolist())
self.assertEqual(get_data_shape(dset), (None,))

def test_write_table_nested(self):
def test_write_cmpd_dt_nested(self):
b_cmpd_dt = np.dtype([('c', np.int32), ('d', np.float64)])
cmpd_dt = np.dtype([('a', np.int32), ('b', b_cmpd_dt)])
data = np.zeros(10, dtype=cmpd_dt)
Expand Down Expand Up @@ -739,12 +742,12 @@ def test_copy_h5py_dataset_h5dataio_input(self):
self.f['test_copy'][:].tolist())

def test_list_fill_empty(self):
dset = self.io.__list_fill__(self.f, 'empty_dataset', [], options={'dtype': int, 'io_settings': {}})
dset = self.io.__list_fill__(self.f, 'empty_dataset', [], True, options={'dtype': int, 'io_settings': {}})
rly marked this conversation as resolved.
Show resolved Hide resolved
self.assertTupleEqual(dset.shape, (0,))

def test_list_fill_empty_no_dtype(self):
with self.assertRaisesRegex(Exception, r"cannot add \S+ to [/\S]+ - could not determine type"):
self.io.__list_fill__(self.f, 'empty_dataset', [])
self.io.__list_fill__(self.f, 'empty_dataset', [], True)

def test_read_str(self):
a = ['a', 'bb', 'ccc', 'dddd', 'e']
Expand All @@ -763,6 +766,28 @@ def test_read_str(self):
'<HDF5 dataset "test_dataset": shape (5,), type "|O">')


class TestExpand(TestCase):
def setUp(self):
self.manager = get_foo_buildmanager()
self.path = get_temp_filepath()

def test_expand_false(self):
# Setup all the data we need
foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14)
foobucket = FooBucket('bucket1', [foo1])
foofile = FooFile(buckets=[foobucket])

with HDF5IO(self.path, manager=self.manager, mode='w') as io:
io.write(foofile, expandable=False)

io = HDF5IO(self.path, manager=self.manager, mode='r')
read_foofile = io.read()
self.assertListEqual(foofile.buckets['bucket1'].foos['foo1'].my_data,
read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist())
self.assertEqual(get_data_shape(read_foofile.buckets['bucket1'].foos['foo1'].my_data),
(5,))


class TestRoundTrip(TestCase):

def setUp(self):
Expand Down
Loading