From acc3d78cc5a828ddd384cca814ef60167ae92682 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Wed, 21 Aug 2024 22:14:24 -0700 Subject: [PATCH] Write scalar datasets with compound data type (#1176) * add support for scalar compound datasets * add scalar compound dset io and validation tests * update CHANGELOG.md * Update tests/unit/test_io_hdf5_h5tools.py Co-authored-by: Ryan Ly * update container repr conditionals --------- Co-authored-by: Ryan Ly --- CHANGELOG.md | 3 +++ src/hdmf/backends/hdf5/h5tools.py | 4 ++++ src/hdmf/container.py | 6 +----- src/hdmf/validate/validator.py | 13 ++++++++---- tests/unit/test_io_hdf5_h5tools.py | 21 ++++++++++++++++++++ tests/unit/validator_tests/test_validate.py | 22 +++++++++++++++++++++ 6 files changed, 60 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 549eccc7a..f3c15392b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ - Improved "already exists" error message when adding a container to a `MultiContainerInterface`. @rly [#1165](https://github.com/hdmf-dev/hdmf/pull/1165) - Added support to write multidimensional string arrays. @stephprince [#1173](https://github.com/hdmf-dev/hdmf/pull/1173) +### Bug fixes +- Fixed issue where scalar datasets with a compound data type were being written as non-scalar datasets @stephprince [#1176](https://github.com/hdmf-dev/hdmf/pull/1176) + ## HDMF 3.14.3 (July 29, 2024) ### Enhancements diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index ffdc4eab6..4db6463dc 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -698,6 +698,8 @@ def __read_dataset(self, h5obj, name=None): d = ReferenceBuilder(target_builder) kwargs['data'] = d kwargs['dtype'] = d.dtype + elif h5obj.dtype.kind == 'V': # scalar compound data type + kwargs['data'] = np.array(scalar, dtype=h5obj.dtype) else: kwargs["data"] = scalar else: @@ -1227,6 +1229,8 @@ def _filler(): return # If the compound data type contains only regular data (i.e., no references) then we can write it as usual + elif len(np.shape(data)) == 0: + dset = self.__scalar_fill__(parent, name, data, options) else: dset = self.__list_fill__(parent, name, data, options) # Write a dataset containing references, i.e., a region or object reference. diff --git a/src/hdmf/container.py b/src/hdmf/container.py index 3772cd634..88a083599 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -629,12 +629,8 @@ def __repr__(self): template += "\nFields:\n" for k in sorted(self.fields): # sorted to enable tests v = self.fields[k] - # if isinstance(v, DataIO) or not hasattr(v, '__len__') or len(v) > 0: if hasattr(v, '__len__'): - if isinstance(v, (np.ndarray, list, tuple)): - if len(v) > 0: - template += " {}: {}\n".format(k, self.__smart_str(v, 1)) - elif v: + if isinstance(v, (np.ndarray, list, tuple)) or v: template += " {}: {}\n".format(k, self.__smart_str(v, 1)) else: template += " {}: {}\n".format(k, v) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index e39011d9f..2668da1ec 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -134,7 +134,7 @@ def get_type(data, builder_dtype=None): elif isinstance(data, ReferenceResolver): return data.dtype, None # Numpy nd-array data - elif isinstance(data, np.ndarray): + elif isinstance(data, np.ndarray) and len(data.dtype) <= 1: if data.size > 0: return get_type(data[0], builder_dtype) else: @@ -147,11 +147,14 @@ def get_type(data, builder_dtype=None): # Case for h5py.Dataset and other I/O specific array types else: # Compound dtype - if builder_dtype and isinstance(builder_dtype, list): + if builder_dtype and len(builder_dtype) > 1: dtypes = [] string_formats = [] for i in range(len(builder_dtype)): - dtype, string_format = get_type(data[0][i]) + if len(np.shape(data)) == 0: + dtype, string_format = get_type(data[()][i]) + else: + dtype, string_format = get_type(data[0][i]) dtypes.append(dtype) string_formats.append(string_format) return dtypes, string_formats @@ -438,7 +441,9 @@ def validate(self, **kwargs): except EmptyArrayError: # do not validate dtype of empty array. HDMF does not yet set dtype when writing a list/tuple pass - if isinstance(builder.dtype, list): + if builder.dtype is not None and len(builder.dtype) > 1 and len(np.shape(builder.data)) == 0: + shape = () # scalar compound dataset + elif isinstance(builder.dtype, list): shape = (len(builder.data), ) # only 1D datasets with compound types are supported else: shape = get_data_shape(data) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index b004a6c54..73aa89788 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -144,6 +144,16 @@ def test_write_dataset_string(self): read_a = read_a.decode('utf-8') self.assertEqual(read_a, a) + def test_write_dataset_scalar_compound(self): + cmpd_dtype = np.dtype([('x', np.int32), ('y', np.float64)]) + a = np.array((1, 0.1), dtype=cmpd_dtype) + self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, + dtype=[DtypeSpec('x', doc='x', dtype='int32'), + DtypeSpec('y', doc='y', dtype='float64')])) + dset = self.f['test_dataset'] + self.assertTupleEqual(dset.shape, ()) + self.assertEqual(dset[()].tolist(), a.tolist()) + ########################################## # write_dataset tests: TermSetWrapper ########################################## @@ -787,6 +797,17 @@ def test_read_str(self): self.assertEqual(str(bldr['test_dataset'].data), '') + def test_read_scalar_compound(self): + cmpd_dtype = np.dtype([('x', np.int32), ('y', np.float64)]) + a = np.array((1, 0.1), dtype=cmpd_dtype) + self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, + dtype=[DtypeSpec('x', doc='x', dtype='int32'), + DtypeSpec('y', doc='y', dtype='float64')])) + self.io.close() + with HDF5IO(self.path, 'r') as io: + bldr = io.read_builder() + np.testing.assert_array_equal(bldr['test_dataset'].data[()], a) + class TestRoundTrip(TestCase): diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 95ff5d98e..dd79cfce5 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -501,6 +501,28 @@ def test_np_bool_for_bool(self): results = self.vmap.validate(bar_builder) self.assertEqual(len(results), 0) + def test_scalar_compound_dtype(self): + """Test that validator allows scalar compound dtype data where a compound dtype is specified.""" + spec_catalog = SpecCatalog() + dtype = [DtypeSpec('x', doc='x', dtype='int'), DtypeSpec('y', doc='y', dtype='float')] + spec = GroupSpec('A test group specification with a data type', + data_type_def='Bar', + datasets=[DatasetSpec('an example dataset', dtype, name='data',)], + attributes=[AttributeSpec('attr1', 'an example attribute', 'text',)]) + spec_catalog.register_spec(spec, 'test2.yaml') + self.namespace = SpecNamespace( + 'a test namespace', CORE_NAMESPACE, [{'source': 'test2.yaml'}], version='0.1.0', catalog=spec_catalog) + self.vmap = ValidatorMap(self.namespace) + + value = np.array((1, 2.2), dtype=[('x', 'int'), ('y', 'float')]) + bar_builder = GroupBuilder('my_bar', + attributes={'data_type': 'Bar', 'attr1': 'test'}, + datasets=[DatasetBuilder(name='data', + data=value, + dtype=[DtypeSpec('x', doc='x', dtype='int'), + DtypeSpec('y', doc='y', dtype='float'),],),]) + results = self.vmap.validate(bar_builder) + self.assertEqual(len(results), 0) class Test1DArrayValidation(TestCase):