diff --git a/CHANGELOG.md b/CHANGELOG.md index a443ed0bf..f9bd98450 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - Update `get_key` to return all the keys if there are multiple within a `HERD` instance. @mavaylon1 [#999](https://github.com/hdmf-dev/hdmf/pull/999) - Improve HTML rendering of tables. @bendichter [#998](https://github.com/hdmf-dev/hdmf/pull/998) - Improved issue and PR templates. @rly [#1004](https://github.com/hdmf-dev/hdmf/pull/1004) +- Added check during validation for if a variable length dataset is empty. @bendichter, @oruebel [#789](https://github.com/hdmf-dev/hdmf/pull/789) ### Bug fixes - Fixed issue with custom class generation when a spec has a `name`. @rly [#1006](https://github.com/hdmf-dev/hdmf/pull/1006) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index 304ad084d..35e647e4a 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -118,26 +118,36 @@ class EmptyArrayError(Exception): def get_type(data, builder_dtype=None): """Return a tuple of (the string representation of the type, the format of the string data) for the given data.""" + # String data if isinstance(data, str): return 'utf', get_string_format(data) + # Bytes data elif isinstance(data, bytes): return 'ascii', get_string_format(data) + # RegionBuilder data elif isinstance(data, RegionBuilder): return 'region', None + # ReferenceBuilder data elif isinstance(data, ReferenceBuilder): return 'object', None + # ReferenceResolver data elif isinstance(data, ReferenceResolver): return data.dtype, None + # Numpy nd-array data elif isinstance(data, np.ndarray): - if data.size == 0: + if data.size > 0: + return get_type(data[0], builder_dtype) + else: raise EmptyArrayError() - return get_type(data[0], builder_dtype) + # Numpy bool data elif isinstance(data, np.bool_): return 'bool', None if not hasattr(data, '__len__'): return type(data).__name__, None + # Case for h5py.Dataset and other I/O specific array types else: - if builder_dtype and isinstance(builder_dtype, list): # compound dtype + # Compound dtype + if builder_dtype and isinstance(builder_dtype, list): dtypes = [] string_formats = [] for i in range(len(builder_dtype)): @@ -145,13 +155,28 @@ def get_type(data, builder_dtype=None): dtypes.append(dtype) string_formats.append(string_format) return dtypes, string_formats + # Object has 'dtype' attribute, e.g., an h5py.Dataset if hasattr(data, 'dtype'): if data.dtype.metadata is not None and data.dtype.metadata.get('vlen') is not None: - return get_type(data[0]) - return data.dtype, None - if len(data) == 0: + # Try to determine dtype from the first array element + if len(data) > 0: + return get_type(data[0], builder_dtype) + # Empty array + else: + # Empty string array + if data.dtype.metadata["vlen"] == str: + return "utf", None + # Undetermined variable length data type. + else: # pragma: no cover + raise EmptyArrayError() # pragma: no cover + # Standard data type (i.e., not compound or vlen) + else: + return data.dtype, None + # If all else has failed, try to determine the datatype from the first element of the array + if len(data) > 0: + return get_type(data[0], builder_dtype) + else: raise EmptyArrayError() - return get_type(data[0], builder_dtype) def check_shape(expected, received): diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index d4c132cd7..7002ebd6f 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -1118,3 +1118,102 @@ def test_round_trip_validation_of_compound_dtype_with_reference(self): attributes={'data_type': 'Foo'} ) self.runBuilderRoundTrip(foo) + + +class TestEmptyDataRoundTrip(ValidatorTestBase): + """ + Test the special case of empty string datasets and attributes during validation + """ + def setUp(self): + self.filename = 'test_ref_dataset.h5' + super().setUp() + + def tearDown(self): + remove_test_file(self.filename) + super().tearDown() + + def getSpecs(self): + ret = GroupSpec('A test group specification with a data type', + data_type_def='Bar', + datasets=[DatasetSpec(name='data', + doc='an example dataset', + dtype='text', + attributes=[AttributeSpec( + name='attr2', + doc='an example integer attribute', + dtype='int', + shape=(None,))]), + DatasetSpec(name='dataInt', + doc='an example int dataset', + dtype='int', + attributes=[]) + ], + attributes=[AttributeSpec(name='attr1', + doc='an example string attribute', + dtype='text', + shape=(None,))]) + return (ret,) + + def runBuilderRoundTrip(self, builder): + """Executes a round-trip test for a builder + + 1. First writes the builder to file, + 2. next reads a new builder from disk + 3. and finally runs the builder through the validator. + The test is successful if there are no validation errors.""" + ns_catalog = NamespaceCatalog() + ns_catalog.add_namespace(self.namespace.name, self.namespace) + typemap = TypeMap(ns_catalog) + self.manager = BuildManager(typemap) + + with HDF5IO(self.filename, manager=self.manager, mode='w') as write_io: + write_io.write_builder(builder) + + with HDF5IO(self.filename, manager=self.manager, mode='r') as read_io: + read_builder = read_io.read_builder() + errors = self.vmap.validate(read_builder) + self.assertEqual(len(errors), 0, errors) + + def test_empty_string_attribute(self): + """Verify that we can determine dtype for empty string attribute during validation""" + builder = GroupBuilder('my_bar', + attributes={'data_type': 'Bar', 'attr1': []}, # <-- Empty string attribute + datasets=[DatasetBuilder(name='data', data=['text1', 'text2'], + attributes={'attr2': [10, ]}), + DatasetBuilder(name='dataInt', data=[5, ]) + ]) + self.runBuilderRoundTrip(builder) + + def test_empty_string_dataset(self): + """Verify that we can determine dtype for empty string dataset during validation""" + builder = GroupBuilder('my_bar', + attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']}, + datasets=[DatasetBuilder(name='data', # <-- Empty string dataset + data=[], + dtype='text', + attributes={'attr2': [10, ]}), + DatasetBuilder(name='dataInt', data=[5, ]) + ]) + self.runBuilderRoundTrip(builder) + + def test_empty_int_attribute(self): + """Verify that we can determine dtype for empty integer attribute during validation""" + builder = GroupBuilder('my_bar', + attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']}, + datasets=[DatasetBuilder(name='data', data=['text1', 'text2'], + attributes={'attr2': []} # <-- Empty integer attribute + ), + DatasetBuilder(name='dataInt', data=[5, ]) + ]) + self.runBuilderRoundTrip(builder) + + def test_empty_int_dataset(self): + """Verify that a dataset builder containing an array of references passes + validation after a round trip""" + builder = GroupBuilder('my_bar', + attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']}, + datasets=[DatasetBuilder(name='data', data=['text1', 'text2'], + attributes={'attr2': [10, ]}), + DatasetBuilder(name='dataInt', data=[], dtype='int') # <-- Empty int dataset + ]) + self.runBuilderRoundTrip(builder)