Skip to content

Commit

Permalink
in validation, add check for if a varlen datasets is empty (#789)
Browse files Browse the repository at this point in the history
Co-authored-by: Oliver Ruebel <[email protected]>
Co-authored-by: Oliver Ruebel <[email protected]>
Co-authored-by: Ryan Ly <[email protected]>
  • Loading branch information
4 people authored Jan 13, 2024
1 parent 216aa34 commit 8fa2105
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- Update `get_key` to return all the keys if there are multiple within a `HERD` instance. @mavaylon1 [#999](https://github.com/hdmf-dev/hdmf/pull/999)
- Improve HTML rendering of tables. @bendichter [#998](https://github.com/hdmf-dev/hdmf/pull/998)
- Improved issue and PR templates. @rly [#1004](https://github.com/hdmf-dev/hdmf/pull/1004)
- Added check during validation for if a variable length dataset is empty. @bendichter, @oruebel [#789](https://github.com/hdmf-dev/hdmf/pull/789)

### Bug fixes
- Fixed issue with custom class generation when a spec has a `name`. @rly [#1006](https://github.com/hdmf-dev/hdmf/pull/1006)
Expand Down
39 changes: 32 additions & 7 deletions src/hdmf/validate/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,40 +118,65 @@ class EmptyArrayError(Exception):

def get_type(data, builder_dtype=None):
"""Return a tuple of (the string representation of the type, the format of the string data) for the given data."""
# String data
if isinstance(data, str):
return 'utf', get_string_format(data)
# Bytes data
elif isinstance(data, bytes):
return 'ascii', get_string_format(data)
# RegionBuilder data
elif isinstance(data, RegionBuilder):
return 'region', None
# ReferenceBuilder data
elif isinstance(data, ReferenceBuilder):
return 'object', None
# ReferenceResolver data
elif isinstance(data, ReferenceResolver):
return data.dtype, None
# Numpy nd-array data
elif isinstance(data, np.ndarray):
if data.size == 0:
if data.size > 0:
return get_type(data[0], builder_dtype)
else:
raise EmptyArrayError()
return get_type(data[0], builder_dtype)
# Numpy bool data
elif isinstance(data, np.bool_):
return 'bool', None
if not hasattr(data, '__len__'):
return type(data).__name__, None
# Case for h5py.Dataset and other I/O specific array types
else:
if builder_dtype and isinstance(builder_dtype, list): # compound dtype
# Compound dtype
if builder_dtype and isinstance(builder_dtype, list):
dtypes = []
string_formats = []
for i in range(len(builder_dtype)):
dtype, string_format = get_type(data[0][i])
dtypes.append(dtype)
string_formats.append(string_format)
return dtypes, string_formats
# Object has 'dtype' attribute, e.g., an h5py.Dataset
if hasattr(data, 'dtype'):
if data.dtype.metadata is not None and data.dtype.metadata.get('vlen') is not None:
return get_type(data[0])
return data.dtype, None
if len(data) == 0:
# Try to determine dtype from the first array element
if len(data) > 0:
return get_type(data[0], builder_dtype)
# Empty array
else:
# Empty string array
if data.dtype.metadata["vlen"] == str:
return "utf", None
# Undetermined variable length data type.
else: # pragma: no cover
raise EmptyArrayError() # pragma: no cover
# Standard data type (i.e., not compound or vlen)
else:
return data.dtype, None
# If all else has failed, try to determine the datatype from the first element of the array
if len(data) > 0:
return get_type(data[0], builder_dtype)
else:
raise EmptyArrayError()
return get_type(data[0], builder_dtype)


def check_shape(expected, received):
Expand Down
99 changes: 99 additions & 0 deletions tests/unit/validator_tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,3 +1118,102 @@ def test_round_trip_validation_of_compound_dtype_with_reference(self):
attributes={'data_type': 'Foo'}
)
self.runBuilderRoundTrip(foo)


class TestEmptyDataRoundTrip(ValidatorTestBase):
"""
Test the special case of empty string datasets and attributes during validation
"""
def setUp(self):
self.filename = 'test_ref_dataset.h5'
super().setUp()

def tearDown(self):
remove_test_file(self.filename)
super().tearDown()

def getSpecs(self):
ret = GroupSpec('A test group specification with a data type',
data_type_def='Bar',
datasets=[DatasetSpec(name='data',
doc='an example dataset',
dtype='text',
attributes=[AttributeSpec(
name='attr2',
doc='an example integer attribute',
dtype='int',
shape=(None,))]),
DatasetSpec(name='dataInt',
doc='an example int dataset',
dtype='int',
attributes=[])
],
attributes=[AttributeSpec(name='attr1',
doc='an example string attribute',
dtype='text',
shape=(None,))])
return (ret,)

def runBuilderRoundTrip(self, builder):
"""Executes a round-trip test for a builder
1. First writes the builder to file,
2. next reads a new builder from disk
3. and finally runs the builder through the validator.
The test is successful if there are no validation errors."""
ns_catalog = NamespaceCatalog()
ns_catalog.add_namespace(self.namespace.name, self.namespace)
typemap = TypeMap(ns_catalog)
self.manager = BuildManager(typemap)

with HDF5IO(self.filename, manager=self.manager, mode='w') as write_io:
write_io.write_builder(builder)

with HDF5IO(self.filename, manager=self.manager, mode='r') as read_io:
read_builder = read_io.read_builder()
errors = self.vmap.validate(read_builder)
self.assertEqual(len(errors), 0, errors)

def test_empty_string_attribute(self):
"""Verify that we can determine dtype for empty string attribute during validation"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': []}, # <-- Empty string attribute
datasets=[DatasetBuilder(name='data', data=['text1', 'text2'],
attributes={'attr2': [10, ]}),
DatasetBuilder(name='dataInt', data=[5, ])
])
self.runBuilderRoundTrip(builder)

def test_empty_string_dataset(self):
"""Verify that we can determine dtype for empty string dataset during validation"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']},
datasets=[DatasetBuilder(name='data', # <-- Empty string dataset
data=[],
dtype='text',
attributes={'attr2': [10, ]}),
DatasetBuilder(name='dataInt', data=[5, ])
])
self.runBuilderRoundTrip(builder)

def test_empty_int_attribute(self):
"""Verify that we can determine dtype for empty integer attribute during validation"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']},
datasets=[DatasetBuilder(name='data', data=['text1', 'text2'],
attributes={'attr2': []} # <-- Empty integer attribute
),
DatasetBuilder(name='dataInt', data=[5, ])
])
self.runBuilderRoundTrip(builder)

def test_empty_int_dataset(self):
"""Verify that a dataset builder containing an array of references passes
validation after a round trip"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']},
datasets=[DatasetBuilder(name='data', data=['text1', 'text2'],
attributes={'attr2': [10, ]}),
DatasetBuilder(name='dataInt', data=[], dtype='int') # <-- Empty int dataset
])
self.runBuilderRoundTrip(builder)

0 comments on commit 8fa2105

Please sign in to comment.