Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

in validation, add check for if a varlen datasets is empty #789

Merged
merged 15 commits into from
Jan 13, 2024
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- Update `get_key` to return all the keys if there are multiple within a `HERD` instance. @mavaylon1 [#999](https://github.com/hdmf-dev/hdmf/pull/999)
- Improve HTML rendering of tables. @bendichter [#998](https://github.com/hdmf-dev/hdmf/pull/998)
- Improved issue and PR templates. @rly [#1004](https://github.com/hdmf-dev/hdmf/pull/1004)
- Added check during validation for if a variable length dataset is empty. @bendichter, @oruebel [#789](https://github.com/hdmf-dev/hdmf/pull/789)

### Bug fixes
- Fixed issue with custom class generation when a spec has a `name`. @rly [#1006](https://github.com/hdmf-dev/hdmf/pull/1006)
Expand Down
39 changes: 32 additions & 7 deletions src/hdmf/validate/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,40 +118,65 @@ class EmptyArrayError(Exception):

def get_type(data, builder_dtype=None):
"""Return a tuple of (the string representation of the type, the format of the string data) for the given data."""
# String data
if isinstance(data, str):
return 'utf', get_string_format(data)
# Bytes data
elif isinstance(data, bytes):
return 'ascii', get_string_format(data)
# RegionBuilder data
elif isinstance(data, RegionBuilder):
return 'region', None
# ReferenceBuilder data
elif isinstance(data, ReferenceBuilder):
return 'object', None
# ReferenceResolver data
elif isinstance(data, ReferenceResolver):
return data.dtype, None
# Numpy nd-array data
elif isinstance(data, np.ndarray):
if data.size == 0:
if data.size > 0:
return get_type(data[0], builder_dtype)
else:
raise EmptyArrayError()
return get_type(data[0], builder_dtype)
# Numpy bool data
elif isinstance(data, np.bool_):
return 'bool', None
if not hasattr(data, '__len__'):
return type(data).__name__, None
# Case for h5py.Dataset and other I/O specific array types
else:
if builder_dtype and isinstance(builder_dtype, list): # compound dtype
# Compound dtype
if builder_dtype and isinstance(builder_dtype, list):
dtypes = []
string_formats = []
for i in range(len(builder_dtype)):
dtype, string_format = get_type(data[0][i])
dtypes.append(dtype)
string_formats.append(string_format)
return dtypes, string_formats
# Object has 'dtype' attribute, e.g., an h5py.Dataset
if hasattr(data, 'dtype'):
if data.dtype.metadata is not None and data.dtype.metadata.get('vlen') is not None:
return get_type(data[0])
return data.dtype, None
if len(data) == 0:
# Try to determine dtype from the first array element
if len(data) > 0:
return get_type(data[0], builder_dtype)
# Empty array
else:
# Empty string array
if data.dtype.metadata["vlen"] == str:
return "utf", None
# Undetermined variable length data type.
else: # pragma: no cover
raise EmptyArrayError() # pragma: no cover
# Standard data type (i.e., not compound or vlen)
else:
return data.dtype, None
# If all else has failed, try to determine the datatype from the first element of the array
if len(data) > 0:
return get_type(data[0], builder_dtype)
else:
raise EmptyArrayError()
return get_type(data[0], builder_dtype)


def check_shape(expected, received):
Expand Down
99 changes: 99 additions & 0 deletions tests/unit/validator_tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,3 +1118,102 @@ def test_round_trip_validation_of_compound_dtype_with_reference(self):
attributes={'data_type': 'Foo'}
)
self.runBuilderRoundTrip(foo)


class TestEmptyDataRoundTrip(ValidatorTestBase):
"""
Test the special case of empty string datasets and attributes during validation
"""
def setUp(self):
self.filename = 'test_ref_dataset.h5'
super().setUp()

def tearDown(self):
remove_test_file(self.filename)
super().tearDown()

def getSpecs(self):
ret = GroupSpec('A test group specification with a data type',
data_type_def='Bar',
datasets=[DatasetSpec(name='data',
doc='an example dataset',
dtype='text',
attributes=[AttributeSpec(
name='attr2',
doc='an example integer attribute',
dtype='int',
shape=(None,))]),
DatasetSpec(name='dataInt',
doc='an example int dataset',
dtype='int',
attributes=[])
],
attributes=[AttributeSpec(name='attr1',
doc='an example string attribute',
dtype='text',
shape=(None,))])
return (ret,)

def runBuilderRoundTrip(self, builder):
"""Executes a round-trip test for a builder

1. First writes the builder to file,
2. next reads a new builder from disk
3. and finally runs the builder through the validator.
The test is successful if there are no validation errors."""
ns_catalog = NamespaceCatalog()
ns_catalog.add_namespace(self.namespace.name, self.namespace)
typemap = TypeMap(ns_catalog)
self.manager = BuildManager(typemap)

with HDF5IO(self.filename, manager=self.manager, mode='w') as write_io:
write_io.write_builder(builder)

with HDF5IO(self.filename, manager=self.manager, mode='r') as read_io:
read_builder = read_io.read_builder()
errors = self.vmap.validate(read_builder)
self.assertEqual(len(errors), 0, errors)

def test_empty_string_attribute(self):
"""Verify that we can determine dtype for empty string attribute during validation"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': []}, # <-- Empty string attribute
datasets=[DatasetBuilder(name='data', data=['text1', 'text2'],
attributes={'attr2': [10, ]}),
DatasetBuilder(name='dataInt', data=[5, ])
])
self.runBuilderRoundTrip(builder)

def test_empty_string_dataset(self):
"""Verify that we can determine dtype for empty string dataset during validation"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']},
datasets=[DatasetBuilder(name='data', # <-- Empty string dataset
data=[],
dtype='text',
attributes={'attr2': [10, ]}),
DatasetBuilder(name='dataInt', data=[5, ])
])
self.runBuilderRoundTrip(builder)

def test_empty_int_attribute(self):
"""Verify that we can determine dtype for empty integer attribute during validation"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']},
datasets=[DatasetBuilder(name='data', data=['text1', 'text2'],
attributes={'attr2': []} # <-- Empty integer attribute
),
DatasetBuilder(name='dataInt', data=[5, ])
])
self.runBuilderRoundTrip(builder)

def test_empty_int_dataset(self):
"""Verify that a dataset builder containing an array of references passes
validation after a round trip"""
builder = GroupBuilder('my_bar',
attributes={'data_type': 'Bar', 'attr1': ['text1', 'text2']},
datasets=[DatasetBuilder(name='data', data=['text1', 'text2'],
attributes={'attr2': [10, ]}),
DatasetBuilder(name='dataInt', data=[], dtype='int') # <-- Empty int dataset
])
self.runBuilderRoundTrip(builder)
Loading