From beb22b46a4f0d7a67df014d55f9cce48d9f621d4 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 12 Jan 2024 13:12:54 -0800 Subject: [PATCH] Fix validation of datetime-formatted strings (#1026) --- CHANGELOG.md | 4 +- src/hdmf/validate/validator.py | 70 ++++++++---- tests/unit/validator_tests/test_validate.py | 117 ++++++++++++++++++-- 3 files changed, 155 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fdaede3f..a443ed0bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,6 @@ ### Enhancements - Add Data.set_data_io(), which allows for setting a `DataIO` to a data object after-the-fact. @bendichter and @CodyCBakerPhD [#1013](https://github.com/hdmf-dev/hdmf/pull/1013) - -### Enhancements - Added `add_ref_termset`, updated helper methods for `HERD`, revised `add_ref` to support validations prior to populating the tables and added `add_ref_container`. @mavaylon1 [#968](https://github.com/hdmf-dev/hdmf/pull/968) - Use `stacklevel` in most warnings. @rly [#1027](https://github.com/hdmf-dev/hdmf/pull/1027) @@ -19,8 +17,8 @@ ### Bug fixes - Fixed issue with custom class generation when a spec has a `name`. @rly [#1006](https://github.com/hdmf-dev/hdmf/pull/1006) - Fixed issue with usage of deprecated `ruamel.yaml.safe_load` in `src/hdmf/testing/validate_spec.py`. @rly [#1008](https://github.com/hdmf-dev/hdmf/pull/1008) - - Fixed issue where `ElementIdentifiers` data could be set to non-integer values. @rly [#1009](https://github.com/hdmf-dev/hdmf/pull/1009) +- Fixed issue where string datasets/attributes with isodatetime-formatted values failed validation against a text spec. @rly [#1026](https://github.com/hdmf-dev/hdmf/pull/1026) ## HDMF 3.11.0 (October 30, 2023) diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index 86d0aee4b..304ad084d 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -42,7 +42,7 @@ __allowable['numeric'] = set(chain.from_iterable(__allowable[k] for k in __allowable if 'int' in k or 'float' in k)) -def check_type(expected, received): +def check_type(expected, received, string_format=None): ''' *expected* should come from the spec *received* should come from the data @@ -52,6 +52,12 @@ def check_type(expected, received): raise ValueError('compound type shorter than expected') for i, exp in enumerate(DtypeHelper.simplify_cpd_type(expected)): rec = received[i] + if exp == "isodatetime": # short circuit for isodatetime + sub_string_format = string_format[i] + return ( + rec in __allowable[exp] or + rec in ("utf", "ascii") and sub_string_format == "isodatetime" + ) if rec not in __allowable[exp]: return False return True @@ -71,6 +77,11 @@ def check_type(expected, received): received = received.name elif isinstance(received, type): received = received.__name__ + if expected == "isodatetime": # short circuit for isodatetime + return ( + received in __allowable[expected] or + (received in ("utf", "ascii") and string_format == "isodatetime") + ) if isinstance(expected, RefSpec): expected = expected.reftype elif isinstance(expected, type): @@ -89,48 +100,58 @@ def get_iso8601_regex(): _iso_re = get_iso8601_regex() -def _check_isodatetime(s, default=None): +def get_string_format(data): + """Return the string format of the given data. Possible outputs are "isodatetime" and None. + """ + assert isinstance(data, (str, bytes)) try: - if _iso_re.match(pystr(s)) is not None: + if _iso_re.match(pystr(data)) is not None: return 'isodatetime' except Exception: pass - return default + return None class EmptyArrayError(Exception): pass -def get_type(data): +def get_type(data, builder_dtype=None): + """Return a tuple of (the string representation of the type, the format of the string data) for the given data.""" if isinstance(data, str): - return _check_isodatetime(data, 'utf') + return 'utf', get_string_format(data) elif isinstance(data, bytes): - return _check_isodatetime(data, 'ascii') + return 'ascii', get_string_format(data) elif isinstance(data, RegionBuilder): - return 'region' + return 'region', None elif isinstance(data, ReferenceBuilder): - return 'object' + return 'object', None elif isinstance(data, ReferenceResolver): - return data.dtype + return data.dtype, None elif isinstance(data, np.ndarray): if data.size == 0: raise EmptyArrayError() - return get_type(data[0]) + return get_type(data[0], builder_dtype) elif isinstance(data, np.bool_): - return 'bool' + return 'bool', None if not hasattr(data, '__len__'): - return type(data).__name__ + return type(data).__name__, None else: + if builder_dtype and isinstance(builder_dtype, list): # compound dtype + dtypes = [] + string_formats = [] + for i in range(len(builder_dtype)): + dtype, string_format = get_type(data[0][i]) + dtypes.append(dtype) + string_formats.append(string_format) + return dtypes, string_formats if hasattr(data, 'dtype'): - if isinstance(data.dtype, list): - return [get_type(data[0][i]) for i in range(len(data.dtype))] if data.dtype.metadata is not None and data.dtype.metadata.get('vlen') is not None: return get_type(data[0]) - return data.dtype + return data.dtype, None if len(data) == 0: raise EmptyArrayError() - return get_type(data[0]) + return get_type(data[0], builder_dtype) def check_shape(expected, received): @@ -310,7 +331,7 @@ def validate(self, **kwargs): if not isinstance(value, BaseBuilder): expected = '%s reference' % spec.dtype.reftype try: - value_type = get_type(value) + value_type, _ = get_type(value) ret.append(DtypeError(self.get_spec_loc(spec), expected, value_type)) except EmptyArrayError: # do not validate dtype of empty array. HDMF does not yet set dtype when writing a list/tuple @@ -323,8 +344,8 @@ def validate(self, **kwargs): ret.append(IncorrectDataType(self.get_spec_loc(spec), spec.dtype.target_type, data_type)) else: try: - dtype = get_type(value) - if not check_type(spec.dtype, dtype): + dtype, string_format = get_type(value) + if not check_type(spec.dtype, dtype, string_format): ret.append(DtypeError(self.get_spec_loc(spec), spec.dtype, dtype)) except EmptyArrayError: # do not validate dtype of empty array. HDMF does not yet set dtype when writing a list/tuple @@ -385,14 +406,17 @@ def validate(self, **kwargs): data = builder.data if self.spec.dtype is not None: try: - dtype = get_type(data) - if not check_type(self.spec.dtype, dtype): + dtype, string_format = get_type(data, builder.dtype) + if not check_type(self.spec.dtype, dtype, string_format): ret.append(DtypeError(self.get_spec_loc(self.spec), self.spec.dtype, dtype, location=self.get_builder_loc(builder))) except EmptyArrayError: # do not validate dtype of empty array. HDMF does not yet set dtype when writing a list/tuple pass - shape = get_data_shape(data) + if isinstance(builder.dtype, list): + shape = (len(builder.data), ) # only 1D datasets with compound types are supported + else: + shape = get_data_shape(data) if not check_shape(self.spec.shape, shape): if shape is None: ret.append(ExpectedArrayError(self.get_spec_loc(self.spec), self.spec.shape, str(data), diff --git a/tests/unit/validator_tests/test_validate.py b/tests/unit/validator_tests/test_validate.py index 22d5a28bc..d4c132cd7 100644 --- a/tests/unit/validator_tests/test_validate.py +++ b/tests/unit/validator_tests/test_validate.py @@ -116,7 +116,18 @@ def getSpecs(self): ), DatasetSpec('an example time dataset', 'isodatetime', name='datetime'), DatasetSpec('an example time dataset', 'isodatetime', name='date', quantity='?'), - DatasetSpec('an array of times', 'isodatetime', name='time_array', dims=('num_times',), shape=(None,)) + DatasetSpec('an array of times', 'isodatetime', name='time_array', dims=('num_times',), shape=(None,)), + DatasetSpec( + doc='an array with compound dtype that includes an isodatetime', + dtype=[ + DtypeSpec('x', doc='x', dtype='int'), + DtypeSpec('y', doc='y', dtype='isodatetime'), + ], + name='cpd_array', + dims=('num_times',), + shape=(None,), + quantity="?", + ), ], attributes=[AttributeSpec('attr1', 'an example string attribute', 'text')]) return ret, @@ -129,7 +140,15 @@ def test_valid_isodatetime(self): DatasetBuilder('data', 100, attributes={'attr2': 10}), DatasetBuilder('datetime', datetime(2017, 5, 1, 12, 0, 0)), DatasetBuilder('date', date(2017, 5, 1)), - DatasetBuilder('time_array', [datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())]) + DatasetBuilder('time_array', [datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())]), + DatasetBuilder( + name='cpd_array', + data=[(1, datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal()))], + dtype=[ + DtypeSpec('x', doc='x', dtype='int'), + DtypeSpec('y', doc='y', dtype='isodatetime'), + ], + ), ] ) validator = self.vmap.get_validator('Bar') @@ -143,7 +162,7 @@ def test_invalid_isodatetime(self): datasets=[ DatasetBuilder('data', 100, attributes={'attr2': 10}), DatasetBuilder('datetime', 100), - DatasetBuilder('time_array', [datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())]) + DatasetBuilder('time_array', [datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())]), ] ) validator = self.vmap.get_validator('Bar') @@ -152,18 +171,44 @@ def test_invalid_isodatetime(self): self.assertValidationError(result[0], DtypeError, name='Bar/datetime') def test_invalid_isodatetime_array(self): - builder = GroupBuilder('my_bar', - attributes={'data_type': 'Bar', 'attr1': 'a string attribute'}, - datasets=[DatasetBuilder('data', 100, attributes={'attr2': 10}), - DatasetBuilder('datetime', - datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())), - DatasetBuilder('time_array', - datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal()))]) + builder = GroupBuilder( + 'my_bar', + attributes={'data_type': 'Bar', 'attr1': 'a string attribute'}, + datasets=[ + DatasetBuilder('data', 100, attributes={'attr2': 10}), + DatasetBuilder('datetime', datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())), + DatasetBuilder('time_array', datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())), + ], + ) validator = self.vmap.get_validator('Bar') result = validator.validate(builder) self.assertEqual(len(result), 1) self.assertValidationError(result[0], ExpectedArrayError, name='Bar/time_array') + def test_invalid_cpd_isodatetime_array(self): + builder = GroupBuilder( + 'my_bar', + attributes={'data_type': 'Bar', 'attr1': 'a string attribute'}, + datasets=[ + DatasetBuilder('data', 100, attributes={'attr2': 10}), + DatasetBuilder('datetime', datetime(2017, 5, 1, 12, 0, 0)), + DatasetBuilder('date', date(2017, 5, 1)), + DatasetBuilder('time_array', [datetime(2017, 5, 1, 12, 0, 0, tzinfo=tzlocal())]), + DatasetBuilder( + name='cpd_array', + data=[(1, "wrong")], + dtype=[ + DtypeSpec('x', doc='x', dtype='int'), + DtypeSpec('y', doc='y', dtype='isodatetime'), + ], + ), + ], + ) + validator = self.vmap.get_validator('Bar') + result = validator.validate(builder) + self.assertEqual(len(result), 1) + self.assertValidationError(result[0], DtypeError, name='Bar/cpd_array') + class TestNestedTypes(ValidatorTestBase): @@ -508,6 +553,58 @@ def test_empty_nparray(self): # TODO test shape validation more completely +class TestStringDatetime(TestCase): + + def test_str_coincidental_isodatetime(self): + """Test validation of a text spec allows a string that coincidentally matches the isodatetime format.""" + spec_catalog = SpecCatalog() + spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Bar', + datasets=[ + DatasetSpec(doc='an example scalar dataset', dtype="text", name='data1'), + DatasetSpec(doc='an example 1D dataset', dtype="text", name='data2', shape=(None, )), + DatasetSpec( + doc='an example 1D compound dtype dataset', + dtype=[ + DtypeSpec('x', doc='x', dtype='int'), + DtypeSpec('y', doc='y', dtype='text'), + ], + name='data3', + shape=(None, ), + ), + ], + attributes=[ + AttributeSpec(name='attr1', doc='an example scalar attribute', dtype="text"), + AttributeSpec(name='attr2', doc='an example 1D attribute', dtype="text", shape=(None, )), + ] + ) + spec_catalog.register_spec(spec, 'test.yaml') + namespace = SpecNamespace( + 'a test namespace', CORE_NAMESPACE, [{'source': 'test.yaml'}], version='0.1.0', catalog=spec_catalog + ) + vmap = ValidatorMap(namespace) + + bar_builder = GroupBuilder( + name='my_bar', + attributes={'data_type': 'Bar', 'attr1': "2023-01-01", 'attr2': ["2023-01-01"]}, + datasets=[ + DatasetBuilder(name='data1', data="2023-01-01"), + DatasetBuilder(name='data2', data=["2023-01-01"]), + DatasetBuilder( + name='data3', + data=[(1, "2023-01-01")], + dtype=[ + DtypeSpec('x', doc='x', dtype='int'), + DtypeSpec('y', doc='y', dtype='text'), + ], + ), + ], + ) + results = vmap.validate(bar_builder) + self.assertEqual(len(results), 0) + + class TestLinkable(TestCase): def set_up_spec(self):