From 3633396fd7f094957e4da53de3e852ddd66718ad Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 29 Nov 2023 18:19:34 -0600 Subject: [PATCH 1/2] Fix bugs/improve unit and def validation code --- hed/models/hed_tag.py | 18 ++++---- hed/schema/hed_schema.py | 11 ----- hed/validator/def_validator.py | 61 +++++++++++++++------------- hed/validator/hed_validator.py | 25 +++++++++--- hed/validator/tag_util/char_util.py | 16 +++++--- hed/validator/tag_util/class_util.py | 33 +++++++++------ spec_tests/test_errors.py | 7 +--- tests/models/test_hed_tag.py | 10 ++--- 8 files changed, 102 insertions(+), 79 deletions(-) diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index 7f4b5321..12577b36 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -316,9 +316,12 @@ def _calculate_to_canonical_forms(self, hed_schema): return tag_issues - def get_stripped_unit_value(self): + def get_stripped_unit_value(self, extension_text): """ Return the extension divided into value and units, if the units are valid. + Parameters: + extension_text (str): The text to split, incase it's a portion of a tag. + Returns: stripped_unit_value (str): The extension portion with the units removed. unit (str or None): None if no valid unit found. @@ -328,7 +331,7 @@ def get_stripped_unit_value(self): """ tag_unit_classes = self.unit_classes - stripped_value, unit, _ = self._get_tag_units_portion(tag_unit_classes) + stripped_value, unit, _ = HedTag._get_tag_units_portion(extension_text, tag_unit_classes) if stripped_value: return stripped_value, unit @@ -354,7 +357,7 @@ def value_as_default_unit(self): unit_entry = self.default_unit unit = unit_entry.name else: - stripped_value, unit, unit_entry = self._get_tag_units_portion(tag_unit_classes) + stripped_value, unit, unit_entry = HedTag._get_tag_units_portion(self.extension, tag_unit_classes) if stripped_value: if unit_entry.get_conversion_factor(unit) is not None: @@ -544,7 +547,8 @@ def _get_schema_namespace(org_tag): return org_tag[:first_colon + 1] return "" - def _get_tag_units_portion(self, tag_unit_classes): + @staticmethod + def _get_tag_units_portion(extension_text, tag_unit_classes): """ Check that this string has valid units and remove them. Parameters: @@ -555,19 +559,19 @@ def _get_tag_units_portion(self, tag_unit_classes): This is filled in if there are no units as well. unit (UnitEntry or None): The matching unit entry if one is found """ - value, _, units = self.extension.rpartition(" ") + value, _, units = extension_text.rpartition(" ") if not units: return None, None, None for unit_class_entry in tag_unit_classes.values(): all_valid_unit_permutations = unit_class_entry.derivative_units - possible_match = self._find_modifier_unit_entry(units, all_valid_unit_permutations) + possible_match = HedTag._find_modifier_unit_entry(units, all_valid_unit_permutations) if possible_match and not possible_match.has_attribute(HedKey.UnitPrefix): return value, units, possible_match # Repeat the above, but as a prefix - possible_match = self._find_modifier_unit_entry(value, all_valid_unit_permutations) + possible_match = HedTag._find_modifier_unit_entry(value, all_valid_unit_permutations) if possible_match and possible_match.has_attribute(HedKey.UnitPrefix): return units, value, possible_match diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 3b1bbdea..a58cc9c4 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -753,17 +753,6 @@ def _get_attributes_for_section(self, key_class): # Semi private function used to create a schema in memory(usually from a source file) # =============================================== def _add_tag_to_dict(self, long_tag_name, new_entry, key_class): - # Add the InLibrary attribute to any library schemas as they are loaded - # These are later removed when they are saved out, if saving unmerged - # if self.library and (not self.with_standard or (not self.merged and self.with_standard)): - # # only add it if not already present - This is a rare case - # Todo ian: I think this should be moved up one level for parity with the other loading changes - # .library will be updated to potentially be a list - # Cannot save schema if .library is a list - # - # if not new_entry.has_attribute(HedKey.InLibrary): - # new_entry._set_attribute_value(HedKey.InLibrary, self.library) - section = self._sections[key_class] return section._add_to_dict(long_tag_name, new_entry) diff --git a/hed/validator/def_validator.py b/hed/validator/def_validator.py index e108ef48..8ca30022 100644 --- a/hed/validator/def_validator.py +++ b/hed/validator/def_validator.py @@ -35,6 +35,7 @@ def validate_def_tags(self, hed_string_obj, hed_validator=None): if self._label_tag_name not in hed_string_lower: return [] + # This is needed primarily to validate the contents of a def-expand matches the default. def_issues = [] # We need to check for labels to expand in ALL groups for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True): @@ -42,30 +43,6 @@ def validate_def_tags(self, hed_string_obj, hed_validator=None): return def_issues - @staticmethod - def _validate_def_units(def_tag, placeholder_tag, hed_validator, is_def_expand_tag): - """Validate units and value classes on def/def-expand tags - - Parameters: - def_tag(HedTag): The source tag - placeholder_tag(HedTag): The placeholder tag this def fills in - hed_validator(HedValidator): Used to validate the units/values - is_def_expand_tag(bool): If the given def_tag is a def-expand tag or not. - - Returns: - issues(list): Issues found from validating placeholders. - """ - def_issues = [] - error_code = ValidationErrors.DEF_INVALID - if is_def_expand_tag: - error_code = ValidationErrors.DEF_EXPAND_INVALID - - def_issues += hed_validator.validate_units(placeholder_tag, - report_as=def_tag, - error_code=error_code) - - return def_issues - @staticmethod def _report_missing_or_invalid_value(def_tag, def_entry, is_def_expand_tag): """Returns the correct error for this type of def tag @@ -121,15 +98,43 @@ def _validate_def_contents(self, def_tag, def_expand_group, hed_validator): def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_EXPAND_INVALID, tag=def_tag, actual_def=def_contents, found_def=def_expand_group) - if def_entry.takes_value and hed_validator: - placeholder_tag = def_contents.get_first_group().find_placeholder_tag() - def_issues += self._validate_def_units(def_tag, placeholder_tag, hed_validator, - is_def_expand_tag) else: def_issues += self._report_missing_or_invalid_value(def_tag, def_entry, is_def_expand_tag) return def_issues + def validate_def_value_units(self, def_tag, hed_validator): + """Equivalent to HedValidator.validate_units for the special case of a Def or Def-expand tag""" + tag_label, _, placeholder = def_tag.extension.partition('/') + is_def_expand_tag = def_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY + + def_entry = self.defs.get(tag_label.lower()) + # These errors will be caught as can't match definition + if def_entry is None: + return [] + + error_code = ValidationErrors.DEF_INVALID + if is_def_expand_tag: + error_code = ValidationErrors.DEF_EXPAND_INVALID + + def_issues = [] + + # Validate the def name vs the name class + def_issues += hed_validator.validate_units(def_tag, + tag_label, + error_code=error_code) + + def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder, return_copy_of_tag=True) + if def_contents and def_entry.takes_value and hed_validator: + placeholder_tag = def_contents.get_first_group().find_placeholder_tag() + def_issues += hed_validator.validate_units(placeholder_tag, + placeholder, + report_as=def_tag, + error_code=error_code, + index_offset=len(tag_label) + 1) + + return def_issues + def validate_onset_offset(self, hed_string_obj): """ Validate onset/offset diff --git a/hed/validator/hed_validator.py b/hed/validator/hed_validator.py index 55f1ebfe..2e509bb1 100644 --- a/hed/validator/hed_validator.py +++ b/hed/validator/hed_validator.py @@ -140,30 +140,40 @@ def check_tag_formatting(self, original_tag): return validation_issues - def validate_units(self, original_tag, report_as=None, error_code=None): + def validate_units(self, original_tag, validate_text=None, report_as=None, error_code=None, + index_offset=0): """Validate units and value classes Parameters: original_tag(HedTag): The source tag + validate_text (str): the text we want to validate, if not the full extension. report_as(HedTag): Report the error tag as coming from a different one. Mostly for definitions that expand. error_code(str): The code to override the error as. Again mostly for def/def-expand tags. + index_offset(int): Offset into the extension validate_text starts at Returns: issues(list): Issues found from units """ + if validate_text is None: + validate_text = original_tag.extension issues = [] if original_tag.is_unit_class_tag(): issues += self._unit_validator.check_tag_unit_class_units_are_valid(original_tag, + validate_text, report_as=report_as, - error_code=error_code) + error_code=error_code, + index_offset=index_offset) elif original_tag.is_value_class_tag(): issues += self._unit_validator.check_tag_value_class_valid(original_tag, + validate_text, report_as=report_as, - error_code=error_code) - # todo: potentially make this one have a report_as + error_code=error_code, + index_offset=index_offset) elif original_tag.extension: - issues += self._char_validator.check_for_invalid_extension_chars(original_tag) + issues += self._char_validator.check_for_invalid_extension_chars(original_tag, + validate_text, + index_offset=index_offset) return issues @@ -198,6 +208,9 @@ def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placehol run_individual_tag_validators(hed_tag, allow_placeholders=allow_placeholders, is_definition=is_definition) - validation_issues += self.validate_units(hed_tag) + if hed_tag.short_base_tag == DefTagNames.DEF_ORG_KEY or hed_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY: + validation_issues += self._def_validator.validate_def_value_units(hed_tag, self) + else: + validation_issues += self.validate_units(hed_tag) return validation_issues diff --git a/hed/validator/tag_util/char_util.py b/hed/validator/tag_util/char_util.py index 219a718c..873b8b10 100644 --- a/hed/validator/tag_util/char_util.py +++ b/hed/validator/tag_util/char_util.py @@ -54,11 +54,15 @@ def check_tag_invalid_chars(self, original_tag, allow_placeholders): validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag) return validation_issues - def check_for_invalid_extension_chars(self, original_tag): + def check_for_invalid_extension_chars(self, original_tag, validate_text, error_code=None, + index_offset=0): """Report invalid characters in extension/value. Parameters: original_tag (HedTag): The original tag that is used to report the error. + validate_text (str): the text we want to validate, if not the full extension. + error_code(str): The code to override the error as. Again mostly for def/def-expand tags. + index_offset(int): Offset into the extension validate_text starts at Returns: list: Validation issues. Each issue is a dictionary. @@ -66,11 +70,12 @@ def check_for_invalid_extension_chars(self, original_tag): allowed_chars = self.TAG_ALLOWED_CHARS allowed_chars += self.DEFAULT_ALLOWED_PLACEHOLDER_CHARS allowed_chars += " " - return self._check_invalid_chars(original_tag.extension, allowed_chars, original_tag, - starting_index=len(original_tag.org_base_tag) + 1) + return self._check_invalid_chars(validate_text, allowed_chars, original_tag, + starting_index=len(original_tag.org_base_tag) + 1 + index_offset, + error_code=error_code) @staticmethod - def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index=0): + def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index=0, error_code=None): validation_issues = [] for i, character in enumerate(check_string): if character.isalnum(): @@ -82,7 +87,8 @@ def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index continue validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_TAG_CHARACTER, tag=source_tag, index_in_tag=starting_index + i, - index_in_tag_end=starting_index + i + 1) + index_in_tag_end=starting_index + i + 1, + actual_error=error_code) return validation_issues @staticmethod diff --git a/hed/validator/tag_util/class_util.py b/hed/validator/tag_util/class_util.py index 1aba8583..966f6009 100644 --- a/hed/validator/tag_util/class_util.py +++ b/hed/validator/tag_util/class_util.py @@ -5,8 +5,6 @@ from hed.errors.error_reporter import ErrorHandler from hed.errors.error_types import ValidationErrors -from hed.schema.hed_schema_constants import HedKey -import functools class UnitValueValidator: @@ -18,6 +16,7 @@ class UnitValueValidator: DIGIT_OR_POUND_EXPRESSION = r'^(-?[\d.]+(?:e-?\d+)?|#)$' VALUE_CLASS_ALLOWED_CACHE=20 + def __init__(self, value_validators=None): """ Validates the unit and value classes on a given tag. @@ -39,11 +38,13 @@ def _get_default_value_class_validators(self): return validator_dict - def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, error_code=None): + def check_tag_unit_class_units_are_valid(self, original_tag, validate_text, report_as=None, error_code=None, + index_offset=0): """ Report incorrect unit class or units. Parameters: original_tag (HedTag): The original tag that is used to report the error. + validate_text (str): The text to validate report_as (HedTag): Report errors as coming from this tag, rather than original_tag. error_code (str): Override error codes Returns: @@ -51,16 +52,17 @@ def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, err """ validation_issues = [] if original_tag.is_unit_class_tag(): - stripped_value, unit = original_tag.get_stripped_unit_value() + stripped_value, unit = original_tag.get_stripped_unit_value(validate_text) if not unit: # Todo: in theory this should separately validate the number and the units, for units # that are prefixes like $. Right now those are marked as unit invalid AND value_invalid. - bad_units = " " in original_tag.extension + bad_units = " " in validate_text if bad_units: stripped_value = stripped_value.split(" ")[0] - validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code) + validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code, + index_offset) validation_issues += self._check_units(original_tag, bad_units, report_as) # We don't want to give this overall error twice @@ -71,17 +73,21 @@ def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, err return validation_issues - def check_tag_value_class_valid(self, original_tag, report_as=None, error_code=None): + def check_tag_value_class_valid(self, original_tag, validate_text, report_as=None, error_code=None, + index_offset=0): """ Report an invalid value portion. Parameters: original_tag (HedTag): The original tag that is used to report the error. + validate_text (str): The text to validate report_as (HedTag): Report errors as coming from this tag, rather than original_tag. error_code (str): Override error codes + index_offset(int): Offset into the extension validate_text starts at + Returns: list: Validation issues. """ - return self._check_value_class(original_tag, original_tag.extension, report_as, error_code) + return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset) # char_sets = { # "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"), @@ -118,7 +124,7 @@ def _get_problem_indexes(self, original_tag, stripped_value): # indexes = [index for index, char in enumerate(stripped_value) if char not in allowed_characters] # pass - def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None): + def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0): """Returns any issues found if this is a value tag""" # todo: This function needs to check for allowed characters, not just {} validation_issues = [] @@ -126,9 +132,12 @@ def _check_value_class(self, original_tag, stripped_value, report_as, error_code report_as = report_as if report_as else original_tag problem_indexes = self._get_problem_indexes(original_tag, stripped_value) for char, index in problem_indexes: - error_code = ValidationErrors.CURLY_BRACE_UNSUPPORTED_HERE \ - if char in "{}" else ValidationErrors.INVALID_TAG_CHARACTER - validation_issues += ErrorHandler.format_error(error_code, + tag_code = ValidationErrors.CURLY_BRACE_UNSUPPORTED_HERE if ( + char in "{}") else ValidationErrors.INVALID_TAG_CHARACTER + + index_adj = len(report_as.org_base_tag) - len(original_tag.org_base_tag) + index += index_adj + index_offset + validation_issues += ErrorHandler.format_error(tag_code, tag=report_as, index_in_tag=index, index_in_tag_end=index + 1) if not self._validate_value_class_portion(original_tag, stripped_value): diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index 3e87fdbd..f4c68fed 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -36,9 +36,6 @@ def run_single_test(self, test_file): test_info = json.load(fp) for info in test_info: error_code = info['error_code'] - verify_code = True - # To be deprecated once we add this to all tests - self._verify_code = verify_code if error_code in skip_tests: print(f"Skipping {error_code} test because: {skip_tests[error_code]}") continue @@ -47,7 +44,7 @@ def run_single_test(self, test_file): print(f"Skipping {name} test because: {skip_tests[name]}") continue - # if name != "attribute-invalid-in-library": + # if name != "sidecar-braces-invalid-spot": # continue description = info['description'] schema = info['schema'] @@ -79,7 +76,7 @@ def report_result(self, expected_result, issues, error_code, description, name, print(f"Passed '{test_type}' (which should fail) '{name}': {test}") print(get_printable_issue_string(issues)) self.fail_count.append(name) - elif self._verify_code: + else: if any(issue['code'] == error_code for issue in issues): return print(f"{error_code}: {description}") diff --git a/tests/models/test_hed_tag.py b/tests/models/test_hed_tag.py index d21c46ce..f6be18f6 100644 --- a/tests/models/test_hed_tag.py +++ b/tests/models/test_hed_tag.py @@ -143,11 +143,11 @@ def test_strip_off_units_from_value(self): # stripped_dollars_string_no_space = dollars_string_no_space._get_tag_units_portion(currency_units) # stripped_dollars_string = dollars_string._get_tag_units_portion(currency_units) # stripped_dollars_string_invalid = dollars_string_invalid._get_tag_units_portion(currency_units) - stripped_volume_string, _, _ = volume_string._get_tag_units_portion(volume_units) - stripped_volume_string_no_space, _, _ = volume_string_no_space._get_tag_units_portion(volume_units) - stripped_prefixed_volume_string, _, _ = prefixed_volume_string._get_tag_units_portion(volume_units) - stripped_invalid_volume_string, _, _ = invalid_volume_string._get_tag_units_portion(volume_units) - stripped_invalid_distance_string, _, _ = invalid_distance_string._get_tag_units_portion(distance_units) + stripped_volume_string, _, _ = HedTag._get_tag_units_portion(volume_string.extension, volume_units) + stripped_volume_string_no_space, _, _ = HedTag._get_tag_units_portion(volume_string_no_space.extension, volume_units) + stripped_prefixed_volume_string, _, _ = HedTag._get_tag_units_portion(prefixed_volume_string.extension, volume_units) + stripped_invalid_volume_string, _, _ = HedTag._get_tag_units_portion(invalid_volume_string.extension, volume_units) + stripped_invalid_distance_string, _, _ = HedTag._get_tag_units_portion(invalid_distance_string.extension, distance_units) # self.assertEqual(stripped_dollars_string_no_space, None) # self.assertEqual(stripped_dollars_string, '25.99') # self.assertEqual(stripped_dollars_string_invalid, None) From 3e2770c7f49493d26d0254df46785ce124463514 Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 29 Nov 2023 18:40:41 -0600 Subject: [PATCH 2/2] Fix bug and spelling --- hed/models/hed_tag.py | 2 +- hed/validator/def_validator.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index 12577b36..bdbfa852 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -320,7 +320,7 @@ def get_stripped_unit_value(self, extension_text): """ Return the extension divided into value and units, if the units are valid. Parameters: - extension_text (str): The text to split, incase it's a portion of a tag. + extension_text (str): The text to split, in case it's a portion of a tag. Returns: stripped_unit_value (str): The extension portion with the units removed. diff --git a/hed/validator/def_validator.py b/hed/validator/def_validator.py index 8ca30022..13fcfa5f 100644 --- a/hed/validator/def_validator.py +++ b/hed/validator/def_validator.py @@ -127,6 +127,9 @@ def validate_def_value_units(self, def_tag, hed_validator): def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder, return_copy_of_tag=True) if def_contents and def_entry.takes_value and hed_validator: placeholder_tag = def_contents.get_first_group().find_placeholder_tag() + # Handle the case where they're adding a unit as part of a placeholder. eg Speed/# mph + if placeholder_tag: + placeholder = placeholder_tag.extension def_issues += hed_validator.validate_units(placeholder_tag, placeholder, report_as=def_tag,