From 8604bef245f5ff12c4a9acea522be82feb316396 Mon Sep 17 00:00:00 2001 From: IanCa Date: Fri, 29 Mar 2024 15:37:42 -0500 Subject: [PATCH] Improve schema character validation to match the new spec/utf8 support reorganize some schema validation/loading code --- hed/errors/error_types.py | 1 + hed/errors/schema_error_messages.py | 7 + hed/schema/hed_schema.py | 69 ----- hed/schema/hed_schema_constants.py | 50 +++- hed/schema/hed_schema_io.py | 2 +- hed/schema/schema_compliance.py | 113 ++++--- hed/schema/schema_header_util.py | 97 ++++++ hed/schema/schema_io/base2schema.py | 60 +++- hed/schema/schema_io/wiki2schema.py | 3 +- hed/schema/schema_io/xml2schema.py | 11 +- hed/schema/schema_validation_util.py | 280 +++++++----------- .../schema_validation_util_deprecated.py | 80 +++++ hed/validator/tag_util/class_util.py | 68 ++--- spec_tests/test_errors.py | 9 +- tests/schema/test_hed_schema.py | 22 -- tests/schema/test_schema_validation_util.py | 28 +- .../test_schema_validation_util_deprecated.py | 69 +++++ 17 files changed, 584 insertions(+), 385 deletions(-) create mode 100644 hed/schema/schema_header_util.py create mode 100644 hed/schema/schema_validation_util_deprecated.py create mode 100644 tests/schema/test_schema_validation_util_deprecated.py diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index 1fa221bf..c7b279ce 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -123,6 +123,7 @@ class SchemaWarnings: SCHEMA_CHARACTER_INVALID = "SCHEMA_CHARACTER_INVALID" SCHEMA_INVALID_CAPITALIZATION = 'invalidCaps' SCHEMA_NON_PLACEHOLDER_HAS_CLASS = 'SCHEMA_NON_PLACEHOLDER_HAS_CLASS' + SCHEMA_PROLOGUE_CHARACTER_INVALID = "SCHEMA_PROLOGUE_CHARACTER_INVALID" class SchemaAttributeErrors: diff --git a/hed/errors/schema_error_messages.py b/hed/errors/schema_error_messages.py index f2a7e4f4..6a794059 100644 --- a/hed/errors/schema_error_messages.py +++ b/hed/errors/schema_error_messages.py @@ -23,6 +23,13 @@ def schema_error_unknown_attribute(attribute_name, source_tag): f"or was used outside of it's defined class." +@hed_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, default_severity=ErrorSeverity.WARNING, + actual_code=SchemaWarnings.SCHEMA_CHARACTER_INVALID) +def schema_error_invalid_character_prologue(char_index, source_string, section_name): + invalid_char = source_string[char_index] + return f"'{section_name}' has invalid character '{invalid_char}' at position {char_index} of string: {source_string}" + + @hed_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, default_severity=ErrorSeverity.WARNING, actual_code=SchemaWarnings.SCHEMA_CHARACTER_INVALID) def schema_warning_invalid_chars_desc(desc_string, tag_name, problem_char, char_index): diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 19732d21..34164204 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -635,75 +635,6 @@ def _initialize_attributes(self, key_class): # =============================================== # Getters used to write out schema primarily. # =============================================== - def get_desc_iter(self): - """ Return an iterator over all the descriptions. - - Yields: - tuple: - - str: The tag node name. - - str: The description associated with the node. - - """ - for section in self._sections.values(): - for tag_entry in section.values(): - if tag_entry.description: - yield tag_entry.name, tag_entry.description - - def get_tag_description(self, tag_name, key_class=HedSectionKey.Tags): - """ Return the description associated with the tag. - - Parameters: - tag_name (str): A hed tag name(or unit/unit modifier etc) with proper capitalization. - key_class (str): A string indicating type of description (e.g. All tags, Units, Unit modifier). - The default is HedSectionKey.Tags. - - Returns: - str: A description of the specified tag. - - """ - tag_entry = self._get_tag_entry(tag_name, key_class) - if tag_entry: - return tag_entry.description - - def get_all_schema_tags(self, return_last_term=False): - """ Get a list of all hed terms from the schema. - - Returns: - list: A list of all terms(short tags) from the schema. - - Notes: - Compatible with Hed2 or Hed3. - - """ - final_list = [] - for lower_tag, tag_entry in self.tags.items(): - if return_last_term: - final_list.append(tag_entry.name.split('/')[-1]) - else: - final_list.append(tag_entry.name) - - return final_list - - def get_unknown_attributes(self): - """ Retrieve the current list of unknown attributes. - - Returns: - dict: The keys are attribute names and the values are lists of tags with this attribute. - - Notes: - - This includes attributes found in the wrong section for example unitClass attribute found on a Tag. - - The return tag list is in long form. - - """ - unknown_attributes = {} - for section in self._sections.values(): - for entry in section.values(): - if entry._unknown_attributes: - for attribute_name in entry._unknown_attributes: - unknown_attributes.setdefault(attribute_name, []).append(entry.name) - - return unknown_attributes - def get_tag_attribute_names(self): """ Return a dict of all allowed tag attributes. diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py index ad22e374..8067fa9e 100644 --- a/hed/schema/hed_schema_constants.py +++ b/hed/schema/hed_schema_constants.py @@ -89,9 +89,51 @@ class HedKey: } character_types = { - "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"), - "blank": set(" "), + "ascii": set([chr(x) for x in range(0, 127)]), + "nonascii": "nonascii", # Special case for all other printable unicode characters + "printable": set([chr(x) for x in range(32, 127)]), + "lowercase": set("abcdefghijklmnopqrstuvwxyz"), + "uppercase": set("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), "digits": set("0123456789"), - "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), - "nonascii": "nonascii" # Special case for all other printable unicode characters + "tab": set("\t"), + "newline": set("\n"), + "blank": set(" "), + "exclamation": set("!"), + "double-quote": set('"'), + "number-sign": set("#"), + "dollar": set("$"), + "percent-sign": set("%"), + "ampersand": set("&"), + "single-quote": set("'"), + "left-paren": set("("), + "right-paren": set(")"), + "asterisk": set("*"), + "plus": set("+"), + "comma": set(","), + "hyphen": set("-"), + "period": set("."), + "slash": set("/"), + "colon": set(":"), + "semicolon": set(";"), + "less-than": set("<"), + "equals": set("="), + "greater-than": set(">"), + "question-mark": set("?"), + "at-sign": set("@"), + "backslash": set("\\"), + "caret": set("^"), + "underscore": set("_"), + "vertical-bar": set("|"), + "tilde": set("~"), } + +banned_delimiters = set(",[]{}") + +# Compound types +character_types["letters"] = character_types["lowercase"] | character_types["uppercase"] +character_types["alphanumeric"] = character_types["letters"] | character_types["digits"] +character_types["text"] = character_types["printable"].copy() +character_types["text"].add("nonascii") +character_types["text"] -= banned_delimiters +character_types["name"] = character_types["alphanumeric"] | character_types["hyphen"] | character_types["period"] | character_types["underscore"] +character_types["name"].add("nonascii") diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index fe26aa11..7137bf02 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -11,7 +11,7 @@ from hed.errors.exceptions import HedFileError, HedExceptions from hed.schema.schema_io import schema_util from hed.schema.hed_schema_group import HedSchemaGroup -from hed.schema.schema_validation_util import validate_version_string +from hed.schema.schema_header_util import validate_version_string from collections import defaultdict # from hed.schema.schema_io.owl_constants import ext_to_format from urllib.error import URLError diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index 4835d994..4549b1f4 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -1,10 +1,12 @@ """ Utilities for HED schema checking. """ from hed.errors.error_types import ErrorContext, SchemaErrors, ErrorSeverity, SchemaAttributeErrors, SchemaWarnings -from hed.errors.error_reporter import ErrorHandler -from hed.schema.hed_schema import HedSchema, HedKey +from hed.errors.error_reporter import ErrorHandler, sort_issues +from hed.schema.hed_schema import HedSchema, HedKey, HedSectionKey from hed.schema import schema_attribute_validators -from hed.schema.schema_validation_util import validate_schema_term, validate_schema_description, schema_version_greater_equal +from hed.schema.schema_validation_util import validate_schema_tag_new, validate_schema_term_new, \ + schema_version_greater_equal, get_allowed_characters_by_name, get_problem_indexes, validate_schema_description_new +from hed.schema.schema_validation_util_deprecated import validate_schema_tag, validate_schema_description, verify_no_brackets def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handler=None): @@ -26,19 +28,20 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl raise ValueError("To check compliance of a HedGroupSchema, call self.check_compliance on the schema itself.") error_handler = error_handler if error_handler else ErrorHandler(check_for_warnings) - validator = SchemaValidator(hed_schema, check_for_warnings, error_handler) + validator = SchemaValidator(hed_schema, error_handler) issues_list = [] if not name: name = hed_schema.filename error_handler.push_error_context(ErrorContext.FILE_NAME, name) - issues_list += validator.check_unknown_attributes() + issues_list += validator.check_prologue_epilogue() + issues_list += validator.check_invalid_chars() issues_list += validator.check_attributes() issues_list += validator.check_duplicate_names() - issues_list += validator.check_invalid_chars() - error_handler.pop_error_context() + + issues_list = sort_issues(issues_list) return issues_list @@ -61,34 +64,45 @@ class SchemaValidator: HedKey.InLibrary: [schema_attribute_validators.in_library_check] } # Known attribute validators - def __init__(self, hed_schema, check_for_warnings=True, error_handler=None): + def __init__(self, hed_schema, error_handler): self.hed_schema = hed_schema - self._check_for_warnings = check_for_warnings self.error_handler = error_handler - - def check_unknown_attributes(self): - """Returns issues for any unknown attributes in any section""" - unknown_attributes = self.hed_schema.get_unknown_attributes() - issues_list = [] - if unknown_attributes: - for attribute_name, source_tags in unknown_attributes.items(): - for tag in source_tags: - issues_list += self.error_handler.format_error_with_context(SchemaAttributeErrors.SCHEMA_ATTRIBUTE_INVALID, - attribute_name, - source_tag=tag) - return issues_list + self._new_character_validation = schema_version_greater_equal(self.hed_schema, "8.3.0") + + def check_prologue_epilogue(self): + issues = [] + if self._new_character_validation: + character_set = get_allowed_characters_by_name(["text", "newline"]) + indexes = get_problem_indexes(self.hed_schema.prologue, character_set) + for _, index in indexes: + issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, char_index=index, + source_string=self.hed_schema.prologue, + section_name="Prologue") + indexes = get_problem_indexes(self.hed_schema.epilogue, character_set) + for _, index in indexes: + issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, char_index=index, + source_string=self.hed_schema.epilogue, + section_name="Epilogue") + self.error_handler.add_context_and_filter(issues) + return issues def check_attributes(self): """Returns issues from validating known attributes in all sections""" issues_list = [] - for section_key in self.hed_schema._sections: - self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, section_key) + for section_key in HedSectionKey: + self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key)) for tag_entry in self.hed_schema[section_key].values(): self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, tag_entry.name) + if tag_entry._unknown_attributes: + for attribute_name in tag_entry._unknown_attributes: + issues_list += self.error_handler.format_error_with_context( + SchemaAttributeErrors.SCHEMA_ATTRIBUTE_INVALID, + attribute_name, + source_tag=tag_entry.name) for attribute_name in tag_entry.attributes: # Always check deprecated validators = self.attribute_validators.get(attribute_name, []) \ - + [schema_attribute_validators.attribute_is_deprecated] + + [schema_attribute_validators.attribute_is_deprecated] for validator in validators: self.error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name) new_issues = validator(self.hed_schema, tag_entry, attribute_name) @@ -104,37 +118,50 @@ def check_attributes(self): def check_duplicate_names(self): """Return issues for any duplicate names in all sections.""" issues_list = [] - for section_key in self.hed_schema._sections: + for section_key in HedSectionKey: for name, duplicate_entries in self.hed_schema[section_key].duplicate_names.items(): values = set(entry.has_attribute(HedKey.InLibrary) for entry in duplicate_entries) error_code = SchemaErrors.SCHEMA_DUPLICATE_NODE if len(values) == 2: error_code = SchemaErrors.SCHEMA_DUPLICATE_FROM_LIBRARY issues_list += self.error_handler.format_error_with_context(error_code, name, - duplicate_tag_list=[entry.name for entry in - duplicate_entries], + duplicate_tag_list=[entry.name for entry in duplicate_entries], section=section_key) return issues_list def check_invalid_chars(self): """Returns issues for bad chars in terms or descriptions.""" issues_list = [] - if self._check_for_warnings: - hed_terms = self.hed_schema.get_all_schema_tags(True) - for hed_term in hed_terms: - issues_list += validate_schema_term(hed_term) - - for tag_name, desc in self.hed_schema.get_desc_iter(): - issues_list += validate_schema_description(tag_name, desc) - - if schema_version_greater_equal(self.hed_schema, "8.3.0"): - for unit_name, unit in self.hed_schema.units.items(): - # Don't check for spaces on deprecated units, to avoid degree Celsius issue - if unit.has_attribute(HedKey.DeprecatedFrom): + section_validators = { + HedSectionKey.Tags: validate_schema_tag, + } + default_validator = verify_no_brackets + description_validator = validate_schema_description + + # If above 8.3.0 use the character class validation instead + if self._new_character_validation: + section_validators = { + HedSectionKey.Tags: validate_schema_tag_new + } + default_validator = validate_schema_term_new + description_validator = validate_schema_description_new + + for section_key in HedSectionKey: + self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key)) + for entry in self.hed_schema[section_key].values(): + if entry.has_attribute(HedKey.DeprecatedFrom): # Don't validate deprecated terms and descriptions continue - for i, char in enumerate(unit_name): - if char == " ": - issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, - unit_name, char_index=i, problem_char=char) + self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, str(entry)) + # Everything but tags just does the generic term check + validator = section_validators.get(section_key, default_validator) + new_issues = [] + if validator: + new_issues += validator(entry) + new_issues += description_validator(entry) + self.error_handler.add_context_and_filter(new_issues) + issues_list += new_issues + self.error_handler.pop_error_context() # Term + self.error_handler.pop_error_context() # section + return issues_list diff --git a/hed/schema/schema_header_util.py b/hed/schema/schema_header_util.py new file mode 100644 index 00000000..8902faa2 --- /dev/null +++ b/hed/schema/schema_header_util.py @@ -0,0 +1,97 @@ + +from semantic_version import Version + +from hed.schema import hed_schema_constants as constants +from hed.errors.exceptions import HedExceptions, HedFileError +from hed.schema.hed_schema_constants import valid_header_attributes + + +def validate_library_name(library_name): + """ Check the validity of the library name. + + Parameters: + library_name (str): Name of the library. + + Returns: + bool or str: If not False, string indicates the issue. + + """ + for i, character in enumerate(library_name): + if not character.isalpha(): + return f"Non alpha character '{character}' at position {i} in '{library_name}'" + if character.isupper(): + return f"Non lowercase character '{character}' at position {i} in '{library_name}'" + + +def validate_version_string(version_string): + """ Check validity of the version. + + Parameters: + version_string (str): A version string. + + Returns: + bool or str: If not False, string indicates the issue. + + """ + try: + Version(version_string) + except ValueError as e: + return str(e) + return False + + +header_attribute_validators = { + constants.VERSION_ATTRIBUTE: (validate_version_string, HedExceptions.SCHEMA_VERSION_INVALID), + constants.LIBRARY_ATTRIBUTE: (validate_library_name, HedExceptions.BAD_HED_LIBRARY_NAME) +} + + +def validate_present_attributes(attrib_dict, name): + """ Validate combinations of attributes + + Parameters: + attrib_dict (dict): Dictionary of attributes to be evaluated. + name (str): File name to use in reporting errors. + + Returns: + list: List of issues. Each issue is a dictionary. + + :raises HedFileError: + - withStandard is found in th header, but a library attribute is not specified + """ + if constants.WITH_STANDARD_ATTRIBUTE in attrib_dict and constants.LIBRARY_ATTRIBUTE not in attrib_dict: + raise HedFileError(HedExceptions.BAD_WITH_STANDARD, + "withStandard header attribute found, but no library attribute is present", + name) + + +def validate_attributes(attrib_dict, name): + """ Validate attributes in the dictionary. + + Parameters: + attrib_dict (dict): Dictionary of attributes to be evaluated. + name (str): name to use in reporting errors. + + Returns: + list: List of issues. Each issue is a dictionary. + + :raises HedFileError: + - Invalid library name + - Version not present + - Invalid combinations of attributes in header + """ + validate_present_attributes(attrib_dict, name) + + for attribute_name, attribute_value in attrib_dict.items(): + if attribute_name in header_attribute_validators: + validator, error_code = header_attribute_validators[attribute_name] + had_error = validator(attribute_value) + if had_error: + raise HedFileError(error_code, had_error, name) + if attribute_name not in valid_header_attributes: + raise HedFileError(HedExceptions.SCHEMA_UNKNOWN_HEADER_ATTRIBUTE, + f"Unknown attribute {attribute_name} found in header line", filename=name) + + if constants.VERSION_ATTRIBUTE not in attrib_dict: + raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, + "No version attribute found in header", filename=name) diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py index 75847446..bf6a5e04 100644 --- a/hed/schema/schema_io/base2schema.py +++ b/hed/schema/schema_io/base2schema.py @@ -1,9 +1,10 @@ import copy + from hed.errors.exceptions import HedFileError, HedExceptions -from hed.schema import HedSchema +from hed.schema import HedSchema, hed_schema_constants as constants from hed.schema.hed_schema_constants import HedKey from abc import abstractmethod, ABC -from hed.schema import schema_validation_util +from hed.schema import schema_header_util from hed.schema import hed_schema_constants @@ -44,7 +45,7 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non # self._schema.filename = filename hed_attributes = self._get_header_attributes(self.input_data) - schema_validation_util.validate_attributes(hed_attributes, name=self.name) + schema_header_util.validate_attributes(hed_attributes, name=self.name) withStandard = hed_attributes.get(hed_schema_constants.WITH_STANDARD_ATTRIBUTE, "") self.library = hed_attributes.get(hed_schema_constants.LIBRARY_ATTRIBUTE, "") @@ -149,3 +150,56 @@ def _add_to_dict_base(self, entry, key_class): entry._set_attribute_value(HedKey.InLibrary, self.library) return self._schema._add_tag_to_dict(entry.name, entry, key_class) + + @staticmethod + def find_rooted_entry(tag_entry, schema, loading_merged): + """ This semi-validates rooted tags, raising an exception on major errors + + Parameters: + tag_entry(HedTagEntry): the possibly rooted tag + schema(HedSchema): The schema being loaded + loading_merged(bool): If this schema was already merged before loading + + Returns: + rooted_tag(HedTagEntry or None): The base tag entry from the standard schema + Returns None if this tag isn't rooted + + :raises HedFileError: + - A rooted attribute is found in a non-paired schema + - A rooted attribute is not a string + - A rooted attribute was found on a non-root node in an unmerged schema. + - A rooted attribute is found on a root node in a merged schema. + - A rooted attribute indicates a tag that doesn't exist in the base schema. + """ + rooted_tag = tag_entry.has_attribute(constants.HedKey.Rooted, return_value=True) + if rooted_tag is not None: + if not schema.with_standard: + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f"Rooted tag attribute found on '{tag_entry.short_tag_name}' in a standard schema.", + schema.name) + + if not isinstance(rooted_tag, str): + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f'Rooted tag \'{tag_entry.short_tag_name}\' is not a string."', + schema.name) + + if tag_entry.parent_name and not loading_merged: + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f'Found rooted tag \'{tag_entry.short_tag_name}\' as a non root node.', + schema.name) + + if not tag_entry.parent_name and loading_merged: + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f'Found rooted tag \'{tag_entry.short_tag_name}\' as a root node in a merged schema.', + schema.name) + + rooted_entry = schema.tags.get(rooted_tag) + if not rooted_entry or rooted_entry.has_attribute(constants.HedKey.InLibrary): + raise HedFileError(HedExceptions.ROOTED_TAG_DOES_NOT_EXIST, + f"Rooted tag '{tag_entry.short_tag_name}' not found in paired standard schema", + schema.name) + + if loading_merged: + return None + + return rooted_entry diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index 4e34ae1c..838572f3 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -6,7 +6,6 @@ from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions from hed.errors import ErrorContext, error_reporter -from hed.schema import schema_validation_util from hed.schema.schema_io import wiki_constants from .base2schema import SchemaLoader from .wiki_constants import HedWikiSection, SectionStarts, SectionNames @@ -172,7 +171,7 @@ def _read_schema(self, lines): continue try: - rooted_entry = schema_validation_util.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) if rooted_entry: parent_tags = rooted_entry.long_tag_name.split("/") level_adj = len(parent_tags) diff --git a/hed/schema/schema_io/xml2schema.py b/hed/schema/schema_io/xml2schema.py index b92a4a49..c6d2a4c5 100644 --- a/hed/schema/schema_io/xml2schema.py +++ b/hed/schema/schema_io/xml2schema.py @@ -5,11 +5,8 @@ from defusedxml import ElementTree import xml - -import hed.schema.hed_schema_constants from hed.errors.exceptions import HedFileError, HedExceptions -from hed.schema.hed_schema_constants import HedSectionKey, HedKey -from hed.schema import schema_validation_util +from hed.schema.hed_schema_constants import HedSectionKey, HedKey, NS_ATTRIB, NO_LOC_ATTRIB from hed.schema.schema_io import xml_constants from .base2schema import SchemaLoader from functools import partial @@ -101,7 +98,7 @@ def _add_tags_recursive(self, new_tags, parent_tags): tag_entry = self._parse_node(tag_element, HedSectionKey.Tags, full_tag) - rooted_entry = schema_validation_util.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) if rooted_entry: loading_from_chain = rooted_entry.name + "/" + tag_entry.short_tag_name loading_from_chain_short = tag_entry.short_tag_name @@ -146,8 +143,8 @@ def _reformat_xsd_attrib(self, attrib_dict): for attrib_name in attrib_dict: if attrib_name == xml_constants.NO_NAMESPACE_XSD_KEY: xsd_value = attrib_dict[attrib_name] - final_attrib[hed.schema.hed_schema_constants.NS_ATTRIB] = xml_constants.XSI_SOURCE - final_attrib[hed.schema.hed_schema_constants.NO_LOC_ATTRIB] = xsd_value + final_attrib[NS_ATTRIB] = xml_constants.XSI_SOURCE + final_attrib[NO_LOC_ATTRIB] = xsd_value else: final_attrib[attrib_name] = attrib_dict[attrib_name] diff --git a/hed/schema/schema_validation_util.py b/hed/schema/schema_validation_util.py index 753fbb10..fb7a6fee 100644 --- a/hed/schema/schema_validation_util.py +++ b/hed/schema/schema_validation_util.py @@ -3,209 +3,75 @@ from hed.errors import ErrorHandler, SchemaWarnings from hed.schema import hed_schema_constants as constants -from hed.errors.exceptions import HedExceptions, HedFileError -from hed.schema.hed_schema_constants import valid_header_attributes +from hed.schema.hed_schema_constants import character_types from hed.schema import HedSchema, HedSchemaGroup -ALLOWED_TAG_CHARS = "-" -ALLOWED_DESC_CHARS = "-_:;,./()+ ^" - - -def validate_library_name(library_name): - """ Check the validity of the library name. +def validate_schema_tag_new(hed_entry): + """ Check tag entry for capitalization and illegal characters. Parameters: - library_name (str): Name of the library. + hed_entry (HedTagEntry): A single tag entry Returns: - bool or str: If not False, string indicates the issue. - - """ - for i, character in enumerate(library_name): - if not character.isalpha(): - return f"Non alpha character '{character}' at position {i} in '{library_name}'" - if character.isupper(): - return f"Non lowercase character '{character}' at position {i} in '{library_name}'" - - -def validate_version_string(version_string): - """ Check validity of the version. - - Parameters: - version_string (str): A version string. - - Returns: - bool or str: If not False, string indicates the issue. - - """ - try: - Version(version_string) - except ValueError as e: - return str(e) - return False - - -header_attribute_validators = { - constants.VERSION_ATTRIBUTE: (validate_version_string, HedExceptions.SCHEMA_VERSION_INVALID), - constants.LIBRARY_ATTRIBUTE: (validate_library_name, HedExceptions.BAD_HED_LIBRARY_NAME) -} - - -def validate_present_attributes(attrib_dict, name): - """ Validate combinations of attributes - - Parameters: - attrib_dict (dict): Dictionary of attributes to be evaluated. - name (str): File name to use in reporting errors. - - Returns: - list: List of issues. Each issue is a dictionary. - - :raises HedFileError: - - withStandard is found in th header, but a library attribute is not specified - """ - if constants.WITH_STANDARD_ATTRIBUTE in attrib_dict and constants.LIBRARY_ATTRIBUTE not in attrib_dict: - raise HedFileError(HedExceptions.BAD_WITH_STANDARD, - "withStandard header attribute found, but no library attribute is present", - name) - - -def validate_attributes(attrib_dict, name): - """ Validate attributes in the dictionary. - - Parameters: - attrib_dict (dict): Dictionary of attributes to be evaluated. - name (str): name to use in reporting errors. - - Returns: - list: List of issues. Each issue is a dictionary. - - :raises HedFileError: - - Invalid library name - - Version not present - - Invalid combinations of attributes in header - """ - validate_present_attributes(attrib_dict, name) - - for attribute_name, attribute_value in attrib_dict.items(): - if attribute_name in header_attribute_validators: - validator, error_code = header_attribute_validators[attribute_name] - had_error = validator(attribute_value) - if had_error: - raise HedFileError(error_code, had_error, name) - if attribute_name not in valid_header_attributes: - raise HedFileError(HedExceptions.SCHEMA_UNKNOWN_HEADER_ATTRIBUTE, - f"Unknown attribute {attribute_name} found in header line", filename=name) - - if constants.VERSION_ATTRIBUTE not in attrib_dict: - raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, - "No version attribute found in header", filename=name) - - -# Might move this to a baseclass version if one is ever made for wiki2schema/xml2schema -def find_rooted_entry(tag_entry, schema, loading_merged): - """ This semi-validates rooted tags, raising an exception on major errors - - Parameters: - tag_entry(HedTagEntry): the possibly rooted tag - schema(HedSchema): The schema being loaded - loading_merged(bool): If this schema was already merged before loading - - Returns: - rooted_tag(HedTagEntry or None): The base tag entry from the standard schema - Returns None if this tag isn't rooted - - :raises HedFileError: - - A rooted attribute is found in a non-paired schema - - A rooted attribute is not a string - - A rooted attribute was found on a non-root node in an unmerged schema. - - A rooted attribute is found on a root node in a merged schema. - - A rooted attribute indicates a tag that doesn't exist in the base schema. + list: A list of all formatting issues found in the term. Each issue is a dictionary. """ - rooted_tag = tag_entry.has_attribute(constants.HedKey.Rooted, return_value=True) - if rooted_tag is not None: - if not schema.with_standard: - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f"Rooted tag attribute found on '{tag_entry.short_tag_name}' in a standard schema.", - schema.name) - - if not isinstance(rooted_tag, str): - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f'Rooted tag \'{tag_entry.short_tag_name}\' is not a string."', - schema.name) - - if tag_entry.parent_name and not loading_merged: - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f'Found rooted tag \'{tag_entry.short_tag_name}\' as a non root node.', - schema.name) - - if not tag_entry.parent_name and loading_merged: - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f'Found rooted tag \'{tag_entry.short_tag_name}\' as a root node in a merged schema.', - schema.name) - - rooted_entry = schema.tags.get(rooted_tag) - if not rooted_entry or rooted_entry.has_attribute(constants.HedKey.InLibrary): - raise HedFileError(HedExceptions.ROOTED_TAG_DOES_NOT_EXIST, - f"Rooted tag '{tag_entry.short_tag_name}' not found in paired standard schema", - schema.name) - - if loading_merged: - return None + issues_list = [] + hed_term = hed_entry.short_tag_name + # Any # terms will have already been validated as the previous entry. + if hed_term == "#": + return issues_list - return rooted_entry + if hed_term and hed_term[0] and not (hed_term[0].isdigit() or hed_term[0].isupper()): + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, + hed_term, char_index=0, problem_char=hed_term[0]) + issues_list += validate_schema_term_new(hed_entry, hed_term) + return issues_list -def validate_schema_term(hed_term): - """ Check short tag for capitalization and illegal characters. +def validate_schema_term_new(hed_entry, hed_term=None): + """ Check the term for invalid character issues Parameters: - hed_term (str): A single hed term. + hed_entry (HedSchemaEntry): A single schema entry + hed_term (str or None): Use instead of hed_entry.name if present. Returns: list: A list of all formatting issues found in the term. Each issue is a dictionary. - """ + if not hed_term: + hed_term = hed_entry.name issues_list = [] - # Any # terms will have already been validated as the previous entry. - if hed_term == "#": - return issues_list - - for i, char in enumerate(hed_term): - if i == 0 and not (char.isdigit() or char.isupper()): - issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, - hed_term, char_index=i, problem_char=char) - continue - if char in ALLOWED_TAG_CHARS or char.isalnum(): - continue - issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, - hed_term, char_index=i, problem_char=char) + # todo: potentially optimize this someday, as most values are the same + character_set = get_allowed_characters_by_name(["name"] + hed_entry.attributes.get("allowedCharacter", "").split(",")) + indexes = get_problem_indexes(hed_term, character_set) + for char, index in indexes: + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, hed_term, char_index=index, problem_char=char) return issues_list -def validate_schema_description(tag_name, hed_description): - """ Check the description of a single schema term. +def validate_schema_description_new(hed_entry): + """ Check the description of the entry for invalid character issues Parameters: - tag_name (str): A single hed tag - not validated here, just used for error messages. - hed_description (str): The description string to validate. + hed_entry (HedSchemaEntry): A single schema entry Returns: - list: A list of all formatting issues found in the description. - + list: A list of all invalid characters found in description. Each issue is a dictionary. """ + if not hed_entry.description: + return [] issues_list = [] - # Blank description is fine - if not hed_description: - return issues_list - for i, char in enumerate(hed_description): - if char.isalnum(): - continue - if char in ALLOWED_DESC_CHARS: - continue + character_set = get_allowed_characters_by_name(["text", "comma"]) + indexes = get_problem_indexes(hed_entry.description, character_set) + # Kludge, just get short name here if we have it for error reporting + name = hed_entry.name + if hasattr(hed_entry, "short_tag_name"): + name = hed_entry.short_tag_name + for char, index in indexes: + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, - hed_description, tag_name, char_index=i, problem_char=char) + hed_entry.description, name, problem_char=char, char_index=index) return issues_list @@ -258,3 +124,67 @@ def schema_version_for_library(hed_schema, library_name): if library_name == "" and hed_schema.with_standard: return hed_schema.with_standard return None + + +def get_allowed_characters(value_classes): + """Returns the allowed characters in a given container of value classes + + Parameters: + value_classes(list of HedSchemaEntry): A list of schema entries that should have the allowedCharacter attribute + + Returns: + character_set(set): The set of all characters from the given classes + """ + # This could be pre-computed + character_set_names = [] + + for value_class in value_classes: + allowed_types = value_class.attributes.get(constants.HedKey.AllowedCharacter, "").split(",") + character_set_names.extend(allowed_types) + + character_set = get_allowed_characters_by_name(character_set_names) + # for now, just always allow these special cases(it's validated extensively elsewhere) + character_set.update("#/") + return character_set + + +def get_allowed_characters_by_name(character_set_names): + """Returns the allowed characters from a list of character set names + + Note: "nonascii" is a special case "character" that can be included as well + + Parameters: + character_set_names(list of str): A list of character sets to allow. See hed_schema_constants.character_types + + Returns: + character_set(set): The set of all characters from the names + """ + character_set = set() + for name in character_set_names: + if name in character_types and name != "nonascii": + character_set.update(character_types[name]) + else: + character_set.add(name) + return character_set + + +def get_problem_indexes(validation_string, character_set, index_adj=0): + """Finds indexes with values not in character set + + Parameters: + validation_string(str): The string to check characters in + character_set(set): the list of valid characters(or the value "nonascii" as a set entry) + index_adj(int): the value to adjust the reported indices by, if this isn't the start of a string. + + Returns: + index_list(tuple of (str, int)): The list of problematic characters and indices + """ + if not character_set: + return [] + + indexes = [(char, index + index_adj) for index, char in enumerate(validation_string) if char not in character_set] + if "nonascii" in character_set: + indexes = [(char, index) for char, index in indexes if not ord(char) > 127] + + return indexes + diff --git a/hed/schema/schema_validation_util_deprecated.py b/hed/schema/schema_validation_util_deprecated.py new file mode 100644 index 00000000..0a0a9ccf --- /dev/null +++ b/hed/schema/schema_validation_util_deprecated.py @@ -0,0 +1,80 @@ +"""Legacy validation for terms and descriptions prior to 8.3.0.""" +from hed.errors import ErrorHandler, SchemaWarnings + + +ALLOWED_TAG_CHARS = "-" +ALLOWED_DESC_CHARS = "-_:;,./()+ ^" + + +def validate_schema_tag(hed_entry): + """ Check short tag for capitalization and illegal characters. + + Parameters: + hed_entry (HedTagEntry): A single hed term. + + Returns: + list: A list of all formatting issues found in the term. Each issue is a dictionary. + + """ + issues_list = [] + hed_term = hed_entry.short_tag_name + # Any # terms will have already been validated as the previous entry. + if hed_term == "#": + return issues_list + + for i, char in enumerate(hed_term): + if i == 0 and not (char.isdigit() or char.isupper()): + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, + hed_term, char_index=i, problem_char=char) + continue + if char in ALLOWED_TAG_CHARS or char.isalnum(): + continue + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, + hed_term, char_index=i, problem_char=char) + return issues_list + + +def validate_schema_description(hed_entry): + """ Check the description of a single schema entry. + + Parameters: + hed_entry (HedSchemaEntry): A single schema entry + + Returns: + list: A list of all formatting issues found in the description. + + """ + issues_list = [] + # Blank description is fine + if not hed_entry.description: + return issues_list + for i, char in enumerate(hed_entry.description): + if char.isalnum(): + continue + if char in ALLOWED_DESC_CHARS: + continue + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, + hed_entry.description, hed_entry.name, char_index=i, problem_char=char) + return issues_list + + +def verify_no_brackets(hed_entry): + """ Extremely basic check to block curly braces + + Parameters: + hed_entry (HedSchemaEntry): A single schema entry + + Returns: + list: A list of issues for invalid characters found in the name + """ + hed_term = hed_entry.name + issues_list = [] + indexes = _get_disallowed_character_indexes(hed_term) + for char, index in indexes: + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, hed_term, char_index=index, problem_char=char) + return issues_list + + +def _get_disallowed_character_indexes(validation_string, index_adj=0, disallowed_chars="{}"): + indexes = [(char, index + index_adj) for index, char in enumerate(validation_string) if char in disallowed_chars] + return indexes diff --git a/hed/validator/tag_util/class_util.py b/hed/validator/tag_util/class_util.py index 9a7569f6..c870f0eb 100644 --- a/hed/validator/tag_util/class_util.py +++ b/hed/validator/tag_util/class_util.py @@ -1,12 +1,11 @@ """ Utilities to support HED validation. """ import datetime import re -import functools - +from hed.schema.schema_validation_util import get_allowed_characters, get_problem_indexes +from hed.schema.schema_validation_util_deprecated import _get_disallowed_character_indexes from hed.errors.error_reporter import ErrorHandler from hed.errors.error_types import ValidationErrors -from hed.schema.hed_schema_constants import HedKey, character_types class UnitValueValidator: @@ -18,8 +17,6 @@ class UnitValueValidator: DIGIT_OR_POUND_EXPRESSION = r'^(-?[\d.]+(?:e-?\d+)?|#)$' - VALUE_CLASS_ALLOWED_CACHE = 20 - def __init__(self, modern_allowed_char_rules=False, value_validators=None): """ Validates the unit and value classes on a given tag. @@ -64,23 +61,22 @@ def check_tag_unit_class_units_are_valid(self, original_tag, validate_text, repo validation_issues = [] if original_tag.is_unit_class_tag(): stripped_value, unit = original_tag.get_stripped_unit_value(validate_text) - if not unit: - # Todo: in theory this should separately validate the number and the units, for units - # that are prefixes like $. Right now those are marked as unit invalid AND value_invalid. - bad_units = " " in validate_text + # that are prefixes like $. Right now those are marked as unit invalid AND value_invalid. + bad_units = " " in stripped_value - if bad_units: - stripped_value = stripped_value.split(" ")[0] + if bad_units: + stripped_value = stripped_value.split(" ")[0] - validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code, - index_offset) + validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code, + index_offset) + if not unit: validation_issues += self._check_units(original_tag, bad_units, report_as) - # We don't want to give this overall error twice - if error_code and not any(error_code == issue['code'] for issue in validation_issues): - new_issue = validation_issues[0].copy() - new_issue['code'] = error_code - validation_issues += [new_issue] + # We don't want to give this overall error twice + if error_code and validation_issues and not any(error_code == issue['code'] for issue in validation_issues): + new_issue = validation_issues[0].copy() + new_issue['code'] = error_code + validation_issues += [new_issue] return validation_issues @@ -100,22 +96,8 @@ def check_tag_value_class_valid(self, original_tag, validate_text, report_as=Non """ return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset) - @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE) - def _get_allowed_characters(self, value_classes): - # This could be pre-computed - character_set = set() - for value_class in value_classes: - allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "") - for single_type in allowed_types.split(","): - if single_type in character_types and single_type != "nonascii": - character_set.update(character_types[single_type]) - else: - character_set.add(single_type) - # for now, just always allow these special cases(it's validated extensively elsewhere) - character_set.update("#/") - return character_set - - def _get_problem_indexes(self, original_tag, stripped_value): + @staticmethod + def _get_tag_problem_indexes(original_tag, stripped_value, validate_characters): """ Return list of problem indices for error messages. Parameters: @@ -131,18 +113,11 @@ def _get_problem_indexes(self, original_tag, stripped_value): if start_index == -1: return indexes - if self._validate_characters: - allowed_characters = self._get_allowed_characters(original_tag.value_classes.values()) - - if allowed_characters: - # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably. - indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char not in allowed_characters] - if "nonascii" in allowed_characters: - # Filter out ascii characters - indexes = [(char, index) for char, index in indexes if not (ord(char) > 127 and char.isprintable())] + if validate_characters: + allowed_characters = get_allowed_characters(original_tag.value_classes.values()) + return get_problem_indexes(stripped_value, allowed_characters, index_adj=start_index) else: - indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"] - return indexes + return _get_disallowed_character_indexes(stripped_value, start_index) def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0): """ Return any issues found if this is a value tag, @@ -159,11 +134,10 @@ def _check_value_class(self, original_tag, stripped_value, report_as, error_code """ - # todo: This function needs to check for allowed characters, not just {} validation_issues = [] if original_tag.is_takes_value_tag(): report_as = report_as if report_as else original_tag - problem_indexes = self._get_problem_indexes(original_tag, stripped_value) + problem_indexes = self._get_tag_problem_indexes(original_tag, stripped_value, self._validate_characters) for char, index in problem_indexes: tag_code = ValidationErrors.CURLY_BRACE_UNSUPPORTED_HERE if ( char in "{}") else ValidationErrors.INVALID_TAG_CHARACTER diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index c2a48a58..9ee913b7 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -53,8 +53,13 @@ def run_single_test(self, test_file): check_for_warnings = info.get("warning", False) error_handler = ErrorHandler(check_for_warnings) if schema: - schema = load_schema_version(schema) - definitions = info['definitions'] + try: + schema = load_schema_version(schema) + except HedFileError as e: + print(f"Failed to load schema version {schema} for test, failing test {name}") + self.fail_count.append(name) + continue + definitions = info.get('definitions', None) def_dict = DefinitionDict(definitions, schema) self.assertFalse(def_dict.issues) else: diff --git a/tests/schema/test_hed_schema.py b/tests/schema/test_hed_schema.py index d62dcb1f..21fcd098 100644 --- a/tests/schema/test_hed_schema.py +++ b/tests/schema/test_hed_schema.py @@ -83,28 +83,6 @@ def test_tag_attribute(self): self.assertEqual(tag.has_attribute(attribute), expected_value, 'Test string: %s. Attribute: %s.' % (test_string, attribute)) - def test_get_all_tags(self): - terms = self.hed_schema_3g.get_all_schema_tags(True) - self.assertTrue(isinstance(terms, list)) - self.assertTrue(len(terms) > 0) - - def test_get_desc_dict(self): - desc_dict = self.hed_schema_3g.get_desc_iter() - self.assertEqual(len(list(desc_dict)), 1117) - - def test_get_tag_description(self): - # Test known tag - desc = self.hed_schema_3g.get_tag_description("Event/Sensory-event") - self.assertEqual(desc, "Something perceivable by the participant. An event meant to be an experimental" - " stimulus should include the tag Task-property/Task-event-role/Experimental-stimulus.") - # Test known unit modifier - desc = self.hed_schema_3g.get_tag_description("deca", HedSectionKey.UnitModifiers) - self.assertEqual(desc, "SI unit multiple representing 10^1") - - # test unknown tag. - desc = self.hed_schema_3g.get_tag_description("This/Is/Not/A/Real/Tag") - self.assertEqual(desc, None) - def test_get_all_tag_attributes(self): test_string = HedString("Jerk-rate/#", self.hed_schema_3g) tag_props = self.hed_schema_3g.get_all_tag_attributes(test_string) diff --git a/tests/schema/test_schema_validation_util.py b/tests/schema/test_schema_validation_util.py index e9bccbcb..d2f12633 100644 --- a/tests/schema/test_schema_validation_util.py +++ b/tests/schema/test_schema_validation_util.py @@ -3,6 +3,7 @@ import hed.schema.schema_validation_util as util from hed.errors import ErrorHandler, SchemaWarnings from hed import load_schema_version, load_schema, HedSchemaGroup +from hed.schema.hed_schema_entry import HedSchemaEntry, HedTagEntry class Test(unittest.TestCase): @@ -12,12 +13,16 @@ def setUpClass(cls): def validate_term_base(self, input_text, expected_issues): for text, issues in zip(input_text, expected_issues): - test_issues = util.validate_schema_term(text) + entry = HedTagEntry(name=text, section=None) + entry.short_tag_name = text + test_issues = util.validate_schema_tag_new(entry) self.assertCountEqual(issues, test_issues) def validate_desc_base(self, input_descriptions, expected_issues): for description, issues in zip(input_descriptions, expected_issues): - test_issues = util.validate_schema_description("dummy", description) + entry = HedSchemaEntry(name="dummy", section=None) + entry.description = description + test_issues = util.validate_schema_description_new(entry) self.assertCountEqual(issues, test_issues) def test_validate_schema_term(self): @@ -36,7 +41,9 @@ def test_validate_schema_term(self): ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, test_terms[3], char_index=11, problem_char="#"), ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, test_terms[4], char_index=0, - problem_char="@"), + problem_char="@") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, test_terms[4], char_index=0, + problem_char="@"), ] self.validate_term_base(test_terms, expected_issues) @@ -45,20 +52,20 @@ def test_validate_schema_description(self): "This is a tag description with no invalid characters.", "This is (also) a tag description with no invalid characters. -_:;./()+ ^", "This description has no invalid characters, as commas are allowed", - "This description has multiple invalid characters at the end @$%*" + "This description has multiple invalid characters at the end {}[]" ] expected_issues = [ [], [], [], ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=60, problem_char="@") + char_index=60, problem_char="{") + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=61, problem_char="$") + char_index=61, problem_char="}") + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=62, problem_char="%") + char_index=62, problem_char="[") + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=63, problem_char="*") + char_index=63, problem_char="]") ] self.validate_desc_base(test_descs, expected_issues) @@ -70,7 +77,8 @@ def test_schema_version_greater_equal(self): schema2 = load_schema_version("v:8.2.0") self.assertFalse(util.schema_version_greater_equal(schema2, "8.3.0")) - schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/schema_tests/schema_utf8.mediawiki') + schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/schema_tests/schema_utf8.mediawiki') schema3 = load_schema(schema_path, schema_namespace="tl:") self.assertTrue(util.schema_version_greater_equal(schema3, "8.3.0")) @@ -95,4 +103,4 @@ def test_schema_version_for_library(self): self.assertEqual(util.schema_version_for_library(schema3, "score"), "1.1.0") self.assertEqual(util.schema_version_for_library(schema3, "testlib"), "2.0.0") - self.assertEqual(util.schema_version_for_library(schema3, "badlib"), None) \ No newline at end of file + self.assertEqual(util.schema_version_for_library(schema3, "badlib"), None) diff --git a/tests/schema/test_schema_validation_util_deprecated.py b/tests/schema/test_schema_validation_util_deprecated.py new file mode 100644 index 00000000..5da596b3 --- /dev/null +++ b/tests/schema/test_schema_validation_util_deprecated.py @@ -0,0 +1,69 @@ +import os +import unittest +import hed.schema.schema_validation_util_deprecated as util +from hed.schema.hed_schema_entry import HedSchemaEntry, HedTagEntry +from hed.errors import ErrorHandler, SchemaWarnings +from hed import load_schema_version + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.hed_schema = load_schema_version("8.1.0") + + def validate_term_base(self, input_text, expected_issues): + for text, issues in zip(input_text, expected_issues): + entry = HedTagEntry(name=text, section=None) + entry.short_tag_name = text + test_issues = util.validate_schema_tag(entry) + self.assertCountEqual(issues, test_issues) + + def validate_desc_base(self, input_descriptions, expected_issues): + for description, issues in zip(input_descriptions, expected_issues): + entry = HedSchemaEntry(name="dummy", section=None) + entry.description = description + test_issues = util.validate_schema_description(entry) + self.assertCountEqual(issues, test_issues) + + def test_validate_schema_term(self): + test_terms = [ + "invalidcaps", + "Validcaps", + "3numberisvalid", + "Invalidchar#", + "@invalidcharatstart", + ] + expected_issues = [ + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, test_terms[0], char_index=0, + problem_char="i"), + [], + [], + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, test_terms[3], char_index=11, + problem_char="#"), + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, test_terms[4], char_index=0, + problem_char="@"), + ] + self.validate_term_base(test_terms, expected_issues) + + def test_validate_schema_description(self): + test_descs = [ + "This is a tag description with no invalid characters.", + "This is (also) a tag description with no invalid characters. -_:;./()+ ^", + "This description has no invalid characters, as commas are allowed", + "This description has multiple invalid characters at the end @$%*" + ] + expected_issues = [ + [], + [], + [], + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=60, problem_char="@") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=61, problem_char="$") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=62, problem_char="%") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=63, problem_char="*") + + ] + self.validate_desc_base(test_descs, expected_issues) \ No newline at end of file