From 8604bef245f5ff12c4a9acea522be82feb316396 Mon Sep 17 00:00:00 2001 From: IanCa Date: Fri, 29 Mar 2024 15:37:42 -0500 Subject: [PATCH 1/2] Improve schema character validation to match the new spec/utf8 support reorganize some schema validation/loading code --- hed/errors/error_types.py | 1 + hed/errors/schema_error_messages.py | 7 + hed/schema/hed_schema.py | 69 ----- hed/schema/hed_schema_constants.py | 50 +++- hed/schema/hed_schema_io.py | 2 +- hed/schema/schema_compliance.py | 113 ++++--- hed/schema/schema_header_util.py | 97 ++++++ hed/schema/schema_io/base2schema.py | 60 +++- hed/schema/schema_io/wiki2schema.py | 3 +- hed/schema/schema_io/xml2schema.py | 11 +- hed/schema/schema_validation_util.py | 280 +++++++----------- .../schema_validation_util_deprecated.py | 80 +++++ hed/validator/tag_util/class_util.py | 68 ++--- spec_tests/test_errors.py | 9 +- tests/schema/test_hed_schema.py | 22 -- tests/schema/test_schema_validation_util.py | 28 +- .../test_schema_validation_util_deprecated.py | 69 +++++ 17 files changed, 584 insertions(+), 385 deletions(-) create mode 100644 hed/schema/schema_header_util.py create mode 100644 hed/schema/schema_validation_util_deprecated.py create mode 100644 tests/schema/test_schema_validation_util_deprecated.py diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index 1fa221bf..c7b279ce 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -123,6 +123,7 @@ class SchemaWarnings: SCHEMA_CHARACTER_INVALID = "SCHEMA_CHARACTER_INVALID" SCHEMA_INVALID_CAPITALIZATION = 'invalidCaps' SCHEMA_NON_PLACEHOLDER_HAS_CLASS = 'SCHEMA_NON_PLACEHOLDER_HAS_CLASS' + SCHEMA_PROLOGUE_CHARACTER_INVALID = "SCHEMA_PROLOGUE_CHARACTER_INVALID" class SchemaAttributeErrors: diff --git a/hed/errors/schema_error_messages.py b/hed/errors/schema_error_messages.py index f2a7e4f4..6a794059 100644 --- a/hed/errors/schema_error_messages.py +++ b/hed/errors/schema_error_messages.py @@ -23,6 +23,13 @@ def schema_error_unknown_attribute(attribute_name, source_tag): f"or was used outside of it's defined class." +@hed_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, default_severity=ErrorSeverity.WARNING, + actual_code=SchemaWarnings.SCHEMA_CHARACTER_INVALID) +def schema_error_invalid_character_prologue(char_index, source_string, section_name): + invalid_char = source_string[char_index] + return f"'{section_name}' has invalid character '{invalid_char}' at position {char_index} of string: {source_string}" + + @hed_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, default_severity=ErrorSeverity.WARNING, actual_code=SchemaWarnings.SCHEMA_CHARACTER_INVALID) def schema_warning_invalid_chars_desc(desc_string, tag_name, problem_char, char_index): diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 19732d21..34164204 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -635,75 +635,6 @@ def _initialize_attributes(self, key_class): # =============================================== # Getters used to write out schema primarily. # =============================================== - def get_desc_iter(self): - """ Return an iterator over all the descriptions. - - Yields: - tuple: - - str: The tag node name. - - str: The description associated with the node. - - """ - for section in self._sections.values(): - for tag_entry in section.values(): - if tag_entry.description: - yield tag_entry.name, tag_entry.description - - def get_tag_description(self, tag_name, key_class=HedSectionKey.Tags): - """ Return the description associated with the tag. - - Parameters: - tag_name (str): A hed tag name(or unit/unit modifier etc) with proper capitalization. - key_class (str): A string indicating type of description (e.g. All tags, Units, Unit modifier). - The default is HedSectionKey.Tags. - - Returns: - str: A description of the specified tag. - - """ - tag_entry = self._get_tag_entry(tag_name, key_class) - if tag_entry: - return tag_entry.description - - def get_all_schema_tags(self, return_last_term=False): - """ Get a list of all hed terms from the schema. - - Returns: - list: A list of all terms(short tags) from the schema. - - Notes: - Compatible with Hed2 or Hed3. - - """ - final_list = [] - for lower_tag, tag_entry in self.tags.items(): - if return_last_term: - final_list.append(tag_entry.name.split('/')[-1]) - else: - final_list.append(tag_entry.name) - - return final_list - - def get_unknown_attributes(self): - """ Retrieve the current list of unknown attributes. - - Returns: - dict: The keys are attribute names and the values are lists of tags with this attribute. - - Notes: - - This includes attributes found in the wrong section for example unitClass attribute found on a Tag. - - The return tag list is in long form. - - """ - unknown_attributes = {} - for section in self._sections.values(): - for entry in section.values(): - if entry._unknown_attributes: - for attribute_name in entry._unknown_attributes: - unknown_attributes.setdefault(attribute_name, []).append(entry.name) - - return unknown_attributes - def get_tag_attribute_names(self): """ Return a dict of all allowed tag attributes. diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py index ad22e374..8067fa9e 100644 --- a/hed/schema/hed_schema_constants.py +++ b/hed/schema/hed_schema_constants.py @@ -89,9 +89,51 @@ class HedKey: } character_types = { - "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"), - "blank": set(" "), + "ascii": set([chr(x) for x in range(0, 127)]), + "nonascii": "nonascii", # Special case for all other printable unicode characters + "printable": set([chr(x) for x in range(32, 127)]), + "lowercase": set("abcdefghijklmnopqrstuvwxyz"), + "uppercase": set("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), "digits": set("0123456789"), - "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), - "nonascii": "nonascii" # Special case for all other printable unicode characters + "tab": set("\t"), + "newline": set("\n"), + "blank": set(" "), + "exclamation": set("!"), + "double-quote": set('"'), + "number-sign": set("#"), + "dollar": set("$"), + "percent-sign": set("%"), + "ampersand": set("&"), + "single-quote": set("'"), + "left-paren": set("("), + "right-paren": set(")"), + "asterisk": set("*"), + "plus": set("+"), + "comma": set(","), + "hyphen": set("-"), + "period": set("."), + "slash": set("/"), + "colon": set(":"), + "semicolon": set(";"), + "less-than": set("<"), + "equals": set("="), + "greater-than": set(">"), + "question-mark": set("?"), + "at-sign": set("@"), + "backslash": set("\\"), + "caret": set("^"), + "underscore": set("_"), + "vertical-bar": set("|"), + "tilde": set("~"), } + +banned_delimiters = set(",[]{}") + +# Compound types +character_types["letters"] = character_types["lowercase"] | character_types["uppercase"] +character_types["alphanumeric"] = character_types["letters"] | character_types["digits"] +character_types["text"] = character_types["printable"].copy() +character_types["text"].add("nonascii") +character_types["text"] -= banned_delimiters +character_types["name"] = character_types["alphanumeric"] | character_types["hyphen"] | character_types["period"] | character_types["underscore"] +character_types["name"].add("nonascii") diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index fe26aa11..7137bf02 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -11,7 +11,7 @@ from hed.errors.exceptions import HedFileError, HedExceptions from hed.schema.schema_io import schema_util from hed.schema.hed_schema_group import HedSchemaGroup -from hed.schema.schema_validation_util import validate_version_string +from hed.schema.schema_header_util import validate_version_string from collections import defaultdict # from hed.schema.schema_io.owl_constants import ext_to_format from urllib.error import URLError diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index 4835d994..4549b1f4 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -1,10 +1,12 @@ """ Utilities for HED schema checking. """ from hed.errors.error_types import ErrorContext, SchemaErrors, ErrorSeverity, SchemaAttributeErrors, SchemaWarnings -from hed.errors.error_reporter import ErrorHandler -from hed.schema.hed_schema import HedSchema, HedKey +from hed.errors.error_reporter import ErrorHandler, sort_issues +from hed.schema.hed_schema import HedSchema, HedKey, HedSectionKey from hed.schema import schema_attribute_validators -from hed.schema.schema_validation_util import validate_schema_term, validate_schema_description, schema_version_greater_equal +from hed.schema.schema_validation_util import validate_schema_tag_new, validate_schema_term_new, \ + schema_version_greater_equal, get_allowed_characters_by_name, get_problem_indexes, validate_schema_description_new +from hed.schema.schema_validation_util_deprecated import validate_schema_tag, validate_schema_description, verify_no_brackets def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handler=None): @@ -26,19 +28,20 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl raise ValueError("To check compliance of a HedGroupSchema, call self.check_compliance on the schema itself.") error_handler = error_handler if error_handler else ErrorHandler(check_for_warnings) - validator = SchemaValidator(hed_schema, check_for_warnings, error_handler) + validator = SchemaValidator(hed_schema, error_handler) issues_list = [] if not name: name = hed_schema.filename error_handler.push_error_context(ErrorContext.FILE_NAME, name) - issues_list += validator.check_unknown_attributes() + issues_list += validator.check_prologue_epilogue() + issues_list += validator.check_invalid_chars() issues_list += validator.check_attributes() issues_list += validator.check_duplicate_names() - issues_list += validator.check_invalid_chars() - error_handler.pop_error_context() + + issues_list = sort_issues(issues_list) return issues_list @@ -61,34 +64,45 @@ class SchemaValidator: HedKey.InLibrary: [schema_attribute_validators.in_library_check] } # Known attribute validators - def __init__(self, hed_schema, check_for_warnings=True, error_handler=None): + def __init__(self, hed_schema, error_handler): self.hed_schema = hed_schema - self._check_for_warnings = check_for_warnings self.error_handler = error_handler - - def check_unknown_attributes(self): - """Returns issues for any unknown attributes in any section""" - unknown_attributes = self.hed_schema.get_unknown_attributes() - issues_list = [] - if unknown_attributes: - for attribute_name, source_tags in unknown_attributes.items(): - for tag in source_tags: - issues_list += self.error_handler.format_error_with_context(SchemaAttributeErrors.SCHEMA_ATTRIBUTE_INVALID, - attribute_name, - source_tag=tag) - return issues_list + self._new_character_validation = schema_version_greater_equal(self.hed_schema, "8.3.0") + + def check_prologue_epilogue(self): + issues = [] + if self._new_character_validation: + character_set = get_allowed_characters_by_name(["text", "newline"]) + indexes = get_problem_indexes(self.hed_schema.prologue, character_set) + for _, index in indexes: + issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, char_index=index, + source_string=self.hed_schema.prologue, + section_name="Prologue") + indexes = get_problem_indexes(self.hed_schema.epilogue, character_set) + for _, index in indexes: + issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, char_index=index, + source_string=self.hed_schema.epilogue, + section_name="Epilogue") + self.error_handler.add_context_and_filter(issues) + return issues def check_attributes(self): """Returns issues from validating known attributes in all sections""" issues_list = [] - for section_key in self.hed_schema._sections: - self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, section_key) + for section_key in HedSectionKey: + self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key)) for tag_entry in self.hed_schema[section_key].values(): self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, tag_entry.name) + if tag_entry._unknown_attributes: + for attribute_name in tag_entry._unknown_attributes: + issues_list += self.error_handler.format_error_with_context( + SchemaAttributeErrors.SCHEMA_ATTRIBUTE_INVALID, + attribute_name, + source_tag=tag_entry.name) for attribute_name in tag_entry.attributes: # Always check deprecated validators = self.attribute_validators.get(attribute_name, []) \ - + [schema_attribute_validators.attribute_is_deprecated] + + [schema_attribute_validators.attribute_is_deprecated] for validator in validators: self.error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name) new_issues = validator(self.hed_schema, tag_entry, attribute_name) @@ -104,37 +118,50 @@ def check_attributes(self): def check_duplicate_names(self): """Return issues for any duplicate names in all sections.""" issues_list = [] - for section_key in self.hed_schema._sections: + for section_key in HedSectionKey: for name, duplicate_entries in self.hed_schema[section_key].duplicate_names.items(): values = set(entry.has_attribute(HedKey.InLibrary) for entry in duplicate_entries) error_code = SchemaErrors.SCHEMA_DUPLICATE_NODE if len(values) == 2: error_code = SchemaErrors.SCHEMA_DUPLICATE_FROM_LIBRARY issues_list += self.error_handler.format_error_with_context(error_code, name, - duplicate_tag_list=[entry.name for entry in - duplicate_entries], + duplicate_tag_list=[entry.name for entry in duplicate_entries], section=section_key) return issues_list def check_invalid_chars(self): """Returns issues for bad chars in terms or descriptions.""" issues_list = [] - if self._check_for_warnings: - hed_terms = self.hed_schema.get_all_schema_tags(True) - for hed_term in hed_terms: - issues_list += validate_schema_term(hed_term) - - for tag_name, desc in self.hed_schema.get_desc_iter(): - issues_list += validate_schema_description(tag_name, desc) - - if schema_version_greater_equal(self.hed_schema, "8.3.0"): - for unit_name, unit in self.hed_schema.units.items(): - # Don't check for spaces on deprecated units, to avoid degree Celsius issue - if unit.has_attribute(HedKey.DeprecatedFrom): + section_validators = { + HedSectionKey.Tags: validate_schema_tag, + } + default_validator = verify_no_brackets + description_validator = validate_schema_description + + # If above 8.3.0 use the character class validation instead + if self._new_character_validation: + section_validators = { + HedSectionKey.Tags: validate_schema_tag_new + } + default_validator = validate_schema_term_new + description_validator = validate_schema_description_new + + for section_key in HedSectionKey: + self.error_handler.push_error_context(ErrorContext.SCHEMA_SECTION, str(section_key)) + for entry in self.hed_schema[section_key].values(): + if entry.has_attribute(HedKey.DeprecatedFrom): # Don't validate deprecated terms and descriptions continue - for i, char in enumerate(unit_name): - if char == " ": - issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, - unit_name, char_index=i, problem_char=char) + self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, str(entry)) + # Everything but tags just does the generic term check + validator = section_validators.get(section_key, default_validator) + new_issues = [] + if validator: + new_issues += validator(entry) + new_issues += description_validator(entry) + self.error_handler.add_context_and_filter(new_issues) + issues_list += new_issues + self.error_handler.pop_error_context() # Term + self.error_handler.pop_error_context() # section + return issues_list diff --git a/hed/schema/schema_header_util.py b/hed/schema/schema_header_util.py new file mode 100644 index 00000000..8902faa2 --- /dev/null +++ b/hed/schema/schema_header_util.py @@ -0,0 +1,97 @@ + +from semantic_version import Version + +from hed.schema import hed_schema_constants as constants +from hed.errors.exceptions import HedExceptions, HedFileError +from hed.schema.hed_schema_constants import valid_header_attributes + + +def validate_library_name(library_name): + """ Check the validity of the library name. + + Parameters: + library_name (str): Name of the library. + + Returns: + bool or str: If not False, string indicates the issue. + + """ + for i, character in enumerate(library_name): + if not character.isalpha(): + return f"Non alpha character '{character}' at position {i} in '{library_name}'" + if character.isupper(): + return f"Non lowercase character '{character}' at position {i} in '{library_name}'" + + +def validate_version_string(version_string): + """ Check validity of the version. + + Parameters: + version_string (str): A version string. + + Returns: + bool or str: If not False, string indicates the issue. + + """ + try: + Version(version_string) + except ValueError as e: + return str(e) + return False + + +header_attribute_validators = { + constants.VERSION_ATTRIBUTE: (validate_version_string, HedExceptions.SCHEMA_VERSION_INVALID), + constants.LIBRARY_ATTRIBUTE: (validate_library_name, HedExceptions.BAD_HED_LIBRARY_NAME) +} + + +def validate_present_attributes(attrib_dict, name): + """ Validate combinations of attributes + + Parameters: + attrib_dict (dict): Dictionary of attributes to be evaluated. + name (str): File name to use in reporting errors. + + Returns: + list: List of issues. Each issue is a dictionary. + + :raises HedFileError: + - withStandard is found in th header, but a library attribute is not specified + """ + if constants.WITH_STANDARD_ATTRIBUTE in attrib_dict and constants.LIBRARY_ATTRIBUTE not in attrib_dict: + raise HedFileError(HedExceptions.BAD_WITH_STANDARD, + "withStandard header attribute found, but no library attribute is present", + name) + + +def validate_attributes(attrib_dict, name): + """ Validate attributes in the dictionary. + + Parameters: + attrib_dict (dict): Dictionary of attributes to be evaluated. + name (str): name to use in reporting errors. + + Returns: + list: List of issues. Each issue is a dictionary. + + :raises HedFileError: + - Invalid library name + - Version not present + - Invalid combinations of attributes in header + """ + validate_present_attributes(attrib_dict, name) + + for attribute_name, attribute_value in attrib_dict.items(): + if attribute_name in header_attribute_validators: + validator, error_code = header_attribute_validators[attribute_name] + had_error = validator(attribute_value) + if had_error: + raise HedFileError(error_code, had_error, name) + if attribute_name not in valid_header_attributes: + raise HedFileError(HedExceptions.SCHEMA_UNKNOWN_HEADER_ATTRIBUTE, + f"Unknown attribute {attribute_name} found in header line", filename=name) + + if constants.VERSION_ATTRIBUTE not in attrib_dict: + raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, + "No version attribute found in header", filename=name) diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py index 75847446..bf6a5e04 100644 --- a/hed/schema/schema_io/base2schema.py +++ b/hed/schema/schema_io/base2schema.py @@ -1,9 +1,10 @@ import copy + from hed.errors.exceptions import HedFileError, HedExceptions -from hed.schema import HedSchema +from hed.schema import HedSchema, hed_schema_constants as constants from hed.schema.hed_schema_constants import HedKey from abc import abstractmethod, ABC -from hed.schema import schema_validation_util +from hed.schema import schema_header_util from hed.schema import hed_schema_constants @@ -44,7 +45,7 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non # self._schema.filename = filename hed_attributes = self._get_header_attributes(self.input_data) - schema_validation_util.validate_attributes(hed_attributes, name=self.name) + schema_header_util.validate_attributes(hed_attributes, name=self.name) withStandard = hed_attributes.get(hed_schema_constants.WITH_STANDARD_ATTRIBUTE, "") self.library = hed_attributes.get(hed_schema_constants.LIBRARY_ATTRIBUTE, "") @@ -149,3 +150,56 @@ def _add_to_dict_base(self, entry, key_class): entry._set_attribute_value(HedKey.InLibrary, self.library) return self._schema._add_tag_to_dict(entry.name, entry, key_class) + + @staticmethod + def find_rooted_entry(tag_entry, schema, loading_merged): + """ This semi-validates rooted tags, raising an exception on major errors + + Parameters: + tag_entry(HedTagEntry): the possibly rooted tag + schema(HedSchema): The schema being loaded + loading_merged(bool): If this schema was already merged before loading + + Returns: + rooted_tag(HedTagEntry or None): The base tag entry from the standard schema + Returns None if this tag isn't rooted + + :raises HedFileError: + - A rooted attribute is found in a non-paired schema + - A rooted attribute is not a string + - A rooted attribute was found on a non-root node in an unmerged schema. + - A rooted attribute is found on a root node in a merged schema. + - A rooted attribute indicates a tag that doesn't exist in the base schema. + """ + rooted_tag = tag_entry.has_attribute(constants.HedKey.Rooted, return_value=True) + if rooted_tag is not None: + if not schema.with_standard: + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f"Rooted tag attribute found on '{tag_entry.short_tag_name}' in a standard schema.", + schema.name) + + if not isinstance(rooted_tag, str): + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f'Rooted tag \'{tag_entry.short_tag_name}\' is not a string."', + schema.name) + + if tag_entry.parent_name and not loading_merged: + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f'Found rooted tag \'{tag_entry.short_tag_name}\' as a non root node.', + schema.name) + + if not tag_entry.parent_name and loading_merged: + raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, + f'Found rooted tag \'{tag_entry.short_tag_name}\' as a root node in a merged schema.', + schema.name) + + rooted_entry = schema.tags.get(rooted_tag) + if not rooted_entry or rooted_entry.has_attribute(constants.HedKey.InLibrary): + raise HedFileError(HedExceptions.ROOTED_TAG_DOES_NOT_EXIST, + f"Rooted tag '{tag_entry.short_tag_name}' not found in paired standard schema", + schema.name) + + if loading_merged: + return None + + return rooted_entry diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index 4e34ae1c..838572f3 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -6,7 +6,6 @@ from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions from hed.errors import ErrorContext, error_reporter -from hed.schema import schema_validation_util from hed.schema.schema_io import wiki_constants from .base2schema import SchemaLoader from .wiki_constants import HedWikiSection, SectionStarts, SectionNames @@ -172,7 +171,7 @@ def _read_schema(self, lines): continue try: - rooted_entry = schema_validation_util.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) if rooted_entry: parent_tags = rooted_entry.long_tag_name.split("/") level_adj = len(parent_tags) diff --git a/hed/schema/schema_io/xml2schema.py b/hed/schema/schema_io/xml2schema.py index b92a4a49..c6d2a4c5 100644 --- a/hed/schema/schema_io/xml2schema.py +++ b/hed/schema/schema_io/xml2schema.py @@ -5,11 +5,8 @@ from defusedxml import ElementTree import xml - -import hed.schema.hed_schema_constants from hed.errors.exceptions import HedFileError, HedExceptions -from hed.schema.hed_schema_constants import HedSectionKey, HedKey -from hed.schema import schema_validation_util +from hed.schema.hed_schema_constants import HedSectionKey, HedKey, NS_ATTRIB, NO_LOC_ATTRIB from hed.schema.schema_io import xml_constants from .base2schema import SchemaLoader from functools import partial @@ -101,7 +98,7 @@ def _add_tags_recursive(self, new_tags, parent_tags): tag_entry = self._parse_node(tag_element, HedSectionKey.Tags, full_tag) - rooted_entry = schema_validation_util.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) if rooted_entry: loading_from_chain = rooted_entry.name + "/" + tag_entry.short_tag_name loading_from_chain_short = tag_entry.short_tag_name @@ -146,8 +143,8 @@ def _reformat_xsd_attrib(self, attrib_dict): for attrib_name in attrib_dict: if attrib_name == xml_constants.NO_NAMESPACE_XSD_KEY: xsd_value = attrib_dict[attrib_name] - final_attrib[hed.schema.hed_schema_constants.NS_ATTRIB] = xml_constants.XSI_SOURCE - final_attrib[hed.schema.hed_schema_constants.NO_LOC_ATTRIB] = xsd_value + final_attrib[NS_ATTRIB] = xml_constants.XSI_SOURCE + final_attrib[NO_LOC_ATTRIB] = xsd_value else: final_attrib[attrib_name] = attrib_dict[attrib_name] diff --git a/hed/schema/schema_validation_util.py b/hed/schema/schema_validation_util.py index 753fbb10..fb7a6fee 100644 --- a/hed/schema/schema_validation_util.py +++ b/hed/schema/schema_validation_util.py @@ -3,209 +3,75 @@ from hed.errors import ErrorHandler, SchemaWarnings from hed.schema import hed_schema_constants as constants -from hed.errors.exceptions import HedExceptions, HedFileError -from hed.schema.hed_schema_constants import valid_header_attributes +from hed.schema.hed_schema_constants import character_types from hed.schema import HedSchema, HedSchemaGroup -ALLOWED_TAG_CHARS = "-" -ALLOWED_DESC_CHARS = "-_:;,./()+ ^" - - -def validate_library_name(library_name): - """ Check the validity of the library name. +def validate_schema_tag_new(hed_entry): + """ Check tag entry for capitalization and illegal characters. Parameters: - library_name (str): Name of the library. + hed_entry (HedTagEntry): A single tag entry Returns: - bool or str: If not False, string indicates the issue. - - """ - for i, character in enumerate(library_name): - if not character.isalpha(): - return f"Non alpha character '{character}' at position {i} in '{library_name}'" - if character.isupper(): - return f"Non lowercase character '{character}' at position {i} in '{library_name}'" - - -def validate_version_string(version_string): - """ Check validity of the version. - - Parameters: - version_string (str): A version string. - - Returns: - bool or str: If not False, string indicates the issue. - - """ - try: - Version(version_string) - except ValueError as e: - return str(e) - return False - - -header_attribute_validators = { - constants.VERSION_ATTRIBUTE: (validate_version_string, HedExceptions.SCHEMA_VERSION_INVALID), - constants.LIBRARY_ATTRIBUTE: (validate_library_name, HedExceptions.BAD_HED_LIBRARY_NAME) -} - - -def validate_present_attributes(attrib_dict, name): - """ Validate combinations of attributes - - Parameters: - attrib_dict (dict): Dictionary of attributes to be evaluated. - name (str): File name to use in reporting errors. - - Returns: - list: List of issues. Each issue is a dictionary. - - :raises HedFileError: - - withStandard is found in th header, but a library attribute is not specified - """ - if constants.WITH_STANDARD_ATTRIBUTE in attrib_dict and constants.LIBRARY_ATTRIBUTE not in attrib_dict: - raise HedFileError(HedExceptions.BAD_WITH_STANDARD, - "withStandard header attribute found, but no library attribute is present", - name) - - -def validate_attributes(attrib_dict, name): - """ Validate attributes in the dictionary. - - Parameters: - attrib_dict (dict): Dictionary of attributes to be evaluated. - name (str): name to use in reporting errors. - - Returns: - list: List of issues. Each issue is a dictionary. - - :raises HedFileError: - - Invalid library name - - Version not present - - Invalid combinations of attributes in header - """ - validate_present_attributes(attrib_dict, name) - - for attribute_name, attribute_value in attrib_dict.items(): - if attribute_name in header_attribute_validators: - validator, error_code = header_attribute_validators[attribute_name] - had_error = validator(attribute_value) - if had_error: - raise HedFileError(error_code, had_error, name) - if attribute_name not in valid_header_attributes: - raise HedFileError(HedExceptions.SCHEMA_UNKNOWN_HEADER_ATTRIBUTE, - f"Unknown attribute {attribute_name} found in header line", filename=name) - - if constants.VERSION_ATTRIBUTE not in attrib_dict: - raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, - "No version attribute found in header", filename=name) - - -# Might move this to a baseclass version if one is ever made for wiki2schema/xml2schema -def find_rooted_entry(tag_entry, schema, loading_merged): - """ This semi-validates rooted tags, raising an exception on major errors - - Parameters: - tag_entry(HedTagEntry): the possibly rooted tag - schema(HedSchema): The schema being loaded - loading_merged(bool): If this schema was already merged before loading - - Returns: - rooted_tag(HedTagEntry or None): The base tag entry from the standard schema - Returns None if this tag isn't rooted - - :raises HedFileError: - - A rooted attribute is found in a non-paired schema - - A rooted attribute is not a string - - A rooted attribute was found on a non-root node in an unmerged schema. - - A rooted attribute is found on a root node in a merged schema. - - A rooted attribute indicates a tag that doesn't exist in the base schema. + list: A list of all formatting issues found in the term. Each issue is a dictionary. """ - rooted_tag = tag_entry.has_attribute(constants.HedKey.Rooted, return_value=True) - if rooted_tag is not None: - if not schema.with_standard: - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f"Rooted tag attribute found on '{tag_entry.short_tag_name}' in a standard schema.", - schema.name) - - if not isinstance(rooted_tag, str): - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f'Rooted tag \'{tag_entry.short_tag_name}\' is not a string."', - schema.name) - - if tag_entry.parent_name and not loading_merged: - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f'Found rooted tag \'{tag_entry.short_tag_name}\' as a non root node.', - schema.name) - - if not tag_entry.parent_name and loading_merged: - raise HedFileError(HedExceptions.ROOTED_TAG_INVALID, - f'Found rooted tag \'{tag_entry.short_tag_name}\' as a root node in a merged schema.', - schema.name) - - rooted_entry = schema.tags.get(rooted_tag) - if not rooted_entry or rooted_entry.has_attribute(constants.HedKey.InLibrary): - raise HedFileError(HedExceptions.ROOTED_TAG_DOES_NOT_EXIST, - f"Rooted tag '{tag_entry.short_tag_name}' not found in paired standard schema", - schema.name) - - if loading_merged: - return None + issues_list = [] + hed_term = hed_entry.short_tag_name + # Any # terms will have already been validated as the previous entry. + if hed_term == "#": + return issues_list - return rooted_entry + if hed_term and hed_term[0] and not (hed_term[0].isdigit() or hed_term[0].isupper()): + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, + hed_term, char_index=0, problem_char=hed_term[0]) + issues_list += validate_schema_term_new(hed_entry, hed_term) + return issues_list -def validate_schema_term(hed_term): - """ Check short tag for capitalization and illegal characters. +def validate_schema_term_new(hed_entry, hed_term=None): + """ Check the term for invalid character issues Parameters: - hed_term (str): A single hed term. + hed_entry (HedSchemaEntry): A single schema entry + hed_term (str or None): Use instead of hed_entry.name if present. Returns: list: A list of all formatting issues found in the term. Each issue is a dictionary. - """ + if not hed_term: + hed_term = hed_entry.name issues_list = [] - # Any # terms will have already been validated as the previous entry. - if hed_term == "#": - return issues_list - - for i, char in enumerate(hed_term): - if i == 0 and not (char.isdigit() or char.isupper()): - issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, - hed_term, char_index=i, problem_char=char) - continue - if char in ALLOWED_TAG_CHARS or char.isalnum(): - continue - issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, - hed_term, char_index=i, problem_char=char) + # todo: potentially optimize this someday, as most values are the same + character_set = get_allowed_characters_by_name(["name"] + hed_entry.attributes.get("allowedCharacter", "").split(",")) + indexes = get_problem_indexes(hed_term, character_set) + for char, index in indexes: + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, hed_term, char_index=index, problem_char=char) return issues_list -def validate_schema_description(tag_name, hed_description): - """ Check the description of a single schema term. +def validate_schema_description_new(hed_entry): + """ Check the description of the entry for invalid character issues Parameters: - tag_name (str): A single hed tag - not validated here, just used for error messages. - hed_description (str): The description string to validate. + hed_entry (HedSchemaEntry): A single schema entry Returns: - list: A list of all formatting issues found in the description. - + list: A list of all invalid characters found in description. Each issue is a dictionary. """ + if not hed_entry.description: + return [] issues_list = [] - # Blank description is fine - if not hed_description: - return issues_list - for i, char in enumerate(hed_description): - if char.isalnum(): - continue - if char in ALLOWED_DESC_CHARS: - continue + character_set = get_allowed_characters_by_name(["text", "comma"]) + indexes = get_problem_indexes(hed_entry.description, character_set) + # Kludge, just get short name here if we have it for error reporting + name = hed_entry.name + if hasattr(hed_entry, "short_tag_name"): + name = hed_entry.short_tag_name + for char, index in indexes: + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, - hed_description, tag_name, char_index=i, problem_char=char) + hed_entry.description, name, problem_char=char, char_index=index) return issues_list @@ -258,3 +124,67 @@ def schema_version_for_library(hed_schema, library_name): if library_name == "" and hed_schema.with_standard: return hed_schema.with_standard return None + + +def get_allowed_characters(value_classes): + """Returns the allowed characters in a given container of value classes + + Parameters: + value_classes(list of HedSchemaEntry): A list of schema entries that should have the allowedCharacter attribute + + Returns: + character_set(set): The set of all characters from the given classes + """ + # This could be pre-computed + character_set_names = [] + + for value_class in value_classes: + allowed_types = value_class.attributes.get(constants.HedKey.AllowedCharacter, "").split(",") + character_set_names.extend(allowed_types) + + character_set = get_allowed_characters_by_name(character_set_names) + # for now, just always allow these special cases(it's validated extensively elsewhere) + character_set.update("#/") + return character_set + + +def get_allowed_characters_by_name(character_set_names): + """Returns the allowed characters from a list of character set names + + Note: "nonascii" is a special case "character" that can be included as well + + Parameters: + character_set_names(list of str): A list of character sets to allow. See hed_schema_constants.character_types + + Returns: + character_set(set): The set of all characters from the names + """ + character_set = set() + for name in character_set_names: + if name in character_types and name != "nonascii": + character_set.update(character_types[name]) + else: + character_set.add(name) + return character_set + + +def get_problem_indexes(validation_string, character_set, index_adj=0): + """Finds indexes with values not in character set + + Parameters: + validation_string(str): The string to check characters in + character_set(set): the list of valid characters(or the value "nonascii" as a set entry) + index_adj(int): the value to adjust the reported indices by, if this isn't the start of a string. + + Returns: + index_list(tuple of (str, int)): The list of problematic characters and indices + """ + if not character_set: + return [] + + indexes = [(char, index + index_adj) for index, char in enumerate(validation_string) if char not in character_set] + if "nonascii" in character_set: + indexes = [(char, index) for char, index in indexes if not ord(char) > 127] + + return indexes + diff --git a/hed/schema/schema_validation_util_deprecated.py b/hed/schema/schema_validation_util_deprecated.py new file mode 100644 index 00000000..0a0a9ccf --- /dev/null +++ b/hed/schema/schema_validation_util_deprecated.py @@ -0,0 +1,80 @@ +"""Legacy validation for terms and descriptions prior to 8.3.0.""" +from hed.errors import ErrorHandler, SchemaWarnings + + +ALLOWED_TAG_CHARS = "-" +ALLOWED_DESC_CHARS = "-_:;,./()+ ^" + + +def validate_schema_tag(hed_entry): + """ Check short tag for capitalization and illegal characters. + + Parameters: + hed_entry (HedTagEntry): A single hed term. + + Returns: + list: A list of all formatting issues found in the term. Each issue is a dictionary. + + """ + issues_list = [] + hed_term = hed_entry.short_tag_name + # Any # terms will have already been validated as the previous entry. + if hed_term == "#": + return issues_list + + for i, char in enumerate(hed_term): + if i == 0 and not (char.isdigit() or char.isupper()): + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, + hed_term, char_index=i, problem_char=char) + continue + if char in ALLOWED_TAG_CHARS or char.isalnum(): + continue + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, + hed_term, char_index=i, problem_char=char) + return issues_list + + +def validate_schema_description(hed_entry): + """ Check the description of a single schema entry. + + Parameters: + hed_entry (HedSchemaEntry): A single schema entry + + Returns: + list: A list of all formatting issues found in the description. + + """ + issues_list = [] + # Blank description is fine + if not hed_entry.description: + return issues_list + for i, char in enumerate(hed_entry.description): + if char.isalnum(): + continue + if char in ALLOWED_DESC_CHARS: + continue + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, + hed_entry.description, hed_entry.name, char_index=i, problem_char=char) + return issues_list + + +def verify_no_brackets(hed_entry): + """ Extremely basic check to block curly braces + + Parameters: + hed_entry (HedSchemaEntry): A single schema entry + + Returns: + list: A list of issues for invalid characters found in the name + """ + hed_term = hed_entry.name + issues_list = [] + indexes = _get_disallowed_character_indexes(hed_term) + for char, index in indexes: + issues_list += ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, hed_term, char_index=index, problem_char=char) + return issues_list + + +def _get_disallowed_character_indexes(validation_string, index_adj=0, disallowed_chars="{}"): + indexes = [(char, index + index_adj) for index, char in enumerate(validation_string) if char in disallowed_chars] + return indexes diff --git a/hed/validator/tag_util/class_util.py b/hed/validator/tag_util/class_util.py index 9a7569f6..c870f0eb 100644 --- a/hed/validator/tag_util/class_util.py +++ b/hed/validator/tag_util/class_util.py @@ -1,12 +1,11 @@ """ Utilities to support HED validation. """ import datetime import re -import functools - +from hed.schema.schema_validation_util import get_allowed_characters, get_problem_indexes +from hed.schema.schema_validation_util_deprecated import _get_disallowed_character_indexes from hed.errors.error_reporter import ErrorHandler from hed.errors.error_types import ValidationErrors -from hed.schema.hed_schema_constants import HedKey, character_types class UnitValueValidator: @@ -18,8 +17,6 @@ class UnitValueValidator: DIGIT_OR_POUND_EXPRESSION = r'^(-?[\d.]+(?:e-?\d+)?|#)$' - VALUE_CLASS_ALLOWED_CACHE = 20 - def __init__(self, modern_allowed_char_rules=False, value_validators=None): """ Validates the unit and value classes on a given tag. @@ -64,23 +61,22 @@ def check_tag_unit_class_units_are_valid(self, original_tag, validate_text, repo validation_issues = [] if original_tag.is_unit_class_tag(): stripped_value, unit = original_tag.get_stripped_unit_value(validate_text) - if not unit: - # Todo: in theory this should separately validate the number and the units, for units - # that are prefixes like $. Right now those are marked as unit invalid AND value_invalid. - bad_units = " " in validate_text + # that are prefixes like $. Right now those are marked as unit invalid AND value_invalid. + bad_units = " " in stripped_value - if bad_units: - stripped_value = stripped_value.split(" ")[0] + if bad_units: + stripped_value = stripped_value.split(" ")[0] - validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code, - index_offset) + validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code, + index_offset) + if not unit: validation_issues += self._check_units(original_tag, bad_units, report_as) - # We don't want to give this overall error twice - if error_code and not any(error_code == issue['code'] for issue in validation_issues): - new_issue = validation_issues[0].copy() - new_issue['code'] = error_code - validation_issues += [new_issue] + # We don't want to give this overall error twice + if error_code and validation_issues and not any(error_code == issue['code'] for issue in validation_issues): + new_issue = validation_issues[0].copy() + new_issue['code'] = error_code + validation_issues += [new_issue] return validation_issues @@ -100,22 +96,8 @@ def check_tag_value_class_valid(self, original_tag, validate_text, report_as=Non """ return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset) - @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE) - def _get_allowed_characters(self, value_classes): - # This could be pre-computed - character_set = set() - for value_class in value_classes: - allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "") - for single_type in allowed_types.split(","): - if single_type in character_types and single_type != "nonascii": - character_set.update(character_types[single_type]) - else: - character_set.add(single_type) - # for now, just always allow these special cases(it's validated extensively elsewhere) - character_set.update("#/") - return character_set - - def _get_problem_indexes(self, original_tag, stripped_value): + @staticmethod + def _get_tag_problem_indexes(original_tag, stripped_value, validate_characters): """ Return list of problem indices for error messages. Parameters: @@ -131,18 +113,11 @@ def _get_problem_indexes(self, original_tag, stripped_value): if start_index == -1: return indexes - if self._validate_characters: - allowed_characters = self._get_allowed_characters(original_tag.value_classes.values()) - - if allowed_characters: - # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably. - indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char not in allowed_characters] - if "nonascii" in allowed_characters: - # Filter out ascii characters - indexes = [(char, index) for char, index in indexes if not (ord(char) > 127 and char.isprintable())] + if validate_characters: + allowed_characters = get_allowed_characters(original_tag.value_classes.values()) + return get_problem_indexes(stripped_value, allowed_characters, index_adj=start_index) else: - indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"] - return indexes + return _get_disallowed_character_indexes(stripped_value, start_index) def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0): """ Return any issues found if this is a value tag, @@ -159,11 +134,10 @@ def _check_value_class(self, original_tag, stripped_value, report_as, error_code """ - # todo: This function needs to check for allowed characters, not just {} validation_issues = [] if original_tag.is_takes_value_tag(): report_as = report_as if report_as else original_tag - problem_indexes = self._get_problem_indexes(original_tag, stripped_value) + problem_indexes = self._get_tag_problem_indexes(original_tag, stripped_value, self._validate_characters) for char, index in problem_indexes: tag_code = ValidationErrors.CURLY_BRACE_UNSUPPORTED_HERE if ( char in "{}") else ValidationErrors.INVALID_TAG_CHARACTER diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index c2a48a58..9ee913b7 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -53,8 +53,13 @@ def run_single_test(self, test_file): check_for_warnings = info.get("warning", False) error_handler = ErrorHandler(check_for_warnings) if schema: - schema = load_schema_version(schema) - definitions = info['definitions'] + try: + schema = load_schema_version(schema) + except HedFileError as e: + print(f"Failed to load schema version {schema} for test, failing test {name}") + self.fail_count.append(name) + continue + definitions = info.get('definitions', None) def_dict = DefinitionDict(definitions, schema) self.assertFalse(def_dict.issues) else: diff --git a/tests/schema/test_hed_schema.py b/tests/schema/test_hed_schema.py index d62dcb1f..21fcd098 100644 --- a/tests/schema/test_hed_schema.py +++ b/tests/schema/test_hed_schema.py @@ -83,28 +83,6 @@ def test_tag_attribute(self): self.assertEqual(tag.has_attribute(attribute), expected_value, 'Test string: %s. Attribute: %s.' % (test_string, attribute)) - def test_get_all_tags(self): - terms = self.hed_schema_3g.get_all_schema_tags(True) - self.assertTrue(isinstance(terms, list)) - self.assertTrue(len(terms) > 0) - - def test_get_desc_dict(self): - desc_dict = self.hed_schema_3g.get_desc_iter() - self.assertEqual(len(list(desc_dict)), 1117) - - def test_get_tag_description(self): - # Test known tag - desc = self.hed_schema_3g.get_tag_description("Event/Sensory-event") - self.assertEqual(desc, "Something perceivable by the participant. An event meant to be an experimental" - " stimulus should include the tag Task-property/Task-event-role/Experimental-stimulus.") - # Test known unit modifier - desc = self.hed_schema_3g.get_tag_description("deca", HedSectionKey.UnitModifiers) - self.assertEqual(desc, "SI unit multiple representing 10^1") - - # test unknown tag. - desc = self.hed_schema_3g.get_tag_description("This/Is/Not/A/Real/Tag") - self.assertEqual(desc, None) - def test_get_all_tag_attributes(self): test_string = HedString("Jerk-rate/#", self.hed_schema_3g) tag_props = self.hed_schema_3g.get_all_tag_attributes(test_string) diff --git a/tests/schema/test_schema_validation_util.py b/tests/schema/test_schema_validation_util.py index e9bccbcb..d2f12633 100644 --- a/tests/schema/test_schema_validation_util.py +++ b/tests/schema/test_schema_validation_util.py @@ -3,6 +3,7 @@ import hed.schema.schema_validation_util as util from hed.errors import ErrorHandler, SchemaWarnings from hed import load_schema_version, load_schema, HedSchemaGroup +from hed.schema.hed_schema_entry import HedSchemaEntry, HedTagEntry class Test(unittest.TestCase): @@ -12,12 +13,16 @@ def setUpClass(cls): def validate_term_base(self, input_text, expected_issues): for text, issues in zip(input_text, expected_issues): - test_issues = util.validate_schema_term(text) + entry = HedTagEntry(name=text, section=None) + entry.short_tag_name = text + test_issues = util.validate_schema_tag_new(entry) self.assertCountEqual(issues, test_issues) def validate_desc_base(self, input_descriptions, expected_issues): for description, issues in zip(input_descriptions, expected_issues): - test_issues = util.validate_schema_description("dummy", description) + entry = HedSchemaEntry(name="dummy", section=None) + entry.description = description + test_issues = util.validate_schema_description_new(entry) self.assertCountEqual(issues, test_issues) def test_validate_schema_term(self): @@ -36,7 +41,9 @@ def test_validate_schema_term(self): ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, test_terms[3], char_index=11, problem_char="#"), ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, test_terms[4], char_index=0, - problem_char="@"), + problem_char="@") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, test_terms[4], char_index=0, + problem_char="@"), ] self.validate_term_base(test_terms, expected_issues) @@ -45,20 +52,20 @@ def test_validate_schema_description(self): "This is a tag description with no invalid characters.", "This is (also) a tag description with no invalid characters. -_:;./()+ ^", "This description has no invalid characters, as commas are allowed", - "This description has multiple invalid characters at the end @$%*" + "This description has multiple invalid characters at the end {}[]" ] expected_issues = [ [], [], [], ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=60, problem_char="@") + char_index=60, problem_char="{") + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=61, problem_char="$") + char_index=61, problem_char="}") + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=62, problem_char="%") + char_index=62, problem_char="[") + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", - char_index=63, problem_char="*") + char_index=63, problem_char="]") ] self.validate_desc_base(test_descs, expected_issues) @@ -70,7 +77,8 @@ def test_schema_version_greater_equal(self): schema2 = load_schema_version("v:8.2.0") self.assertFalse(util.schema_version_greater_equal(schema2, "8.3.0")) - schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/schema_tests/schema_utf8.mediawiki') + schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/schema_tests/schema_utf8.mediawiki') schema3 = load_schema(schema_path, schema_namespace="tl:") self.assertTrue(util.schema_version_greater_equal(schema3, "8.3.0")) @@ -95,4 +103,4 @@ def test_schema_version_for_library(self): self.assertEqual(util.schema_version_for_library(schema3, "score"), "1.1.0") self.assertEqual(util.schema_version_for_library(schema3, "testlib"), "2.0.0") - self.assertEqual(util.schema_version_for_library(schema3, "badlib"), None) \ No newline at end of file + self.assertEqual(util.schema_version_for_library(schema3, "badlib"), None) diff --git a/tests/schema/test_schema_validation_util_deprecated.py b/tests/schema/test_schema_validation_util_deprecated.py new file mode 100644 index 00000000..5da596b3 --- /dev/null +++ b/tests/schema/test_schema_validation_util_deprecated.py @@ -0,0 +1,69 @@ +import os +import unittest +import hed.schema.schema_validation_util_deprecated as util +from hed.schema.hed_schema_entry import HedSchemaEntry, HedTagEntry +from hed.errors import ErrorHandler, SchemaWarnings +from hed import load_schema_version + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.hed_schema = load_schema_version("8.1.0") + + def validate_term_base(self, input_text, expected_issues): + for text, issues in zip(input_text, expected_issues): + entry = HedTagEntry(name=text, section=None) + entry.short_tag_name = text + test_issues = util.validate_schema_tag(entry) + self.assertCountEqual(issues, test_issues) + + def validate_desc_base(self, input_descriptions, expected_issues): + for description, issues in zip(input_descriptions, expected_issues): + entry = HedSchemaEntry(name="dummy", section=None) + entry.description = description + test_issues = util.validate_schema_description(entry) + self.assertCountEqual(issues, test_issues) + + def test_validate_schema_term(self): + test_terms = [ + "invalidcaps", + "Validcaps", + "3numberisvalid", + "Invalidchar#", + "@invalidcharatstart", + ] + expected_issues = [ + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, test_terms[0], char_index=0, + problem_char="i"), + [], + [], + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_TAG, test_terms[3], char_index=11, + problem_char="#"), + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CAPITALIZATION, test_terms[4], char_index=0, + problem_char="@"), + ] + self.validate_term_base(test_terms, expected_issues) + + def test_validate_schema_description(self): + test_descs = [ + "This is a tag description with no invalid characters.", + "This is (also) a tag description with no invalid characters. -_:;./()+ ^", + "This description has no invalid characters, as commas are allowed", + "This description has multiple invalid characters at the end @$%*" + ] + expected_issues = [ + [], + [], + [], + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=60, problem_char="@") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=61, problem_char="$") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=62, problem_char="%") + + ErrorHandler.format_error(SchemaWarnings.SCHEMA_INVALID_CHARACTERS_IN_DESC, test_descs[3], "dummy", + char_index=63, problem_char="*") + + ] + self.validate_desc_base(test_descs, expected_issues) \ No newline at end of file From f198b6bdd9e324007153b1839160977e5f6a08f5 Mon Sep 17 00:00:00 2001 From: IanCa Date: Fri, 29 Mar 2024 19:56:33 -0500 Subject: [PATCH 2/2] Switch to .casefold in most places Rewrite extract_tags --- hed/models/def_expand_gather.py | 18 +-- hed/models/definition_dict.py | 12 +- hed/models/df_util.py | 14 +- hed/models/hed_group.py | 16 ++- hed/models/hed_string.py | 4 +- hed/models/hed_tag.py | 12 +- hed/models/query_handler.py | 2 +- hed/models/string_util.py | 4 +- hed/schema/hed_schema.py | 6 +- hed/schema/hed_schema_entry.py | 4 +- hed/schema/hed_schema_section.py | 16 +-- hed/tools/analysis/annotation_util.py | 135 ++++-------------- hed/tools/analysis/event_manager.py | 4 +- hed/tools/analysis/hed_tag_counts.py | 4 +- hed/tools/analysis/hed_type.py | 13 +- hed/tools/analysis/hed_type_counts.py | 2 +- hed/tools/analysis/hed_type_defs.py | 18 +-- hed/tools/analysis/hed_type_factors.py | 6 +- hed/tools/analysis/hed_type_manager.py | 12 +- .../operations/factor_hed_type_op.py | 2 +- .../operations/summarize_hed_tags_op.py | 2 +- .../operations/summarize_hed_type_op.py | 2 +- hed/validator/def_validator.py | 6 +- hed/validator/onset_validator.py | 10 +- hed/validator/sidecar_validator.py | 4 +- hed/validator/tag_util/group_util.py | 4 +- tests/data/schema_tests/schema_utf8.mediawiki | 1 + .../schema_tests/schema_utf8_dupe.mediawiki | 29 ++++ tests/schema/test_hed_schema_io.py | 11 ++ tests/tools/analysis/test_annotation_util.py | 54 +------ 30 files changed, 174 insertions(+), 253 deletions(-) create mode 100644 tests/data/schema_tests/schema_utf8_dupe.mediawiki diff --git a/hed/models/def_expand_gather.py b/hed/models/def_expand_gather.py index b8271512..e4950ddc 100644 --- a/hed/models/def_expand_gather.py +++ b/hed/models/def_expand_gather.py @@ -155,20 +155,20 @@ def _handle_known_definition(self, def_tag, def_expand_group, def_group): if def_group_contents: if def_group_contents != def_expand_group: - self.errors.setdefault(def_tag_name.lower(), []).append(def_expand_group.get_first_group()) + self.errors.setdefault(def_tag_name.casefold(), []).append(def_expand_group.get_first_group()) return True has_extension = "/" in def_tag.extension if not has_extension: group_tag = def_expand_group.get_first_group() - self.def_dict.defs[def_tag_name.lower()] = DefinitionEntry(name=def_tag_name, contents=group_tag, + self.def_dict.defs[def_tag_name.casefold()] = DefinitionEntry(name=def_tag_name, contents=group_tag, takes_value=False, source_context=[]) return True # this is needed for the cases where we have a definition with errors, but it's not a known definition. - if def_tag_name.lower() in self.errors: - self.errors.setdefault(f"{def_tag_name.lower()}", []).append(def_expand_group.get_first_group()) + if def_tag_name.casefold() in self.errors: + self.errors.setdefault(f"{def_tag_name.casefold()}", []).append(def_expand_group.get_first_group()) return True return False @@ -181,20 +181,20 @@ def _handle_ambiguous_definition(self, def_tag, def_expand_group): def_expand_group (HedGroup): The group containing the def-expand tag. """ def_tag_name = def_tag.extension.split('/')[0] - these_defs = self.ambiguous_defs.setdefault(def_tag_name.lower(), AmbiguousDef()) + these_defs = self.ambiguous_defs.setdefault(def_tag_name.casefold(), AmbiguousDef()) these_defs.add_def(def_tag, def_expand_group) try: if these_defs.validate(): new_contents = these_defs.get_group() - self.def_dict.defs[def_tag_name.lower()] = DefinitionEntry(name=def_tag_name, contents=new_contents, + self.def_dict.defs[def_tag_name.casefold()] = DefinitionEntry(name=def_tag_name, contents=new_contents, takes_value=True, source_context=[]) - del self.ambiguous_defs[def_tag_name.lower()] + del self.ambiguous_defs[def_tag_name.casefold()] except ValueError: for ambiguous_def in these_defs.placeholder_defs: - self.errors.setdefault(def_tag_name.lower(), []).append(ambiguous_def) - del self.ambiguous_defs[def_tag_name.lower()] + self.errors.setdefault(def_tag_name.casefold(), []).append(ambiguous_def) + del self.ambiguous_defs[def_tag_name.casefold()] return diff --git a/hed/models/definition_dict.py b/hed/models/definition_dict.py index 86b4147f..f033c8d6 100644 --- a/hed/models/definition_dict.py +++ b/hed/models/definition_dict.py @@ -84,7 +84,7 @@ def get(self, def_name): Returns: DefinitionEntry: Definition entry for the requested definition. """ - return self.defs.get(def_name.lower()) + return self.defs.get(def_name.casefold()) def __iter__(self): return iter(self.defs) @@ -144,14 +144,14 @@ def check_for_definitions(self, hed_string_obj, error_handler=None): def_issues += new_def_issues continue - self.defs[def_tag_name.lower()] = DefinitionEntry(name=def_tag_name, contents=group_tag, + self.defs[def_tag_name.casefold()] = DefinitionEntry(name=def_tag_name, contents=group_tag, takes_value=def_takes_value, source_context=context) return def_issues def _strip_value_placeholder(self, def_tag_name): - def_takes_value = def_tag_name.lower().endswith("/#") + def_takes_value = def_tag_name.endswith("/#") if def_takes_value: def_tag_name = def_tag_name[:-len("/#")] return def_tag_name, def_takes_value @@ -162,7 +162,7 @@ def _validate_name_and_context(self, def_tag_name, error_handler): else: context = [] new_def_issues = [] - if def_tag_name.lower() in self.defs: + if def_tag_name.casefold() in self.defs: new_def_issues += ErrorHandler.format_error_with_context(error_handler, DefinitionErrors.DUPLICATE_DEFINITION, def_name=def_tag_name) @@ -263,7 +263,7 @@ def get_definition_entry(self, def_tag): """ tag_label, _, placeholder = def_tag.extension.partition('/') - label_tag_lower = tag_label.lower() + label_tag_lower = tag_label.casefold() def_entry = self.defs.get(label_tag_lower) return def_entry @@ -281,7 +281,7 @@ def _get_definition_contents(self, def_tag): """ tag_label, _, placeholder = def_tag.extension.partition('/') - label_tag_lower = tag_label.lower() + label_tag_lower = tag_label.casefold() def_entry = self.defs.get(label_tag_lower) if def_entry is None: # Could raise an error here? diff --git a/hed/models/df_util.py b/hed/models/df_util.py index f3686a94..daef2fb2 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -123,22 +123,20 @@ def sort_dataframe_by_onsets(df): return df -def replace_ref(text, newvalue, column_ref): +def replace_ref(text, oldvalue, newvalue="n/a"): """ Replace column ref in x with y. If it's n/a, delete extra commas/parentheses. Parameters: text (str): The input string containing the ref enclosed in curly braces. + oldvalue (str): The full tag or ref to replace newvalue (str): The replacement value for the ref. - column_ref (str): The ref to be replaced, without curly braces. Returns: str: The modified string with the ref replaced or removed. """ - # Note: This function could easily be updated to handle non-curly brace values, but it seemed faster this way - # If it's not n/a, we can just replace directly. if newvalue != "n/a": - return text.replace(f"{{{column_ref}}}", newvalue) + return text.replace(oldvalue, newvalue) def _remover(match): p1 = match.group("p1").count("(") @@ -162,7 +160,7 @@ def _remover(match): # c1/c2 contain the comma(and possibly spaces) separating this ref from other tags # p1/p2 contain the parentheses directly surrounding the tag # All four groups can have spaces. - pattern = r'(?P[\s,]*)(?P[(\s]*)\{' + column_ref + r'\}(?P[\s)]*)(?P[\s,]*)' + pattern = r'(?P[\s,]*)(?P[(\s]*)' + oldvalue + r'(?P[\s)]*)(?P[\s,]*)' return re.sub(pattern, _remover, text) @@ -192,7 +190,7 @@ def _handle_curly_braces_refs(df, refs, column_names): # column_name_brackets = f"{{{replacing_name}}}" # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y # in zip(df[column_name], saved_columns[replacing_name])) - new_df[column_name] = pd.Series(replace_ref(x, y, replacing_name) for x, y + new_df[column_name] = pd.Series(replace_ref(x, f"{{{replacing_name}}}", y) for x, y in zip(new_df[column_name], saved_columns[replacing_name])) new_df = new_df[remaining_columns] @@ -220,7 +218,7 @@ def split_delay_tags(series, hed_schema, onsets): return split_df = pd.DataFrame({"onset": onsets, "HED": series, "original_index": series.index}) delay_strings = [(i, HedString(hed_string, hed_schema)) for (i, hed_string) in series.items() if - "delay/" in hed_string.lower()] + "delay/" in hed_string.casefold()] delay_groups = [] for i, delay_string in delay_strings: duration_tags = delay_string.find_top_level_tags({DefTagNames.DELAY_KEY}) diff --git a/hed/models/hed_group.py b/hed/models/hed_group.py index 842f6369..f3890f44 100644 --- a/hed/models/hed_group.py +++ b/hed/models/hed_group.py @@ -353,6 +353,10 @@ def lower(self): """ Convenience function, equivalent to str(self).lower(). """ return str(self).lower() + def casefold(self): + """ Convenience function, equivalent to str(self).casefold(). """ + return str(self).casefold() + def get_as_indented(self, tag_attribute="short_tag"): """Return the string as a multiline indented format. @@ -442,9 +446,9 @@ def find_tags(self, search_tags, recursive=False, include_groups=2): tags = self.get_all_tags() else: tags = self.tags() - search_tags = {tag.lower() for tag in search_tags} + search_tags = {tag.casefold() for tag in search_tags} for tag in tags: - if tag.short_base_tag.lower() in search_tags: + if tag.short_base_tag.casefold() in search_tags: found_tags.append((tag, tag._parent)) if include_groups == 0 or include_groups == 1: @@ -454,7 +458,7 @@ def find_tags(self, search_tags, recursive=False, include_groups=2): def find_wildcard_tags(self, search_tags, recursive=False, include_groups=2): """ Find the tags and their containing groups. - This searches tag.short_tag.lower(), with an implicit wildcard on the end. + This searches tag.short_tag.casefold(), with an implicit wildcard on the end. e.g. "Eve" will find Event, but not Sensory-event. @@ -475,11 +479,11 @@ def find_wildcard_tags(self, search_tags, recursive=False, include_groups=2): else: tags = self.tags() - search_tags = {search_tag.lower() for search_tag in search_tags} + search_tags = {search_tag.casefold() for search_tag in search_tags} for tag in tags: for search_tag in search_tags: - if tag.short_tag.lower().startswith(search_tag): + if tag.short_tag.casefold().startswith(search_tag): found_tags.append((tag, tag._parent)) # We can't find the same tag twice break @@ -575,7 +579,7 @@ def find_tags_with_term(self, term, recursive=False, include_groups=2): else: tags = self.tags() - search_for = term.lower() + search_for = term.casefold() for tag in tags: if search_for in tag.tag_terms: found_tags.append((tag, tag._parent)) diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index 9af387c3..32a443f0 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -353,11 +353,11 @@ def find_top_level_tags(self, anchor_tags, include_groups=2): Returns: list: The returned result depends on include_groups. """ - anchor_tags = {tag.lower() for tag in anchor_tags} + anchor_tags = {tag.casefold() for tag in anchor_tags} top_level_tags = [] for group in self.groups(): for tag in group.tags(): - if tag.short_base_tag.lower() in anchor_tags: + if tag.short_base_tag.casefold() in anchor_tags: top_level_tags.append((tag, group)) # Only capture a max of 1 per group. These are implicitly unique. break diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index 5e2281ae..647f6463 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -309,6 +309,10 @@ def lower(self): """ Convenience function, equivalent to str(self).lower(). """ return str(self).lower() + def casefold(self): + """ Convenience function, equivalent to str(self).casefold(). """ + return str(self).casefold() + def _calculate_to_canonical_forms(self, hed_schema): """ Update internal state based on schema. @@ -617,16 +621,16 @@ def replace_placeholder(self, placeholder_value): def __hash__(self): if self._schema_entry: return hash( - self._namespace + self._schema_entry.short_tag_name.lower() + self._extension_value.lower()) + self._namespace + self._schema_entry.short_tag_name.casefold() + self._extension_value.casefold()) else: - return hash(self.lower()) + return hash(self.casefold()) def __eq__(self, other): if self is other: return True if isinstance(other, str): - return self.lower() == other.lower() + return self.casefold() == other.casefold() if not isinstance(other, HedTag): return False @@ -634,7 +638,7 @@ def __eq__(self, other): if self.short_tag == other.short_tag: return True - if self.org_tag.lower() == other.org_tag.lower(): + if self.org_tag.casefold() == other.org_tag.casefold(): return True return False diff --git a/hed/models/query_handler.py b/hed/models/query_handler.py index 8aaf04a3..0cc404b9 100644 --- a/hed/models/query_handler.py +++ b/hed/models/query_handler.py @@ -44,7 +44,7 @@ def __init__(self, expression_string): """ self.tokens = [] self.at_token = -1 - self.tree = self._parse(expression_string.lower()) + self.tree = self._parse(expression_string.casefold()) self._org_string = expression_string def search(self, hed_string_obj): diff --git a/hed/models/string_util.py b/hed/models/string_util.py index 2804ac12..ea28a86d 100644 --- a/hed/models/string_util.py +++ b/hed/models/string_util.py @@ -38,7 +38,7 @@ def split_base_tags(hed_string, base_tags, remove_group=False): - The second HedString object contains the tags from hed_string that match the base_tags. """ - base_tags = [tag.lower() for tag in base_tags] + base_tags = [tag.casefold() for tag in base_tags] include_groups = 0 if remove_group: include_groups = 2 @@ -70,7 +70,7 @@ def split_def_tags(hed_string, def_names, remove_group=False): include_groups = 0 if remove_group: include_groups = 2 - wildcard_tags = [f"def/{def_name}".lower() for def_name in def_names] + wildcard_tags = [f"def/{def_name}".casefold() for def_name in def_names] found_things = hed_string.find_wildcard_tags(wildcard_tags, recursive=True, include_groups=include_groups) if remove_group: found_things = [tag if isinstance(group, HedString) else group for tag, group in found_things] diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 34164204..85767fa8 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -520,7 +520,7 @@ def _find_tag_entry(self, tag, schema_namespace=""): clean_tag = str(tag) namespace = schema_namespace clean_tag = clean_tag[len(namespace):] - working_tag = clean_tag.lower() + working_tag = clean_tag.casefold() # Most tags are in the schema directly, so test that first found_entry = self._get_tag_entry(working_tag) @@ -699,10 +699,10 @@ def _get_modifiers_for_unit(self, unit): This is a lower level one that doesn't rely on the Unit entries being fully setup. """ - # todo: could refactor this so this unit.lower() part is in HedSchemaUnitSection.get + # todo: could refactor this so this unit.casefold() part is in HedSchemaUnitSection.get unit_entry = self.get_tag_entry(unit, HedSectionKey.Units) if unit_entry is None: - unit_entry = self.get_tag_entry(unit.lower(), HedSectionKey.Units) + unit_entry = self.get_tag_entry(unit.casefold(), HedSectionKey.Units) # Unit symbols must match exactly if unit_entry is None or unit_entry.has_attribute(HedKey.UnitSymbol): return [] diff --git a/hed/schema/hed_schema_entry.py b/hed/schema/hed_schema_entry.py index 2f42cca5..7b0e19b6 100644 --- a/hed/schema/hed_schema_entry.py +++ b/hed/schema/hed_schema_entry.py @@ -197,7 +197,7 @@ def get_derivative_unit_entry(self, units): if possible_match and possible_match.has_attribute(HedKey.UnitSymbol): return possible_match - possible_match = self.derivative_units.get(units.lower()) + possible_match = self.derivative_units.get(units.casefold()) # Unit symbols must match including case, a match of a unit symbol now is something like M becoming m. if possible_match and possible_match.has_attribute(HedKey.UnitSymbol): possible_match = None @@ -416,7 +416,7 @@ def finalize_entry(self, schema): if self._parent_tag: self._parent_tag.children[self.short_tag_name] = self self.takes_value_child_entry = schema._get_tag_entry(self.name + "/#") - self.tag_terms = tuple(self.long_tag_name.lower().split("/")) + self.tag_terms = tuple(self.long_tag_name.casefold().split("/")) self._finalize_inherited_attributes() self._finalize_takes_value_tag(schema) diff --git a/hed/schema/hed_schema_section.py b/hed/schema/hed_schema_section.py index 99d7b168..8d45bcba 100644 --- a/hed/schema/hed_schema_section.py +++ b/hed/schema/hed_schema_section.py @@ -64,7 +64,7 @@ def _add_to_dict(self, name, new_entry): """ Add a name to the dictionary for this section. """ name_key = name if not self.case_sensitive: - name_key = name.lower() + name_key = name.casefold() return_entry = self._check_if_duplicate(name_key, new_entry) @@ -115,7 +115,7 @@ def keys(self): def __getitem__(self, key): if not self.case_sensitive: - key = key.lower() + key = key.casefold() return self.all_names[key] def get(self, key): @@ -126,7 +126,7 @@ def get(self, key): """ if not self.case_sensitive: - key = key.lower() + key = key.casefold() return self.all_names.get(key) def __eq__(self, other): @@ -153,7 +153,7 @@ class HedSchemaUnitSection(HedSchemaSection): def _check_if_duplicate(self, name_key, new_entry): """We need to mark duplicate units(units with unitSymbol are case sensitive, while others are not.""" if not new_entry.has_attribute(HedKey.UnitSymbol): - name_key = name_key.lower() + name_key = name_key.casefold() return super()._check_if_duplicate(name_key, new_entry) @@ -220,24 +220,24 @@ def _check_if_duplicate(self, name, new_entry): else: self.all_names[name] = new_entry for tag_key in tag_forms: - name_key = tag_key.lower() + name_key = tag_key.casefold() self.long_form_tags[name_key] = new_entry return new_entry def get(self, key): if not self.case_sensitive: - key = key.lower() + key = key.casefold() return self.long_form_tags.get(key) def __getitem__(self, key): if not self.case_sensitive: - key = key.lower() + key = key.casefold() return self.long_form_tags[key] def __contains__(self, key): if not self.case_sensitive: - key = key.lower() + key = key.casefold() return key in self.long_form_tags @staticmethod diff --git a/hed/tools/analysis/annotation_util.py b/hed/tools/analysis/annotation_util.py index aafb2a8d..078a2968 100644 --- a/hed/tools/analysis/annotation_util.py +++ b/hed/tools/analysis/annotation_util.py @@ -3,6 +3,7 @@ import re from pandas import DataFrame from hed.errors.exceptions import HedFileError +from hed.models.df_util import replace_ref def check_df_columns(df, required_cols=('column_name', 'column_value', 'description', 'HED')): @@ -70,20 +71,12 @@ def extract_tags(hed_string, search_tag): - list: A list of the tags that were extracted, for example descriptions. """ - extracted = [] - remainder = "" - back_piece = hed_string - while back_piece: - ind = back_piece.find(search_tag) - if ind == -1: - remainder = _update_remainder(remainder, back_piece) - break - first_pos = _find_last_pos(back_piece[:ind]) - remainder = _update_remainder(remainder, trim_back(back_piece[:first_pos])) - next_piece = back_piece[first_pos:] - last_pos = _find_first_pos(next_piece) - extracted.append(trim_back(next_piece[:last_pos])) - back_piece = trim_front(next_piece[last_pos:]) + possible_descriptions = hed_string.replace(")", "").replace("(", "").split(",") + extracted = [tag.strip() for tag in possible_descriptions if search_tag in tag] + remainder = hed_string + for tag in extracted: + remainder = replace_ref(remainder, tag) + return remainder, extracted @@ -178,80 +171,6 @@ def merge_hed_dict(sidecar_dict, hed_dict): sidecar_dict[key]['Levels'] = value_dict['Levels'] -def trim_back(tag_string): - """ Return a trimmed copy of tag_string. - - Parameters: - tag_string (str): A tag string to be trimmed. - - Returns: - str: A copy of tag_string that has been trimmed. - - Notes: - - The trailing blanks and commas are removed from the copy. - - - """ - - last_pos = 0 - for ind, char in enumerate(reversed(tag_string)): - if char not in [',', ' ']: - last_pos = ind - break - return_str = tag_string[:(len(tag_string)-last_pos)] - return return_str - - -def trim_front(tag_string): - """ Return a copy of tag_string with leading blanks and commas removed. - - Parameters: - tag_string (str): A tag string to be trimmed. - - Returns: - str: A copy of tag_string that has been trimmed. - """ - first_pos = len(tag_string) - for ind, char in enumerate(tag_string): - if char not in [',', ' ']: - first_pos = ind - break - return_str = tag_string[first_pos:] - return return_str - - -def _find_first_pos(tag_string): - """ Return the position of the first comma or closing parenthesis in tag_string. - - Parameters: - tag_string (str): String to be analyzed. - - Returns: - int: Position of first comma or closing parenthesis or length of tag_string if none. - - """ - for ind, char in enumerate(tag_string): - if char in [',', ')']: - return ind - return len(tag_string) - - -def _find_last_pos(tag_string): - """ Find the position of the last comma, blank, or opening parenthesis in tag_string. - - Parameters: - tag_string (str): String to be analyzed. - - Returns: - int: Position of last comma or opening parenthesis or 0 if none. - - """ - for index, char in enumerate(reversed(tag_string)): - if char in [',', ' ', '(']: - return len(tag_string) - index - return 0 - - def _flatten_cat_col(col_key, col_dict): """ Flatten a sidecar entry corresponding to a categorical column. @@ -386,7 +305,7 @@ def _tag_list_to_str(extracted, removed_tag=None): return " ".join(extracted) str_list = [] for ind, item in enumerate(extracted): - ind = item.lower().find(removed_tag.lower()) + ind = item.casefold().find(removed_tag.casefold()) if ind >= 0: str_list.append(item[ind+len(removed_tag):]) else: @@ -419,22 +338,22 @@ def _update_cat_dict(cat_dict, value_entry, hed_entry, description_entry, descri cat_dict['HED'] = hed_part -def _update_remainder(remainder, update_piece): - """ Update remainder with update piece. - - Parameters: - remainder (str): A tag string without trailing comma. - update_piece (str): A tag string to be appended. - - Returns: - str: A concatenation of remainder and update_piece, paying attention to separating commas. - - """ - if not update_piece: - return remainder - elif not remainder: - return update_piece - elif remainder.endswith('(') or update_piece.startswith(')'): - return remainder + update_piece - else: - return remainder + ", " + update_piece +# def _update_remainder(remainder, update_piece): +# """ Update remainder with update piece. +# +# Parameters: +# remainder (str): A tag string without trailing comma. +# update_piece (str): A tag string to be appended. +# +# Returns: +# str: A concatenation of remainder and update_piece, paying attention to separating commas. +# +# """ +# if not update_piece: +# return remainder +# elif not remainder: +# return update_piece +# elif remainder.endswith('(') or update_piece.startswith(')'): +# return remainder + update_piece +# else: +# return remainder + ", " + update_piece diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py index 645ff450..9bdc5183 100644 --- a/hed/tools/analysis/event_manager.py +++ b/hed/tools/analysis/event_manager.py @@ -101,8 +101,8 @@ def _extract_temporal_events(self, hed, event_index, onset_dict): to_remove = [] for def_tag, group in group_tuples: anchor_tag = group.find_def_tags(recursive=False, include_groups=0)[0] - anchor = anchor_tag.extension.lower() - if anchor in onset_dict or def_tag.short_base_tag == DefTagNames.OFFSET_KEY: + anchor = anchor_tag.extension.casefold() + if anchor in onset_dict or def_tag == DefTagNames.OFFSET_KEY: temporal_event = onset_dict.pop(anchor) temporal_event.set_end(event_index, self.onsets[event_index]) if def_tag == DefTagNames.ONSET_KEY: diff --git a/hed/tools/analysis/hed_tag_counts.py b/hed/tools/analysis/hed_tag_counts.py index e4b303e4..a552d6ff 100644 --- a/hed/tools/analysis/hed_tag_counts.py +++ b/hed/tools/analysis/hed_tag_counts.py @@ -99,7 +99,7 @@ def update_event_counts(self, hed_string_obj, file_name): tag_list = hed_string_obj.get_all_tags() tag_dict = {} for tag in tag_list: - str_tag = tag.short_base_tag.lower() + str_tag = tag.short_base_tag.casefold() if str_tag not in tag_dict: tag_dict[str_tag] = HedTagCount(tag, file_name) else: @@ -173,7 +173,7 @@ def create_template(tags): template_dict = {} for key, key_list in tags.items(): for element in key_list: - template_dict[element.lower()] = [] + template_dict[element.casefold()] = [] return template_dict @staticmethod diff --git a/hed/tools/analysis/hed_type.py b/hed/tools/analysis/hed_type.py index 10059cef..60cb1a3d 100644 --- a/hed/tools/analysis/hed_type.py +++ b/hed/tools/analysis/hed_type.py @@ -1,6 +1,7 @@ """ Manager a type variable and its associated context. """ import pandas as pd from hed.models import HedGroup, HedTag +from hed.models.model_constants import DefTagNames from hed.tools.analysis.hed_type_defs import HedTypeDefs from hed.tools.analysis.hed_type_factors import HedTypeFactors @@ -21,7 +22,7 @@ def __init__(self, event_manager, name, type_tag="condition-variable"): """ self.name = name - self.type_tag = type_tag.lower() + self.type_tag = type_tag.casefold() self.event_manager = event_manager self.type_defs = HedTypeDefs(event_manager.def_dict, type_tag=type_tag) self._type_map = {} # Dictionary of type tags versus dictionary with keys being definition names. @@ -41,7 +42,7 @@ def get_type_value_factors(self, type_value): HedTypeFactors or None """ - return self._type_map.get(type_value.lower(), None) + return self._type_map.get(type_value.casefold(), None) def get_type_value_level_info(self, type_value): """ Return type variable corresponding to type_value. @@ -121,7 +122,7 @@ def _extract_definition_variables(self, item, index): else: tags = item.get_all_tags() for tag in tags: - if tag.short_base_tag.lower() != "def": + if tag.short_base_tag != DefTagNames.DEF_KEY: continue hed_vars = self.type_defs.get_type_values(tag) if not hed_vars: @@ -140,7 +141,7 @@ def _update_definition_variables(self, tag, hed_vars, index): This modifies the HedTypeFactors map. """ - level = tag.extension.lower() + level = tag.extension.casefold() for var_name in hed_vars: hed_var = self._type_map.get(var_name, None) if hed_var is None: @@ -173,7 +174,7 @@ def get_type_list(type_tag, item): list: List of the items with this type_tag """ - if isinstance(item, HedTag) and item.short_base_tag.lower() == type_tag: + if isinstance(item, HedTag) and item.short_base_tag.casefold() == type_tag: tag_list = [item] elif isinstance(item, HedGroup) and item.children: tag_list = item.find_tags_with_term(type_tag, recursive=True, include_groups=0) @@ -190,7 +191,7 @@ def _update_variables(self, tag_list, index): """ for tag in tag_list: - tag_value = tag.extension.lower() + tag_value = tag.extension.casefold() if not tag_value: tag_value = self.type_tag hed_var = self._type_map.get(tag_value, None) diff --git a/hed/tools/analysis/hed_type_counts.py b/hed/tools/analysis/hed_type_counts.py index 31d8bd9c..49b458cf 100644 --- a/hed/tools/analysis/hed_type_counts.py +++ b/hed/tools/analysis/hed_type_counts.py @@ -16,7 +16,7 @@ class HedTypeCount: def __init__(self, type_value, type_tag, file_name=None): self.type_value = type_value - self.type_tag = type_tag.lower() + self.type_tag = type_tag.casefold() self.direct_references = 0 self.total_events = 0 self.events = 0 diff --git a/hed/tools/analysis/hed_type_defs.py b/hed/tools/analysis/hed_type_defs.py index a152123d..2a308415 100644 --- a/hed/tools/analysis/hed_type_defs.py +++ b/hed/tools/analysis/hed_type_defs.py @@ -27,7 +27,7 @@ def __init__(self, definitions, type_tag='condition-variable'): """ - self.type_tag = type_tag.lower() + self.type_tag = type_tag.casefold() if isinstance(definitions, DefinitionDict): self.definitions = definitions.defs elif isinstance(definitions, dict): @@ -50,7 +50,7 @@ def get_type_values(self, item): def_names = self.extract_def_names(item, no_value=True) type_values = [] for def_name in def_names: - values = self.def_map.get(def_name.lower(), {}) + values = self.def_map.get(def_name.casefold(), {}) if "type_values" in values: type_values = type_values + values["type_values"] return type_values @@ -81,7 +81,7 @@ def _extract_def_map(self): for entry in self.definitions.values(): type_def, type_values, description, other_tags = self._extract_entry_values(entry) if type_def: - def_map[type_def.lower()] = \ + def_map[type_def.casefold()] = \ {'def_name': type_def, 'type_values': type_values, 'description': description, 'tags': other_tags} return def_map @@ -115,12 +115,12 @@ def _extract_entry_values(self, entry): description = '' other_tags = [] for hed_tag in tag_list: - if hed_tag.short_base_tag.lower() == 'description': + if hed_tag.short_base_tag == 'Description': description = hed_tag.extension - elif hed_tag.short_base_tag.lower() != self.type_tag: + elif hed_tag.short_base_tag.casefold() != self.type_tag: other_tags.append(hed_tag.short_base_tag) else: - type_values.append(hed_tag.extension.lower()) + type_values.append(hed_tag.extension.casefold()) type_def = entry.name return type_def, type_values, description, other_tags @@ -137,9 +137,9 @@ def extract_def_names(item, no_value=True): """ if isinstance(item, HedTag) and 'def' in item.tag_terms: - names = [item.extension.lower()] + names = [item.extension.casefold()] else: - names = [tag.extension.lower() for tag in item.get_all_tags() if 'def' in tag.tag_terms] + names = [tag.extension.casefold() for tag in item.get_all_tags() if 'def' in tag.tag_terms] if no_value: for index, name in enumerate(names): name, name_value = HedTypeDefs.split_name(name) @@ -167,6 +167,6 @@ def split_name(name, lowercase=True): if len(parts) > 1: def_value = parts[1] if lowercase: - return def_name.lower(), def_value.lower() + return def_name.casefold(), def_value.casefold() else: return def_name, def_value diff --git a/hed/tools/analysis/hed_type_factors.py b/hed/tools/analysis/hed_type_factors.py index d9d38564..17f1de9c 100644 --- a/hed/tools/analysis/hed_type_factors.py +++ b/hed/tools/analysis/hed_type_factors.py @@ -21,7 +21,7 @@ def __init__(self, type_tag, type_value, number_elements): self.type_value = type_value self.number_elements = number_elements - self.type_tag = type_tag.lower() + self.type_tag = type_tag.casefold() self.levels = {} self.direct_indices = {} @@ -80,9 +80,9 @@ def _one_hot_to_categorical(self, factors, levels): df.at[index, self.type_value] = self.type_value continue for level in levels: - level_str = f"{self.type_value}.{level.lower()}" + level_str = f"{self.type_value}.{level.casefold()}" if level_str in row.index and row[level_str]: - df.at[index, self.type_value] = level.lower() + df.at[index, self.type_value] = level.casefold() break return df diff --git a/hed/tools/analysis/hed_type_manager.py b/hed/tools/analysis/hed_type_manager.py index 2cb01111..402d45d1 100644 --- a/hed/tools/analysis/hed_type_manager.py +++ b/hed/tools/analysis/hed_type_manager.py @@ -39,9 +39,9 @@ def add_type(self, type_name): type_name (str): Type tag name of the type to be added. """ - if type_name.lower() in self._type_map: + if type_name.casefold() in self._type_map: return - self._type_map[type_name.lower()] = \ + self._type_map[type_name.casefold()] = \ HedType(self.event_manager, 'run-01', type_tag=type_name) def get_factor_vectors(self, type_tag, type_values=None, factor_encoding="one-hot"): @@ -56,7 +56,7 @@ def get_factor_vectors(self, type_tag, type_values=None, factor_encoding="one-ho DataFrame or None: DataFrame containing the factor vectors as the columns. """ - this_var = self.get_type(type_tag.lower()) + this_var = self.get_type(type_tag.casefold()) if this_var is None: return None variables = this_var.get_type_value_names() @@ -80,7 +80,7 @@ def get_type(self, type_tag): HedType or None: the values associated with this type tag. """ - return self._type_map.get(type_tag.lower(), None) + return self._type_map.get(type_tag.casefold(), None) def get_type_tag_factor(self, type_tag, type_value): """ Return the HedTypeFactors a specified value and extension. @@ -90,9 +90,9 @@ def get_type_tag_factor(self, type_tag, type_value): type_value (str or None): Value of this tag to return the factors for. """ - this_map = self._type_map.get(type_tag.lower(), None) + this_map = self._type_map.get(type_tag.casefold(), None) if this_map: - return this_map._type_map.get(type_value.lower(), None) + return this_map._type_map.get(type_value.casefold(), None) return None def get_type_def_names(self, type_var): diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index 424720cc..ab407cb3 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -75,7 +75,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): df_list = [input_data.dataframe] var_manager = HedTypeManager( EventManager(input_data, dispatcher.hed_schema)) - var_manager.add_type(self.type_tag.lower()) + var_manager.add_type(self.type_tag.casefold()) df_factors = var_manager.get_factor_vectors( self.type_tag, self.type_values, factor_encoding="one-hot") diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 1abcfe3c..f899baf5 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -467,6 +467,6 @@ def _get_details(key_list, template, verbose=False): """ key_details = [] for item in key_list: - for tag_cnt in template[item.lower()]: + for tag_cnt in template[item.casefold()]: key_details.append(tag_cnt.get_info(verbose=verbose)) return key_details diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 85c705f2..de1c73f3 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -67,7 +67,7 @@ def __init__(self, parameters): super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] - self.type_tag = parameters['type_tag'].lower() + self.type_tag = parameters['type_tag'].casefold() self.append_timecode = parameters.get('append_timecode', False) def do_op(self, dispatcher, df, name, sidecar=None): diff --git a/hed/validator/def_validator.py b/hed/validator/def_validator.py index 953a5f92..667e3858 100644 --- a/hed/validator/def_validator.py +++ b/hed/validator/def_validator.py @@ -78,7 +78,7 @@ def _validate_def_contents(self, def_tag, def_expand_group, hed_validator): is_def_expand_tag = def_expand_group != def_tag tag_label, _, placeholder = def_tag.extension.partition('/') - label_tag_lower = tag_label.lower() + label_tag_lower = tag_label.casefold() def_entry = self.defs.get(label_tag_lower) if def_entry is None: error_code = ValidationErrors.HED_DEF_UNMATCHED @@ -103,7 +103,7 @@ def validate_def_value_units(self, def_tag, hed_validator): tag_label, _, placeholder = def_tag.extension.partition('/') is_def_expand_tag = def_tag.short_base_tag == DefTagNames.DEF_EXPAND_KEY - def_entry = self.defs.get(tag_label.lower()) + def_entry = self.defs.get(tag_label.casefold()) # These errors will be caught as can't match definition if def_entry is None: return [] @@ -196,7 +196,7 @@ def _find_onset_tags(self, hed_string_obj): def _handle_onset_or_offset(self, def_tag): def_name, _, placeholder = def_tag.extension.partition('/') - def_entry = self.defs.get(def_name.lower()) + def_entry = self.defs.get(def_name.casefold()) if def_entry is None: return ErrorHandler.format_error(TemporalErrors.ONSET_DEF_UNMATCHED, tag=def_tag) if bool(def_entry.takes_value) != bool(placeholder): diff --git a/hed/validator/onset_validator.py b/hed/validator/onset_validator.py index 105090c6..dfd2b7cd 100644 --- a/hed/validator/onset_validator.py +++ b/hed/validator/onset_validator.py @@ -30,12 +30,12 @@ def validate_temporal_relations(self, hed_string_obj): def_tag = def_tags[0] def_name = def_tag.extension - if def_name.lower() in used_def_names: + if def_name.casefold() in used_def_names: onset_issues += ErrorHandler.format_error(TemporalErrors.ONSET_SAME_DEFS_ONE_ROW, tag=temporal_tag, def_name=def_name) continue - used_def_names.add(def_tag.extension.lower()) + used_def_names.add(def_tag.extension.casefold()) # At this point we have either an onset or offset tag and it's name onset_issues += self._handle_onset_or_offset(def_tag, temporal_tag) @@ -47,16 +47,16 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag): full_def_name = def_tag.extension if is_onset: # onset can never fail as it implies an offset - self._onsets[full_def_name.lower()] = full_def_name + self._onsets[full_def_name.casefold()] = full_def_name else: is_offset = onset_offset_tag.short_base_tag == DefTagNames.OFFSET_KEY - if full_def_name.lower() not in self._onsets: + if full_def_name.casefold() not in self._onsets: if is_offset: return ErrorHandler.format_error(TemporalErrors.OFFSET_BEFORE_ONSET, tag=def_tag) else: return ErrorHandler.format_error(TemporalErrors.INSET_BEFORE_ONSET, tag=def_tag) elif is_offset: - del self._onsets[full_def_name.lower()] + del self._onsets[full_def_name.casefold()] return [] diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py index 6f3b5b1e..462423b5 100644 --- a/hed/validator/sidecar_validator.py +++ b/hed/validator/sidecar_validator.py @@ -97,7 +97,7 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None) ref_dict = dict(zip(refs, combination)) modified_string = hed_string for ref in refs: - modified_string = replace_ref(modified_string, ref_dict[ref], ref) + modified_string = replace_ref(modified_string, f"{{{ref}}}", ref_dict[ref]) hed_string_obj = HedString(modified_string, hed_schema=self._schema, def_dict=sidecar_def_dict) error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) @@ -296,7 +296,7 @@ def _validate_pound_sign_count(self, hed_string, column_type): hed_string_copy.remove_definitions() hed_string_copy.shrink_defs() - if hed_string_copy.lower().count("#") != expected_count: + if str(hed_string_copy).count("#") != expected_count: return ErrorHandler.format_error(error_type, pound_sign_count=str(hed_string_copy).count("#")) return [] diff --git a/hed/validator/tag_util/group_util.py b/hed/validator/tag_util/group_util.py index 6e6c92ce..cc32970e 100644 --- a/hed/validator/tag_util/group_util.py +++ b/hed/validator/tag_util/group_util.py @@ -136,7 +136,7 @@ def check_for_required_tags(self, tags): validation_issues = [] required_prefixes = self._hed_schema.get_tags_with_attribute(HedKey.Required) for required_prefix in required_prefixes: - if not any(tag.long_tag.lower().startswith(required_prefix.lower()) for tag in tags): + if not any(tag.long_tag.casefold().startswith(required_prefix.casefold()) for tag in tags): validation_issues += ErrorHandler.format_error(ValidationErrors.REQUIRED_TAG_MISSING, tag_namespace=required_prefix) return validation_issues @@ -156,7 +156,7 @@ def check_multiple_unique_tags_exist(self, tags): validation_issues = [] unique_prefixes = self._hed_schema.get_tags_with_attribute(HedKey.Unique) for unique_prefix in unique_prefixes: - unique_tag_prefix_bool_mask = [x.long_tag.lower().startswith(unique_prefix.lower()) for x in tags] + unique_tag_prefix_bool_mask = [x.long_tag.casefold().startswith(unique_prefix.casefold()) for x in tags] if sum(unique_tag_prefix_bool_mask) > 1: validation_issues += ErrorHandler.format_error(ValidationErrors.TAG_NOT_UNIQUE, tag_namespace=unique_prefix) diff --git a/tests/data/schema_tests/schema_utf8.mediawiki b/tests/data/schema_tests/schema_utf8.mediawiki index 4eb37065..2cc5f437 100644 --- a/tests/data/schema_tests/schema_utf8.mediawiki +++ b/tests/data/schema_tests/schema_utf8.mediawiki @@ -6,6 +6,7 @@ HED version="8.3.0" unmerged="True" '''Tag1''' * Café +* ßword [ This is a special character that differs with .casefold vs .lower] '''Ascii''' * # {takesValue, valueClass=textClass} diff --git a/tests/data/schema_tests/schema_utf8_dupe.mediawiki b/tests/data/schema_tests/schema_utf8_dupe.mediawiki new file mode 100644 index 00000000..63d89ca3 --- /dev/null +++ b/tests/data/schema_tests/schema_utf8_dupe.mediawiki @@ -0,0 +1,29 @@ +HED version="8.3.0" unmerged="True" + +'''Prologue''' + +!# start schema + +'''Tag1''' +* Wßord [ This is a special character that differs with .casefold vs .lower] +* Wssord [This is the same word as above] + +!# end schema + +'''Unit classes''' [Unit classes and the units for the nodes.] + + + +'''Unit modifiers''' [Unit multiples and submultiples.] + + + +'''Value classes''' [Specification of the rules for the values provided by users.] + +'''Schema attributes''' [Allowed attribute modifiers of other sections of the schema.] + +'''Properties''' [Properties of the schema attributes themselves. These are used for schema handling and verification.] + +'''Epilogue''' + +!# end hed diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index ade99fab..bfd79371 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -115,6 +115,17 @@ def test_load_schema_version_merged(self): with self.assertRaises(HedFileError): schemas3.save_as_mediawiki("filename") + def test_verify_utf8_dupe(self): + base_dir = os.path.join(os.path.dirname(__file__), "../data/schema_tests") + schema_path = os.path.join(base_dir, "schema_utf8_dupe.mediawiki") + schema = load_schema(schema_path) + issues = schema.check_compliance() + self.assertEqual(len(issues), 1) + + # Note it finds both of these as a duplicate + self.assertTrue(schema.get_tag_entry("Wßord")) + self.assertTrue(schema.get_tag_entry("Wssord")) + def test_load_and_verify_tags(self): # Load 'testlib' by itself testlib = load_schema_version('testlib_2.0.0') diff --git a/tests/tools/analysis/test_annotation_util.py b/tests/tools/analysis/test_annotation_util.py index abcfcdbb..1455d209 100644 --- a/tests/tools/analysis/test_annotation_util.py +++ b/tests/tools/analysis/test_annotation_util.py @@ -7,9 +7,9 @@ from hed.errors import HedFileError from hed.models.sidecar import Sidecar from hed.tools.analysis.annotation_util import check_df_columns, df_to_hed, extract_tags, hed_to_df, merge_hed_dict -from hed.tools.analysis.annotation_util import _find_last_pos, _find_first_pos, \ - _flatten_cat_col, _flatten_val_col, _get_value_entry, trim_back, trim_front, _tag_list_to_str, _update_cat_dict, \ - generate_sidecar_entry +from hed.tools.analysis.annotation_util import _flatten_cat_col, _flatten_val_col, _get_value_entry, _tag_list_to_str, \ + _update_cat_dict, generate_sidecar_entry +# from hed.tools.analysis.annotation_util import _find_last_pos, _find_first_pos, trim_back, trim_front from hed.tools.analysis.tabular_summary import TabularSummary from hed.tools.util.io_util import get_file_list @@ -117,7 +117,7 @@ def extract_tag_multiple_matches(self): self.assertEqual(extracted6[1], "Description/Another description.", "extract_tags return right item when parens") - def extract_tag_with_parens(self): + def test_extract_tag_with_parens(self): str7 = "Bear, ((Informational-property/Description/Pluck this leaf., Junk), Description/Another description.)" remainder7, extracted7 = extract_tags(str7, 'Description/') self.assertEqual(remainder7, "Bear, ((Junk))", "extract_tags should return the right string when parens") @@ -291,52 +291,6 @@ def test_merge_hed_dict_full(self): merge_hed_dict(example_sidecar, spreadsheet_sidecar) self.assertEqual(6, len(example_sidecar), 'merge_hed_dict merges with the correct length') - def test_trim_back(self): - str1 = 'Blech, Cat, (' - trim1 = trim_back(str1) - self.assertEqual(trim1, str1, 'trim_back should trim the correct amount') - str2 = "" - trim2 = trim_back(str2) - self.assertFalse(trim2, 'trim_back should trim an empty string to empty') - str3 = '(Blech, Cat), ' - trim3 = trim_back(str3) - self.assertEqual('(Blech, Cat)', trim3, 'trim_back should trim extra blanks and comma') - - def test_trim_front(self): - str1 = ', (Blech, Cat)' - trim1 = trim_front(str1) - self.assertEqual(trim1, "(Blech, Cat)", 'trim_front should trim the correct amount') - str2 = "" - trim2 = trim_front(str2) - self.assertFalse(trim2, 'trim_front should trim an empty string to empty') - str3 = '(Blech, Cat)' - trim3 = trim_front(str3) - self.assertEqual(str3, trim3, 'trim_front should trim not trim if no extras') - - def test_find_last_pos(self): - test1 = "Apple/1.0, (" - pos1 = _find_last_pos(test1) - self.assertEqual(pos1, len(test1)) - test2 = "Informational-property/" - pos2 = _find_last_pos(test2) - self.assertEqual(pos2, 0, "_find_last_pos should return the start if at the beginning") - test3 = "(Blech), (Property/Informational-property" - pos3 = _find_last_pos(test3) - self.assertEqual(pos3, 10, "_find_last_pos should return the start if at the beginning") - - def test_find_first_pos(self): - test1 = "My blech." - pos1 = _find_first_pos(test1) - self.assertEqual(pos1, len(test1), - "_find_first_position should return position at character after end of string") - - test2 = "My blech.))" - pos2 = _find_first_pos(test2) - self.assertEqual(pos2, 9, "_find_first_position should return position at closing parentheses") - test3 = "My blech., Description/My apple." - pos3 = _find_first_pos(test3) - self.assertEqual(pos3, 9, "_find_first_position should return position at closing parentheses") - def test_flatten_cat_col(self): col1 = self.sidecar2c["a"] col2 = self.sidecar2c["b"]