Turn on allowed character support in > 8.3

Partial UTF8 support(allowedcharacter = nonascii)
hed-standard · Feb 22, 2024 · c8c9614 · c8c9614
1 parent 2b04113
commit c8c9614
Show file tree

Hide file tree

Showing 10 changed files with 262 additions and 40 deletions.
diff --git a/hed/errors/schema_error_messages.py b/hed/errors/schema_error_messages.py
@@ -94,9 +94,10 @@ def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor):
 @hed_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID,
            actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID)
 def schema_error_SCHEMA_ALLOWED_CHARACTERS_INVALID(tag, invalid_character):
+    from hed.schema.hed_schema_constants import character_types
     return (f"Tag '{tag}' has an invalid allowedCharacter: '{invalid_character}'.  "
             f"Allowed characters are: a single character, "
-            f"or one of the following - letters, blank, digits, alphanumeric.")
+            f"or one of the following - {', '.join(character_types.keys())}.")
 
 
 @hed_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID,

diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py
@@ -87,3 +87,11 @@ class HedKey:
     NO_LOC_ATTRIB,
     UNMERGED_ATTRIBUTE
 }
+
+character_types = {
+    "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
+    "blank": set(" "),
+    "digits": set("0123456789"),
+    "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"),
+    "nonascii": "nonascii"  # Special case for all other printable unicode characters
+}
diff --git a/hed/schema/schema_attribute_validators.py b/hed/schema/schema_attribute_validators.py
@@ -14,7 +14,7 @@
 from hed.errors.error_types import SchemaWarnings, ValidationErrors, SchemaAttributeErrors
 from hed.errors.error_reporter import ErrorHandler
 from hed.schema.hed_cache import get_hed_versions
-from hed.schema.hed_schema_constants import HedKey
+from hed.schema.hed_schema_constants import HedKey, character_types
 
 
 def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name):
@@ -141,6 +141,10 @@ def tag_is_deprecated_check(hed_schema, tag_entry, attribute_name):
     deprecated_version = tag_entry.attributes.get(attribute_name, "")
     library_name = tag_entry.has_attribute(HedKey.InLibrary, return_value=True)
     all_versions = get_hed_versions(library_name=library_name)
+    if not library_name:
+        library_name = ""
+    if library_name == hed_schema.library and hed_schema.version_number not in all_versions:
+        all_versions.append(hed_schema.version_number)
     if deprecated_version and deprecated_version not in all_versions:
         issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID,
                                             tag_entry.name,
@@ -182,7 +186,7 @@ def allowed_characters_check(hed_schema, tag_entry, attribute_name):
 
     """
     issues = []
-    allowed_strings = {'letters', 'blank', 'digits', 'alphanumeric'}
+    allowed_strings = character_types
 
     char_string = tag_entry.attributes.get(attribute_name, "")
     characters = char_string.split(",")

diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py
@@ -125,6 +125,7 @@ def check_invalid_chars(self):
             for tag_name, desc in self.hed_schema.get_desc_iter():
                 issues_list += validate_schema_description(tag_name, desc)
 
+        # todo: Do we want to add this?
         # todo Activate this session once we have clearer rules on spaces in unit names
         # for unit in self.hed_schema.units:
         #     for i, char in enumerate(unit):

diff --git a/hed/validator/hed_validator.py b/hed/validator/hed_validator.py
@@ -31,7 +31,7 @@ def __init__(self, hed_schema, def_dicts=None, definitions_allowed=False):
         self._def_validator = DefValidator(def_dicts, hed_schema)
         self._definitions_allowed = definitions_allowed
 
-        self._unit_validator = UnitValueValidator()
+        self._unit_validator = UnitValueValidator(hed_schema)
         self._char_validator = CharValidator()
         self._string_validator = StringValidator()
         self._tag_validator = TagValidator()

diff --git a/hed/validator/tag_util/char_util.py b/hed/validator/tag_util/char_util.py
@@ -33,7 +33,7 @@ def check_invalid_character_issues(self, hed_string, allow_placeholders):
         if allow_placeholders:
             invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS
         for index, character in enumerate(hed_string):
-            if character in invalid_dict or ord(character) > 127:
+            if character in invalid_dict or not character.isprintable():
                 validation_issues += self._report_invalid_character_error(hed_string, index)
 
         return validation_issues

diff --git a/hed/validator/tag_util/class_util.py b/hed/validator/tag_util/class_util.py
@@ -1,11 +1,14 @@
 """ Utilities to support HED validation. """
 import datetime
 import re
+import functools
+from semantic_version import Version
 
 
 from hed.errors.error_reporter import ErrorHandler
 from hed.errors.error_types import ValidationErrors
-
+from hed.schema.hed_schema_constants import HedKey, character_types
+from hed.schema import HedSchema
 
 class UnitValueValidator:
     """ Validates units. """
@@ -18,13 +21,20 @@ class UnitValueValidator:
 
     VALUE_CLASS_ALLOWED_CACHE = 20
 
-    def __init__(self, value_validators=None):
+    def __init__(self, hed_schema, value_validators=None):
         """ Validates the unit and value classes on a given tag.
 
         Parameters:
             value_validators(dict or None): Override or add value class validators
 
         """
+        self._validate_characters = False
+        # todo: Extend character validation for schema groups eventually
+        if isinstance(hed_schema, HedSchema):
+            validation_version = hed_schema.with_standard
+            if not validation_version:
+                validation_version = hed_schema.version_number
+            self._validate_characters = Version(validation_version) >= Version("8.3.0")
         self._value_validators = self._get_default_value_class_validators()
         if value_validators and isinstance(value_validators, dict):
             self._value_validators.update(value_validators)
@@ -97,25 +107,20 @@ def check_tag_value_class_valid(self, original_tag, validate_text, report_as=Non
         """
         return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset)
 
-    # char_sets = {
-    #     "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
-    #     "blank": set(" "),
-    #     "digits": set("0123456789"),
-    #     "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
-    # }
-    #
-    # @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
-    # def _get_allowed_characters(self, value_classes):
-    #     # This could be pre-computed
-    #     character_set = set()
-    #     for value_class in value_classes:
-    #         allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
-    #         for single_type in allowed_types.split(","):
-    #             if single_type in self.char_sets:
-    #                 character_set.update(self.char_sets[single_type])
-    #             else:
-    #                 character_set.add(single_type)
-    #     return character_set
+    @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
+    def _get_allowed_characters(self, value_classes):
+        # This could be pre-computed
+        character_set = set()
+        for value_class in value_classes:
+            allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
+            for single_type in allowed_types.split(","):
+                if single_type in character_types and single_type != "nonascii":
+                    character_set.update(character_types[single_type])
+                else:
+                    character_set.add(single_type)
+        # for now, just always allow these special cases(it's validated extensively elsewhere)
+        character_set.update("#/")
+        return character_set
 
     def _get_problem_indexes(self, original_tag, stripped_value):
         """ Return list of problem indices for error messages.
@@ -127,19 +132,24 @@ def _get_problem_indexes(self, original_tag, stripped_value):
         Returns:
             list: List of int locations in which error occurred.
         """
+        indexes = []
         # Extra +1 for the slash
         start_index = original_tag.extension.find(stripped_value) + len(original_tag.org_base_tag) + 1
         if start_index == -1:
-            return []
+            return indexes
+
+        if self._validate_characters:
+            allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())
 
-        problem_indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
-        return problem_indexes
-        # Partial implementation of allowedCharacter
-        # allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())
-        # if allowed_characters:
-        #     # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
-        #     indexes = [index for index, char in enumerate(stripped_value) if char not in allowed_characters]
-        #     pass
+            if allowed_characters:
+                # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
+                indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char not in allowed_characters]
+                if "nonascii" in allowed_characters:
+                    # Filter out ascii characters
+                    indexes = [(char, index) for char, index in indexes if not (ord(char) > 127 and char.isprintable())]
+        else:
+            indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
+        return indexes
 
     def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0):
         """ Return any issues found if this is a value tag,
@@ -219,12 +229,14 @@ def validate_value_class_type(self, unit_or_value_portion, valid_types):
             type_valid (bool): True if this is one of the valid_types validators.
 
         """
+        has_valid_func = False
         for unit_class_type in valid_types:
             valid_func = self._value_validators.get(unit_class_type)
             if valid_func:
+                has_valid_func = True
                 if valid_func(unit_or_value_portion):
                     return True
-        return False
+        return not has_valid_func
 
 
 def is_date_time(date_time_string):