Merge pull request #867 from IanCa/develop

Turn on allowed character support in > 8.3
hed-standard · Feb 22, 2024 · 34c418a · 34c418a
2 parents 2b04113 + b722e5f
commit 34c418a
Show file tree

Hide file tree

Showing 12 changed files with 313 additions and 44 deletions.
diff --git a/hed/errors/schema_error_messages.py b/hed/errors/schema_error_messages.py
@@ -94,9 +94,10 @@ def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor):
 @hed_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID,
            actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID)
 def schema_error_SCHEMA_ALLOWED_CHARACTERS_INVALID(tag, invalid_character):
+    from hed.schema.hed_schema_constants import character_types
     return (f"Tag '{tag}' has an invalid allowedCharacter: '{invalid_character}'.  "
             f"Allowed characters are: a single character, "
-            f"or one of the following - letters, blank, digits, alphanumeric.")
+            f"or one of the following - {', '.join(character_types.keys())}.")
 
 
 @hed_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID,

diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py
@@ -87,3 +87,11 @@ class HedKey:
     NO_LOC_ATTRIB,
     UNMERGED_ATTRIBUTE
 }
+
+character_types = {
+    "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
+    "blank": set(" "),
+    "digits": set("0123456789"),
+    "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"),
+    "nonascii": "nonascii"  # Special case for all other printable unicode characters
+}
diff --git a/hed/schema/schema_attribute_validators.py b/hed/schema/schema_attribute_validators.py
@@ -14,7 +14,7 @@
 from hed.errors.error_types import SchemaWarnings, ValidationErrors, SchemaAttributeErrors
 from hed.errors.error_reporter import ErrorHandler
 from hed.schema.hed_cache import get_hed_versions
-from hed.schema.hed_schema_constants import HedKey
+from hed.schema.hed_schema_constants import HedKey, character_types
 
 
 def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name):
@@ -141,6 +141,10 @@ def tag_is_deprecated_check(hed_schema, tag_entry, attribute_name):
     deprecated_version = tag_entry.attributes.get(attribute_name, "")
     library_name = tag_entry.has_attribute(HedKey.InLibrary, return_value=True)
     all_versions = get_hed_versions(library_name=library_name)
+    if not library_name:
+        library_name = ""
+    if library_name == hed_schema.library and hed_schema.version_number not in all_versions:
+        all_versions.append(hed_schema.version_number)
     if deprecated_version and deprecated_version not in all_versions:
         issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID,
                                             tag_entry.name,
@@ -182,7 +186,7 @@ def allowed_characters_check(hed_schema, tag_entry, attribute_name):
 
     """
     issues = []
-    allowed_strings = {'letters', 'blank', 'digits', 'alphanumeric'}
+    allowed_strings = character_types
 
     char_string = tag_entry.attributes.get(attribute_name, "")
     characters = char_string.split(",")

diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py
@@ -125,6 +125,7 @@ def check_invalid_chars(self):
             for tag_name, desc in self.hed_schema.get_desc_iter():
                 issues_list += validate_schema_description(tag_name, desc)
 
+        # todo: Do we want to add this?
         # todo Activate this session once we have clearer rules on spaces in unit names
         # for unit in self.hed_schema.units:
         #     for i, char in enumerate(unit):

diff --git a/hed/tools/visualization/tag_word_cloud.py b/hed/tools/visualization/tag_word_cloud.py
@@ -3,9 +3,10 @@
 import numpy as np
 from PIL import Image
 from hed.tools.visualization.word_cloud_util import default_color_func, WordCloud, generate_contour_svg
+import matplotlib.font_manager as fm
 
 
-def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=300, **kwargs):
+def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=300, font_path=None, **kwargs):
     """ Takes a word dict and returns a generated word cloud object.
 
     Parameters:
@@ -14,6 +15,8 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400
         background_color (str or None): If None, transparent background.
         width (int): width in pixels.
         height (int): height in pixels.
+        font_path (str): a filename or font name to use.  Assumed to be a full file path if it ends with .ttf or .otf.
+                         Font names will use a default if a close enough match isn't found.
         kwargs (kwargs): Any other parameters WordCloud accepts, overrides default values where relevant.
 
     Returns:
@@ -41,9 +44,11 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400
     kwargs.setdefault('color_func', default_color_func)
     kwargs.setdefault('relative_scaling', 1)
     kwargs.setdefault('max_font_size', height / 20)
-    kwargs.setdefault('min_font_size', 8),
+    kwargs.setdefault('min_font_size', 8)
+    if font_path and not font_path.endswith(".ttf") and not font_path.endswith(".otf"):
+        font_path = fm.findfont(font_path)
 
-    wc = WordCloud(background_color=background_color, mask=mask_image,
+    wc = WordCloud(font_path=font_path, background_color=background_color, mask=mask_image,
                    width=width, height=height, mode="RGBA", **kwargs)
 
     wc.generate_from_frequencies(word_dict)

diff --git a/hed/validator/hed_validator.py b/hed/validator/hed_validator.py
@@ -5,11 +5,13 @@
 
 """
 import re
+from semantic_version import Version
 from hed.errors.error_types import ValidationErrors, DefinitionErrors
 from hed.errors.error_reporter import ErrorHandler, check_for_any_errors
 
 from hed.validator.def_validator import DefValidator
 from hed.validator.tag_util import UnitValueValidator, CharValidator, StringValidator, TagValidator, GroupValidator
+from hed.schema import HedSchema
 
 
 class HedValidator:
@@ -31,8 +33,16 @@ def __init__(self, hed_schema, def_dicts=None, definitions_allowed=False):
         self._def_validator = DefValidator(def_dicts, hed_schema)
         self._definitions_allowed = definitions_allowed
 
-        self._unit_validator = UnitValueValidator()
-        self._char_validator = CharValidator()
+        self._validate_characters = False
+        # todo: This could still do validation on schema groups.
+        if isinstance(hed_schema, HedSchema):
+            validation_version = hed_schema.with_standard
+            if not validation_version:
+                validation_version = hed_schema.version_number
+            self._validate_characters = Version(validation_version) >= Version("8.3.0")
+
+        self._unit_validator = UnitValueValidator(modern_allowed_char_rules=self._validate_characters)
+        self._char_validator = CharValidator(modern_allowed_char_rules=self._validate_characters)
         self._string_validator = StringValidator()
         self._tag_validator = TagValidator()
         self._group_validator = GroupValidator(hed_schema)

diff --git a/hed/validator/tag_util/char_util.py b/hed/validator/tag_util/char_util.py
@@ -14,6 +14,14 @@ class CharValidator:
     INVALID_STRING_CHARS = '[]{}~'
     INVALID_STRING_CHARS_PLACEHOLDERS = '[]~'
 
+    def __init__(self, modern_allowed_char_rules=False):
+        """Does basic character validation for hed strings/tags
+
+        Parameters:
+            modern_allowed_char_rules(bool): If True, use 8.3 style rules for unicode characters.
+        """
+        self._validate_characters = modern_allowed_char_rules
+
     def check_invalid_character_issues(self, hed_string, allow_placeholders):
         """ Report invalid characters.
 
@@ -33,8 +41,12 @@ def check_invalid_character_issues(self, hed_string, allow_placeholders):
         if allow_placeholders:
             invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS
         for index, character in enumerate(hed_string):
-            if character in invalid_dict or ord(character) > 127:
-                validation_issues += self._report_invalid_character_error(hed_string, index)
+            if self._validate_characters:
+                if character in invalid_dict or not character.isprintable():
+                    validation_issues += self._report_invalid_character_error(hed_string, index)
+            else:
+                if character in invalid_dict or ord(character) > 127:
+                    validation_issues += self._report_invalid_character_error(hed_string, index)
 
         return validation_issues
 

diff --git a/hed/validator/tag_util/class_util.py b/hed/validator/tag_util/class_util.py
@@ -1,10 +1,12 @@
 """ Utilities to support HED validation. """
 import datetime
 import re
+import functools
 
 
 from hed.errors.error_reporter import ErrorHandler
 from hed.errors.error_types import ValidationErrors
+from hed.schema.hed_schema_constants import HedKey, character_types
 
 
 class UnitValueValidator:
@@ -18,13 +20,14 @@ class UnitValueValidator:
 
     VALUE_CLASS_ALLOWED_CACHE = 20
 
-    def __init__(self, value_validators=None):
+    def __init__(self, modern_allowed_char_rules=False, value_validators=None):
         """ Validates the unit and value classes on a given tag.
 
         Parameters:
             value_validators(dict or None): Override or add value class validators
 
         """
+        self._validate_characters = modern_allowed_char_rules
         self._value_validators = self._get_default_value_class_validators()
         if value_validators and isinstance(value_validators, dict):
             self._value_validators.update(value_validators)
@@ -97,25 +100,20 @@ def check_tag_value_class_valid(self, original_tag, validate_text, report_as=Non
         """
         return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset)
 
-    # char_sets = {
-    #     "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
-    #     "blank": set(" "),
-    #     "digits": set("0123456789"),
-    #     "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
-    # }
-    #
-    # @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
-    # def _get_allowed_characters(self, value_classes):
-    #     # This could be pre-computed
-    #     character_set = set()
-    #     for value_class in value_classes:
-    #         allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
-    #         for single_type in allowed_types.split(","):
-    #             if single_type in self.char_sets:
-    #                 character_set.update(self.char_sets[single_type])
-    #             else:
-    #                 character_set.add(single_type)
-    #     return character_set
+    @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
+    def _get_allowed_characters(self, value_classes):
+        # This could be pre-computed
+        character_set = set()
+        for value_class in value_classes:
+            allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
+            for single_type in allowed_types.split(","):
+                if single_type in character_types and single_type != "nonascii":
+                    character_set.update(character_types[single_type])
+                else:
+                    character_set.add(single_type)
+        # for now, just always allow these special cases(it's validated extensively elsewhere)
+        character_set.update("#/")
+        return character_set
 
     def _get_problem_indexes(self, original_tag, stripped_value):
         """ Return list of problem indices for error messages.
@@ -127,19 +125,24 @@ def _get_problem_indexes(self, original_tag, stripped_value):
         Returns:
             list: List of int locations in which error occurred.
         """
+        indexes = []
         # Extra +1 for the slash
         start_index = original_tag.extension.find(stripped_value) + len(original_tag.org_base_tag) + 1
         if start_index == -1:
-            return []
+            return indexes
 
-        problem_indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
-        return problem_indexes
-        # Partial implementation of allowedCharacter
-        # allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())
-        # if allowed_characters:
-        #     # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
-        #     indexes = [index for index, char in enumerate(stripped_value) if char not in allowed_characters]
-        #     pass
+        if self._validate_characters:
+            allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())
+
+            if allowed_characters:
+                # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
+                indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char not in allowed_characters]
+                if "nonascii" in allowed_characters:
+                    # Filter out ascii characters
+                    indexes = [(char, index) for char, index in indexes if not (ord(char) > 127 and char.isprintable())]
+        else:
+            indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
+        return indexes
 
     def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0):
         """ Return any issues found if this is a value tag,
@@ -219,12 +222,14 @@ def validate_value_class_type(self, unit_or_value_portion, valid_types):
             type_valid (bool): True if this is one of the valid_types validators.
 
         """
+        has_valid_func = False
         for unit_class_type in valid_types:
             valid_func = self._value_validators.get(unit_class_type)
             if valid_func:
+                has_valid_func = True
                 if valid_func(unit_or_value_portion):
                     return True
-        return False
+        return not has_valid_func
 
 
 def is_date_time(date_time_string):