Skip to content

Commit

Permalink
Merge pull request #867 from IanCa/develop
Browse files Browse the repository at this point in the history
Turn on allowed character support in > 8.3
  • Loading branch information
VisLab authored Feb 22, 2024
2 parents 2b04113 + b722e5f commit 34c418a
Show file tree
Hide file tree
Showing 12 changed files with 313 additions and 44 deletions.
3 changes: 2 additions & 1 deletion hed/errors/schema_error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor):
@hed_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID,
actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID)
def schema_error_SCHEMA_ALLOWED_CHARACTERS_INVALID(tag, invalid_character):
from hed.schema.hed_schema_constants import character_types
return (f"Tag '{tag}' has an invalid allowedCharacter: '{invalid_character}'. "
f"Allowed characters are: a single character, "
f"or one of the following - letters, blank, digits, alphanumeric.")
f"or one of the following - {', '.join(character_types.keys())}.")


@hed_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID,
Expand Down
8 changes: 8 additions & 0 deletions hed/schema/hed_schema_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,11 @@ class HedKey:
NO_LOC_ATTRIB,
UNMERGED_ATTRIBUTE
}

character_types = {
"letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
"blank": set(" "),
"digits": set("0123456789"),
"alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"),
"nonascii": "nonascii" # Special case for all other printable unicode characters
}
8 changes: 6 additions & 2 deletions hed/schema/schema_attribute_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from hed.errors.error_types import SchemaWarnings, ValidationErrors, SchemaAttributeErrors
from hed.errors.error_reporter import ErrorHandler
from hed.schema.hed_cache import get_hed_versions
from hed.schema.hed_schema_constants import HedKey
from hed.schema.hed_schema_constants import HedKey, character_types


def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name):
Expand Down Expand Up @@ -141,6 +141,10 @@ def tag_is_deprecated_check(hed_schema, tag_entry, attribute_name):
deprecated_version = tag_entry.attributes.get(attribute_name, "")
library_name = tag_entry.has_attribute(HedKey.InLibrary, return_value=True)
all_versions = get_hed_versions(library_name=library_name)
if not library_name:
library_name = ""
if library_name == hed_schema.library and hed_schema.version_number not in all_versions:
all_versions.append(hed_schema.version_number)
if deprecated_version and deprecated_version not in all_versions:
issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID,
tag_entry.name,
Expand Down Expand Up @@ -182,7 +186,7 @@ def allowed_characters_check(hed_schema, tag_entry, attribute_name):
"""
issues = []
allowed_strings = {'letters', 'blank', 'digits', 'alphanumeric'}
allowed_strings = character_types

char_string = tag_entry.attributes.get(attribute_name, "")
characters = char_string.split(",")
Expand Down
1 change: 1 addition & 0 deletions hed/schema/schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def check_invalid_chars(self):
for tag_name, desc in self.hed_schema.get_desc_iter():
issues_list += validate_schema_description(tag_name, desc)

# todo: Do we want to add this?
# todo Activate this session once we have clearer rules on spaces in unit names
# for unit in self.hed_schema.units:
# for i, char in enumerate(unit):
Expand Down
11 changes: 8 additions & 3 deletions hed/tools/visualization/tag_word_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import numpy as np
from PIL import Image
from hed.tools.visualization.word_cloud_util import default_color_func, WordCloud, generate_contour_svg
import matplotlib.font_manager as fm


def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=300, **kwargs):
def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=300, font_path=None, **kwargs):
""" Takes a word dict and returns a generated word cloud object.
Parameters:
Expand All @@ -14,6 +15,8 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400
background_color (str or None): If None, transparent background.
width (int): width in pixels.
height (int): height in pixels.
font_path (str): a filename or font name to use. Assumed to be a full file path if it ends with .ttf or .otf.
Font names will use a default if a close enough match isn't found.
kwargs (kwargs): Any other parameters WordCloud accepts, overrides default values where relevant.
Returns:
Expand Down Expand Up @@ -41,9 +44,11 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400
kwargs.setdefault('color_func', default_color_func)
kwargs.setdefault('relative_scaling', 1)
kwargs.setdefault('max_font_size', height / 20)
kwargs.setdefault('min_font_size', 8),
kwargs.setdefault('min_font_size', 8)
if font_path and not font_path.endswith(".ttf") and not font_path.endswith(".otf"):
font_path = fm.findfont(font_path)

wc = WordCloud(background_color=background_color, mask=mask_image,
wc = WordCloud(font_path=font_path, background_color=background_color, mask=mask_image,
width=width, height=height, mode="RGBA", **kwargs)

wc.generate_from_frequencies(word_dict)
Expand Down
14 changes: 12 additions & 2 deletions hed/validator/hed_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
"""
import re
from semantic_version import Version
from hed.errors.error_types import ValidationErrors, DefinitionErrors
from hed.errors.error_reporter import ErrorHandler, check_for_any_errors

from hed.validator.def_validator import DefValidator
from hed.validator.tag_util import UnitValueValidator, CharValidator, StringValidator, TagValidator, GroupValidator
from hed.schema import HedSchema


class HedValidator:
Expand All @@ -31,8 +33,16 @@ def __init__(self, hed_schema, def_dicts=None, definitions_allowed=False):
self._def_validator = DefValidator(def_dicts, hed_schema)
self._definitions_allowed = definitions_allowed

self._unit_validator = UnitValueValidator()
self._char_validator = CharValidator()
self._validate_characters = False
# todo: This could still do validation on schema groups.
if isinstance(hed_schema, HedSchema):
validation_version = hed_schema.with_standard
if not validation_version:
validation_version = hed_schema.version_number
self._validate_characters = Version(validation_version) >= Version("8.3.0")

self._unit_validator = UnitValueValidator(modern_allowed_char_rules=self._validate_characters)
self._char_validator = CharValidator(modern_allowed_char_rules=self._validate_characters)
self._string_validator = StringValidator()
self._tag_validator = TagValidator()
self._group_validator = GroupValidator(hed_schema)
Expand Down
16 changes: 14 additions & 2 deletions hed/validator/tag_util/char_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ class CharValidator:
INVALID_STRING_CHARS = '[]{}~'
INVALID_STRING_CHARS_PLACEHOLDERS = '[]~'

def __init__(self, modern_allowed_char_rules=False):
"""Does basic character validation for hed strings/tags
Parameters:
modern_allowed_char_rules(bool): If True, use 8.3 style rules for unicode characters.
"""
self._validate_characters = modern_allowed_char_rules

def check_invalid_character_issues(self, hed_string, allow_placeholders):
""" Report invalid characters.
Expand All @@ -33,8 +41,12 @@ def check_invalid_character_issues(self, hed_string, allow_placeholders):
if allow_placeholders:
invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS
for index, character in enumerate(hed_string):
if character in invalid_dict or ord(character) > 127:
validation_issues += self._report_invalid_character_error(hed_string, index)
if self._validate_characters:
if character in invalid_dict or not character.isprintable():
validation_issues += self._report_invalid_character_error(hed_string, index)
else:
if character in invalid_dict or ord(character) > 127:
validation_issues += self._report_invalid_character_error(hed_string, index)

return validation_issues

Expand Down
65 changes: 35 additions & 30 deletions hed/validator/tag_util/class_util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
""" Utilities to support HED validation. """
import datetime
import re
import functools


from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import ValidationErrors
from hed.schema.hed_schema_constants import HedKey, character_types


class UnitValueValidator:
Expand All @@ -18,13 +20,14 @@ class UnitValueValidator:

VALUE_CLASS_ALLOWED_CACHE = 20

def __init__(self, value_validators=None):
def __init__(self, modern_allowed_char_rules=False, value_validators=None):
""" Validates the unit and value classes on a given tag.
Parameters:
value_validators(dict or None): Override or add value class validators
"""
self._validate_characters = modern_allowed_char_rules
self._value_validators = self._get_default_value_class_validators()
if value_validators and isinstance(value_validators, dict):
self._value_validators.update(value_validators)
Expand Down Expand Up @@ -97,25 +100,20 @@ def check_tag_value_class_valid(self, original_tag, validate_text, report_as=Non
"""
return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset)

# char_sets = {
# "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
# "blank": set(" "),
# "digits": set("0123456789"),
# "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
# }
#
# @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
# def _get_allowed_characters(self, value_classes):
# # This could be pre-computed
# character_set = set()
# for value_class in value_classes:
# allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
# for single_type in allowed_types.split(","):
# if single_type in self.char_sets:
# character_set.update(self.char_sets[single_type])
# else:
# character_set.add(single_type)
# return character_set
@functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
def _get_allowed_characters(self, value_classes):
# This could be pre-computed
character_set = set()
for value_class in value_classes:
allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
for single_type in allowed_types.split(","):
if single_type in character_types and single_type != "nonascii":
character_set.update(character_types[single_type])
else:
character_set.add(single_type)
# for now, just always allow these special cases(it's validated extensively elsewhere)
character_set.update("#/")
return character_set

def _get_problem_indexes(self, original_tag, stripped_value):
""" Return list of problem indices for error messages.
Expand All @@ -127,19 +125,24 @@ def _get_problem_indexes(self, original_tag, stripped_value):
Returns:
list: List of int locations in which error occurred.
"""
indexes = []
# Extra +1 for the slash
start_index = original_tag.extension.find(stripped_value) + len(original_tag.org_base_tag) + 1
if start_index == -1:
return []
return indexes

problem_indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
return problem_indexes
# Partial implementation of allowedCharacter
# allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())
# if allowed_characters:
# # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
# indexes = [index for index, char in enumerate(stripped_value) if char not in allowed_characters]
# pass
if self._validate_characters:
allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())

if allowed_characters:
# Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char not in allowed_characters]
if "nonascii" in allowed_characters:
# Filter out ascii characters
indexes = [(char, index) for char, index in indexes if not (ord(char) > 127 and char.isprintable())]
else:
indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
return indexes

def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0):
""" Return any issues found if this is a value tag,
Expand Down Expand Up @@ -219,12 +222,14 @@ def validate_value_class_type(self, unit_or_value_portion, valid_types):
type_valid (bool): True if this is one of the valid_types validators.
"""
has_valid_func = False
for unit_class_type in valid_types:
valid_func = self._value_validators.get(unit_class_type)
if valid_func:
has_valid_func = True
if valid_func(unit_or_value_portion):
return True
return False
return not has_valid_func


def is_date_time(date_time_string):
Expand Down
Loading

0 comments on commit 34c418a

Please sign in to comment.