Skip to content

Commit

Permalink
Turn on allowed character support in > 8.3
Browse files Browse the repository at this point in the history
Partial UTF8 support(allowedcharacter = nonascii)
  • Loading branch information
IanCa committed Feb 22, 2024
1 parent 2b04113 commit c8c9614
Show file tree
Hide file tree
Showing 10 changed files with 262 additions and 40 deletions.
3 changes: 2 additions & 1 deletion hed/errors/schema_error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor):
@hed_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID,
actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID)
def schema_error_SCHEMA_ALLOWED_CHARACTERS_INVALID(tag, invalid_character):
from hed.schema.hed_schema_constants import character_types
return (f"Tag '{tag}' has an invalid allowedCharacter: '{invalid_character}'. "
f"Allowed characters are: a single character, "
f"or one of the following - letters, blank, digits, alphanumeric.")
f"or one of the following - {', '.join(character_types.keys())}.")


@hed_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID,
Expand Down
8 changes: 8 additions & 0 deletions hed/schema/hed_schema_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,11 @@ class HedKey:
NO_LOC_ATTRIB,
UNMERGED_ATTRIBUTE
}

character_types = {
"letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
"blank": set(" "),
"digits": set("0123456789"),
"alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"),
"nonascii": "nonascii" # Special case for all other printable unicode characters
}
8 changes: 6 additions & 2 deletions hed/schema/schema_attribute_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from hed.errors.error_types import SchemaWarnings, ValidationErrors, SchemaAttributeErrors
from hed.errors.error_reporter import ErrorHandler
from hed.schema.hed_cache import get_hed_versions
from hed.schema.hed_schema_constants import HedKey
from hed.schema.hed_schema_constants import HedKey, character_types


def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name):
Expand Down Expand Up @@ -141,6 +141,10 @@ def tag_is_deprecated_check(hed_schema, tag_entry, attribute_name):
deprecated_version = tag_entry.attributes.get(attribute_name, "")
library_name = tag_entry.has_attribute(HedKey.InLibrary, return_value=True)
all_versions = get_hed_versions(library_name=library_name)
if not library_name:
library_name = ""
if library_name == hed_schema.library and hed_schema.version_number not in all_versions:
all_versions.append(hed_schema.version_number)
if deprecated_version and deprecated_version not in all_versions:
issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID,
tag_entry.name,
Expand Down Expand Up @@ -182,7 +186,7 @@ def allowed_characters_check(hed_schema, tag_entry, attribute_name):
"""
issues = []
allowed_strings = {'letters', 'blank', 'digits', 'alphanumeric'}
allowed_strings = character_types

char_string = tag_entry.attributes.get(attribute_name, "")
characters = char_string.split(",")
Expand Down
1 change: 1 addition & 0 deletions hed/schema/schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def check_invalid_chars(self):
for tag_name, desc in self.hed_schema.get_desc_iter():
issues_list += validate_schema_description(tag_name, desc)

# todo: Do we want to add this?
# todo Activate this session once we have clearer rules on spaces in unit names
# for unit in self.hed_schema.units:
# for i, char in enumerate(unit):
Expand Down
2 changes: 1 addition & 1 deletion hed/validator/hed_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(self, hed_schema, def_dicts=None, definitions_allowed=False):
self._def_validator = DefValidator(def_dicts, hed_schema)
self._definitions_allowed = definitions_allowed

self._unit_validator = UnitValueValidator()
self._unit_validator = UnitValueValidator(hed_schema)
self._char_validator = CharValidator()
self._string_validator = StringValidator()
self._tag_validator = TagValidator()
Expand Down
2 changes: 1 addition & 1 deletion hed/validator/tag_util/char_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def check_invalid_character_issues(self, hed_string, allow_placeholders):
if allow_placeholders:
invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS
for index, character in enumerate(hed_string):
if character in invalid_dict or ord(character) > 127:
if character in invalid_dict or not character.isprintable():
validation_issues += self._report_invalid_character_error(hed_string, index)

return validation_issues
Expand Down
74 changes: 43 additions & 31 deletions hed/validator/tag_util/class_util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
""" Utilities to support HED validation. """
import datetime
import re
import functools
from semantic_version import Version


from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import ValidationErrors

from hed.schema.hed_schema_constants import HedKey, character_types
from hed.schema import HedSchema

class UnitValueValidator:
""" Validates units. """
Expand All @@ -18,13 +21,20 @@ class UnitValueValidator:

VALUE_CLASS_ALLOWED_CACHE = 20

def __init__(self, value_validators=None):
def __init__(self, hed_schema, value_validators=None):
""" Validates the unit and value classes on a given tag.
Parameters:
value_validators(dict or None): Override or add value class validators
"""
self._validate_characters = False
# todo: Extend character validation for schema groups eventually
if isinstance(hed_schema, HedSchema):
validation_version = hed_schema.with_standard
if not validation_version:
validation_version = hed_schema.version_number
self._validate_characters = Version(validation_version) >= Version("8.3.0")
self._value_validators = self._get_default_value_class_validators()
if value_validators and isinstance(value_validators, dict):
self._value_validators.update(value_validators)
Expand Down Expand Up @@ -97,25 +107,20 @@ def check_tag_value_class_valid(self, original_tag, validate_text, report_as=Non
"""
return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset)

# char_sets = {
# "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
# "blank": set(" "),
# "digits": set("0123456789"),
# "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
# }
#
# @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
# def _get_allowed_characters(self, value_classes):
# # This could be pre-computed
# character_set = set()
# for value_class in value_classes:
# allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
# for single_type in allowed_types.split(","):
# if single_type in self.char_sets:
# character_set.update(self.char_sets[single_type])
# else:
# character_set.add(single_type)
# return character_set
@functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
def _get_allowed_characters(self, value_classes):
# This could be pre-computed
character_set = set()
for value_class in value_classes:
allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
for single_type in allowed_types.split(","):
if single_type in character_types and single_type != "nonascii":
character_set.update(character_types[single_type])
else:
character_set.add(single_type)
# for now, just always allow these special cases(it's validated extensively elsewhere)
character_set.update("#/")
return character_set

def _get_problem_indexes(self, original_tag, stripped_value):
""" Return list of problem indices for error messages.
Expand All @@ -127,19 +132,24 @@ def _get_problem_indexes(self, original_tag, stripped_value):
Returns:
list: List of int locations in which error occurred.
"""
indexes = []
# Extra +1 for the slash
start_index = original_tag.extension.find(stripped_value) + len(original_tag.org_base_tag) + 1
if start_index == -1:
return []
return indexes

if self._validate_characters:
allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())

problem_indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
return problem_indexes
# Partial implementation of allowedCharacter
# allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())
# if allowed_characters:
# # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
# indexes = [index for index, char in enumerate(stripped_value) if char not in allowed_characters]
# pass
if allowed_characters:
# Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char not in allowed_characters]
if "nonascii" in allowed_characters:
# Filter out ascii characters
indexes = [(char, index) for char, index in indexes if not (ord(char) > 127 and char.isprintable())]
else:
indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
return indexes

def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0):
""" Return any issues found if this is a value tag,
Expand Down Expand Up @@ -219,12 +229,14 @@ def validate_value_class_type(self, unit_or_value_portion, valid_types):
type_valid (bool): True if this is one of the valid_types validators.
"""
has_valid_func = False
for unit_class_type in valid_types:
valid_func = self._value_validators.get(unit_class_type)
if valid_func:
has_valid_func = True
if valid_func(unit_or_value_portion):
return True
return False
return not has_valid_func


def is_date_time(date_time_string):
Expand Down
Loading

0 comments on commit c8c9614

Please sign in to comment.