Skip to content

Commit

Permalink
Update sidecar validation to check fully combined hed strings
Browse files Browse the repository at this point in the history
  • Loading branch information
IanCa committed Mar 6, 2024
1 parent 50a6df1 commit a7fd57e
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 214 deletions.
68 changes: 3 additions & 65 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
Superclass representing a basic columnar file.
"""
import re
import os

import openpyxl
Expand All @@ -11,6 +10,8 @@
from hed.errors.exceptions import HedFileError, HedExceptions
import pandas as pd

from hed.models.df_util import _handle_curly_braces_refs


class BaseInput:
""" Superclass representing a basic columnar file. """
Expand Down Expand Up @@ -417,7 +418,7 @@ def assemble(self, mapper=None, skip_curly_braces=False):
transformers, _ = mapper.get_transformers()
refs = self.get_column_refs()
column_names = list(transformers)
return self._handle_curly_braces_refs(all_columns, refs, column_names)
return _handle_curly_braces_refs(all_columns, refs, column_names)

def _handle_transforms(self, mapper):
transformers, need_categorical = mapper.get_transformers()
Expand All @@ -435,69 +436,6 @@ def _handle_transforms(self, mapper):

return all_columns

@staticmethod
def _replace_ref(text, newvalue, column_ref):
""" Replace column ref in x with y. If it's n/a, delete extra commas/parentheses.
Note: This function could easily be updated to handle non-curly brace values, but it's faster this way.
Parameters:
text (str): The input string containing the ref enclosed in curly braces.
newvalue (str): The replacement value for the ref.
column_ref (str): The ref to be replaced, without curly braces.
Returns:
str: The modified string with the ref replaced or removed.
"""
# If it's not n/a, we can just replace directly.
if newvalue != "n/a":
return text.replace(f"{{{column_ref}}}", newvalue)

def _remover(match):
p1 = match.group("p1").count("(")
p2 = match.group("p2").count(")")
if p1 > p2: # We have more starting parens than ending. Make sure we don't remove comma before
output = match.group("c1") + "(" * (p1 - p2)
elif p2 > p1: # We have more ending parens. Make sure we don't remove comma after
output = ")" * (p2 - p1) + match.group("c2")
else:
c1 = match.group("c1")
c2 = match.group("c2")
if c1:
c1 = ""
elif c2:
c2 = ""
output = c1 + c2

return output

# this finds all surrounding commas and parentheses to a reference.
# c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
# p1/p2 contain the parentheses directly surrounding the tag
# All four groups can have spaces.
pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
return re.sub(pattern, _remover, text)

@staticmethod
def _handle_curly_braces_refs(df, refs, column_names):
""" Plug in curly braces with other columns. """
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]

# Replace references in the columns we are saving out.
saved_columns = df[refs]
for column_name in remaining_columns:
for replacing_name in refs:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y
in zip(df[column_name], saved_columns[replacing_name]))
df = df[remaining_columns]

return df

@staticmethod
def combine_dataframe(dataframe):
""" Combine all columns in the given dataframe into a single HED string series,
Expand Down
77 changes: 77 additions & 0 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
""" Utilities for assembly and conversion of HED strings to different forms. """
import re
from functools import partial
import pandas as pd
from hed.models.hed_string import HedString
Expand Down Expand Up @@ -144,3 +145,79 @@ def sort_dataframe_by_onsets(df):

return df_copy
return df


def replace_ref(text, newvalue, column_ref):
""" Replace column ref in x with y. If it's n/a, delete extra commas/parentheses.
Parameters:
text (str): The input string containing the ref enclosed in curly braces.
newvalue (str): The replacement value for the ref.
column_ref (str): The ref to be replaced, without curly braces.
Returns:
str: The modified string with the ref replaced or removed.
"""
# Note: This function could easily be updated to handle non-curly brace values, but it seemed faster this way

# If it's not n/a, we can just replace directly.
if newvalue != "n/a":
return text.replace(f"{{{column_ref}}}", newvalue)

def _remover(match):
p1 = match.group("p1").count("(")
p2 = match.group("p2").count(")")
if p1 > p2: # We have more starting parens than ending. Make sure we don't remove comma before
output = match.group("c1") + "(" * (p1 - p2)
elif p2 > p1: # We have more ending parens. Make sure we don't remove comma after
output = ")" * (p2 - p1) + match.group("c2")
else:
c1 = match.group("c1")
c2 = match.group("c2")
if c1:
c1 = ""
elif c2:
c2 = ""
output = c1 + c2

return output

# this finds all surrounding commas and parentheses to a reference.
# c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
# p1/p2 contain the parentheses directly surrounding the tag
# All four groups can have spaces.
pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
return re.sub(pattern, _remover, text)


def _handle_curly_braces_refs(df, refs, column_names):
""" Fills in the refs in the dataframe
You probably shouldn't call this function directly, but rather use base input.
Parameters:
df(pd.DataFrame): The dataframe to modify
refs(list or pd.Series): a list of column refs to replace(without {})
column_names(list): the columns we are interested in(should include all ref columns)
Returns:
modified_df(pd.DataFrame): The modified dataframe with refs replaced
"""
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]

new_df = df.copy()
# Replace references in the columns we are saving out.
saved_columns = new_df[refs]
for column_name in remaining_columns:
for replacing_name in refs:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
new_df[column_name] = pd.Series(replace_ref(x, y, replacing_name) for x, y
in zip(new_df[column_name], saved_columns[replacing_name]))
new_df = new_df[remaining_columns]

return new_df
40 changes: 34 additions & 6 deletions hed/validator/sidecar_validator.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import copy
import re
import itertools

from hed.errors import ErrorHandler, ErrorContext, SidecarErrors, DefinitionErrors, ColumnErrors
from hed.models import ColumnType
from hed import HedString
from hed.models.column_metadata import ColumnMetadata
from hed.errors.error_reporter import sort_issues
from hed.models.model_constants import DefTagNames
from hed.errors.error_reporter import check_for_any_errors
from hed.models.df_util import replace_ref


# todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't)
Expand Down Expand Up @@ -53,11 +56,14 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)
issues += sidecar._extract_definition_issues
issues += sidecar_def_dict.issues

# todo: Break this function up
all_ref_columns = sidecar.get_column_refs()
definition_checks = {}
for column_data in sidecar:
column_name = column_data.column_name
column_data = column_data._get_unvalidated_data()
hed_strings = column_data.get_hed_strings()
is_ref_column = column_name in all_ref_columns
error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name)
for key_name, hed_string in hed_strings.items():
new_issues = []
Expand All @@ -68,24 +74,46 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)

error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
new_issues += hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True)
new_issues += hed_validator.run_full_string_checks(hed_string_obj)

def_check_list = definition_checks.setdefault(column_name, [])
def_check_list.append(hed_string_obj.find_tags({DefTagNames.DEFINITION_KEY}, recursive=True,
include_groups=0))

# Might refine this later - for now just skip checking placeholder counts in definition columns.
if not def_check_list[-1]:
new_issues += self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type)

if len(hed_strings) > 1:
error_handler.pop_error_context()
error_handler.add_context_and_filter(new_issues)
issues += new_issues
error_handler.pop_error_context()
error_handler.pop_error_context()
error_handler.pop_error_context() # Hed String

# Only do full string checks on full columns, not partial ref columns.
if not is_ref_column:
refs = re.findall("\{([a-z_\-0-9]+)\}", hed_string, re.IGNORECASE)
refs_strings = {data.column_name: data.get_hed_strings() for data in sidecar}
if "HED" not in refs_strings:
refs_strings["HED"] = ["n/a"]
for combination in itertools.product(*[refs_strings[key] for key in refs]):
new_issues = []
ref_dict = dict(zip(refs, combination))
modified_string = hed_string
for ref in refs:
modified_string = replace_ref(modified_string, ref_dict[ref], ref)
hed_string_obj = HedString(modified_string, hed_schema=self._schema, def_dict=sidecar_def_dict)

error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
new_issues += hed_validator.run_full_string_checks(hed_string_obj)
error_handler.add_context_and_filter(new_issues)
issues += new_issues
error_handler.pop_error_context() # Hed string
if len(hed_strings) > 1:
error_handler.pop_error_context() # Category key

error_handler.pop_error_context() # Column Name
issues += self._check_definitions_bad_spot(definition_checks, error_handler)
issues = sort_issues(issues)

error_handler.pop_error_context() # Filename

return issues

def validate_structure(self, sidecar, error_handler):
Expand Down
4 changes: 2 additions & 2 deletions tests/data/sidecar_tests/basic_refs_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
"stop": "A blue square is displayed to indicate stopping"
},
"HED": {
"go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See",
"go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/Hear",
"stop": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure"
}
},
"response_time": {
"LongName": "Response time after stimulus",
"Description": "Time from stimulus presentation until subject presses button",
"Units": "ms",
"HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See, Time-value/# s, {trial_type}"
"HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/Yellow-color/Gold,Action/Perceive/See, Time-value/# s, {trial_type}"
},
"stim_file": {
"LongName": "Stimulus file name",
Expand Down
17 changes: 17 additions & 0 deletions tests/data/sidecar_tests/multiple_category_refs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"cat1": {
"HED": {
"go": "Azure,Action/Perceive/Hear",
"stop": "Azure"
}
},
"cat2": {
"HED": {
"go2": "White-color/Azure,Action/Perceive/Hear",
"stop2": "n/a"
}
},
"combo": {
"HED": "{cat1},{cat2}, Event, Time-interval/# s"
}
}
Loading

0 comments on commit a7fd57e

Please sign in to comment.