Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update sidecar validation to check fully combined hed strings #880

Merged
merged 2 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 3 additions & 65 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
Superclass representing a basic columnar file.
"""
import re
import os

import openpyxl
Expand All @@ -11,6 +10,8 @@
from hed.errors.exceptions import HedFileError, HedExceptions
import pandas as pd

from hed.models.df_util import _handle_curly_braces_refs


class BaseInput:
""" Superclass representing a basic columnar file. """
Expand Down Expand Up @@ -417,7 +418,7 @@ def assemble(self, mapper=None, skip_curly_braces=False):
transformers, _ = mapper.get_transformers()
refs = self.get_column_refs()
column_names = list(transformers)
return self._handle_curly_braces_refs(all_columns, refs, column_names)
return _handle_curly_braces_refs(all_columns, refs, column_names)

def _handle_transforms(self, mapper):
transformers, need_categorical = mapper.get_transformers()
Expand All @@ -435,69 +436,6 @@ def _handle_transforms(self, mapper):

return all_columns

@staticmethod
def _replace_ref(text, newvalue, column_ref):
""" Replace column ref in x with y. If it's n/a, delete extra commas/parentheses.

Note: This function could easily be updated to handle non-curly brace values, but it's faster this way.
Parameters:
text (str): The input string containing the ref enclosed in curly braces.
newvalue (str): The replacement value for the ref.
column_ref (str): The ref to be replaced, without curly braces.

Returns:
str: The modified string with the ref replaced or removed.
"""
# If it's not n/a, we can just replace directly.
if newvalue != "n/a":
return text.replace(f"{{{column_ref}}}", newvalue)

def _remover(match):
p1 = match.group("p1").count("(")
p2 = match.group("p2").count(")")
if p1 > p2: # We have more starting parens than ending. Make sure we don't remove comma before
output = match.group("c1") + "(" * (p1 - p2)
elif p2 > p1: # We have more ending parens. Make sure we don't remove comma after
output = ")" * (p2 - p1) + match.group("c2")
else:
c1 = match.group("c1")
c2 = match.group("c2")
if c1:
c1 = ""
elif c2:
c2 = ""
output = c1 + c2

return output

# this finds all surrounding commas and parentheses to a reference.
# c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
# p1/p2 contain the parentheses directly surrounding the tag
# All four groups can have spaces.
pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
return re.sub(pattern, _remover, text)

@staticmethod
def _handle_curly_braces_refs(df, refs, column_names):
""" Plug in curly braces with other columns. """
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]

# Replace references in the columns we are saving out.
saved_columns = df[refs]
for column_name in remaining_columns:
for replacing_name in refs:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y
in zip(df[column_name], saved_columns[replacing_name]))
df = df[remaining_columns]

return df

@staticmethod
def combine_dataframe(dataframe):
""" Combine all columns in the given dataframe into a single HED string series,
Expand Down
77 changes: 77 additions & 0 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
""" Utilities for assembly and conversion of HED strings to different forms. """
import re
from functools import partial
import pandas as pd
from hed.models.hed_string import HedString
Expand Down Expand Up @@ -144,3 +145,79 @@ def sort_dataframe_by_onsets(df):

return df_copy
return df


def replace_ref(text, newvalue, column_ref):
""" Replace column ref in x with y. If it's n/a, delete extra commas/parentheses.

Parameters:
text (str): The input string containing the ref enclosed in curly braces.
newvalue (str): The replacement value for the ref.
column_ref (str): The ref to be replaced, without curly braces.

Returns:
str: The modified string with the ref replaced or removed.
"""
# Note: This function could easily be updated to handle non-curly brace values, but it seemed faster this way

# If it's not n/a, we can just replace directly.
if newvalue != "n/a":
return text.replace(f"{{{column_ref}}}", newvalue)

def _remover(match):
p1 = match.group("p1").count("(")
p2 = match.group("p2").count(")")
if p1 > p2: # We have more starting parens than ending. Make sure we don't remove comma before
output = match.group("c1") + "(" * (p1 - p2)
elif p2 > p1: # We have more ending parens. Make sure we don't remove comma after
output = ")" * (p2 - p1) + match.group("c2")
else:
c1 = match.group("c1")
c2 = match.group("c2")
if c1:
c1 = ""
elif c2:
c2 = ""
output = c1 + c2

return output

# this finds all surrounding commas and parentheses to a reference.
# c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
# p1/p2 contain the parentheses directly surrounding the tag
# All four groups can have spaces.
pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
return re.sub(pattern, _remover, text)


def _handle_curly_braces_refs(df, refs, column_names):
""" Fills in the refs in the dataframe

You probably shouldn't call this function directly, but rather use base input.

Parameters:
df(pd.DataFrame): The dataframe to modify
refs(list or pd.Series): a list of column refs to replace(without {})
column_names(list): the columns we are interested in(should include all ref columns)

Returns:
modified_df(pd.DataFrame): The modified dataframe with refs replaced
"""
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]

new_df = df.copy()
# Replace references in the columns we are saving out.
saved_columns = new_df[refs]
for column_name in remaining_columns:
for replacing_name in refs:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
new_df[column_name] = pd.Series(replace_ref(x, y, replacing_name) for x, y
in zip(new_df[column_name], saved_columns[replacing_name]))
new_df = new_df[remaining_columns]

return new_df
40 changes: 34 additions & 6 deletions hed/validator/sidecar_validator.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import copy
import re
import itertools

from hed.errors import ErrorHandler, ErrorContext, SidecarErrors, DefinitionErrors, ColumnErrors
from hed.models import ColumnType
from hed import HedString
from hed.models.column_metadata import ColumnMetadata
from hed.errors.error_reporter import sort_issues
from hed.models.model_constants import DefTagNames
from hed.errors.error_reporter import check_for_any_errors
from hed.models.df_util import replace_ref


# todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't)
Expand Down Expand Up @@ -53,11 +56,14 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)
issues += sidecar._extract_definition_issues
issues += sidecar_def_dict.issues

# todo: Break this function up
all_ref_columns = sidecar.get_column_refs()
definition_checks = {}
for column_data in sidecar:
column_name = column_data.column_name
column_data = column_data._get_unvalidated_data()
hed_strings = column_data.get_hed_strings()
is_ref_column = column_name in all_ref_columns
error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name)
for key_name, hed_string in hed_strings.items():
new_issues = []
Expand All @@ -68,24 +74,46 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)

error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
new_issues += hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True)
new_issues += hed_validator.run_full_string_checks(hed_string_obj)

def_check_list = definition_checks.setdefault(column_name, [])
def_check_list.append(hed_string_obj.find_tags({DefTagNames.DEFINITION_KEY}, recursive=True,
include_groups=0))

# Might refine this later - for now just skip checking placeholder counts in definition columns.
if not def_check_list[-1]:
new_issues += self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type)

if len(hed_strings) > 1:
error_handler.pop_error_context()
error_handler.add_context_and_filter(new_issues)
issues += new_issues
error_handler.pop_error_context()
error_handler.pop_error_context()
error_handler.pop_error_context() # Hed String

# Only do full string checks on full columns, not partial ref columns.
if not is_ref_column:
refs = re.findall("\{([a-z_\-0-9]+)\}", hed_string, re.IGNORECASE)
refs_strings = {data.column_name: data.get_hed_strings() for data in sidecar}
if "HED" not in refs_strings:
refs_strings["HED"] = ["n/a"]
for combination in itertools.product(*[refs_strings[key] for key in refs]):
new_issues = []
ref_dict = dict(zip(refs, combination))
modified_string = hed_string
for ref in refs:
modified_string = replace_ref(modified_string, ref_dict[ref], ref)
hed_string_obj = HedString(modified_string, hed_schema=self._schema, def_dict=sidecar_def_dict)

error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
new_issues += hed_validator.run_full_string_checks(hed_string_obj)
error_handler.add_context_and_filter(new_issues)
issues += new_issues
error_handler.pop_error_context() # Hed string
if len(hed_strings) > 1:
error_handler.pop_error_context() # Category key

error_handler.pop_error_context() # Column Name
issues += self._check_definitions_bad_spot(definition_checks, error_handler)
issues = sort_issues(issues)

error_handler.pop_error_context() # Filename

return issues

def validate_structure(self, sidecar, error_handler):
Expand Down
4 changes: 2 additions & 2 deletions tests/data/sidecar_tests/basic_refs_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
"stop": "A blue square is displayed to indicate stopping"
},
"HED": {
"go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See",
"go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/Hear",
"stop": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure"
}
},
"response_time": {
"LongName": "Response time after stimulus",
"Description": "Time from stimulus presentation until subject presses button",
"Units": "ms",
"HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See, Time-value/# s, {trial_type}"
"HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/Yellow-color/Gold,Action/Perceive/See, Time-value/# s, {trial_type}"
},
"stim_file": {
"LongName": "Stimulus file name",
Expand Down
17 changes: 17 additions & 0 deletions tests/data/sidecar_tests/multiple_category_refs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"cat1": {
"HED": {
"go": "Azure,Action/Perceive/Hear",
"stop": "Azure"
}
},
"cat2": {
"HED": {
"go2": "White-color/Azure,Action/Perceive/Hear",
"stop2": "n/a"
}
},
"combo": {
"HED": "{cat1},{cat2}, Event, Time-interval/# s"
}
}
Loading
Loading