Merge pull request #880 from IanCa/develop

Update sidecar validation to check fully combined hed strings
hed-standard · Mar 7, 2024 · f000099 · f000099
2 parents 50a6df1 + 55490f2
commit f000099
Show file tree

Hide file tree

Showing 9 changed files with 284 additions and 215 deletions.
diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -1,7 +1,6 @@
 """
 Superclass representing a basic columnar file.
 """
-import re
 import os
 
 import openpyxl
@@ -11,6 +10,8 @@
 from hed.errors.exceptions import HedFileError, HedExceptions
 import pandas as pd
 
+from hed.models.df_util import _handle_curly_braces_refs
+
 
 class BaseInput:
     """ Superclass representing a basic columnar file. """
@@ -417,7 +418,7 @@ def assemble(self, mapper=None, skip_curly_braces=False):
         transformers, _ = mapper.get_transformers()
         refs = self.get_column_refs()
         column_names = list(transformers)
-        return self._handle_curly_braces_refs(all_columns, refs, column_names)
+        return _handle_curly_braces_refs(all_columns, refs, column_names)
 
     def _handle_transforms(self, mapper):
         transformers, need_categorical = mapper.get_transformers()
@@ -435,69 +436,6 @@ def _handle_transforms(self, mapper):
 
         return all_columns
 
-    @staticmethod
-    def _replace_ref(text, newvalue, column_ref):
-        """ Replace column ref in x with y.  If it's n/a, delete extra commas/parentheses.
-
-        Note: This function could easily be updated to handle non-curly brace values, but it's faster this way.
-        Parameters:
-            text (str): The input string containing the ref enclosed in curly braces.
-            newvalue (str): The replacement value for the ref.
-            column_ref (str): The ref to be replaced, without curly braces.
-
-        Returns:
-            str: The modified string with the ref replaced or removed.
-        """
-        # If it's not n/a, we can just replace directly.
-        if newvalue != "n/a":
-            return text.replace(f"{{{column_ref}}}", newvalue)
-
-        def _remover(match):
-            p1 = match.group("p1").count("(")
-            p2 = match.group("p2").count(")")
-            if p1 > p2:  # We have more starting parens than ending.  Make sure we don't remove comma before
-                output = match.group("c1") + "(" * (p1 - p2)
-            elif p2 > p1:  # We have more ending parens.  Make sure we don't remove comma after
-                output = ")" * (p2 - p1) + match.group("c2")
-            else:
-                c1 = match.group("c1")
-                c2 = match.group("c2")
-                if c1:
-                    c1 = ""
-                elif c2:
-                    c2 = ""
-                output = c1 + c2
-
-            return output
-
-        # this finds all surrounding commas and parentheses to a reference.
-        # c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
-        # p1/p2 contain the parentheses directly surrounding the tag
-        # All four groups can have spaces.
-        pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
-        return re.sub(pattern, _remover, text)
-
-    @staticmethod
-    def _handle_curly_braces_refs(df, refs, column_names):
-        """ Plug in curly braces with other columns. """
-        # Filter out columns and refs that don't exist.
-        refs = [ref for ref in refs if ref in column_names]
-        remaining_columns = [column for column in column_names if column not in refs]
-
-        # Replace references in the columns we are saving out.
-        saved_columns = df[refs]
-        for column_name in remaining_columns:
-            for replacing_name in refs:
-                # If the data has no n/a values, this version is MUCH faster.
-                # column_name_brackets = f"{{{replacing_name}}}"
-                # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
-                #                             in zip(df[column_name], saved_columns[replacing_name]))
-                df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y
-                                            in zip(df[column_name], saved_columns[replacing_name]))
-        df = df[remaining_columns]
-
-        return df
-
     @staticmethod
     def combine_dataframe(dataframe):
         """ Combine all columns in the given dataframe into a single HED string series,

diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -1,4 +1,5 @@
 """ Utilities for assembly and conversion of HED strings to different forms. """
+import re
 from functools import partial
 import pandas as pd
 from hed.models.hed_string import HedString
@@ -144,3 +145,79 @@ def sort_dataframe_by_onsets(df):
 
         return df_copy
     return df
+
+
+def replace_ref(text, newvalue, column_ref):
+    """ Replace column ref in x with y.  If it's n/a, delete extra commas/parentheses.
+
+    Parameters:
+        text (str): The input string containing the ref enclosed in curly braces.
+        newvalue (str): The replacement value for the ref.
+        column_ref (str): The ref to be replaced, without curly braces.
+
+    Returns:
+        str: The modified string with the ref replaced or removed.
+    """
+    # Note: This function could easily be updated to handle non-curly brace values, but it seemed faster this way
+
+    # If it's not n/a, we can just replace directly.
+    if newvalue != "n/a":
+        return text.replace(f"{{{column_ref}}}", newvalue)
+
+    def _remover(match):
+        p1 = match.group("p1").count("(")
+        p2 = match.group("p2").count(")")
+        if p1 > p2:  # We have more starting parens than ending.  Make sure we don't remove comma before
+            output = match.group("c1") + "(" * (p1 - p2)
+        elif p2 > p1:  # We have more ending parens.  Make sure we don't remove comma after
+            output = ")" * (p2 - p1) + match.group("c2")
+        else:
+            c1 = match.group("c1")
+            c2 = match.group("c2")
+            if c1:
+                c1 = ""
+            elif c2:
+                c2 = ""
+            output = c1 + c2
+
+        return output
+
+    # this finds all surrounding commas and parentheses to a reference.
+    # c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
+    # p1/p2 contain the parentheses directly surrounding the tag
+    # All four groups can have spaces.
+    pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
+    return re.sub(pattern, _remover, text)
+
+
+def _handle_curly_braces_refs(df, refs, column_names):
+    """ Fills in the refs in the dataframe
+
+        You probably shouldn't call this function directly, but rather use base input.
+
+    Parameters:
+        df(pd.DataFrame): The dataframe to modify
+        refs(list or pd.Series): a list of column refs to replace(without {})
+        column_names(list): the columns we are interested in(should include all ref columns)
+
+    Returns:
+        modified_df(pd.DataFrame): The modified dataframe with refs replaced
+    """
+    # Filter out columns and refs that don't exist.
+    refs = [ref for ref in refs if ref in column_names]
+    remaining_columns = [column for column in column_names if column not in refs]
+
+    new_df = df.copy()
+    # Replace references in the columns we are saving out.
+    saved_columns = new_df[refs]
+    for column_name in remaining_columns:
+        for replacing_name in refs:
+            # If the data has no n/a values, this version is MUCH faster.
+            # column_name_brackets = f"{{{replacing_name}}}"
+            # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
+            #                             in zip(df[column_name], saved_columns[replacing_name]))
+            new_df[column_name] = pd.Series(replace_ref(x, y, replacing_name) for x, y
+                                        in zip(new_df[column_name], saved_columns[replacing_name]))
+    new_df = new_df[remaining_columns]
+
+    return new_df
diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py
@@ -1,12 +1,15 @@
 import copy
 import re
+import itertools
+
 from hed.errors import ErrorHandler, ErrorContext, SidecarErrors, DefinitionErrors, ColumnErrors
 from hed.models import ColumnType
 from hed import HedString
 from hed.models.column_metadata import ColumnMetadata
 from hed.errors.error_reporter import sort_issues
 from hed.models.model_constants import DefTagNames
 from hed.errors.error_reporter import check_for_any_errors
+from hed.models.df_util import replace_ref
 
 
 # todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't)
@@ -53,11 +56,14 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)
         issues += sidecar._extract_definition_issues
         issues += sidecar_def_dict.issues
 
+        # todo: Break this function up
+        all_ref_columns = sidecar.get_column_refs()
         definition_checks = {}
         for column_data in sidecar:
             column_name = column_data.column_name
             column_data = column_data._get_unvalidated_data()
             hed_strings = column_data.get_hed_strings()
+            is_ref_column = column_name in all_ref_columns
             error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name)
             for key_name, hed_string in hed_strings.items():
                 new_issues = []
@@ -68,24 +74,46 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)
 
                 error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
                 new_issues += hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True)
-                new_issues += hed_validator.run_full_string_checks(hed_string_obj)
-
                 def_check_list = definition_checks.setdefault(column_name, [])
                 def_check_list.append(hed_string_obj.find_tags({DefTagNames.DEFINITION_KEY}, recursive=True,
                                                                include_groups=0))
+
                 # Might refine this later - for now just skip checking placeholder counts in definition columns.
                 if not def_check_list[-1]:
                     new_issues += self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type)
 
-                if len(hed_strings) > 1:
-                    error_handler.pop_error_context()
                 error_handler.add_context_and_filter(new_issues)
                 issues += new_issues
-            error_handler.pop_error_context()
-        error_handler.pop_error_context()
+                error_handler.pop_error_context()  # Hed String
+
+                # Only do full string checks on full columns, not partial ref columns.
+                if not is_ref_column:
+                    refs = re.findall("\{([a-z_\-0-9]+)\}", hed_string, re.IGNORECASE)
+                    refs_strings = {data.column_name: data.get_hed_strings() for data in sidecar}
+                    if "HED" not in refs_strings:
+                        refs_strings["HED"] = ["n/a"]
+                    for combination in itertools.product(*[refs_strings[key] for key in refs]):
+                        new_issues = []
+                        ref_dict = dict(zip(refs, combination))
+                        modified_string = hed_string
+                        for ref in refs:
+                            modified_string = replace_ref(modified_string, ref_dict[ref], ref)
+                        hed_string_obj = HedString(modified_string, hed_schema=self._schema, def_dict=sidecar_def_dict)
+
+                        error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
+                        new_issues += hed_validator.run_full_string_checks(hed_string_obj)
+                        error_handler.add_context_and_filter(new_issues)
+                        issues += new_issues
+                        error_handler.pop_error_context()  # Hed string
+                if len(hed_strings) > 1:
+                    error_handler.pop_error_context()  # Category key
+
+            error_handler.pop_error_context()  # Column Name
         issues += self._check_definitions_bad_spot(definition_checks, error_handler)
         issues = sort_issues(issues)
 
+        error_handler.pop_error_context()  # Filename
+
         return issues
 
     def validate_structure(self, sidecar, error_handler):

diff --git a/tests/data/sidecar_tests/basic_refs_test.json b/tests/data/sidecar_tests/basic_refs_test.json
@@ -7,15 +7,15 @@
           "stop": "A blue square is displayed to indicate stopping"
        },
        "HED": {
-          "go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See",
+          "go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/Hear",
           "stop": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure"
        }
    },
    "response_time": {
        "LongName": "Response time after stimulus",
        "Description": "Time from stimulus presentation until subject presses button",
        "Units": "ms",
-       "HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See, Time-value/# s, {trial_type}"
+       "HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/Yellow-color/Gold,Action/Perceive/See, Time-value/# s, {trial_type}"
    },
    "stim_file": {
        "LongName": "Stimulus file name",

diff --git a/tests/data/sidecar_tests/multiple_category_refs.json b/tests/data/sidecar_tests/multiple_category_refs.json
@@ -0,0 +1,17 @@
+{
+   "cat1": {
+       "HED": {
+          "go": "Azure,Action/Perceive/Hear",
+          "stop": "Azure"
+       }
+   },
+  "cat2": {
+       "HED": {
+          "go2": "White-color/Azure,Action/Perceive/Hear",
+          "stop2": "n/a"
+       }
+   },
+  "combo": {
+    "HED": "{cat1},{cat2}, Event, Time-interval/# s"
+  }
+}