From a7fd57e8b778801a567cae3146a19f7739cf36a7 Mon Sep 17 00:00:00 2001
From: IanCa <ianrcallanan@gmail.com>
Date: Wed, 6 Mar 2024 17:46:43 -0600
Subject: [PATCH 1/2] Update sidecar validation to check fully combined hed
 strings

---
 hed/models/base_input.py                      |  68 +--------
 hed/models/df_util.py                         |  77 ++++++++++
 hed/validator/sidecar_validator.py            |  40 ++++-
 tests/data/sidecar_tests/basic_refs_test.json |   4 +-
 .../sidecar_tests/multiple_category_refs.json |  17 +++
 tests/models/test_base_input.py               | 141 ------------------
 tests/models/test_df_util.py                  | 139 +++++++++++++++++
 tests/validator/test_sidecar_validator.py     |  11 ++
 8 files changed, 283 insertions(+), 214 deletions(-)
 create mode 100644 tests/data/sidecar_tests/multiple_category_refs.json
diff --git a/hed/models/base_input.py b/hed/models/base_input.py
index 41a762cd..024f8e27 100644
--- a/hed/models/base_input.py
+++ b/hed/models/base_input.py
@@ -1,7 +1,6 @@
 """
 Superclass representing a basic columnar file.
 """
-import re
 import os
 
 import openpyxl
@@ -11,6 +10,8 @@
 from hed.errors.exceptions import HedFileError, HedExceptions
 import pandas as pd
 
+from hed.models.df_util import _handle_curly_braces_refs
+
 
 class BaseInput:
     """ Superclass representing a basic columnar file. """
@@ -417,7 +418,7 @@ def assemble(self, mapper=None, skip_curly_braces=False):
         transformers, _ = mapper.get_transformers()
         refs = self.get_column_refs()
         column_names = list(transformers)
-        return self._handle_curly_braces_refs(all_columns, refs, column_names)
+        return _handle_curly_braces_refs(all_columns, refs, column_names)
 
     def _handle_transforms(self, mapper):
         transformers, need_categorical = mapper.get_transformers()
@@ -435,69 +436,6 @@ def _handle_transforms(self, mapper):
 
         return all_columns
 
-    @staticmethod
-    def _replace_ref(text, newvalue, column_ref):
-        """ Replace column ref in x with y.  If it's n/a, delete extra commas/parentheses.
-
-        Note: This function could easily be updated to handle non-curly brace values, but it's faster this way.
-        Parameters:
-            text (str): The input string containing the ref enclosed in curly braces.
-            newvalue (str): The replacement value for the ref.
-            column_ref (str): The ref to be replaced, without curly braces.
-
-        Returns:
-            str: The modified string with the ref replaced or removed.
-        """
-        # If it's not n/a, we can just replace directly.
-        if newvalue != "n/a":
-            return text.replace(f"{{{column_ref}}}", newvalue)
-
-        def _remover(match):
-            p1 = match.group("p1").count("(")
-            p2 = match.group("p2").count(")")
-            if p1 > p2:  # We have more starting parens than ending.  Make sure we don't remove comma before
-                output = match.group("c1") + "(" * (p1 - p2)
-            elif p2 > p1:  # We have more ending parens.  Make sure we don't remove comma after
-                output = ")" * (p2 - p1) + match.group("c2")
-            else:
-                c1 = match.group("c1")
-                c2 = match.group("c2")
-                if c1:
-                    c1 = ""
-                elif c2:
-                    c2 = ""
-                output = c1 + c2
-
-            return output
-
-        # this finds all surrounding commas and parentheses to a reference.
-        # c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
-        # p1/p2 contain the parentheses directly surrounding the tag
-        # All four groups can have spaces.
-        pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
-        return re.sub(pattern, _remover, text)
-
-    @staticmethod
-    def _handle_curly_braces_refs(df, refs, column_names):
-        """ Plug in curly braces with other columns. """
-        # Filter out columns and refs that don't exist.
-        refs = [ref for ref in refs if ref in column_names]
-        remaining_columns = [column for column in column_names if column not in refs]
-
-        # Replace references in the columns we are saving out.
-        saved_columns = df[refs]
-        for column_name in remaining_columns:
-            for replacing_name in refs:
-                # If the data has no n/a values, this version is MUCH faster.
-                # column_name_brackets = f"{{{replacing_name}}}"
-                # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
-                #                             in zip(df[column_name], saved_columns[replacing_name]))
-                df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y
-                                            in zip(df[column_name], saved_columns[replacing_name]))
-        df = df[remaining_columns]
-
-        return df
-
     @staticmethod
     def combine_dataframe(dataframe):
         """ Combine all columns in the given dataframe into a single HED string series,
diff --git a/hed/models/df_util.py b/hed/models/df_util.py
index 7811b6fe..0364c539 100644
--- a/hed/models/df_util.py
+++ b/hed/models/df_util.py
@@ -1,4 +1,5 @@
 """ Utilities for assembly and conversion of HED strings to different forms. """
+import re
 from functools import partial
 import pandas as pd
 from hed.models.hed_string import HedString
@@ -144,3 +145,79 @@ def sort_dataframe_by_onsets(df):
 
         return df_copy
     return df
+
+
+def replace_ref(text, newvalue, column_ref):
+    """ Replace column ref in x with y.  If it's n/a, delete extra commas/parentheses.
+
+    Parameters:
+        text (str): The input string containing the ref enclosed in curly braces.
+        newvalue (str): The replacement value for the ref.
+        column_ref (str): The ref to be replaced, without curly braces.
+
+    Returns:
+        str: The modified string with the ref replaced or removed.
+    """
+    # Note: This function could easily be updated to handle non-curly brace values, but it seemed faster this way
+
+    # If it's not n/a, we can just replace directly.
+    if newvalue != "n/a":
+        return text.replace(f"{{{column_ref}}}", newvalue)
+
+    def _remover(match):
+        p1 = match.group("p1").count("(")
+        p2 = match.group("p2").count(")")
+        if p1 > p2:  # We have more starting parens than ending.  Make sure we don't remove comma before
+            output = match.group("c1") + "(" * (p1 - p2)
+        elif p2 > p1:  # We have more ending parens.  Make sure we don't remove comma after
+            output = ")" * (p2 - p1) + match.group("c2")
+        else:
+            c1 = match.group("c1")
+            c2 = match.group("c2")
+            if c1:
+                c1 = ""
+            elif c2:
+                c2 = ""
+            output = c1 + c2
+
+        return output
+
+    # this finds all surrounding commas and parentheses to a reference.
+    # c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
+    # p1/p2 contain the parentheses directly surrounding the tag
+    # All four groups can have spaces.
+    pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
+    return re.sub(pattern, _remover, text)
+
+
+def _handle_curly_braces_refs(df, refs, column_names):
+    """ Fills in the refs in the dataframe
+
+        You probably shouldn't call this function directly, but rather use base input.
+
+    Parameters:
+        df(pd.DataFrame): The dataframe to modify
+        refs(list or pd.Series): a list of column refs to replace(without {})
+        column_names(list): the columns we are interested in(should include all ref columns)
+
+    Returns:
+        modified_df(pd.DataFrame): The modified dataframe with refs replaced
+    """
+    # Filter out columns and refs that don't exist.
+    refs = [ref for ref in refs if ref in column_names]
+    remaining_columns = [column for column in column_names if column not in refs]
+
+    new_df = df.copy()
+    # Replace references in the columns we are saving out.
+    saved_columns = new_df[refs]
+    for column_name in remaining_columns:
+        for replacing_name in refs:
+            # If the data has no n/a values, this version is MUCH faster.
+            # column_name_brackets = f"{{{replacing_name}}}"
+            # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
+            #                             in zip(df[column_name], saved_columns[replacing_name]))
+            new_df[column_name] = pd.Series(replace_ref(x, y, replacing_name) for x, y
+                                        in zip(new_df[column_name], saved_columns[replacing_name]))
+    new_df = new_df[remaining_columns]
+
+    return new_df
diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py
index 782f031c..6f3b5b1e 100644
--- a/hed/validator/sidecar_validator.py
+++ b/hed/validator/sidecar_validator.py
@@ -1,5 +1,7 @@
 import copy
 import re
+import itertools
+
 from hed.errors import ErrorHandler, ErrorContext, SidecarErrors, DefinitionErrors, ColumnErrors
 from hed.models import ColumnType
 from hed import HedString
@@ -7,6 +9,7 @@
 from hed.errors.error_reporter import sort_issues
 from hed.models.model_constants import DefTagNames
 from hed.errors.error_reporter import check_for_any_errors
+from hed.models.df_util import replace_ref
 
 
 # todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't)
@@ -53,11 +56,14 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)
         issues += sidecar._extract_definition_issues
         issues += sidecar_def_dict.issues
 
+        # todo: Break this function up
+        all_ref_columns = sidecar.get_column_refs()
         definition_checks = {}
         for column_data in sidecar:
             column_name = column_data.column_name
             column_data = column_data._get_unvalidated_data()
             hed_strings = column_data.get_hed_strings()
+            is_ref_column = column_name in all_ref_columns
             error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name)
             for key_name, hed_string in hed_strings.items():
                 new_issues = []
@@ -68,24 +74,46 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)
 
                 error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
                 new_issues += hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True)
-                new_issues += hed_validator.run_full_string_checks(hed_string_obj)
-
                 def_check_list = definition_checks.setdefault(column_name, [])
                 def_check_list.append(hed_string_obj.find_tags({DefTagNames.DEFINITION_KEY}, recursive=True,
                                                                include_groups=0))
+
                 # Might refine this later - for now just skip checking placeholder counts in definition columns.
                 if not def_check_list[-1]:
                     new_issues += self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type)
 
-                if len(hed_strings) > 1:
-                    error_handler.pop_error_context()
                 error_handler.add_context_and_filter(new_issues)
                 issues += new_issues
-            error_handler.pop_error_context()
-        error_handler.pop_error_context()
+                error_handler.pop_error_context()  # Hed String
+
+                # Only do full string checks on full columns, not partial ref columns.
+                if not is_ref_column:
+                    refs = re.findall("\{([a-z_\-0-9]+)\}", hed_string, re.IGNORECASE)
+                    refs_strings = {data.column_name: data.get_hed_strings() for data in sidecar}
+                    if "HED" not in refs_strings:
+                        refs_strings["HED"] = ["n/a"]
+                    for combination in itertools.product(*[refs_strings[key] for key in refs]):
+                        new_issues = []
+                        ref_dict = dict(zip(refs, combination))
+                        modified_string = hed_string
+                        for ref in refs:
+                            modified_string = replace_ref(modified_string, ref_dict[ref], ref)
+                        hed_string_obj = HedString(modified_string, hed_schema=self._schema, def_dict=sidecar_def_dict)
+
+                        error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj)
+                        new_issues += hed_validator.run_full_string_checks(hed_string_obj)
+                        error_handler.add_context_and_filter(new_issues)
+                        issues += new_issues
+                        error_handler.pop_error_context()  # Hed string
+                if len(hed_strings) > 1:
+                    error_handler.pop_error_context()  # Category key
+
+            error_handler.pop_error_context()  # Column Name
         issues += self._check_definitions_bad_spot(definition_checks, error_handler)
         issues = sort_issues(issues)
 
+        error_handler.pop_error_context()  # Filename
+
         return issues
 
     def validate_structure(self, sidecar, error_handler):
diff --git a/tests/data/sidecar_tests/basic_refs_test.json b/tests/data/sidecar_tests/basic_refs_test.json
index cd3011ac..a0270cb1 100644
--- a/tests/data/sidecar_tests/basic_refs_test.json
+++ b/tests/data/sidecar_tests/basic_refs_test.json
@@ -7,7 +7,7 @@
           "stop": "A blue square is displayed to indicate stopping"
        },
        "HED": {
-          "go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See",
+          "go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/Hear",
           "stop": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure"
        }
    },
@@ -15,7 +15,7 @@
        "LongName": "Response time after stimulus",
        "Description": "Time from stimulus presentation until subject presses button",
        "Units": "ms",
-       "HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See, Time-value/# s, {trial_type}"
+       "HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/Yellow-color/Gold,Action/Perceive/See, Time-value/# s, {trial_type}"
    },
    "stim_file": {
        "LongName": "Stimulus file name",
diff --git a/tests/data/sidecar_tests/multiple_category_refs.json b/tests/data/sidecar_tests/multiple_category_refs.json
new file mode 100644
index 00000000..4e43062b
--- /dev/null
+++ b/tests/data/sidecar_tests/multiple_category_refs.json
@@ -0,0 +1,17 @@
+{
+   "cat1": {
+       "HED": {
+          "go": "Azure,Action/Perceive/Hear",
+          "stop": "Azure"
+       }
+   },
+  "cat2": {
+       "HED": {
+          "go2": "White-color/Azure,Action/Perceive/Hear",
+          "stop2": "n/a"
+       }
+   },
+  "combo": {
+    "HED": "{cat1},{cat2}, Event, Time-interval/# s"
+  }
+}
\ No newline at end of file
diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py
index 5ada973d..b6d738e2 100644
--- a/tests/models/test_base_input.py
+++ b/tests/models/test_base_input.py
@@ -129,147 +129,6 @@ def test_sort(self):
         self.assertFalse(df.equals(df2))
 
 
-
-class TestInsertColumns(unittest.TestCase):
-
-    def test_insert_columns_simple(self):
-        df = pd.DataFrame({
-            "column1": ["{column2}, Event, Action"],
-            "column2": ["Item"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["Item, Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_multiple_rows(self):
-        df = pd.DataFrame({
-            "column1": ["{column2}, Event, Action", "Event, Action"],
-            "column2": ["Item", "Subject"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["Item, Event, Action", "Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_multiple_columns(self):
-        df = pd.DataFrame({
-            "column1": ["{column2}, Event, {column3}, Action"],
-            "column2": ["Item"],
-            "column3": ["Subject"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["Item, Event, Subject, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_four_columns(self):
-        df = pd.DataFrame({
-            "column1": ["{column2}, Event, {column3}, Action"],
-            "column2": ["Item"],
-            "column3": ["Subject"],
-            "column4": ["Data"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["Item, Event, Subject, Action"],
-            "column4": ["Data"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_with_nested_parentheses(self):
-        df = pd.DataFrame({
-            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
-            "column2": ["Item"],
-            "column3": ["Subject"],
-            "column4": ["Data"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["(Item, (Subject, Data)), Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_with_nested_parentheses_na_values(self):
-        df = pd.DataFrame({
-            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
-            "column2": ["Data"],
-            "column3": ["n/a"],
-            "column4": ["n/a"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["(Data), Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_with_nested_parentheses_na_values2(self):
-        df = pd.DataFrame({
-            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
-            "column2": ["n/a"],
-            "column3": ["n/a"],
-            "column4": ["Data"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["((Data)), Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_with_nested_parentheses_mixed_na_values(self):
-        df = pd.DataFrame({
-            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
-            "column2": ["n/a"],
-            "column3": ["Subject"],
-            "column4": ["n/a"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["((Subject)), Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_with_nested_parentheses_all_na_values(self):
-        df = pd.DataFrame({
-            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
-            "column2": ["n/a"],
-            "column3": ["n/a"],
-            "column4": ["n/a"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_with_parentheses(self):
-        df = pd.DataFrame({
-            "column1": ["({column2}), Event, Action"],
-            "column2": ["Item"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["(Item), Event, Action"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-    def test_insert_columns_with_parentheses_na_values(self):
-        df = pd.DataFrame({
-            "column1": ["({column2}), Event, Action"],
-            "column2": ["n/a"],
-            "column3": ["n/a"]
-        })
-        expected_df = pd.DataFrame({
-            "column1": ["Event, Action"],
-            "column3": ["n/a"]
-        })
-        result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
-        pd.testing.assert_frame_equal(result, expected_df)
-
-
 class TestCombineDataframe(unittest.TestCase):
     def test_combine_dataframe_with_strings(self):
         data = {
diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py
index 280038ff..1cff6943 100644
--- a/tests/models/test_df_util.py
+++ b/tests/models/test_df_util.py
@@ -5,6 +5,7 @@
 from hed import load_schema_version
 from hed.models.df_util import shrink_defs, expand_defs, convert_to_form, process_def_expands
 from hed import DefinitionDict
+from hed.models.df_util import _handle_curly_braces_refs
 
 
 class TestShrinkDefs(unittest.TestCase):
@@ -286,3 +287,141 @@ def test_def_expand_detection(self):
         self.assertEqual(len(ambiguous), 0)
         self.assertEqual(len(errors), 0)
 
+class TestInsertColumns(unittest.TestCase):
+
+    def test_insert_columns_simple(self):
+        df = pd.DataFrame({
+            "column1": ["{column2}, Event, Action"],
+            "column2": ["Item"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["Item, Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_multiple_rows(self):
+        df = pd.DataFrame({
+            "column1": ["{column2}, Event, Action", "Event, Action"],
+            "column2": ["Item", "Subject"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["Item, Event, Action", "Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_multiple_columns(self):
+        df = pd.DataFrame({
+            "column1": ["{column2}, Event, {column3}, Action"],
+            "column2": ["Item"],
+            "column3": ["Subject"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["Item, Event, Subject, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2", "column3"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_four_columns(self):
+        df = pd.DataFrame({
+            "column1": ["{column2}, Event, {column3}, Action"],
+            "column2": ["Item"],
+            "column3": ["Subject"],
+            "column4": ["Data"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["Item, Event, Subject, Action"],
+            "column4": ["Data"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2", "column3"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_with_nested_parentheses(self):
+        df = pd.DataFrame({
+            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
+            "column2": ["Item"],
+            "column3": ["Subject"],
+            "column4": ["Data"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["(Item, (Subject, Data)), Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_with_nested_parentheses_na_values(self):
+        df = pd.DataFrame({
+            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
+            "column2": ["Data"],
+            "column3": ["n/a"],
+            "column4": ["n/a"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["(Data), Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_with_nested_parentheses_na_values2(self):
+        df = pd.DataFrame({
+            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
+            "column2": ["n/a"],
+            "column3": ["n/a"],
+            "column4": ["Data"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["((Data)), Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_with_nested_parentheses_mixed_na_values(self):
+        df = pd.DataFrame({
+            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
+            "column2": ["n/a"],
+            "column3": ["Subject"],
+            "column4": ["n/a"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["((Subject)), Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_with_nested_parentheses_all_na_values(self):
+        df = pd.DataFrame({
+            "column1": ["({column2}, ({column3}, {column4})), Event, Action"],
+            "column2": ["n/a"],
+            "column3": ["n/a"],
+            "column4": ["n/a"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_with_parentheses(self):
+        df = pd.DataFrame({
+            "column1": ["({column2}), Event, Action"],
+            "column2": ["Item"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["(Item), Event, Action"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
+
+    def test_insert_columns_with_parentheses_na_values(self):
+        df = pd.DataFrame({
+            "column1": ["({column2}), Event, Action"],
+            "column2": ["n/a"],
+            "column3": ["n/a"]
+        })
+        expected_df = pd.DataFrame({
+            "column1": ["Event, Action"],
+            "column3": ["n/a"]
+        })
+        result = _handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns)
+        pd.testing.assert_frame_equal(result, expected_df)
diff --git a/tests/validator/test_sidecar_validator.py b/tests/validator/test_sidecar_validator.py
index f74fb03b..a8a4bca3 100644
--- a/tests/validator/test_sidecar_validator.py
+++ b/tests/validator/test_sidecar_validator.py
@@ -22,6 +22,7 @@ def setUpClass(cls):
         cls._refs_json_filename = os.path.join(base_data_dir, "sidecar_tests/basic_refs_test.json")
         cls._bad_refs_json_filename = os.path.join(base_data_dir, "sidecar_tests/bad_refs_test2.json")
         cls._malformed_refs_json_filename = os.path.join(base_data_dir, "sidecar_tests/malformed_refs_test.json")
+        cls._multiple_category_refs = os.path.join(base_data_dir, "sidecar_tests/multiple_category_refs.json")
 
     def test_basic_refs(self):
         sidecar = Sidecar(self._refs_json_filename)
@@ -31,6 +32,16 @@ def test_basic_refs(self):
         refs = sidecar.get_column_refs()
         self.assertEqual(len(refs), 2)
 
+    def test_multicategory_refs(self):
+        sidecar = Sidecar(self._multiple_category_refs)
+        issues = sidecar.validate(self.hed_schema)
+
+        # 3 issues are expected for repeated tags from stacking lines
+        self.assertEqual(len(issues), 3)
+        refs = sidecar.get_column_refs()
+        self.assertEqual(len(refs), 2)
+
+
     def test_bad_refs(self):
         sidecar = Sidecar(self._bad_refs_json_filename)
         issues = sidecar.validate(self.hed_schema)

From 55490f2695165d612ba51abb9b05061c5b127611 Mon Sep 17 00:00:00 2001
From: IanCa <ianrcallanan@gmail.com>
Date: Wed, 6 Mar 2024 18:56:30 -0600
Subject: [PATCH 2/2] Fix unrelated test

---
 tests/tools/bids/test_bids_tabular_dictionary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tools/bids/test_bids_tabular_dictionary.py b/tests/tools/bids/test_bids_tabular_dictionary.py
index 0c604ea7..b2fa7066 100644
--- a/tests/tools/bids/test_bids_tabular_dictionary.py
+++ b/tests/tools/bids/test_bids_tabular_dictionary.py
@@ -51,7 +51,7 @@ def test_count_diffs_diff(self):
         self.assertEqual(len(diff_list2), 1, "count_diffs has differences when other self keys are missing")
 
     def test_set_tsv_info(self):
-        dict1 = BidsTabularDictionary("Tsv Name1", self.file_list[:-1], entities=('sub', 'run'))
+        dict1 = BidsTabularDictionary("Tsv Name1", sorted(self.file_list)[:-1], entities=('sub', 'run'))
         info1 = dict1.get_info('sub-002_run-1')
         self.assertIsInstance(info1, dict)
         info2 = dict1.get_info('sub-002_run-1')