Add support for Duration tag, and validation support for Delay

Event manager now uses filtered series, combining multiple rows Minor bug fixes/functions added
IanCa · Mar 5, 2024 · a03e803 · a03e803
1 parent 0b23b2a
commit a03e803
Show file tree

Hide file tree

Showing 16 changed files with 176 additions and 51 deletions.
diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py
@@ -388,6 +388,13 @@ def onset_too_many_groups(tag, tag_list):
            f"Found {len(tag_list_strings)}: {tag_list_strings}"
 
 
+@hed_tag_error(OnsetErrors.DURATION_WRONG_NUMBER_GROUPS, actual_code=ValidationErrors.ONSET_OFFSET_INSET_ERROR)
+def onset_DURATION_WRONG_NUMBER_GROUPS(tag, tag_list):
+    tag_list_strings = [str(a_tag) for a_tag in tag_list]
+    return f"A duration and/or delay tag '{tag}'should have exactly one child group." \
+           f"Found {len(tag_list_strings)}: {tag_list_strings}"
+
+
 @hed_tag_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, actual_code=ValidationErrors.ONSET_OFFSET_INSET_ERROR)
 def onset_wrong_type_tag(tag, def_tag):
     return f"Onset def tag '{def_tag}' has an improper sibling tag '{tag}'.  All onset context tags must be " \
@@ -401,6 +408,11 @@ def onset_wrong_placeholder(tag, has_placeholder):
     return f"Onset/offset def tag {tag} should not have a placeholder, but has one."
 
 
+@hed_tag_error(OnsetErrors.DURATION_HAS_OTHER_TAGS, actual_code=ValidationErrors.ONSET_OFFSET_INSET_ERROR)
+def onset_DURATION_HAS_OTHER_TAGS(tag):
+    return f"Tag '{tag}' should not be grouped with Duration or Delay.  Context tags should be in a sub-group."
+
+
 @hed_error(ColumnErrors.INVALID_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
 def invalid_column_ref(bad_ref):
     return f"The column '{bad_ref}' is unknown or does not have HED annotations.'"

diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py
@@ -172,6 +172,9 @@ class OnsetErrors:
     ONSET_SAME_DEFS_ONE_ROW = "ONSET_SAME_DEFS_ONE_ROW"
     HED_ONSET_WITH_NO_COLUMN = 'HED_ONSET_WITH_NO_COLUMN'
 
+    DURATION_HAS_OTHER_TAGS = "DURATION_HAS_OTHER_TAGS"
+    DURATION_WRONG_NUMBER_GROUPS = "DURATION_WRONG_NUMBER_GROUPS"
+
 
 class ColumnErrors:
     INVALID_COLUMN_REF = "INVALID_COLUMN_REF"

diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -139,7 +139,7 @@ def _indexed_dict_from_onsets(onsets):
     # This would need to store the index list -> So it can optionally apply to other columns on request.
     @staticmethod
     def _filter_by_index_list(original_series, indexed_dict):
-        new_series = pd.Series(["n/a"] * len(original_series), dtype=str)
+        new_series = pd.Series([""] * len(original_series), dtype=str)
 
         for onset, indices in indexed_dict.items():
             if indices:

diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -4,7 +4,7 @@
 from hed.models.hed_string import HedString
 
 
-def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=True):
+def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=True, return_filtered=False):
     """ Create an array of assembled HedString objects (or list of these) of the same length as tabular file input.
 
     Parameters:
@@ -13,17 +13,20 @@ def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=
         extra_def_dicts: list of DefinitionDict, optional
             Any extra DefinitionDict objects to use when parsing the HED tags.
         defs_expanded (bool): (Default True) Expands definitions if True, otherwise shrinks them.
+        return_filtered (bool): If true, combines lines with the same onset.
+            Further lines with that onset are marked n/a
     Returns:
         tuple:
-            hed_strings(list of HedStrings): A list of HedStrings or a list of lists of HedStrings
+            hed_strings(list of HedStrings): A list of HedStrings
             def_dict(DefinitionDict): The definitions from this Sidecar.
     """
 
     def_dict = tabular_file.get_def_dict(hed_schema, extra_def_dicts=extra_def_dicts)
+    series_a = tabular_file.series_a if not return_filtered else tabular_file.series_filtered
     if defs_expanded:
-        return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict
+        return [HedString(x, hed_schema, def_dict).expand_defs() for x in series_a], def_dict
     else:
-        return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict
+        return [HedString(x, hed_schema, def_dict).shrink_defs() for x in series_a], def_dict
 
 
 def convert_to_form(df, hed_schema, tag_form, columns=None):

diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py
@@ -351,7 +351,7 @@ def find_top_level_tags(self, anchor_tags, include_groups=2):
                 If 1: return only groups.
                 If 2 or any other value: return both.
         Returns:
-            list or tuple: The returned result depends on include_groups.
+            list: The returned result depends on include_groups.
         """
         top_level_tags = []
         for group in self.groups():
@@ -365,6 +365,29 @@ def find_top_level_tags(self, anchor_tags, include_groups=2):
             return [tag[include_groups] for tag in top_level_tags]
         return top_level_tags
 
+    def find_top_level_tags_grouped(self, anchor_tags):
+        """ Find top level groups with an anchor tag.
+
+            This is an alternate one designed to be easy to use with Delay/Duration tag.
+
+        Parameters:
+            anchor_tags (container):  A list/set/etc. of short_base_tags to find groups by.
+        Returns:
+            list of tuples:
+                list of tags: the tags in the same subgroup
+                group: the subgroup containing the tags
+        """
+        top_level_tags = []
+        for group in self.groups():
+            tags = []
+            for tag in group.tags():
+                if tag.short_base_tag.lower() in anchor_tags:
+                    tags.append(tag)
+            if tags:
+                top_level_tags.append((tags, group))
+
+        return top_level_tags
+
     def remove_refs(self):
         """ Remove any refs(tags contained entirely inside curly braces) from the string.
 

diff --git a/hed/models/model_constants.py b/hed/models/model_constants.py
@@ -1,10 +1,4 @@
 """ Defined constants for definitions, def labels, and expanded labels. """
-COLUMN_TO_HED_TAGS = "column_to_hed_tags"
-ROW_HED_STRING = "HED"
-COLUMN_ISSUES = "column_issues"
-ROW_ISSUES = "row_issues"
-
-
 class DefTagNames:
     """ Source names for definitions, def labels, and expanded labels. """
 
@@ -19,9 +13,16 @@ class DefTagNames:
     ONSET_ORG_KEY = "Onset"
     OFFSET_ORG_KEY = "Offset"
     INSET_ORG_KEY = "Inset"
+    DURATION_ORG_KEY = "Duration"
+    DELAY_ORG_KEY = "Delay"
 
     ONSET_KEY = ONSET_ORG_KEY.lower()
     OFFSET_KEY = OFFSET_ORG_KEY.lower()
     INSET_KEY = INSET_ORG_KEY.lower()
+    DURATION_KEY = DURATION_ORG_KEY.lower()
+    DELAY_KEY = DELAY_ORG_KEY.lower()
 
     TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY}
+    DURATION_KEYS = {DURATION_KEY, DELAY_KEY}
+
+    ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS)
diff --git a/hed/models/query_service.py b/hed/models/query_service.py
@@ -57,7 +57,8 @@ def search_strings(hed_strings, queries, query_names):
     df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names)
     for parse_ind, parser in enumerate(queries):
         for index, next_item in enumerate(hed_strings):
-            match = parser.search(next_item)
-            if match:
-                df_factors.at[index, query_names[parse_ind]] = 1
+            if next_item:
+                match = parser.search(next_item)
+                if match:
+                    df_factors.at[index, query_names[parse_ind]] = 1
     return df_factors
diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py
@@ -1,5 +1,6 @@
 """ Manager of events of temporal extent. """
 import pandas as pd
+import bisect
 
 from hed.errors import HedFileError
 from hed.models import HedString
@@ -52,15 +53,31 @@ def _create_event_list(self, input_data):
         Notes:
 
         """
-        hed_strings, def_dict = get_assembled(input_data, self.hed_schema, extra_def_dicts=None, defs_expanded=False)
+        hed_strings, def_dict = get_assembled(input_data, self.hed_schema, extra_def_dicts=None, defs_expanded=False,
+                                              return_filtered=True)
         onset_dict = {}  # Temporary dictionary keeping track of temporal events that haven't ended yet.
         for event_index, hed in enumerate(hed_strings):
             self._extract_temporal_events(hed, event_index, onset_dict)
+            self._extract_duration_events(hed, event_index)
         # Now handle the events that extend to end of list
         for item in onset_dict.values():
             item.set_end(len(self.onsets), None)
         self.hed_strings = hed_strings
 
+    def _extract_duration_events(self, hed, event_index):
+        groups = hed.find_top_level_tags(anchor_tags={DefTagNames.DURATION_KEY})
+        to_remove = []
+        for duration_tag, group in groups:
+            start_time = self.onsets[event_index]
+            new_event = TemporalEvent(group, event_index, start_time)
+            end_time = new_event.end_time
+            # Todo: This may need updating.  end_index==len(self.onsets) in the edge
+            end_index = bisect.bisect_left(self.onsets, end_time)
+            new_event.set_end(end_index, end_time)
+            self.event_list[event_index].append(new_event)
+            to_remove.append(group)
+        hed.remove(to_remove)
+
     def _extract_temporal_events(self, hed, event_index, onset_dict):
         """ Extract the temporal events and remove them from the other HED strings.
 
@@ -77,18 +94,19 @@ def _extract_temporal_events(self, hed, event_index, onset_dict):
             return
         group_tuples = hed.find_top_level_tags(anchor_tags={DefTagNames.ONSET_KEY, DefTagNames.OFFSET_KEY},
                                                include_groups=2)
+
         to_remove = []
-        for tup in group_tuples:
-            anchor_tag = tup[1].find_def_tags(recursive=False, include_groups=0)[0]
+        for def_tag, group in group_tuples:
+            anchor_tag = group.find_def_tags(recursive=False, include_groups=0)[0]
             anchor = anchor_tag.extension.lower()
-            if anchor in onset_dict or tup[0].short_base_tag.lower() == DefTagNames.OFFSET_KEY:
+            if anchor in onset_dict or def_tag.short_base_tag.lower() == DefTagNames.OFFSET_KEY:
                 temporal_event = onset_dict.pop(anchor)
                 temporal_event.set_end(event_index, self.onsets[event_index])
-            if tup[0] == DefTagNames.ONSET_KEY:
-                new_event = TemporalEvent(tup[1], event_index, self.onsets[event_index])
+            if def_tag == DefTagNames.ONSET_KEY:
+                new_event = TemporalEvent(group, event_index, self.onsets[event_index])
                 self.event_list[event_index].append(new_event)
                 onset_dict[anchor] = new_event
-            to_remove.append(tup[1])
+            to_remove.append(group)
         hed.remove(to_remove)
 
     def unfold_context(self, remove_types=[]):

diff --git a/hed/tools/analysis/temporal_event.py b/hed/tools/analysis/temporal_event.py
@@ -40,7 +40,7 @@ def _split_group(self, contents):
                 to_remove.append(item)
             elif item.short_base_tag.lower() == "duration":
                 to_remove.append(item)
-                self.end_time = self.start_time + float(item.extension.lower())  # Will need to be fixed for units
+                self.end_time = self.start_time + item.value_as_default_unit()
             elif item.short_base_tag.lower() == "def":
                 self.anchor = item.short_tag
         contents.remove(to_remove)

diff --git a/hed/validator/onset_validator.py b/hed/validator/onset_validator.py
@@ -16,11 +16,11 @@ def validate_temporal_relations(self, hed_string_obj):
             hed_string_obj (HedString): The hed string to check.
 
         Returns:
-            list: A list of issues found in validating onsets (i.e., out of order onsets, unknown def names).
+            list: A list of issues found in validating onsets (i.e., out of order onsets, repeated def names).
         """
         onset_issues = []
         used_def_names = set()
-        for temporal_tag, temporal_group in self._find_temporal_tags(hed_string_obj):
+        for temporal_tag, temporal_group in hed_string_obj.find_top_level_tags(anchor_tags=DefTagNames.TEMPORAL_KEYS):
             if not temporal_tag:
                 return []
 
@@ -42,8 +42,33 @@ def validate_temporal_relations(self, hed_string_obj):
 
         return onset_issues
 
-    def _find_temporal_tags(self, hed_string_obj):
-        return hed_string_obj.find_top_level_tags(anchor_tags=DefTagNames.TEMPORAL_KEYS)
+    def validate_duration_tags(self, hed_string_obj):
+        """ Validate Duration/Delay tag groups
+
+        Parameters:
+            hed_string_obj (HedString): The hed string to check.
+
+        Returns:
+            list: A list of issues found in validating durations (i.e., extra tags or groups present, or a group missing)
+        """
+        duration_issues = []
+        for tags, group in hed_string_obj.find_top_level_tags_grouped(anchor_tags=DefTagNames.DURATION_KEYS):
+            # This implicitly validates the duration/delay tag, as they're the only two allowed in the same group
+            # It should be impossible to have > 2 tags, but it's a good stopgap.
+            if len(tags) != len(group.tags()) or len(group.tags()) > 2:
+                for tag in group.tags():
+                    if tag not in tags:
+                        duration_issues += ErrorHandler.format_error(OnsetErrors.DURATION_HAS_OTHER_TAGS, tag=tag)
+                continue
+            if len(group.groups()) != 1:
+                duration_issues += ErrorHandler.format_error(OnsetErrors.DURATION_WRONG_NUMBER_GROUPS,
+                                                             tags[0],
+                                                             hed_string_obj.groups())
+                continue
+
+        # Does anything else need verification here?
+        #     That duration is positive?
+        return duration_issues
 
     def _handle_onset_or_offset(self, def_tag, onset_offset_tag):
         is_onset = onset_offset_tag.short_base_tag == DefTagNames.ONSET_ORG_KEY
@@ -73,9 +98,9 @@ def check_for_banned_tags(hed_string):
         Returns:
             list: The validation issues associated with the characters. Each issue is dictionary.
         """
-        banned_tag_list = DefTagNames.TEMPORAL_KEYS
+        banned_tag_list = DefTagNames.ALL_TIME_KEYS
         issues = []
         for tag in hed_string.get_all_tags():
-            if tag in banned_tag_list:
+            if tag.short_base_tag.lower() in banned_tag_list:
                 issues += ErrorHandler.format_error(OnsetErrors.HED_ONSET_WITH_NO_COLUMN, tag)
         return issues
diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py
@@ -113,6 +113,7 @@ def _run_checks(self, hed_df, onset_filtered, error_handler, row_adj):
                 new_column_issues = self._hed_validator.run_full_string_checks(row_string)
                 if self._onset_validator is not None:
                     new_column_issues += self._onset_validator.validate_temporal_relations(row_string)
+                    new_column_issues += self._onset_validator.validate_duration_tags(row_string)
                 else:
                     new_column_issues += OnsetValidator.check_for_banned_tags(row_string)
                 error_handler.add_context_and_filter(new_column_issues)

diff --git a/hed/validator/tag_util/group_util.py b/hed/validator/tag_util/group_util.py
@@ -91,8 +91,8 @@ def check_tag_level_issue(original_tag_list, is_top_level, is_group):
                 actual_code = None
                 if top_level_tag.short_base_tag == DefTagNames.DEFINITION_ORG_KEY:
                     actual_code = ValidationErrors.DEFINITION_INVALID
-                elif top_level_tag.short_base_tag in {DefTagNames.ONSET_ORG_KEY, DefTagNames.OFFSET_ORG_KEY}:
-                    actual_code = ValidationErrors.ONSET_OFFSET_INSET_ERROR
+                elif top_level_tag.short_base_tag.lower() in DefTagNames.ALL_TIME_KEYS:
+                    actual_code = ValidationErrors.ONSET_OFFSET_INSET_ERROR  # May split this out if we switch error
 
                 if actual_code:
                     validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG,
@@ -102,9 +102,12 @@ def check_tag_level_issue(original_tag_list, is_top_level, is_group):
                                                                tag=top_level_tag)
 
         if is_top_level and len(top_level_tags) > 1:
-            validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS,
-                                                           tag=top_level_tags[0],
-                                                           multiple_tags=top_level_tags[1:])
+            short_tags = [tag.short_base_tag for tag in top_level_tags]
+            # Special exception for Duration/Delay pairing
+            if len(top_level_tags) != 2 or DefTagNames.DURATION_ORG_KEY not in short_tags or DefTagNames.DELAY_ORG_KEY not in short_tags:
+                validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS,
+                                                               tag=top_level_tags[0],
+                                                               multiple_tags=top_level_tags[1:])
 
         return validation_issues
 

diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py
@@ -364,13 +364,13 @@ def test_empty_and_single_item_series(self):
 
     def test_two_item_series_with_same_onset(self):
         input_series = pd.Series(["apple", "orange"])
-        expected_series = pd.Series(["apple,orange", "n/a"])
+        expected_series = pd.Series(["apple,orange", ""])
         self.assertTrue(BaseInput._filter_by_index_list(input_series, {0: [0, 1]}).equals(expected_series))
 
     def test_multiple_item_series(self):
         input_series = pd.Series(["apple", "orange", "banana", "mango"])
         indexed_dict = {0: [0, 1], 1: [2], 2: [3]}
-        expected_series = pd.Series(["apple,orange", "n/a", "banana", "mango"])
+        expected_series = pd.Series(["apple,orange", "", "banana", "mango"])
         self.assertTrue(BaseInput._filter_by_index_list(input_series, indexed_dict).equals(expected_series))
 
     def test_complex_scenarios(self):
@@ -383,6 +383,6 @@ def test_complex_scenarios(self):
         # Test with more complex indexed_dict
         original2 = ["apple", "orange", "banana", "mango", "grape"]
         indexed_dict2= {0: [0, 1], 1: [2], 2: [3, 4]}
-        expected_series2 = pd.Series(["apple,orange", "n/a", "banana", "mango,grape", "n/a"])
+        expected_series2 = pd.Series(["apple,orange", "", "banana", "mango,grape", ""])
         self.assertTrue(BaseInput._filter_by_index_list(original2, indexed_dict2).equals(expected_series2))