Skip to content

Commit

Permalink
Add support for Duration tag, and validation support for Delay
Browse files Browse the repository at this point in the history
Event manager now uses filtered series, combining multiple rows
Minor bug fixes/functions added
  • Loading branch information
IanCa committed Mar 5, 2024
1 parent 0b23b2a commit a03e803
Show file tree
Hide file tree
Showing 16 changed files with 176 additions and 51 deletions.
12 changes: 12 additions & 0 deletions hed/errors/error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,13 @@ def onset_too_many_groups(tag, tag_list):
f"Found {len(tag_list_strings)}: {tag_list_strings}"


@hed_tag_error(OnsetErrors.DURATION_WRONG_NUMBER_GROUPS, actual_code=ValidationErrors.ONSET_OFFSET_INSET_ERROR)
def onset_DURATION_WRONG_NUMBER_GROUPS(tag, tag_list):
tag_list_strings = [str(a_tag) for a_tag in tag_list]
return f"A duration and/or delay tag '{tag}'should have exactly one child group." \
f"Found {len(tag_list_strings)}: {tag_list_strings}"


@hed_tag_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, actual_code=ValidationErrors.ONSET_OFFSET_INSET_ERROR)
def onset_wrong_type_tag(tag, def_tag):
return f"Onset def tag '{def_tag}' has an improper sibling tag '{tag}'. All onset context tags must be " \
Expand All @@ -401,6 +408,11 @@ def onset_wrong_placeholder(tag, has_placeholder):
return f"Onset/offset def tag {tag} should not have a placeholder, but has one."


@hed_tag_error(OnsetErrors.DURATION_HAS_OTHER_TAGS, actual_code=ValidationErrors.ONSET_OFFSET_INSET_ERROR)
def onset_DURATION_HAS_OTHER_TAGS(tag):
return f"Tag '{tag}' should not be grouped with Duration or Delay. Context tags should be in a sub-group."


@hed_error(ColumnErrors.INVALID_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def invalid_column_ref(bad_ref):
return f"The column '{bad_ref}' is unknown or does not have HED annotations.'"
Expand Down
3 changes: 3 additions & 0 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ class OnsetErrors:
ONSET_SAME_DEFS_ONE_ROW = "ONSET_SAME_DEFS_ONE_ROW"
HED_ONSET_WITH_NO_COLUMN = 'HED_ONSET_WITH_NO_COLUMN'

DURATION_HAS_OTHER_TAGS = "DURATION_HAS_OTHER_TAGS"
DURATION_WRONG_NUMBER_GROUPS = "DURATION_WRONG_NUMBER_GROUPS"


class ColumnErrors:
INVALID_COLUMN_REF = "INVALID_COLUMN_REF"
Expand Down
2 changes: 1 addition & 1 deletion hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def _indexed_dict_from_onsets(onsets):
# This would need to store the index list -> So it can optionally apply to other columns on request.
@staticmethod
def _filter_by_index_list(original_series, indexed_dict):
new_series = pd.Series(["n/a"] * len(original_series), dtype=str)
new_series = pd.Series([""] * len(original_series), dtype=str)

for onset, indices in indexed_dict.items():
if indices:
Expand Down
11 changes: 7 additions & 4 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from hed.models.hed_string import HedString


def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=True):
def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=True, return_filtered=False):
""" Create an array of assembled HedString objects (or list of these) of the same length as tabular file input.
Parameters:
Expand All @@ -13,17 +13,20 @@ def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=
extra_def_dicts: list of DefinitionDict, optional
Any extra DefinitionDict objects to use when parsing the HED tags.
defs_expanded (bool): (Default True) Expands definitions if True, otherwise shrinks them.
return_filtered (bool): If true, combines lines with the same onset.
Further lines with that onset are marked n/a
Returns:
tuple:
hed_strings(list of HedStrings): A list of HedStrings or a list of lists of HedStrings
hed_strings(list of HedStrings): A list of HedStrings
def_dict(DefinitionDict): The definitions from this Sidecar.
"""

def_dict = tabular_file.get_def_dict(hed_schema, extra_def_dicts=extra_def_dicts)
series_a = tabular_file.series_a if not return_filtered else tabular_file.series_filtered
if defs_expanded:
return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict
return [HedString(x, hed_schema, def_dict).expand_defs() for x in series_a], def_dict
else:
return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict
return [HedString(x, hed_schema, def_dict).shrink_defs() for x in series_a], def_dict


def convert_to_form(df, hed_schema, tag_form, columns=None):
Expand Down
25 changes: 24 additions & 1 deletion hed/models/hed_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def find_top_level_tags(self, anchor_tags, include_groups=2):
If 1: return only groups.
If 2 or any other value: return both.
Returns:
list or tuple: The returned result depends on include_groups.
list: The returned result depends on include_groups.
"""
top_level_tags = []
for group in self.groups():
Expand All @@ -365,6 +365,29 @@ def find_top_level_tags(self, anchor_tags, include_groups=2):
return [tag[include_groups] for tag in top_level_tags]
return top_level_tags

def find_top_level_tags_grouped(self, anchor_tags):
""" Find top level groups with an anchor tag.
This is an alternate one designed to be easy to use with Delay/Duration tag.
Parameters:
anchor_tags (container): A list/set/etc. of short_base_tags to find groups by.
Returns:
list of tuples:
list of tags: the tags in the same subgroup
group: the subgroup containing the tags
"""
top_level_tags = []
for group in self.groups():
tags = []
for tag in group.tags():
if tag.short_base_tag.lower() in anchor_tags:
tags.append(tag)
if tags:
top_level_tags.append((tags, group))

return top_level_tags

def remove_refs(self):
""" Remove any refs(tags contained entirely inside curly braces) from the string.
Expand Down
13 changes: 7 additions & 6 deletions hed/models/model_constants.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
""" Defined constants for definitions, def labels, and expanded labels. """
COLUMN_TO_HED_TAGS = "column_to_hed_tags"
ROW_HED_STRING = "HED"
COLUMN_ISSUES = "column_issues"
ROW_ISSUES = "row_issues"


class DefTagNames:
""" Source names for definitions, def labels, and expanded labels. """

Expand All @@ -19,9 +13,16 @@ class DefTagNames:
ONSET_ORG_KEY = "Onset"
OFFSET_ORG_KEY = "Offset"
INSET_ORG_KEY = "Inset"
DURATION_ORG_KEY = "Duration"
DELAY_ORG_KEY = "Delay"

ONSET_KEY = ONSET_ORG_KEY.lower()
OFFSET_KEY = OFFSET_ORG_KEY.lower()
INSET_KEY = INSET_ORG_KEY.lower()
DURATION_KEY = DURATION_ORG_KEY.lower()
DELAY_KEY = DELAY_ORG_KEY.lower()

TEMPORAL_KEYS = {ONSET_KEY, OFFSET_KEY, INSET_KEY}
DURATION_KEYS = {DURATION_KEY, DELAY_KEY}

ALL_TIME_KEYS = TEMPORAL_KEYS.union(DURATION_KEYS)
7 changes: 4 additions & 3 deletions hed/models/query_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def search_strings(hed_strings, queries, query_names):
df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names)
for parse_ind, parser in enumerate(queries):
for index, next_item in enumerate(hed_strings):
match = parser.search(next_item)
if match:
df_factors.at[index, query_names[parse_ind]] = 1
if next_item:
match = parser.search(next_item)
if match:
df_factors.at[index, query_names[parse_ind]] = 1
return df_factors
32 changes: 25 additions & 7 deletions hed/tools/analysis/event_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" Manager of events of temporal extent. """
import pandas as pd
import bisect

from hed.errors import HedFileError
from hed.models import HedString
Expand Down Expand Up @@ -52,15 +53,31 @@ def _create_event_list(self, input_data):
Notes:
"""
hed_strings, def_dict = get_assembled(input_data, self.hed_schema, extra_def_dicts=None, defs_expanded=False)
hed_strings, def_dict = get_assembled(input_data, self.hed_schema, extra_def_dicts=None, defs_expanded=False,
return_filtered=True)
onset_dict = {} # Temporary dictionary keeping track of temporal events that haven't ended yet.
for event_index, hed in enumerate(hed_strings):
self._extract_temporal_events(hed, event_index, onset_dict)
self._extract_duration_events(hed, event_index)
# Now handle the events that extend to end of list
for item in onset_dict.values():
item.set_end(len(self.onsets), None)
self.hed_strings = hed_strings

def _extract_duration_events(self, hed, event_index):
groups = hed.find_top_level_tags(anchor_tags={DefTagNames.DURATION_KEY})
to_remove = []
for duration_tag, group in groups:
start_time = self.onsets[event_index]
new_event = TemporalEvent(group, event_index, start_time)
end_time = new_event.end_time
# Todo: This may need updating. end_index==len(self.onsets) in the edge
end_index = bisect.bisect_left(self.onsets, end_time)
new_event.set_end(end_index, end_time)
self.event_list[event_index].append(new_event)
to_remove.append(group)
hed.remove(to_remove)

def _extract_temporal_events(self, hed, event_index, onset_dict):
""" Extract the temporal events and remove them from the other HED strings.
Expand All @@ -77,18 +94,19 @@ def _extract_temporal_events(self, hed, event_index, onset_dict):
return
group_tuples = hed.find_top_level_tags(anchor_tags={DefTagNames.ONSET_KEY, DefTagNames.OFFSET_KEY},
include_groups=2)

to_remove = []
for tup in group_tuples:
anchor_tag = tup[1].find_def_tags(recursive=False, include_groups=0)[0]
for def_tag, group in group_tuples:
anchor_tag = group.find_def_tags(recursive=False, include_groups=0)[0]
anchor = anchor_tag.extension.lower()
if anchor in onset_dict or tup[0].short_base_tag.lower() == DefTagNames.OFFSET_KEY:
if anchor in onset_dict or def_tag.short_base_tag.lower() == DefTagNames.OFFSET_KEY:
temporal_event = onset_dict.pop(anchor)
temporal_event.set_end(event_index, self.onsets[event_index])
if tup[0] == DefTagNames.ONSET_KEY:
new_event = TemporalEvent(tup[1], event_index, self.onsets[event_index])
if def_tag == DefTagNames.ONSET_KEY:
new_event = TemporalEvent(group, event_index, self.onsets[event_index])
self.event_list[event_index].append(new_event)
onset_dict[anchor] = new_event
to_remove.append(tup[1])
to_remove.append(group)
hed.remove(to_remove)

def unfold_context(self, remove_types=[]):
Expand Down
2 changes: 1 addition & 1 deletion hed/tools/analysis/temporal_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def _split_group(self, contents):
to_remove.append(item)
elif item.short_base_tag.lower() == "duration":
to_remove.append(item)
self.end_time = self.start_time + float(item.extension.lower()) # Will need to be fixed for units
self.end_time = self.start_time + item.value_as_default_unit()
elif item.short_base_tag.lower() == "def":
self.anchor = item.short_tag
contents.remove(to_remove)
Expand Down
37 changes: 31 additions & 6 deletions hed/validator/onset_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ def validate_temporal_relations(self, hed_string_obj):
hed_string_obj (HedString): The hed string to check.
Returns:
list: A list of issues found in validating onsets (i.e., out of order onsets, unknown def names).
list: A list of issues found in validating onsets (i.e., out of order onsets, repeated def names).
"""
onset_issues = []
used_def_names = set()
for temporal_tag, temporal_group in self._find_temporal_tags(hed_string_obj):
for temporal_tag, temporal_group in hed_string_obj.find_top_level_tags(anchor_tags=DefTagNames.TEMPORAL_KEYS):
if not temporal_tag:
return []

Expand All @@ -42,8 +42,33 @@ def validate_temporal_relations(self, hed_string_obj):

return onset_issues

def _find_temporal_tags(self, hed_string_obj):
return hed_string_obj.find_top_level_tags(anchor_tags=DefTagNames.TEMPORAL_KEYS)
def validate_duration_tags(self, hed_string_obj):
""" Validate Duration/Delay tag groups
Parameters:
hed_string_obj (HedString): The hed string to check.
Returns:
list: A list of issues found in validating durations (i.e., extra tags or groups present, or a group missing)
"""
duration_issues = []
for tags, group in hed_string_obj.find_top_level_tags_grouped(anchor_tags=DefTagNames.DURATION_KEYS):
# This implicitly validates the duration/delay tag, as they're the only two allowed in the same group
# It should be impossible to have > 2 tags, but it's a good stopgap.
if len(tags) != len(group.tags()) or len(group.tags()) > 2:
for tag in group.tags():
if tag not in tags:
duration_issues += ErrorHandler.format_error(OnsetErrors.DURATION_HAS_OTHER_TAGS, tag=tag)
continue
if len(group.groups()) != 1:
duration_issues += ErrorHandler.format_error(OnsetErrors.DURATION_WRONG_NUMBER_GROUPS,
tags[0],
hed_string_obj.groups())
continue

# Does anything else need verification here?
# That duration is positive?
return duration_issues

def _handle_onset_or_offset(self, def_tag, onset_offset_tag):
is_onset = onset_offset_tag.short_base_tag == DefTagNames.ONSET_ORG_KEY
Expand Down Expand Up @@ -73,9 +98,9 @@ def check_for_banned_tags(hed_string):
Returns:
list: The validation issues associated with the characters. Each issue is dictionary.
"""
banned_tag_list = DefTagNames.TEMPORAL_KEYS
banned_tag_list = DefTagNames.ALL_TIME_KEYS
issues = []
for tag in hed_string.get_all_tags():
if tag in banned_tag_list:
if tag.short_base_tag.lower() in banned_tag_list:
issues += ErrorHandler.format_error(OnsetErrors.HED_ONSET_WITH_NO_COLUMN, tag)
return issues
1 change: 1 addition & 0 deletions hed/validator/spreadsheet_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def _run_checks(self, hed_df, onset_filtered, error_handler, row_adj):
new_column_issues = self._hed_validator.run_full_string_checks(row_string)
if self._onset_validator is not None:
new_column_issues += self._onset_validator.validate_temporal_relations(row_string)
new_column_issues += self._onset_validator.validate_duration_tags(row_string)
else:
new_column_issues += OnsetValidator.check_for_banned_tags(row_string)
error_handler.add_context_and_filter(new_column_issues)
Expand Down
13 changes: 8 additions & 5 deletions hed/validator/tag_util/group_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def check_tag_level_issue(original_tag_list, is_top_level, is_group):
actual_code = None
if top_level_tag.short_base_tag == DefTagNames.DEFINITION_ORG_KEY:
actual_code = ValidationErrors.DEFINITION_INVALID
elif top_level_tag.short_base_tag in {DefTagNames.ONSET_ORG_KEY, DefTagNames.OFFSET_ORG_KEY}:
actual_code = ValidationErrors.ONSET_OFFSET_INSET_ERROR
elif top_level_tag.short_base_tag.lower() in DefTagNames.ALL_TIME_KEYS:
actual_code = ValidationErrors.ONSET_OFFSET_INSET_ERROR # May split this out if we switch error

if actual_code:
validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG,
Expand All @@ -102,9 +102,12 @@ def check_tag_level_issue(original_tag_list, is_top_level, is_group):
tag=top_level_tag)

if is_top_level and len(top_level_tags) > 1:
validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS,
tag=top_level_tags[0],
multiple_tags=top_level_tags[1:])
short_tags = [tag.short_base_tag for tag in top_level_tags]
# Special exception for Duration/Delay pairing
if len(top_level_tags) != 2 or DefTagNames.DURATION_ORG_KEY not in short_tags or DefTagNames.DELAY_ORG_KEY not in short_tags:
validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS,
tag=top_level_tags[0],
multiple_tags=top_level_tags[1:])

return validation_issues

Expand Down
6 changes: 3 additions & 3 deletions tests/models/test_base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,13 +364,13 @@ def test_empty_and_single_item_series(self):

def test_two_item_series_with_same_onset(self):
input_series = pd.Series(["apple", "orange"])
expected_series = pd.Series(["apple,orange", "n/a"])
expected_series = pd.Series(["apple,orange", ""])
self.assertTrue(BaseInput._filter_by_index_list(input_series, {0: [0, 1]}).equals(expected_series))

def test_multiple_item_series(self):
input_series = pd.Series(["apple", "orange", "banana", "mango"])
indexed_dict = {0: [0, 1], 1: [2], 2: [3]}
expected_series = pd.Series(["apple,orange", "n/a", "banana", "mango"])
expected_series = pd.Series(["apple,orange", "", "banana", "mango"])
self.assertTrue(BaseInput._filter_by_index_list(input_series, indexed_dict).equals(expected_series))

def test_complex_scenarios(self):
Expand All @@ -383,6 +383,6 @@ def test_complex_scenarios(self):
# Test with more complex indexed_dict
original2 = ["apple", "orange", "banana", "mango", "grape"]
indexed_dict2= {0: [0, 1], 1: [2], 2: [3, 4]}
expected_series2 = pd.Series(["apple,orange", "n/a", "banana", "mango,grape", "n/a"])
expected_series2 = pd.Series(["apple,orange", "", "banana", "mango,grape", ""])
self.assertTrue(BaseInput._filter_by_index_list(original2, indexed_dict2).equals(expected_series2))

Loading

0 comments on commit a03e803

Please sign in to comment.