From 3df2dab210ee4126ffbc88acaf578b5d9e12bdf9 Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 7 Feb 2024 15:43:53 -0600 Subject: [PATCH] Allow validation of files with out of order onsets. Rename expression parser and split into files. Move some functions from analysis until to query_service.py. Minor changes related to the above --- hed/errors/error_messages.py | 6 + hed/errors/error_types.py | 2 +- hed/models/__init__.py | 2 +- hed/models/base_input.py | 8 + hed/models/df_util.py | 41 +- hed/models/expression_parser.py | 485 ------------------ hed/models/query_expressions.py | 222 ++++++++ hed/models/query_handler.py | 175 +++++++ hed/models/query_service.py | 61 +++ hed/models/query_util.py | 93 ++++ hed/tools/__init__.py | 3 - hed/tools/analysis/analysis_util.py | 230 --------- hed/tools/analysis/event_manager.py | 2 +- .../operations/factor_hed_tags_op.py | 23 +- hed/validator/spreadsheet_validator.py | 12 +- tests/models/test_base_input.py | 57 +- ...ession_parser.py => test_query_handler.py} | 14 +- .../test_analysis_util_assemble_hed.py | 123 ----- .../analysis/test_analysis_util_convert.py | 109 ---- tests/tools/analysis/test_hed_tag_counts.py | 9 +- .../operations/test_summarize_hed_tags_op.py | 2 +- 21 files changed, 687 insertions(+), 992 deletions(-) delete mode 100644 hed/models/expression_parser.py create mode 100644 hed/models/query_expressions.py create mode 100644 hed/models/query_handler.py create mode 100644 hed/models/query_service.py create mode 100644 hed/models/query_util.py delete mode 100644 hed/tools/analysis/analysis_util.py rename tests/models/{test_expression_parser.py => test_query_handler.py} (98%) delete mode 100644 tests/tools/analysis/test_analysis_util_assemble_hed.py delete mode 100644 tests/tools/analysis/test_analysis_util_convert.py diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 657aefbb..7c78993e 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -60,6 +60,12 @@ def val_error_CURLY_BRACE_UNSUPPORTED_HERE(tag, problem_tag): return (f"Curly braces are only permitted in sidecars, fully wrapping text in place of a tag. " f"Invalid character '{problem_tag}' in tag '{tag}'") + +@hed_error(ValidationErrors.ONSETS_OUT_OF_ORDER, default_severity=ErrorSeverity.WARNING) +def val_error_ONSETS_OUT_OF_ORDER(): + return "Onsets need to be temporally increasing for most downstream tools to work." + + @hed_error(ValidationErrors.COMMA_MISSING) def val_error_comma_missing(tag): return f"Comma missing after - '{tag}'" diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index 5dc32737..a90322c7 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -91,7 +91,7 @@ class ValidationErrors: INVALID_TAG_CHARACTER = 'invalidTagCharacter' CURLY_BRACE_UNSUPPORTED_HERE = "CURLY_BRACE_UNSUPPORTED_HERE" - + ONSETS_OUT_OF_ORDER = "ONSETS_OUT_OF_ORDER" class SidecarErrors: diff --git a/hed/models/__init__.py b/hed/models/__init__.py index f2f1a600..ed38bb1e 100644 --- a/hed/models/__init__.py +++ b/hed/models/__init__.py @@ -5,7 +5,7 @@ from .column_metadata import ColumnMetadata, ColumnType from .definition_dict import DefinitionDict from .definition_entry import DefinitionEntry -from .expression_parser import QueryParser +from .query_handler import QueryHandler from .hed_group import HedGroup from .spreadsheet_input import SpreadsheetInput from .hed_string import HedString diff --git a/hed/models/base_input.py b/hed/models/base_input.py index cc8ff916..d548d50b 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -157,6 +157,14 @@ def onsets(self): if "onset" in self.columns: return self._dataframe["onset"] + @property + def needs_sorting(self): + """Returns True if this both has an onset column, and it needs sorting.""" + onsets = self.onsets + if onsets is not None: + onsets = onsets.astype(float) + return not onsets.is_monotonic_increasing + @property def name(self): """ Name of the data. """ diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 71bd4c76..7d16f97c 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -7,8 +7,7 @@ from hed.models.definition_dict import DefinitionDict -def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_columns=True, - shrink_defs=False, expand_defs=True): +def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, shrink_defs=False, expand_defs=True): """ Create an array of assembled HedString objects (or list of these) of the same length as tabular file with. Args: @@ -20,8 +19,6 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ If str, will attempt to load as a version if it doesn't have a valid extension. extra_def_dicts: list of DefinitionDict, optional Any extra DefinitionDict objects to use when parsing the HED tags. - join_columns: bool - If True, join all HED columns into one. shrink_defs: bool Shrink any def-expand tags found expand_defs: bool @@ -41,19 +38,12 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ if sidecar: def_dict = sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) - if join_columns: - if expand_defs: - return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict - elif shrink_defs: - return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict - else: - return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict + if expand_defs: + return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict + elif shrink_defs: + return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict else: - return [[HedString(x, hed_schema, def_dict).expand_defs() if expand_defs - else HedString(x, hed_schema, def_dict).shrink_defs() if shrink_defs - else HedString(x, hed_schema, def_dict) - for x in text_file_row] for text_file_row in tabular_file.dataframe_a.itertuples(index=False)], \ - def_dict + return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict def convert_to_form(df, hed_schema, tag_form, columns=None): @@ -151,3 +141,22 @@ def process_def_expands(hed_strings, hed_schema, known_defs=None, ambiguous_defs from hed.models.def_expand_gather import DefExpandGatherer def_gatherer = DefExpandGatherer(hed_schema, known_defs, ambiguous_defs) return def_gatherer.process_def_expands(hed_strings) + + +def sort_dataframe_by_onsets(df): + """ Gather def-expand tags in the strings/compare with known definitions to find any differences + + Parameters: + df(pd.Dataframe): Dataframe to sort + Returns: + The sorted dataframe, or the original dataframe if it didn't have an onset column. + """ + if "onset" in df.columns: + # Create a copy and sort by onsets as floats(if needed), but continue to keep the string version. + df_copy = df.copy() + df_copy['_temp_onset_sort'] = df_copy['onset'].astype(float) + df_copy.sort_values(by='_temp_onset_sort', inplace=True) + df_copy.drop(columns=['_temp_onset_sort'], inplace=True) + + return df_copy + return df diff --git a/hed/models/expression_parser.py b/hed/models/expression_parser.py deleted file mode 100644 index 76309819..00000000 --- a/hed/models/expression_parser.py +++ /dev/null @@ -1,485 +0,0 @@ -""" Holder for and manipulation of search results. """ -import re - - -class SearchResult: - """ Holder for and manipulation of search results. """ - def __init__(self, group, tag): - self.group = group - # todo: rename tag: children - if not isinstance(tag, list): - new_tags = [tag] - else: - new_tags = tag.copy() - self.tags = new_tags - - def __eq__(self, other): - if isinstance(other, SearchResult): - return self.group == other.group - return other == self.group - - def merge_result(self, other): - # Returns a new - new_tags = self.tags.copy() - for tag in other.tags: - if any(tag is this_tag for this_tag in self.tags): - continue - new_tags.append(tag) - new_tags.sort(key=lambda x: str(x)) - - if self.group != other.group: - raise ValueError("Internal error") - return SearchResult(self.group, new_tags) - - def has_same_tags(self, other): - if self.group != other.group: - return False - - if len(self.tags) != len(other.tags): - return False - - return all(tag is tag2 for tag, tag2 in zip(self.tags, other.tags)) - - def __str__(self): - return str(self.group) + " Tags: " + "---".join([str(tag) for tag in self.tags]) - - def get_tags_only(self): - from hed import HedTag - return [tag for tag in self.tags if isinstance(tag, HedTag)] - - def get_groups_only(self): - from hed import HedTag - return [tag for tag in self.tags if not isinstance(tag, HedTag)] - - -class Token: - And = 0 - Tag = 1 - DescendantGroup = 4 - DescendantGroupEnd = 5 - Or = 6 - LogicalGroup = 7 - LogicalGroupEnd = 8 - LogicalNegation = 9 - Wildcard = 10 - ExactMatch = 11 - ExactMatchEnd = 12 - ExactMatchOptional = 14 - NotInLine = 13 # Not currently a token. In development and may become one. - - def __init__(self, text): - tokens = { - ",": Token.And, - "and": Token.And, - "or": Token.Or, - "[": Token.DescendantGroup, - "]": Token.DescendantGroupEnd, - "(": Token.LogicalGroup, - ")": Token.LogicalGroupEnd, - "~": Token.LogicalNegation, - "?": Token.Wildcard, # Any tag or group - "??": Token.Wildcard, # Any tag - "???": Token.Wildcard, # Any Group - "{": Token.ExactMatch, # Nothing else - "}": Token.ExactMatchEnd, # Nothing else - ":": Token.ExactMatchOptional, - "@": Token.NotInLine - } - self.kind = tokens.get(text, Token.Tag) - self.text = text - - def __str__(self): - return self.text - - def __eq__(self, other): - if self.kind == other: - return True - return False - - -class Expression: - def __init__(self, token, left=None, right=None): - self.left = left - self.right = right - self.token = token - self._match_mode = "/" in token.text - self._must_not_be_in_line = False - if token.text.startswith("@"): - self._must_not_be_in_line = True - token.text = token.text[1:] - if token.text.startswith('"') and token.text.endswith('"') and len(token.text) > 2: - self._match_mode = 1 - token.text = token.text[1:-1] - if "*" in token.text: - self._match_mode = 2 - token.text = token.text.replace("*", "") - - def _get_parent_groups(self, search_results): - found_parent_groups = [] - if search_results: - for group in search_results: - if not group.group.is_group: - continue - if group.group._parent: - found_parent_groups.append(SearchResult(group.group._parent, group.group)) - - return found_parent_groups - - def __str__(self): - output_str = "" - if self.left: - output_str += str(self.left) - output_str += " " + str(self.token) - if self.right: - output_str += str(self.right) - return output_str - - def handle_expr(self, hed_group, exact=False): - if self._match_mode == 2: - groups_found = hed_group.find_wildcard_tags([self.token.text], recursive=True, include_groups=2) - elif self._match_mode: - groups_found = hed_group.find_exact_tags([self.token.text], recursive=True, include_groups=2) - else: - groups_found = hed_group.find_tags_with_term(self.token.text, recursive=True, include_groups=2) - - if self._must_not_be_in_line: - # If we found this, and it cannot be in the line. - if groups_found: - groups_found = [] - else: - groups_found = [([], group) for group in hed_group.get_all_groups()] - - # If we're checking for all groups, also need to add parents. - if exact: - all_found_groups = [SearchResult(group, tag) for tag, group in groups_found] - else: - all_found_groups = [] - for tag, group in groups_found: - while group: - all_found_groups.append(SearchResult(group, tag)) - # This behavior makes it eat higher level groups at higher levels - tag = group - group = group._parent - return all_found_groups - - -class ExpressionAnd(Expression): - def handle_expr(self, hed_group, exact=False): - groups1 = self.left.handle_expr(hed_group, exact=exact) - if not groups1: - return groups1 - groups2 = self.right.handle_expr(hed_group, exact=exact) - - return self.merge_groups(groups1, groups2) - - @staticmethod - def merge_groups(groups1, groups2): - return_list = [] - for group in groups1: - for other_group in groups2: - if group.group is other_group.group: - # At this point any shared tags between the two groups invalidates it. - if any(tag is tag2 and tag is not None for tag in group.tags for tag2 in other_group.tags): - continue - merged_result = group.merge_result(other_group) - - dont_add = False - # This is trash and slow - for finalized_value in return_list: - if merged_result.has_same_tags(finalized_value): - dont_add = True - break - if dont_add: - continue - return_list.append(merged_result) - - return return_list - - def __str__(self): - output_str = "(" - if self.left: - output_str += str(self.left) - output_str += " " + str(self.token) - if self.right: - output_str += str(self.right) - output_str += ")" - return output_str - - -class ExpressionWildcardNew(Expression): - def handle_expr(self, hed_group, exact=False): - groups_found = [] - if self.token.text == "?": - # Any tag or group - groups_searching = hed_group.get_all_groups() - for group in groups_searching: - for child in group.children: - groups_found.append((child, group)) - elif self.token.text == "??": - groups_searching = hed_group.get_all_groups() - for group in groups_searching: - for child in group.tags(): - groups_found.append((child, group)) - elif self.token.text == "???": - # Any group - groups_searching = hed_group.get_all_groups() - for group in groups_searching: - for child in group.groups(): - groups_found.append((child, group)) - - # Wildcards are only found in containing groups. I believe this is correct. - # todo: Is this code still needed for this kind of wildcard? We already are registering every group, just not - # every group at every level. - all_found_groups = [SearchResult(group, tag) for tag, group in groups_found] - return all_found_groups - - -class ExpressionOr(Expression): - def handle_expr(self, hed_group, exact=False): - groups1 = self.left.handle_expr(hed_group, exact=exact) - # Don't early out as we need to gather all groups in case tags appear more than once etc - groups2 = self.right.handle_expr(hed_group, exact=exact) - # todo: optimize this eventually - # Filter out duplicates - duplicates = [] - for group in groups1: - for other_group in groups2: - if group.has_same_tags(other_group): - duplicates.append(group) - - groups1 = [group for group in groups1 if not any(other_group is group for other_group in duplicates)] - - return groups1 + groups2 - - def __str__(self): - output_str = "(" - if self.left: - output_str += str(self.left) - output_str += " " + str(self.token) - if self.right: - output_str += str(self.right) - output_str += ")" - return output_str - - -class ExpressionNegation(Expression): - def handle_expr(self, hed_group, exact=False): - found_groups = self.right.handle_expr(hed_group, exact=exact) - - # Todo: this may need more thought with respects to wildcards and negation - # negated_groups = [group for group in hed_group.get_all_groups() if group not in groups] - # This simpler version works on python >= 3.9 - # negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() if group not in groups] - # Python 3.7/8 compatible version. - negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() - if not any(group is found_group.group for found_group in found_groups)] - - return negated_groups - - -class ExpressionDescendantGroup(Expression): - def handle_expr(self, hed_group, exact=False): - found_groups = self.right.handle_expr(hed_group) - found_parent_groups = self._get_parent_groups(found_groups) - return found_parent_groups - - -class ExpressionExactMatch(Expression): - def __init__(self, token, left=None, right=None): - super().__init__(token, left, right) - self.optional = "any" - - def _filter_exact_matches(self, search_results): - filtered_list = [] - for group in search_results: - if len(group.group.children) == len(group.tags): - filtered_list.append(group) - - return filtered_list - - def handle_expr(self, hed_group, exact=False): - found_groups = self.right.handle_expr(hed_group, exact=True) - if self.optional == "any": - return self._get_parent_groups(found_groups) - - filtered_list = self._filter_exact_matches(found_groups) - if filtered_list: - return self._get_parent_groups(filtered_list) - - # Basically if we don't have an exact match above, do the more complex matching including optional - if self.left: - optional_groups = self.left.handle_expr(hed_group, exact=True) - found_groups = ExpressionAnd.merge_groups(found_groups, optional_groups) - - filtered_list = self._filter_exact_matches(found_groups) - if filtered_list: - return self._get_parent_groups(filtered_list) - - return [] - - -class QueryParser: - """Parse a search expression into a form than can be used to search a hed string.""" - - def __init__(self, expression_string): - """Compiles a QueryParser for a particular expression, so it can be used to search hed strings. - - Basic Input Examples: - - 'Event' - Finds any strings with Event, or a descendent tag of Event such as Sensory-event - - 'Event and Action' - Find any strings with Event and Action, including descendant tags - - 'Event or Action' - Same as above, but it has either - - '"Event"' - Finds the Event tag, but not any descendent tags - - `Def/DefName/*` - Find Def/DefName instances with placeholders, regardless of the value of the placeholder - - 'Eve*' - Find any short tags that begin with Eve*, such as Event, but not Sensory-event - - '[Event and Action]' - Find a group that contains both Event and Action(at any level) - - '{Event and Action}' - Find a group with Event And Action at the same level. - - '{Event and Action:}' - Find a group with Event And Action at the same level, and nothing else - - '{Event and Action:Agent}' - Find a group with Event And Action at the same level, and optionally an Agent tag. - - Practical Complex Example: - - {(Onset or Offset), (Def or {Def-expand}): ???} - A group with an onset tag, - a def tag or def-expand group, and an optional wildcard group - - Parameters: - expression_string(str): The query string - """ - self.tokens = [] - self.at_token = -1 - self.tree = self._parse(expression_string.lower()) - self._org_string = expression_string - - def __str__(self): - return str(self.tree) - - def _get_next_token(self): - self.at_token += 1 - if self.at_token >= len(self.tokens): - raise ValueError("Parse error in get next token") - return self.tokens[self.at_token] - - def _next_token_is(self, kinds): - if self.at_token + 1 >= len(self.tokens): - return None - if self.tokens[self.at_token + 1].kind in kinds: - return self._get_next_token() - return None - - def current_token(self): - if self.at_token + 1 >= len(self.tokens): - return None - return self.tokens[self.at_token].text - - def _handle_and_op(self): - expr = self._handle_negation() - next_token = self._next_token_is([Token.And]) - while next_token: - right = self._handle_negation() - if next_token.kind == Token.And: - expr = ExpressionAnd(next_token, expr, right) - next_token = self._next_token_is([Token.And]) - return expr - - def _handle_or_op(self): - expr = self._handle_and_op() # Note: calling _handle_and_op here - next_token = self._next_token_is([Token.Or]) - while next_token: - right = self._handle_and_op() # Note: calling _handle_and_op here - if next_token.kind == Token.Or: - expr = ExpressionOr(next_token, expr, right) - next_token = self._next_token_is([Token.Or]) - return expr - - def _handle_negation(self): - next_token = self._next_token_is([Token.LogicalNegation]) - if next_token == Token.LogicalNegation: - interior = self._handle_grouping_op() - if "?" in str(interior): - raise ValueError("Cannot negate wildcards, or expressions that contain wildcards." - "Use {required_expression : optional_expression}.") - expr = ExpressionNegation(next_token, right=interior) - return expr - else: - return self._handle_grouping_op() - - def _handle_grouping_op(self): - next_token = self._next_token_is( - [Token.LogicalGroup, Token.DescendantGroup, Token.ExactMatch]) - if next_token == Token.LogicalGroup: - expr = self._handle_or_op() - next_token = self._next_token_is([Token.LogicalGroupEnd]) - if next_token != Token.LogicalGroupEnd: - raise ValueError("Parse error: Missing closing paren") - elif next_token == Token.DescendantGroup: - interior = self._handle_or_op() - expr = ExpressionDescendantGroup(next_token, right=interior) - next_token = self._next_token_is([Token.DescendantGroupEnd]) - if next_token != Token.DescendantGroupEnd: - raise ValueError("Parse error: Missing closing square bracket") - elif next_token == Token.ExactMatch: - interior = self._handle_or_op() - expr = ExpressionExactMatch(next_token, right=interior) - next_token = self._next_token_is([Token.ExactMatchEnd, Token.ExactMatchOptional]) - if next_token == Token.ExactMatchOptional: - # We have an optional portion - this needs to now be an exact match - expr.optional = "none" - next_token = self._next_token_is([Token.ExactMatchEnd]) - if next_token != Token.ExactMatchEnd: - optional_portion = self._handle_or_op() - expr.left = optional_portion - next_token = self._next_token_is([Token.ExactMatchEnd]) - if "~" in str(expr): - raise ValueError("Cannot use negation in exact matching groups," - " as it's not clear what is being matched.\n" - "{thing and ~(expression)} is allowed.") - - if next_token is None: - raise ValueError("Parse error: Missing closing curly bracket") - else: - next_token = self._get_next_token() - if next_token and next_token.kind == Token.Wildcard: - expr = ExpressionWildcardNew(next_token) - elif next_token: - expr = Expression(next_token) - else: - expr = None - - return expr - - def _parse(self, expression_string): - self.tokens = self._tokenize(expression_string) - - expr = self._handle_or_op() - - if self.at_token + 1 != len(self.tokens): - raise ValueError("Parse error in search string") - - return expr - - def _tokenize(self, expression_string): - grouping_re = r"\[\[|\[|\]\]|\]|}|{|:" - paren_re = r"\)|\(|~" - word_re = r"\?+|\band\b|\bor\b|,|[\"_\-a-zA-Z0-9/.^#\*@]+" - re_string = fr"({grouping_re}|{paren_re}|{word_re})" - token_re = re.compile(re_string) - - tokens = token_re.findall(expression_string) - tokens = [Token(token) for token in tokens] - - return tokens - - def search(self, hed_string_obj): - current_node = self.tree - - result = current_node.handle_expr(hed_string_obj) - return result diff --git a/hed/models/query_expressions.py b/hed/models/query_expressions.py new file mode 100644 index 00000000..163cee4b --- /dev/null +++ b/hed/models/query_expressions.py @@ -0,0 +1,222 @@ +from hed.models.query_util import SearchResult + + +class Expression: + def __init__(self, token, left=None, right=None): + self.left = left + self.right = right + self.token = token + self._match_mode = "/" in token.text + self._must_not_be_in_line = False + if token.text.startswith("@"): + self._must_not_be_in_line = True + token.text = token.text[1:] + if token.text.startswith('"') and token.text.endswith('"') and len(token.text) > 2: + self._match_mode = 1 + token.text = token.text[1:-1] + if "*" in token.text: + self._match_mode = 2 + token.text = token.text.replace("*", "") + + def _get_parent_groups(self, search_results): + found_parent_groups = [] + if search_results: + for group in search_results: + if not group.group.is_group: + continue + if group.group._parent: + found_parent_groups.append(SearchResult(group.group._parent, group.group)) + + return found_parent_groups + + def __str__(self): + output_str = "" + if self.left: + output_str += str(self.left) + output_str += " " + str(self.token) + if self.right: + output_str += str(self.right) + return output_str + + def handle_expr(self, hed_group, exact=False): + if self._match_mode == 2: + groups_found = hed_group.find_wildcard_tags([self.token.text], recursive=True, include_groups=2) + elif self._match_mode: + groups_found = hed_group.find_exact_tags([self.token.text], recursive=True, include_groups=2) + else: + groups_found = hed_group.find_tags_with_term(self.token.text, recursive=True, include_groups=2) + + if self._must_not_be_in_line: + # If we found this, and it cannot be in the line. + if groups_found: + groups_found = [] + else: + groups_found = [([], group) for group in hed_group.get_all_groups()] + + # If we're checking for all groups, also need to add parents. + if exact: + all_found_groups = [SearchResult(group, tag) for tag, group in groups_found] + else: + all_found_groups = [] + for tag, group in groups_found: + while group: + all_found_groups.append(SearchResult(group, tag)) + # This behavior makes it eat higher level groups at higher levels + tag = group + group = group._parent + return all_found_groups + + +class ExpressionAnd(Expression): + def handle_expr(self, hed_group, exact=False): + groups1 = self.left.handle_expr(hed_group, exact=exact) + if not groups1: + return groups1 + groups2 = self.right.handle_expr(hed_group, exact=exact) + + return self.merge_groups(groups1, groups2) + + @staticmethod + def merge_groups(groups1, groups2): + return_list = [] + for group in groups1: + for other_group in groups2: + if group.group is other_group.group: + # At this point any shared tags between the two groups invalidates it. + if any(tag is tag2 and tag is not None for tag in group.tags for tag2 in other_group.tags): + continue + merged_result = group.merge_result(other_group) + + dont_add = False + # This is trash and slow + for finalized_value in return_list: + if merged_result.has_same_tags(finalized_value): + dont_add = True + break + if dont_add: + continue + return_list.append(merged_result) + + return return_list + + def __str__(self): + output_str = "(" + if self.left: + output_str += str(self.left) + output_str += " " + str(self.token) + if self.right: + output_str += str(self.right) + output_str += ")" + return output_str + + +class ExpressionWildcardNew(Expression): + def handle_expr(self, hed_group, exact=False): + groups_found = [] + if self.token.text == "?": + # Any tag or group + groups_searching = hed_group.get_all_groups() + for group in groups_searching: + for child in group.children: + groups_found.append((child, group)) + elif self.token.text == "??": + groups_searching = hed_group.get_all_groups() + for group in groups_searching: + for child in group.tags(): + groups_found.append((child, group)) + elif self.token.text == "???": + # Any group + groups_searching = hed_group.get_all_groups() + for group in groups_searching: + for child in group.groups(): + groups_found.append((child, group)) + + # Wildcards are only found in containing groups. I believe this is correct. + # todo: Is this code still needed for this kind of wildcard? We already are registering every group, just not + # every group at every level. + all_found_groups = [SearchResult(group, tag) for tag, group in groups_found] + return all_found_groups + + +class ExpressionOr(Expression): + def handle_expr(self, hed_group, exact=False): + groups1 = self.left.handle_expr(hed_group, exact=exact) + # Don't early out as we need to gather all groups in case tags appear more than once etc + groups2 = self.right.handle_expr(hed_group, exact=exact) + # todo: optimize this eventually + # Filter out duplicates + duplicates = [] + for group in groups1: + for other_group in groups2: + if group.has_same_tags(other_group): + duplicates.append(group) + + groups1 = [group for group in groups1 if not any(other_group is group for other_group in duplicates)] + + return groups1 + groups2 + + def __str__(self): + output_str = "(" + if self.left: + output_str += str(self.left) + output_str += " " + str(self.token) + if self.right: + output_str += str(self.right) + output_str += ")" + return output_str + + +class ExpressionNegation(Expression): + def handle_expr(self, hed_group, exact=False): + found_groups = self.right.handle_expr(hed_group, exact=exact) + + # Todo: this may need more thought with respects to wildcards and negation + # negated_groups = [group for group in hed_group.get_all_groups() if group not in groups] + # This simpler version works on python >= 3.9 + # negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() if group not in groups] + # Python 3.7/8 compatible version. + negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() + if not any(group is found_group.group for found_group in found_groups)] + + return negated_groups + + +class ExpressionDescendantGroup(Expression): + def handle_expr(self, hed_group, exact=False): + found_groups = self.right.handle_expr(hed_group) + found_parent_groups = self._get_parent_groups(found_groups) + return found_parent_groups + + +class ExpressionExactMatch(Expression): + def __init__(self, token, left=None, right=None): + super().__init__(token, left, right) + self.optional = "any" + + def _filter_exact_matches(self, search_results): + filtered_list = [] + for group in search_results: + if len(group.group.children) == len(group.tags): + filtered_list.append(group) + + return filtered_list + + def handle_expr(self, hed_group, exact=False): + found_groups = self.right.handle_expr(hed_group, exact=True) + if self.optional == "any": + return self._get_parent_groups(found_groups) + + filtered_list = self._filter_exact_matches(found_groups) + if filtered_list: + return self._get_parent_groups(filtered_list) + + # Basically if we don't have an exact match above, do the more complex matching including optional + if self.left: + optional_groups = self.left.handle_expr(hed_group, exact=True) + found_groups = ExpressionAnd.merge_groups(found_groups, optional_groups) + + filtered_list = self._filter_exact_matches(found_groups) + if filtered_list: + return self._get_parent_groups(filtered_list) + + return [] diff --git a/hed/models/query_handler.py b/hed/models/query_handler.py new file mode 100644 index 00000000..c0a38bad --- /dev/null +++ b/hed/models/query_handler.py @@ -0,0 +1,175 @@ +""" Holder for and manipulation of search results. """ +import re + +from hed.models.query_expressions import Expression, ExpressionAnd, ExpressionWildcardNew, ExpressionOr, \ + ExpressionNegation, ExpressionDescendantGroup, ExpressionExactMatch +from hed.models.query_util import Token + + +class QueryHandler: + """Parse a search expression into a form than can be used to search a hed string.""" + + def __init__(self, expression_string): + """Compiles a QueryHandler for a particular expression, so it can be used to search hed strings. + + Basic Input Examples: + + 'Event' - Finds any strings with Event, or a descendent tag of Event such as Sensory-event + + 'Event and Action' - Find any strings with Event and Action, including descendant tags + + 'Event or Action' - Same as above, but it has either + + '"Event"' - Finds the Event tag, but not any descendent tags + + `Def/DefName/*` - Find Def/DefName instances with placeholders, regardless of the value of the placeholder + + 'Eve*' - Find any short tags that begin with Eve*, such as Event, but not Sensory-event + + '[Event and Action]' - Find a group that contains both Event and Action(at any level) + + '{Event and Action}' - Find a group with Event And Action at the same level. + + '{Event and Action:}' - Find a group with Event And Action at the same level, and nothing else + + '{Event and Action:Agent}' - Find a group with Event And Action at the same level, and optionally an Agent tag. + + Practical Complex Example: + + {(Onset or Offset), (Def or {Def-expand}): ???} - A group with an onset tag, + a def tag or def-expand group, and an optional wildcard group + + Parameters: + expression_string(str): The query string + """ + self.tokens = [] + self.at_token = -1 + self.tree = self._parse(expression_string.lower()) + self._org_string = expression_string + + def __str__(self): + return str(self.tree) + + def _get_next_token(self): + self.at_token += 1 + if self.at_token >= len(self.tokens): + raise ValueError("Parse error in get next token") + return self.tokens[self.at_token] + + def _next_token_is(self, kinds): + if self.at_token + 1 >= len(self.tokens): + return None + if self.tokens[self.at_token + 1].kind in kinds: + return self._get_next_token() + return None + + def current_token(self): + if self.at_token + 1 >= len(self.tokens): + return None + return self.tokens[self.at_token].text + + def _handle_and_op(self): + expr = self._handle_negation() + next_token = self._next_token_is([Token.And]) + while next_token: + right = self._handle_negation() + if next_token.kind == Token.And: + expr = ExpressionAnd(next_token, expr, right) + next_token = self._next_token_is([Token.And]) + return expr + + def _handle_or_op(self): + expr = self._handle_and_op() # Note: calling _handle_and_op here + next_token = self._next_token_is([Token.Or]) + while next_token: + right = self._handle_and_op() # Note: calling _handle_and_op here + if next_token.kind == Token.Or: + expr = ExpressionOr(next_token, expr, right) + next_token = self._next_token_is([Token.Or]) + return expr + + def _handle_negation(self): + next_token = self._next_token_is([Token.LogicalNegation]) + if next_token == Token.LogicalNegation: + interior = self._handle_grouping_op() + if "?" in str(interior): + raise ValueError("Cannot negate wildcards, or expressions that contain wildcards." + "Use {required_expression : optional_expression}.") + expr = ExpressionNegation(next_token, right=interior) + return expr + else: + return self._handle_grouping_op() + + def _handle_grouping_op(self): + next_token = self._next_token_is( + [Token.LogicalGroup, Token.DescendantGroup, Token.ExactMatch]) + if next_token == Token.LogicalGroup: + expr = self._handle_or_op() + next_token = self._next_token_is([Token.LogicalGroupEnd]) + if next_token != Token.LogicalGroupEnd: + raise ValueError("Parse error: Missing closing paren") + elif next_token == Token.DescendantGroup: + interior = self._handle_or_op() + expr = ExpressionDescendantGroup(next_token, right=interior) + next_token = self._next_token_is([Token.DescendantGroupEnd]) + if next_token != Token.DescendantGroupEnd: + raise ValueError("Parse error: Missing closing square bracket") + elif next_token == Token.ExactMatch: + interior = self._handle_or_op() + expr = ExpressionExactMatch(next_token, right=interior) + next_token = self._next_token_is([Token.ExactMatchEnd, Token.ExactMatchOptional]) + if next_token == Token.ExactMatchOptional: + # We have an optional portion - this needs to now be an exact match + expr.optional = "none" + next_token = self._next_token_is([Token.ExactMatchEnd]) + if next_token != Token.ExactMatchEnd: + optional_portion = self._handle_or_op() + expr.left = optional_portion + next_token = self._next_token_is([Token.ExactMatchEnd]) + if "~" in str(expr): + raise ValueError("Cannot use negation in exact matching groups," + " as it's not clear what is being matched.\n" + "{thing and ~(expression)} is allowed.") + + if next_token is None: + raise ValueError("Parse error: Missing closing curly bracket") + else: + next_token = self._get_next_token() + if next_token and next_token.kind == Token.Wildcard: + expr = ExpressionWildcardNew(next_token) + elif next_token: + expr = Expression(next_token) + else: + expr = None + + return expr + + def _parse(self, expression_string): + self.tokens = self._tokenize(expression_string) + + expr = self._handle_or_op() + + if self.at_token + 1 != len(self.tokens): + raise ValueError("Parse error in search string") + + return expr + + def _tokenize(self, expression_string): + grouping_re = r"\[\[|\[|\]\]|\]|}|{|:" + paren_re = r"\)|\(|~" + word_re = r"\?+|\band\b|\bor\b|,|[\"_\-a-zA-Z0-9/.^#\*@]+" + re_string = fr"({grouping_re}|{paren_re}|{word_re})" + token_re = re.compile(re_string) + + tokens = token_re.findall(expression_string) + tokens = [Token(token) for token in tokens] + + return tokens + + def search(self, hed_string_obj): + current_node = self.tree + + result = current_node.handle_expr(hed_string_obj) + return result + + diff --git a/hed/models/query_service.py b/hed/models/query_service.py new file mode 100644 index 00000000..c197c683 --- /dev/null +++ b/hed/models/query_service.py @@ -0,0 +1,61 @@ +import pandas as pd + +from hed.models import QueryHandler + + +def get_query_handlers(queries, query_names=None): + """ Returns a list of query handlers and names + + Parameters: + queries (list): A list of query strings or QueryHandler objects + query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. + + Returns: + DataFrame - containing the search strings + + :raises ValueError: + - If query names are invalid or duplicated. + + """ + expression_parsers = [] + if not query_names: + query_names = [f"query_{index}" for index in range(len(queries))] + elif len(queries) != len(query_names): + raise ValueError("QueryNamesLengthBad", + f"The query_names length {len(query_names)} must be empty or equal" + + f"to the queries length {len(queries)}.") + elif len(set(query_names)) != len(query_names): + raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates") + for index, query in enumerate(queries): + if isinstance(query, str): + try: + next_query = QueryHandler(query) + except Exception: + raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed") + else: + raise ValueError("BadQuery", f"Query [{index}]: {query} has a bad type") + expression_parsers.append(next_query) + return expression_parsers, query_names + + +def search_strings(hed_strings, queries, query_names): + """ Returns a DataFrame of factors based on results of queries. + + Parameters: + hed_strings (list): A list of HedString objects (empty entries or None entries are 0's) + queries (list): A list of query strings or QueryHandler objects + query_names (list): A list of column names for results of queries. + + Returns: + DataFrame - containing the factor vectors with results of the queries + + :raises ValueError: + - If query names are invalid or duplicated. + """ + df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names) + for parse_ind, parser in enumerate(queries): + for index, next_item in enumerate(hed_strings): + match = parser.search(next_item) + if match: + df_factors.at[index, query_names[parse_ind]] = 1 + return df_factors diff --git a/hed/models/query_util.py b/hed/models/query_util.py new file mode 100644 index 00000000..88f1351d --- /dev/null +++ b/hed/models/query_util.py @@ -0,0 +1,93 @@ +class SearchResult: + """ Holder for and manipulation of search results. """ + def __init__(self, group, tag): + self.group = group + # todo: rename tag: children + if not isinstance(tag, list): + new_tags = [tag] + else: + new_tags = tag.copy() + self.tags = new_tags + + def __eq__(self, other): + if isinstance(other, SearchResult): + return self.group == other.group + return other == self.group + + def merge_result(self, other): + # Returns a new + new_tags = self.tags.copy() + for tag in other.tags: + if any(tag is this_tag for this_tag in self.tags): + continue + new_tags.append(tag) + new_tags.sort(key=lambda x: str(x)) + + if self.group != other.group: + raise ValueError("Internal error") + return SearchResult(self.group, new_tags) + + def has_same_tags(self, other): + if self.group != other.group: + return False + + if len(self.tags) != len(other.tags): + return False + + return all(tag is tag2 for tag, tag2 in zip(self.tags, other.tags)) + + def __str__(self): + return str(self.group) + " Tags: " + "---".join([str(tag) for tag in self.tags]) + + def get_tags_only(self): + from hed import HedTag + return [tag for tag in self.tags if isinstance(tag, HedTag)] + + def get_groups_only(self): + from hed import HedTag + return [tag for tag in self.tags if not isinstance(tag, HedTag)] + + +class Token: + And = 0 + Tag = 1 + DescendantGroup = 4 + DescendantGroupEnd = 5 + Or = 6 + LogicalGroup = 7 + LogicalGroupEnd = 8 + LogicalNegation = 9 + Wildcard = 10 + ExactMatch = 11 + ExactMatchEnd = 12 + ExactMatchOptional = 14 + NotInLine = 13 # Not currently a token. In development and may become one. + + def __init__(self, text): + tokens = { + ",": Token.And, + "and": Token.And, + "or": Token.Or, + "[": Token.DescendantGroup, + "]": Token.DescendantGroupEnd, + "(": Token.LogicalGroup, + ")": Token.LogicalGroupEnd, + "~": Token.LogicalNegation, + "?": Token.Wildcard, # Any tag or group + "??": Token.Wildcard, # Any tag + "???": Token.Wildcard, # Any Group + "{": Token.ExactMatch, # Nothing else + "}": Token.ExactMatchEnd, # Nothing else + ":": Token.ExactMatchOptional, + "@": Token.NotInLine + } + self.kind = tokens.get(text, Token.Tag) + self.text = text + + def __str__(self): + return self.text + + def __eq__(self, other): + if self.kind == other: + return True + return False diff --git a/hed/tools/__init__.py b/hed/tools/__init__.py index 435af03e..350a2497 100644 --- a/hed/tools/__init__.py +++ b/hed/tools/__init__.py @@ -46,9 +46,6 @@ from .analysis import annotation_util from .analysis.annotation_util import \ check_df_columns, extract_tags, generate_sidecar_entry, hed_to_df, df_to_hed, merge_hed_dict -from .analysis import analysis_util -from .analysis.analysis_util import assemble_hed -# from .analysis.analysis_util import search_tabular, get_assembled_strings from .remodeling.cli import run_remodel from .remodeling.cli import run_remodel_backup diff --git a/hed/tools/analysis/analysis_util.py b/hed/tools/analysis/analysis_util.py deleted file mode 100644 index ebca8acc..00000000 --- a/hed/tools/analysis/analysis_util.py +++ /dev/null @@ -1,230 +0,0 @@ -""" Utilities for assembly, analysis, and searching. """ - -import pandas as pd -from hed.models.tabular_input import TabularInput -from hed.tools.util.data_util import separate_values -from hed.models.hed_tag import HedTag -from hed.models.hed_group import HedGroup -from hed.models import df_util -from hed.models import QueryParser - - -def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False): - """ Return assembled HED annotations in a dataframe. - - Parameters: - data_input (TabularInput): The tabular input file whose HED annotations are to be assembled. - sidecar (Sidecar): Sidecar with definitions. - schema (HedSchema): Hed schema. - columns_included (list or None): A list of additional column names to include. - If None, only the list of assembled tags is included. - expand_defs (bool): If True, definitions are expanded when the events are assembled. - - Returns: - DataFrame or None: A DataFrame with the assembled events. - dict: A dictionary with definition names as keys and definition content strings as values. - """ - - eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included) - hed_string_list = data_input.series_a - definitions = sidecar.get_def_dict(hed_schema=schema) - if expand_defs: - df_util.expand_defs(hed_string_list, schema, definitions) - # Keep in mind hed_string_list is now a Series. The rest of the function should probably - # also be modified - - # hed_obj_list, defs = get_assembled(data_input, sidecar, schema, extra_def_dicts=None, join_columns=True, - # shrink_defs=False, expand_defs=True) - # hed_string_list = [str(hed) for hed in hed_obj_list] - if not eligible_columns: - df = pd.DataFrame({"HED_assembled": hed_string_list}) - else: - df = data_input.dataframe[eligible_columns].copy(deep=True) - df['HED_assembled'] = hed_string_list - return df, definitions - - -def get_expression_parsers(queries, query_names=None): - """ Returns a list of expression parsers and query_names. - - Parameters: - queries (list): A list of query strings or QueryParser objects - query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. - - Returns: - DataFrame - containing the search strings - - :raises ValueError: - - If query names are invalid or duplicated. - - """ - expression_parsers = [] - if not query_names: - query_names = [f"query_{index}" for index in range(len(queries))] - elif len(queries) != len(query_names): - raise ValueError("QueryNamesLengthBad", - f"The query_names length {len(query_names)} must be empty or equal" + - f"to the queries length {len(queries)}.") - elif len(set(query_names)) != len(query_names): - raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates") - for index, query in enumerate(queries): - if not query: - raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be empty") - elif isinstance(query, str): - try: - next_query = QueryParser(query) - except Exception: - raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed") - else: - next_query = query - expression_parsers.append(next_query) - return expression_parsers, query_names - - -def search_strings(hed_strings, queries, query_names=None): - """ Returns a DataFrame of factors based on results of queries. - - Parameters: - hed_strings (list): A list of HedString objects (empty entries or None entries are 0's) - queries (list): A list of query strings or QueryParser objects - query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. - - Returns: - DataFrame - containing the factor vectors with results of the queries - - :raises ValueError: - - If query names are invalid or duplicated. - - """ - - expression_parsers, query_names = get_expression_parsers(queries, query_names=query_names) - df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names) - for parse_ind, parser in enumerate(expression_parsers): - for index, next_item in enumerate(hed_strings): - match = parser.search(next_item) - if match: - df_factors.at[index, query_names[parse_ind]] = 1 - return df_factors - -# def get_assembled_strings(table, hed_schema=None, expand_defs=False): -# """ Return HED string objects for a tabular file. -# -# Parameters: -# table (TabularInput): The input file to be searched. -# hed_schema (HedSchema or HedschemaGroup): If provided the HedStrings are converted to canonical form. -# expand_defs (bool): If True, definitions are expanded when the events are assembled. -# -# Returns: -# list: A list of HedString objects. -# -# """ -# hed_list = list(table.iter_dataframe(hed_ops=[hed_schema], return_string_only=True, -# expand_defs=expand_defs, remove_definitions=True)) -# return hed_list -# - -# def search_tabular(data_input, sidecar, hed_schema, query, extra_def_dicts=None, columns_included=None): -# """ Return a dataframe with results of query. -# -# Parameters: -# data_input (TabularInput): The tabular input file (e.g., events) to be searched. -# hed_schema (HedSchema or HedSchemaGroup): The schema(s) under which to make the query. -# query (str or list): The str query or list of string queries to make. -# columns_included (list or None): List of names of columns to include -# -# Returns: -# DataFrame or None: A DataFrame with the results of the query or None if no events satisfied the query. -# -# """ -# -# eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included) -# hed_list, definitions = df_util.get_assembled(data_input, sidecar, hed_schema, extra_def_dicts=None, -# join_columns=True, -# shrink_defs=False, expand_defs=True) -# expression = QueryParser(query) -# hed_tags = [] -# row_numbers = [] -# for index, next_item in enumerate(hed_list): -# match = expression.search(next_item) -# if not match: -# continue -# hed_tags.append(next_item) -# row_numbers.append(index) -# -# if not row_numbers: -# df = None -# elif not eligible_columns: -# df = pd.DataFrame({'row_number': row_numbers, 'HED_assembled': hed_tags}) -# else: -# df = data_input.dataframe.iloc[row_numbers][eligible_columns].reset_index() -# df.rename(columns={'index': 'row_number'}) -# return df - - -# def remove_defs(hed_strings): -# """ This removes any def or Def-expand from a list of HedStrings. -# -# Parameters: -# hed_strings (list): A list of HedStrings -# -# Returns: -# list: A list of the removed Defs. -# -# """ -# def_groups = [[] for i in range(len(hed_strings))] -# for index, hed in enumerate(hed_strings): -# def_groups[index] = extract_defs(hed) -# return def_groups -# -# -# def extract_defs(hed_string_obj): -# """ This removes any def or Def-expand from a list of HedStrings. -# -# Parameters: -# hed_string_obj (HedString): A HedString -# -# Returns: -# list: A list of the removed Defs. -# -# Notes: -# - the hed_string_obj passed in no longer has definitions. -# -# """ -# to_remove = [] -# to_append = [] -# tuples = hed_string_obj.find_def_tags(recursive=True, include_groups=3) -# for tup in tuples: -# if len(tup[2].children) == 1: -# to_append.append(tup[0]) -# else: -# to_append.append(tup[2]) -# to_remove.append(tup[2]) -# hed_string_obj.remove(to_remove) -# return to_append - - -def hed_to_str(contents, remove_parentheses=False): - - if contents is None: - return '' - if isinstance(contents, str): - return contents - if isinstance(contents, HedTag): - return str(contents) - if isinstance(contents, list): - converted = [hed_to_str(element, remove_parentheses) for element in contents if element] - return ",".join(converted) - if not isinstance(contents, HedGroup): - raise TypeError("ContentsWrongClass", "OnsetGroup excepts contents that can be converted to string.") - if not remove_parentheses or len(contents.children) != 1: - return str(contents) - return _handle_remove(contents) - - -def _handle_remove(contents): - if contents.is_group or isinstance(contents.children[0], HedTag): - return str(contents.children[0]) - child = contents.children[0] - if child.is_group and len(child.children) == 1: - return str(child.children[0]) - return str(child) diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py index c77dbd47..885be64b 100644 --- a/hed/tools/analysis/event_manager.py +++ b/hed/tools/analysis/event_manager.py @@ -48,7 +48,7 @@ def _create_event_list(self, input_data): """ hed_strings, def_dict = get_assembled(input_data, input_data._sidecar, self.hed_schema, - extra_def_dicts=None, join_columns=True, + extra_def_dicts=None, shrink_defs=True, expand_defs=False) onset_dict = {} # Temporary dictionary keeping track of temporal events that haven't ended yet. for event_index, hed in enumerate(hed_strings): diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index f99b961d..53a06635 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -6,8 +6,8 @@ from hed.tools.remodeling.operations.base_op import BaseOp from hed.models.tabular_input import TabularInput from hed.models.sidecar import Sidecar -from hed.models.df_util import get_assembled -from hed.tools.analysis.analysis_util import get_expression_parsers, search_strings +from hed.models.query_handler import QueryHandler +from hed.models.query_service import search_strings, get_query_handlers from hed.tools.analysis.event_manager import EventManager from hed.tools.analysis.hed_tag_manager import HedTagManager @@ -83,8 +83,8 @@ def __init__(self, parameters): self.remove_types = parameters.get('remove_types', []) self.expand_context = parameters.get('expand_context', True) self.replace_defs = parameters.get('replace_defs', True) - self.expression_parsers, self.query_names = get_expression_parsers(self.queries, - parameters.get('query_names', None)) + self.query_handlers, self.query_names = get_query_handlers(self.queries, + parameters.get('query_names', None)) def do_op(self, dispatcher, df, name, sidecar=None): """ Factor the column using HED tag queries. @@ -115,7 +115,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): tag_man = HedTagManager(EventManager(input_data, dispatcher.hed_schema), remove_types=self.remove_types) hed_objs = tag_man.get_hed_objs(include_context=self.expand_context, replace_defs=self.replace_defs) - df_factors = search_strings(hed_objs, self.expression_parsers, query_names=self.query_names) + df_factors = search_strings(hed_objs, self.query_handlers, query_names=self.query_names) if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) @@ -124,8 +124,15 @@ def do_op(self, dispatcher, df, name, sidecar=None): @staticmethod def validate_input_data(parameters): - queries = parameters.get("queries", None) - names = parameters.get("query_names", None) + queries = parameters.get("queries", []) + names = parameters.get("query_names", []) if names and queries and (len(names) != len(parameters["queries"])): return ["factor_hed_tags_op: query_names must be same length as queries."] - return [] + + issues = [] + for query in queries: + try: + QueryHandler(query) + except ValueError as ex: + issues.append(f"factor_hed_tags_op: Invalid query '{query}") + return issues diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index aad30283..28d0a3c3 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -1,3 +1,5 @@ +import copy + import pandas as pd from hed import BaseInput from hed.errors import ErrorHandler, ValidationErrors, ErrorContext @@ -7,6 +9,8 @@ from hed.errors.error_reporter import sort_issues, check_for_any_errors from hed.validator.onset_validator import OnsetValidator from hed.validator.hed_validator import HedValidator +from hed.models.df_util import sort_dataframe_by_onsets + PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " @@ -50,6 +54,12 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): if data.has_column_names: row_adj += 1 issues += self._validate_column_structure(data, error_handler, row_adj) + + if data.needs_sorting: + data_new = copy.deepcopy(data) + data_new._dataframe = sort_dataframe_by_onsets(data.dataframe) + issues += error_handler.format_error_with_context(ValidationErrors.ONSETS_OUT_OF_ORDER) + data = data_new onset_filtered = data.series_filtered df = data.dataframe_a @@ -69,7 +79,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): def _run_checks(self, hed_df, onset_filtered, error_handler, row_adj): issues = [] columns = list(hed_df.columns) - for row_number, text_file_row in enumerate(hed_df.itertuples(index=False)): + for row_number, text_file_row in hed_df.iterrows(): error_handler.push_error_context(ErrorContext.ROW, row_number + row_adj) row_strings = [] new_column_issues = [] diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index b74e97ab..0f1b5255 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -2,17 +2,20 @@ import unittest import os import shutil -from hed import Sidecar +from hed import Sidecar, load_schema_version from hed import BaseInput, TabularInput from hed.models.column_mapper import ColumnMapper from hed.models import DefinitionDict from hed import schema from hed import HedFileError +from hed.errors import ErrorContext, ValidationErrors + import pandas as pd import numpy as np + class Test(unittest.TestCase): @classmethod def setUpClass(cls): @@ -74,6 +77,58 @@ def test_invalid_input_type_dict(self): with self.assertRaises(HedFileError): BaseInput({'key': 'value'}) +class TestSortingByOnset(unittest.TestCase): + @staticmethod + def generate_test_dataframe(): + data = { + 'onset': [0.5, 1.0, 1.5, 2.0, 2.5], + 'HED': [ + 'Age/1', + 'Age/2', + 'Age/3', + 'NotATag', + 'Age/5' + ] + } + + df = pd.DataFrame(data) + + return df + + def test_needs_sort(self): + df = self.generate_test_dataframe() + opened_file = TabularInput(df) + self.assertFalse(opened_file.needs_sorting) + + issues = opened_file.validate(load_schema_version("8.2.0")) + self.assertEqual(issues[1][ErrorContext.ROW], 5) + df.at[3, "onset"] = 1.5 + opened_file = TabularInput(df) + self.assertFalse(opened_file.needs_sorting) + + df.at[3, "onset"] = 1.0 + opened_file = TabularInput(df) + self.assertTrue(opened_file.needs_sorting) + issues = opened_file.validate(load_schema_version("8.2.0")) + # Should still report the same issue row despite needing sorting for validation + self.assertEqual(issues[1]['code'], ValidationErrors.ONSETS_OUT_OF_ORDER) + self.assertEqual(issues[2][ErrorContext.ROW], 5) + + def test_sort(self): + from hed.models.df_util import sort_dataframe_by_onsets + df = self.generate_test_dataframe() + df2 = sort_dataframe_by_onsets(df) + self.assertTrue(df.equals(df2)) + + df.at[3, "onset"] = 1.5 + df2 = sort_dataframe_by_onsets(df) + self.assertTrue(df.equals(df2)) + + df.at[3, "onset"] = 1.0 + df2 = sort_dataframe_by_onsets(df) + self.assertFalse(df.equals(df2)) + + class TestInsertColumns(unittest.TestCase): diff --git a/tests/models/test_expression_parser.py b/tests/models/test_query_handler.py similarity index 98% rename from tests/models/test_expression_parser.py rename to tests/models/test_query_handler.py index 5bdb71b7..0e33d631 100644 --- a/tests/models/test_expression_parser.py +++ b/tests/models/test_query_handler.py @@ -1,6 +1,6 @@ import unittest from hed.models.hed_string import HedString -from hed.models.expression_parser import QueryParser +from hed.models.query_handler import QueryHandler import os from hed import schema from hed import HedTag @@ -25,7 +25,7 @@ def setUpClass(cls): cls.hed_schema = schema.load_schema(hed_xml_file) def base_test(self, parse_expr, search_strings): - expression = QueryParser(parse_expr) + expression = QueryHandler(parse_expr) # print(f"Search Pattern: {expression._org_string} - {str(expression.tree)}") for string, expected_result in search_strings.items(): @@ -47,7 +47,7 @@ def test_broken_search_strings(self): ] for string in test_search_strings: with self.assertRaises(ValueError) as context: - QueryParser(string) + QueryHandler(string) self.assertTrue(context.exception.args[0]) def test_finding_tags(self): @@ -317,7 +317,7 @@ def test_exact_group_negation4(self): def test_exact_group_negation5(self): test_string = "{ ~a and b:}" with self.assertRaises(ValueError) as context: - QueryParser(test_string) + QueryHandler(test_string) self.assertTrue(context.exception.args[0]) def test_mixed_group_complex_split(self): @@ -653,13 +653,13 @@ def test_and_or(self): self.base_test("(a or b) and c", test_strings) def test_logical_negation(self): - expression = QueryParser("~a") + expression = QueryHandler("~a") hed_string = HedString("A", self.hed_schema) self.assertEqual(bool(expression.search(hed_string)), False) hed_string = HedString("B", self.hed_schema) self.assertEqual(bool(expression.search(hed_string)), True) - expression = QueryParser("~a and b") + expression = QueryHandler("~a and b") hed_string = HedString("A", self.hed_schema) self.assertEqual(bool(expression.search(hed_string)), False) hed_string = HedString("B", self.hed_schema) @@ -667,7 +667,7 @@ def test_logical_negation(self): hed_string = HedString("A, B", self.hed_schema) self.assertEqual(bool(expression.search(hed_string)), False) - expression = QueryParser("~( (a or b) and c)") + expression = QueryHandler("~( (a or b) and c)") hed_string = HedString("A", self.hed_schema) self.assertEqual(bool(expression.search(hed_string)), True) hed_string = HedString("B", self.hed_schema) diff --git a/tests/tools/analysis/test_analysis_util_assemble_hed.py b/tests/tools/analysis/test_analysis_util_assemble_hed.py deleted file mode 100644 index a7d2810c..00000000 --- a/tests/tools/analysis/test_analysis_util_assemble_hed.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import unittest -from pandas import DataFrame -from hed import schema as hedschema -from hed.models import Sidecar, TabularInput, DefinitionDict -from hed.models import df_util -from hed.tools.analysis.analysis_util import assemble_hed, search_strings - - -# noinspection PyBroadException -class Test(unittest.TestCase): - - @classmethod - def setUpClass(cls): - bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../../data/bids_tests/eeg_ds003645s_hed')) - schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../../data/schema_tests/HED8.2.0.xml')) - cls.bids_root_path = bids_root_path - json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - events_path = os.path.realpath(os.path.join(bids_root_path, - 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) - - schema = hedschema.load_schema(schema_path) - cls.schema = schema - sidecar1 = Sidecar(json_path, name='face_sub1_json') - cls.sidecar1 = sidecar1 - cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") - cls.input_data_no_sidecar = TabularInput(events_path, name="face_sub1_events_no_sidecar") - - def test_assemble_hed_included_no_expand(self): - df1, dict1 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=False, - columns_included=["onset", "duration", "event_type"]) - self.assertIsInstance(df1, DataFrame, "hed_assemble should return a dataframe when columns are included") - columns1 = list(df1.columns) - self.assertEqual(len(columns1), 4, - "assemble_hed should return the correct number of columns when columns are included ") - first_str1 = df1.iloc[0]['HED_assembled'] - self.assertNotEqual(first_str1.find('Def/'), -1, "assemble_hed with no def expand has Def tags") - self.assertEqual(first_str1.find('Def-expand'), -1, - "assemble_hed with no def expand does not have Def-expand tags") - self.assertIsInstance(dict1.defs, dict, "hed_assemble returns a dictionary of definitions") - self.assertEqual(len(dict1.defs), 17, "hed_assemble definition dictionary has the right number of elements.") - - def test_assemble_hed_included_expand(self): - df2, dict2 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=True, - columns_included=["onset", "duration", "event_type"]) - first_str2 = df2.iloc[0]['HED_assembled'] - self.assertEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") - self.assertNotEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") - - def test_assemble_hed_included_no_expand_bad_column(self): - df3, dict3 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=True, - columns_included=["onset", "baloney", "duration", "event_type"]) - columns3 = list(df3.columns) - self.assertEqual(len(columns3), 4, - "assemble_hed should return the correct number of columns when bad columns are included ") - - def test_assemble_hed_included_expand_bad_column(self): - df3, dict3 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=True, - columns_included=["onset", "baloney", "duration", "event_type"]) - columns3 = list(df3.columns) - self.assertEqual(len(columns3), 4, - "assemble_hed should return the correct number of columns when bad columns are included ") - - def test_assemble_hed_no_included_no_expand(self): - df1, dict1 = assemble_hed(self.input_data, self.sidecar1, self.schema, - columns_included=None, expand_defs=False) - self.assertIsInstance(df1, DataFrame, "hed_assemble returns a dataframe when no columns are included") - columns1 = list(df1.columns) - self.assertEqual(len(columns1), 1, - "assemble_hed returns only assembled strings when no columns include. ") - first_str1 = df1.iloc[0]['HED_assembled'] - self.assertNotEqual(first_str1.find('Def/'), -1, "assemble_hed with no def expand has Def tags") - self.assertEqual(first_str1.find('Def-expand'), -1, - "assemble_hed with no def expand does not have Def-expand tags") - self.assertIsInstance(dict1, DefinitionDict, "hed_assemble returns a dictionary of definitions") - self.assertEqual(len(dict1.defs), 17, "hed_assemble definition dictionary has the right number of elements.") - - def test_assemble_hed_no_included_expand(self): - df2, dict2 = assemble_hed(self.input_data, self.sidecar1, self.schema, - columns_included=None, expand_defs=True) - first_str2 = df2.iloc[0]['HED_assembled'] - self.assertEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") - self.assertNotEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") - - def test_assemble_hed_bad_column_no_expand(self): - df3, dict3 = assemble_hed(self.input_data, self.sidecar1, self.schema, - columns_included=["onset", "baloney", "duration", "event_type"], expand_defs=False) - columns3 = list(df3.columns) - self.assertEqual(len(columns3), 4, - "assemble_hed returns the correct number of columns when bad columns are included ") - first_str2 = df3.iloc[0]['HED_assembled'] - self.assertNotEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") - self.assertEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") - - def test_search_strings(self): - hed_strings, dict1 = df_util.get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, - join_columns=True, shrink_defs=False, expand_defs=True) - queries1 = ["sensory-event"] - query_names1 = ["sensory"] - df1 = search_strings(hed_strings, queries1, query_names1) - self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.") - self.assertEqual(len(df1.columns), 1, "search_tabular has the right number of columns when query okay") - self.assertEqual(len(df1.index), 200, "search_tabular has right number of rows when query okay") - queries2 = ['data-feature', "sensory-event"] - query_names2 = ['data', 'sensory'] - df2 = search_strings(hed_strings, queries2, query_names2) - self.assertEqual(len(df2.columns), 2, "search_tabular has the right number of columns when query okay") - self.assertEqual(len(df2.index), 200, "search_tabular has right number of rows when query okay") - totals = df2.sum(axis=0) - self.assertFalse(totals.loc['data']) - self.assertEqual(totals.loc['sensory'], 155) - queries3 = ['image', "sensory-event", "face"] - query_names3 = ['image', 'sensory', "faced"] - df3 = search_strings(hed_strings, queries3, query_names3) - self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns") - self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns") - self.assertEqual(len(df3.index), 200, "search_tabular has right number of rows when query okay") - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/tools/analysis/test_analysis_util_convert.py b/tests/tools/analysis/test_analysis_util_convert.py deleted file mode 100644 index 5c472421..00000000 --- a/tests/tools/analysis/test_analysis_util_convert.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -import unittest -from hed import schema as hedschema -from hed.models import HedTag, HedString -from hed.tools.analysis.analysis_util import hed_to_str - - -# noinspection PyBroadException -class Test(unittest.TestCase): - - @classmethod - def setUpClass(cls): - schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../../data/schema_tests/HED8.1.0.xml')) - cls.hed_schema = hedschema.load_schema(schema_path) - - def test_convert_list(self): - pass - - def test_convert_hed_tag(self): - tag1 = HedTag('Label/Cond1', self.hed_schema) - str1 = hed_to_str(tag1) - self.assertIsInstance(str1, str) - self.assertEqual(str1, 'Label/Cond1') - tag2 = HedTag('Label/Cond1', hed_schema=self.hed_schema) - str2 = hed_to_str(tag2) - self.assertIsInstance(str2, str) - self.assertEqual(str2, 'Label/Cond1') - tag3 = HedTag('Label/Cond1', hed_schema=self.hed_schema) - str3 = hed_to_str(tag3) - self.assertIsInstance(str3, str) - self.assertEqual(str3, 'Label/Cond1') - - def test_hed_to_str_other(self): - str1 = hed_to_str(None) - self.assertFalse(str1) - str2 = 'test/node1' - str3 = hed_to_str(str2) - self.assertIsInstance(str2, str) - self.assertEqual(str2, str3) - dict1 = {'first': 'Red'} - with self.assertRaises(TypeError) as context: - hed_to_str(dict1) - self.assertEqual(context.exception.args[0], "ContentsWrongClass") - - def test_hed_to_str_obj(self): - str_obj1 = HedString('Label/Cond1', self.hed_schema) - str1 = hed_to_str(str_obj1) - self.assertIsInstance(str1, str) - self.assertEqual(str1, 'Label/Cond1') - str_obj2 = HedString('Label/Cond1', hed_schema=self.hed_schema) - str2 = hed_to_str(str_obj2) - self.assertIsInstance(str2, str) - self.assertEqual(str2, 'Label/Cond1') - str_obj3 = HedString('Label/Cond1', hed_schema=self.hed_schema) - str3 = hed_to_str(str_obj3) - self.assertIsInstance(str3, str) - self.assertEqual(str3, 'Label/Cond1') - str_obj4 = HedString('(Label/Cond1, Offset), Red', hed_schema=self.hed_schema) - str4 = hed_to_str(str_obj4) - self.assertIsInstance(str4, str) - self.assertEqual(str4, '(Label/Cond1,Offset),Red') - str_obj5 = HedString('(Label/Cond1, Offset), Red, (Offset)', hed_schema=self.hed_schema) - tuples = str_obj5.find_tags(["offset"], recursive=True, include_groups=2) - str_obj5.remove([tuples[0][0], tuples[1][0]]) - str5 = str(str_obj5) - self.assertEqual(str5, '(Label/Cond1),Red') - for tup in tuples: - if len(tup[1].children) == 1: - str_obj5.replace(tup[1], tup[1].children[0]) - str5a = str(str_obj5) - self.assertEqual(str5a, 'Label/Cond1,Red') - - def test_hed_to_str_group(self): - test1 = '(Label/Cond1, Offset)' - str_obj1 = HedString(test1, hed_schema=self.hed_schema) - grp1 = str_obj1.children[0] - str1 = hed_to_str(grp1) - self.assertIsInstance(str1, str) - self.assertEqual(str1, '(Label/Cond1,Offset)') - - def test_hed_to_str_list(self): - list1 = [] - str1 = hed_to_str(list1) - self.assertIsInstance(str1, str) - self.assertFalse(str1) - list2 = [HedString('Label/Cond1', hed_schema=self.hed_schema), - HedString("Red,Blue", hed_schema=self.hed_schema)] - str2 = hed_to_str(list2) - self.assertIsInstance(str2, str) - self.assertEqual(str2, 'Label/Cond1,Red,Blue') - - def test_hed_to_str_remove_parentheses(self): - str_obj1 = HedString('((Label/Cond1))', hed_schema=self.hed_schema) - str1 = hed_to_str(str_obj1, remove_parentheses=True) - self.assertIsInstance(str1, str) - self.assertEqual(str1, '(Label/Cond1)') - str_obj2 = HedString('(Red, (Label/Cond1))', hed_schema=self.hed_schema) - str2 = hed_to_str(str_obj2, remove_parentheses=True) - self.assertIsInstance(str2, str) - self.assertEqual(str2, '(Red,(Label/Cond1))') - str_obj3 = HedString('(Label/Cond1)', hed_schema=self.hed_schema) - str3 = hed_to_str(str_obj3, remove_parentheses=True) - self.assertIsInstance(str3, str) - self.assertEqual(str3, 'Label/Cond1') - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/tools/analysis/test_hed_tag_counts.py b/tests/tools/analysis/test_hed_tag_counts.py index 52f91fee..6eac9480 100644 --- a/tests/tools/analysis/test_hed_tag_counts.py +++ b/tests/tools/analysis/test_hed_tag_counts.py @@ -3,8 +3,8 @@ from hed import schema as hedschema from hed.models import Sidecar, TabularInput, HedString from hed.models.df_util import get_assembled -from hed.tools import assemble_hed from hed.tools.analysis.hed_tag_counts import HedTagCounts +import pandas as pd # noinspection PyBroadException @@ -27,9 +27,8 @@ def setUpClass(cls): input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") cls.input_data = input_data cls.sidecar1 = sidecar1 - input_df, def_dict = assemble_hed(input_data, sidecar1, schema, expand_defs=False) - cls.input_df = input_df - cls.def_dict = def_dict + cls.input_df = pd.DataFrame(input_data.series_a, columns=["HED_assembled"]) + cls.def_dict = input_data.get_def_dict(schema) cls.tag_template = { "Sensory events": ["Sensory-event", "Sensory-presentation", "Sensory-attribute", "Experimental-stimulus", "Task-stimulus-role", @@ -76,7 +75,7 @@ def test_hed_tag_count(self): def test_organize_tags(self): counts = HedTagCounts('Base_name') hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.hed_schema, - extra_def_dicts=None, join_columns=True, + extra_def_dicts=None, shrink_defs=False, expand_defs=True) # type_defs = input_data.get_definitions().gathered_defs for hed in hed_strings: diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py index 196a9575..f66dbdf9 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py @@ -168,7 +168,7 @@ def test_quick4(self): counts = HedTagCounts('myName', 2) summary_dict = {} hed_strings, definitions = get_assembled(input_data, sidecar, my_schema, - extra_def_dicts=None, join_columns=True, + extra_def_dicts=None, shrink_defs=False, expand_defs=True) for hed in hed_strings: counts.update_event_counts(hed, 'myName')