diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 71cdeab3..23f4de7d 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -1,49 +1,32 @@ """ Utilities for assembly and conversion of HED strings to different forms. """ from functools import partial import pandas as pd -from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput from hed.models.hed_string import HedString from hed.models.definition_dict import DefinitionDict -def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, shrink_defs=False, expand_defs=True): - """ Create an array of assembled HedString objects (or list of these) of the same length as tabular file with. +def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=True): + """ Create an array of assembled HedString objects (or list of these) of the same length as tabular file input. - Args: - tabular_file: str or TabularInput - The path to the tabular file, or a TabularInput object representing it. - sidecar: str or Sidecar - The path to the sidecar file, or a Sidecar object representing it. + Parameters: + tabular_file (TabularInput): Represents the tabular input file. hed_schema: HedSchema If str, will attempt to load as a version if it doesn't have a valid extension. extra_def_dicts: list of DefinitionDict, optional Any extra DefinitionDict objects to use when parsing the HED tags. - shrink_defs: bool - Shrink any def-expand tags found - expand_defs: bool - Expand any def tags found + defs_expanded (bool): (Default True) Expands definitions if True, otherwise shrinks them. Returns: tuple: hed_strings(list of HedStrings):A list of HedStrings or a list of lists of HedStrings def_dict(DefinitionDict): The definitions from this Sidecar """ - if isinstance(sidecar, str): - sidecar = Sidecar(sidecar) - - if isinstance(tabular_file, str): - tabular_file = TabularInput(tabular_file, sidecar) - - def_dict = None - if sidecar: - def_dict = sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) - if expand_defs: + def_dict = tabular_file.get_def_dict(hed_schema, extra_def_dicts=extra_def_dicts) + if defs_expanded: return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict - elif shrink_defs: - return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict else: - return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict + return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict def convert_to_form(df, hed_schema, tag_form, columns=None): diff --git a/hed/models/query_service.py b/hed/models/query_service.py index c197c683..b5042de7 100644 --- a/hed/models/query_service.py +++ b/hed/models/query_service.py @@ -4,38 +4,39 @@ def get_query_handlers(queries, query_names=None): - """ Returns a list of query handlers and names + """ Returns a list of query handlers, query names, and issues if any. Parameters: - queries (list): A list of query strings or QueryHandler objects + queries (list): A list of query strings. query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. Returns: - DataFrame - containing the search strings - - :raises ValueError: - - If query names are invalid or duplicated. + list - QueryHandlers for successfully parsed queries. + list - str names to assign to results of the queries. + list - issues if any of the queries could not be parsed or other errors occurred. """ - expression_parsers = [] + if not queries: + return None, None, [f"EmptyQueries: The queries list must not be empty"] + elif isinstance(queries, str): + queries = [queries] + expression_parsers = [None for i in range(len(queries))] + issues = [] if not query_names: query_names = [f"query_{index}" for index in range(len(queries))] - elif len(queries) != len(query_names): - raise ValueError("QueryNamesLengthBad", - f"The query_names length {len(query_names)} must be empty or equal" + - f"to the queries length {len(queries)}.") + + if len(queries) != len(query_names): + issues.append(f"QueryNamesLengthBad: The query_names length {len(query_names)} must be empty or equal" + + f"to the queries length {len(queries)}.") elif len(set(query_names)) != len(query_names): - raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates") + issues.append(f"DuplicateQueryNames: The query names {str(query_names)} list has duplicates") + for index, query in enumerate(queries): - if isinstance(query, str): - try: - next_query = QueryHandler(query) - except Exception: - raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed") - else: - raise ValueError("BadQuery", f"Query [{index}]: {query} has a bad type") - expression_parsers.append(next_query) - return expression_parsers, query_names + try: + expression_parsers[index] = QueryHandler(query) + except Exception as ex: + issues.append(f"[BadQuery {index}]: {query} cannot be parsed") + return expression_parsers, query_names, issues def search_strings(hed_strings, queries, query_names): diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py index 726696fc..bb2fb5e5 100644 --- a/hed/models/spreadsheet_input.py +++ b/hed/models/spreadsheet_input.py @@ -16,8 +16,8 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data. worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags. Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed. - tag_columns (list): A list of ints containing the columns that contain the HED tags. - The default value is [1] indicating only the second column has tags. + tag_columns (list): A list of ints or strs containing the columns that contain the HED tags. + If ints then column numbers with [1] indicating only the second column has tags. has_column_names (bool): True if file has column names. Validation will skip over the first row. first line of the file if the spreadsheet as column names. column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py index 885be64b..4e3c152e 100644 --- a/hed/tools/analysis/event_manager.py +++ b/hed/tools/analysis/event_manager.py @@ -47,9 +47,7 @@ def _create_event_list(self, input_data): Notes: """ - hed_strings, def_dict = get_assembled(input_data, input_data._sidecar, self.hed_schema, - extra_def_dicts=None, - shrink_defs=True, expand_defs=False) + hed_strings, def_dict = get_assembled(input_data, self.hed_schema, extra_def_dicts=None, defs_expanded=False) onset_dict = {} # Temporary dictionary keeping track of temporal events that haven't ended yet. for event_index, hed in enumerate(hed_strings): self._extract_temporal_events(hed, event_index, onset_dict) diff --git a/hed/tools/remodeling/operations/factor_column_op.py b/hed/tools/remodeling/operations/factor_column_op.py index 3a8292d0..4d956528 100644 --- a/hed/tools/remodeling/operations/factor_column_op.py +++ b/hed/tools/remodeling/operations/factor_column_op.py @@ -95,8 +95,8 @@ def validate_input_data(parameters): names = parameters.get("factor_names", None) values = parameters.get("factor_values", None) if names and not values: - return ["factor_names_op: factor_names cannot be given without factor_values"] + return ["factor_names cannot be given without factor_values"] elif names and values and len(names) != len(values): - return ["factor_names_op: factor_names must be same length as factor_values"] + return ["factor_names must be same length as factor_values"] else: return [] diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index 53a06635..68d8ac35 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -83,8 +83,10 @@ def __init__(self, parameters): self.remove_types = parameters.get('remove_types', []) self.expand_context = parameters.get('expand_context', True) self.replace_defs = parameters.get('replace_defs', True) - self.query_handlers, self.query_names = get_query_handlers(self.queries, - parameters.get('query_names', None)) + self.query_handlers, self.query_names, issues = \ + get_query_handlers(self.queries, parameters.get('query_names', None)) + if issues: + raise ValueError("FactorHedTagInvalidQueries", "\n".join(issues)) def do_op(self, dispatcher, df, name, sidecar=None): """ Factor the column using HED tag queries. @@ -124,15 +126,5 @@ def do_op(self, dispatcher, df, name, sidecar=None): @staticmethod def validate_input_data(parameters): - queries = parameters.get("queries", []) - names = parameters.get("query_names", []) - if names and queries and (len(names) != len(parameters["queries"])): - return ["factor_hed_tags_op: query_names must be same length as queries."] - - issues = [] - for query in queries: - try: - QueryHandler(query) - except ValueError as ex: - issues.append(f"factor_hed_tags_op: Invalid query '{query}") + queries, names, issues = get_query_handlers(parameters.get("queries", []), parameters.get("query_names", None)) return issues diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py index f214112c..e8626679 100644 --- a/hed/tools/remodeling/operations/merge_consecutive_op.py +++ b/hed/tools/remodeling/operations/merge_consecutive_op.py @@ -170,5 +170,5 @@ def validate_input_data(parameters): match_columns = parameters.get("match_columns", None) name = parameters.get("column_name", None) if match_columns and name in match_columns: - return [f"merge_consecutive_op: column_name `{name}` cannot not be a match_column."] + return [f"column_name `{name}` cannot not be a match_column."] return [] diff --git a/hed/tools/remodeling/operations/remap_columns_op.py b/hed/tools/remodeling/operations/remap_columns_op.py index 2eb4e13e..cb752e0c 100644 --- a/hed/tools/remodeling/operations/remap_columns_op.py +++ b/hed/tools/remodeling/operations/remap_columns_op.py @@ -146,8 +146,8 @@ def validate_input_data(parameters): required_len = len(parameters['source_columns']) + len(parameters['destination_columns']) for x in map_list: if len(x) != required_len: - return [f"remap_columns_op: all map_list arrays must be of length {str(required_len)}."] + return [f"all map_list arrays must be of length {str(required_len)}."] missing = set(parameters.get('integer_sources', [])) - set(parameters['source_columns']) if missing: - return [f"remap_columns_op: the integer_sources {str(missing)} are missing from source_columns."] + return [f"the integer_sources {str(missing)} are missing from source_columns."] return [] diff --git a/hed/tools/remodeling/remodeler_validator.py b/hed/tools/remodeling/remodeler_validator.py index 04f84c7f..ce74072d 100644 --- a/hed/tools/remodeling/remodeler_validator.py +++ b/hed/tools/remodeling/remodeler_validator.py @@ -117,7 +117,7 @@ def validate(self, operations): for index, operation in enumerate(operation_by_parameters): error_strings = valid_operations[operation[0]].validate_input_data(operation[1]) for error_string in error_strings: - list_of_error_strings.append("Operation %s: %s" %(index+1, error_string)) + list_of_error_strings.append("Operation %s (%s): %s" %(index+1, operation[0], error_string)) return list_of_error_strings diff --git a/tests/tools/analysis/test_hed_tag_counts.py b/tests/tools/analysis/test_hed_tag_counts.py index 6eac9480..d90d710e 100644 --- a/tests/tools/analysis/test_hed_tag_counts.py +++ b/tests/tools/analysis/test_hed_tag_counts.py @@ -74,9 +74,8 @@ def test_hed_tag_count(self): def test_organize_tags(self): counts = HedTagCounts('Base_name') - hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.hed_schema, - extra_def_dicts=None, - shrink_defs=False, expand_defs=True) + hed_strings, definitions = get_assembled(self.input_data, self.hed_schema, extra_def_dicts=None, + defs_expanded=True) # type_defs = input_data.get_definitions().gathered_defs for hed in hed_strings: counts.update_event_counts(hed, 'run-1') diff --git a/tests/tools/remodeling/operations/test_factor_hed_tags_op.py b/tests/tools/remodeling/operations/test_factor_hed_tags_op.py index 6c5ab7b9..90380f77 100644 --- a/tests/tools/remodeling/operations/test_factor_hed_tags_op.py +++ b/tests/tools/remodeling/operations/test_factor_hed_tags_op.py @@ -62,14 +62,14 @@ def test_invalid_query_names(self): params["query_names"] = ["apple", "apple"] with self.assertRaises(ValueError) as context: FactorHedTagsOp(params) - self.assertEqual(context.exception.args[0], 'DuplicateQueryNames') + self.assertEqual(context.exception.args[0], 'FactorHedTagInvalidQueries') # Query names have wrong length params = json.loads(self.json_params) params["query_names"] = ["apple", "banana", "pear"] with self.assertRaises(ValueError) as context: FactorHedTagsOp(params) - self.assertEqual(context.exception.args[0], 'QueryNamesLengthBad') + self.assertEqual(context.exception.args[0], 'FactorHedTagInvalidQueries') # Query name already a column name params = json.loads(self.json_params) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py index f66dbdf9..09c49298 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py @@ -162,14 +162,12 @@ def test_quick4(self): '../../../data/remodel_tests/')) data_path = os.path.realpath(os.path.join(path, 'sub-002_task-FacePerception_run-1_events.tsv')) json_path = os.path.realpath(os.path.join(path, 'task-FacePerception_events.json')) - my_schema = load_schema_version('8.1.0') + schema = load_schema_version('8.1.0') sidecar = Sidecar(json_path,) input_data = TabularInput(data_path, sidecar=sidecar) counts = HedTagCounts('myName', 2) summary_dict = {} - hed_strings, definitions = get_assembled(input_data, sidecar, my_schema, - extra_def_dicts=None, - shrink_defs=False, expand_defs=True) + hed_strings, definitions = get_assembled(input_data, schema, extra_def_dicts=None, defs_expanded=True) for hed in hed_strings: counts.update_event_counts(hed, 'myName') summary_dict['myName'] = counts diff --git a/tests/tools/remodeling/test_validator.py b/tests/tools/remodeling/test_validator.py index 13206b31..9b02257d 100644 --- a/tests/tools/remodeling/test_validator.py +++ b/tests/tools/remodeling/test_validator.py @@ -124,29 +124,30 @@ def test_validate_parameter_data(self): factor_column_validate = [deepcopy(self.remodel_file)[1]] factor_column_validate[0]["parameters"]["factor_names"] = ["stopped"] error_strings = self.validator.validate(factor_column_validate) - self.assertEqual(error_strings[0], "Operation 1: factor_names_op: factor_names must be same length as factor_values") + self.assertEqual(error_strings[0], "Operation 1 (factor_column): factor_names must be same length as factor_values") factor_hed_tags_validate = [deepcopy(self.remodel_file)[2]] factor_hed_tags_validate[0]["parameters"]["query_names"] = ["correct"] error_strings = self.validator.validate(factor_hed_tags_validate) - self.assertEqual(error_strings[0], "Operation 1: factor_hed_tags_op: query_names must be same length as queries.") + self.assertEqual(error_strings[0], "Operation 1 (factor_hed_tags): QueryNamesLengthBad: The query_names length 1 must be empty or equalto the queries length 2.") merge_consecutive_validate = [deepcopy(self.remodel_file)[4]] merge_consecutive_validate[0]["parameters"]["match_columns"].append("trial_type") error_strings = self.validator.validate(merge_consecutive_validate) - self.assertEqual(error_strings[0], "Operation 1: merge_consecutive_op: column_name `trial_type` cannot not be a match_column.") + self.assertEqual(error_strings[0], "Operation 1 (merge_consecutive): column_name `trial_type` cannot not be a match_column.") remap_columns_validate_same_length = [deepcopy(self.remodel_file)[5]] remap_columns_validate_same_length[0]["parameters"]["map_list"][0] = [""] error_strings = self.validator.validate(remap_columns_validate_same_length) - self.assertEqual(error_strings[0], "Operation 1: remap_columns_op: all map_list arrays must be of length 3.") + self.assertEqual(error_strings[0], "Operation 1 (remap_columns): all map_list arrays must be of length 3.") remap_columns_validate_right_length = [deepcopy(self.remodel_file[5])] - remap_columns_validate_right_length[0]["parameters"]["map_list"] = [["string1", "string2"], ["string3", "string4"]] + remap_columns_validate_right_length[0]["parameters"]["map_list"] = \ + [["string1", "string2"], ["string3", "string4"]] error_strings = self.validator.validate(remap_columns_validate_right_length) - self.assertEqual(error_strings[0], "Operation 1: remap_columns_op: all map_list arrays must be of length 3.") + self.assertEqual(error_strings[0], "Operation 1 (remap_columns): all map_list arrays must be of length 3.") remap_columns_integer_sources = [deepcopy(self.remodel_file[5])] remap_columns_integer_sources[0]["parameters"]["integer_sources"] = ["unknown_column"] error_strings = self.validator.validate(remap_columns_integer_sources) - self.assertEqual(error_strings[0], "Operation 1: remap_columns_op: the integer_sources {'unknown_column'} are missing from source_columns.") + self.assertEqual(error_strings[0], "Operation 1 (remap_columns): the integer_sources {'unknown_column'} are missing from source_columns.")