Minor changes to the documentation and implementation

hed-standard · Feb 12, 2024 · 8aff712 · 8aff712
1 parent 456a3c5
commit 8aff712
Show file tree

Hide file tree

Showing 13 changed files with 58 additions and 86 deletions.
diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -1,49 +1,32 @@
 """ Utilities for assembly and conversion of HED strings to different forms. """
 from functools import partial
 import pandas as pd
-from hed.models.sidecar import Sidecar
 from hed.models.tabular_input import TabularInput
 from hed.models.hed_string import HedString
 from hed.models.definition_dict import DefinitionDict
 
 
-def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, shrink_defs=False, expand_defs=True):
-    """ Create an array of assembled HedString objects (or list of these) of the same length as tabular file with.
+def get_assembled(tabular_file, hed_schema, extra_def_dicts=None, defs_expanded=True):
+    """ Create an array of assembled HedString objects (or list of these) of the same length as tabular file input.
 
-    Args:
-        tabular_file: str or TabularInput
-            The path to the tabular file, or a TabularInput object representing it.
-        sidecar: str or Sidecar
-            The path to the sidecar file, or a Sidecar object representing it.
+    Parameters:
+        tabular_file (TabularInput): Represents the tabular input file.
         hed_schema: HedSchema
             If str, will attempt to load as a version if it doesn't have a valid extension.
         extra_def_dicts: list of DefinitionDict, optional
             Any extra DefinitionDict objects to use when parsing the HED tags.
-        shrink_defs: bool
-            Shrink any def-expand tags found
-        expand_defs: bool
-            Expand any def tags found
+        defs_expanded (bool): (Default True) Expands definitions if True, otherwise shrinks them.
     Returns:
         tuple:
             hed_strings(list of HedStrings):A list of HedStrings or a list of lists of HedStrings
             def_dict(DefinitionDict): The definitions from this Sidecar
     """
-    if isinstance(sidecar, str):
-        sidecar = Sidecar(sidecar)
-
-    if isinstance(tabular_file, str):
-        tabular_file = TabularInput(tabular_file, sidecar)
-
-    def_dict = None
-    if sidecar:
-        def_dict = sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts)
 
-    if expand_defs:
+    def_dict = tabular_file.get_def_dict(hed_schema, extra_def_dicts=extra_def_dicts)
+    if defs_expanded:
         return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict
-    elif shrink_defs:
-        return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict
     else:
-        return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict
+        return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict
 
 
 def convert_to_form(df, hed_schema, tag_form, columns=None):

diff --git a/hed/models/query_service.py b/hed/models/query_service.py
@@ -4,38 +4,39 @@
 
 
 def get_query_handlers(queries, query_names=None):
-    """ Returns a list of query handlers and names
+    """ Returns a list of query handlers, query names, and issues if any.
 
     Parameters:
-        queries (list):  A list of query strings or QueryHandler objects
+        queries (list):  A list of query strings.
         query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc.
 
     Returns:
-        DataFrame - containing the search strings
-
-    :raises ValueError:
-        - If query names are invalid or duplicated.
+        list - QueryHandlers for successfully parsed queries.
+        list - str names to assign to results of the queries.
+        list - issues if any of the queries could not be parsed or other errors occurred.
 
     """
-    expression_parsers = []
+    if not queries:
+        return None, None, [f"EmptyQueries: The queries list must not be empty"]
+    elif isinstance(queries, str):
+        queries = [queries]
+    expression_parsers = [None for i in range(len(queries))]
+    issues = []
     if not query_names:
         query_names = [f"query_{index}" for index in range(len(queries))]
-    elif len(queries) != len(query_names):
-        raise ValueError("QueryNamesLengthBad",
-                         f"The query_names length {len(query_names)} must be empty or equal" +
-                         f"to the queries length {len(queries)}.")
+
+    if len(queries) != len(query_names):
+        issues.append(f"QueryNamesLengthBad: The query_names length {len(query_names)} must be empty or equal" +
+                      f"to the queries length {len(queries)}.")
     elif len(set(query_names)) != len(query_names):
-        raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates")
+        issues.append(f"DuplicateQueryNames: The query names {str(query_names)} list has duplicates")
+
     for index, query in enumerate(queries):
-        if isinstance(query, str):
-            try:
-                next_query = QueryHandler(query)
-            except Exception:
-                raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed")
-        else:
-            raise ValueError("BadQuery", f"Query [{index}]: {query} has a bad type")
-        expression_parsers.append(next_query)
-    return expression_parsers, query_names
+        try:
+            expression_parsers[index] = QueryHandler(query)
+        except Exception as ex:
+            issues.append(f"[BadQuery {index}]: {query} cannot be parsed")
+    return expression_parsers, query_names, issues
 
 
 def search_strings(hed_strings, queries, query_names):

diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py
@@ -16,8 +16,8 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N
             file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data. 
             worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
                 Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
-            tag_columns (list): A list of ints containing the columns that contain the HED tags.
-                The default value is [1] indicating only the second column has tags.
+            tag_columns (list): A list of ints or strs containing the columns that contain the HED tags.
+                If ints then column numbers with [1] indicating only the second column has tags.
             has_column_names (bool): True if file has column names. Validation will skip over the first row.
                 first line of the file if the spreadsheet as column names.
             column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and

diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py
@@ -47,9 +47,7 @@ def _create_event_list(self, input_data):
         Notes:
 
         """
-        hed_strings, def_dict = get_assembled(input_data, input_data._sidecar, self.hed_schema,
-                                              extra_def_dicts=None,
-                                              shrink_defs=True, expand_defs=False)
+        hed_strings, def_dict = get_assembled(input_data, self.hed_schema, extra_def_dicts=None, defs_expanded=False)
         onset_dict = {}  # Temporary dictionary keeping track of temporal events that haven't ended yet.
         for event_index, hed in enumerate(hed_strings):
             self._extract_temporal_events(hed, event_index, onset_dict)

diff --git a/hed/tools/remodeling/operations/factor_column_op.py b/hed/tools/remodeling/operations/factor_column_op.py
@@ -95,8 +95,8 @@ def validate_input_data(parameters):
         names = parameters.get("factor_names", None)
         values = parameters.get("factor_values", None)
         if names and not values:
-            return ["factor_names_op: factor_names cannot be given without factor_values"]
+            return ["factor_names cannot be given without factor_values"]
         elif names and values and len(names) != len(values):
-            return ["factor_names_op: factor_names must be same length as factor_values"]
+            return ["factor_names must be same length as factor_values"]
         else:
             return []
diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py
@@ -83,8 +83,10 @@ def __init__(self, parameters):
         self.remove_types = parameters.get('remove_types', [])
         self.expand_context = parameters.get('expand_context', True)
         self.replace_defs = parameters.get('replace_defs', True)
-        self.query_handlers, self.query_names = get_query_handlers(self.queries,
-                                                                   parameters.get('query_names', None))
+        self.query_handlers, self.query_names, issues = \
+            get_query_handlers(self.queries, parameters.get('query_names', None))
+        if issues:
+            raise ValueError("FactorHedTagInvalidQueries", "\n".join(issues))
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Factor the column using HED tag queries.
@@ -124,15 +126,5 @@ def do_op(self, dispatcher, df, name, sidecar=None):
 
     @staticmethod
     def validate_input_data(parameters):
-        queries = parameters.get("queries", [])
-        names = parameters.get("query_names", [])
-        if names and queries and (len(names) != len(parameters["queries"])):
-            return ["factor_hed_tags_op: query_names must be same length as queries."]
-
-        issues = []
-        for query in queries:
-            try:
-                QueryHandler(query)
-            except ValueError as ex:
-                issues.append(f"factor_hed_tags_op: Invalid query '{query}")
+        queries, names, issues = get_query_handlers(parameters.get("queries", []), parameters.get("query_names", None))
         return issues
diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py
@@ -170,5 +170,5 @@ def validate_input_data(parameters):
         match_columns = parameters.get("match_columns", None)
         name = parameters.get("column_name", None)
         if match_columns and name in match_columns:
-            return [f"merge_consecutive_op: column_name `{name}` cannot not be a match_column."]
+            return [f"column_name `{name}` cannot not be a match_column."]
         return []
diff --git a/hed/tools/remodeling/operations/remap_columns_op.py b/hed/tools/remodeling/operations/remap_columns_op.py
@@ -146,8 +146,8 @@ def validate_input_data(parameters):
         required_len = len(parameters['source_columns']) + len(parameters['destination_columns'])
         for x in map_list:
             if len(x) != required_len:
-                return [f"remap_columns_op: all map_list arrays must be of length {str(required_len)}."]
+                return [f"all map_list arrays must be of length {str(required_len)}."]
         missing = set(parameters.get('integer_sources', [])) - set(parameters['source_columns'])
         if missing:
-            return [f"remap_columns_op: the integer_sources {str(missing)} are missing from source_columns."]
+            return [f"the integer_sources {str(missing)} are missing from source_columns."]
         return []
diff --git a/hed/tools/remodeling/remodeler_validator.py b/hed/tools/remodeling/remodeler_validator.py
@@ -117,7 +117,7 @@ def validate(self, operations):
         for index, operation in enumerate(operation_by_parameters):
             error_strings = valid_operations[operation[0]].validate_input_data(operation[1])
             for error_string in error_strings:
-                list_of_error_strings.append("Operation %s: %s" %(index+1, error_string))
+                list_of_error_strings.append("Operation %s (%s): %s" %(index+1, operation[0], error_string))
 
         return list_of_error_strings
 

diff --git a/tests/tools/analysis/test_hed_tag_counts.py b/tests/tools/analysis/test_hed_tag_counts.py
@@ -74,9 +74,8 @@ def test_hed_tag_count(self):
 
     def test_organize_tags(self):
         counts = HedTagCounts('Base_name')
-        hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.hed_schema,
-                                                 extra_def_dicts=None,
-                                                 shrink_defs=False, expand_defs=True)
+        hed_strings, definitions = get_assembled(self.input_data, self.hed_schema, extra_def_dicts=None,
+                                                 defs_expanded=True)
         # type_defs = input_data.get_definitions().gathered_defs
         for hed in hed_strings:
             counts.update_event_counts(hed, 'run-1')

diff --git a/tests/tools/remodeling/operations/test_factor_hed_tags_op.py b/tests/tools/remodeling/operations/test_factor_hed_tags_op.py
@@ -62,14 +62,14 @@ def test_invalid_query_names(self):
         params["query_names"] = ["apple", "apple"]
         with self.assertRaises(ValueError) as context:
             FactorHedTagsOp(params)
-        self.assertEqual(context.exception.args[0], 'DuplicateQueryNames')
+        self.assertEqual(context.exception.args[0], 'FactorHedTagInvalidQueries')
 
         # Query names have wrong length
         params = json.loads(self.json_params)
         params["query_names"] = ["apple", "banana", "pear"]
         with self.assertRaises(ValueError) as context:
             FactorHedTagsOp(params)
-        self.assertEqual(context.exception.args[0], 'QueryNamesLengthBad')
+        self.assertEqual(context.exception.args[0], 'FactorHedTagInvalidQueries')
 
         # Query name already a column name
         params = json.loads(self.json_params)

diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py
@@ -162,14 +162,12 @@ def test_quick4(self):
                                              '../../../data/remodel_tests/'))
         data_path = os.path.realpath(os.path.join(path, 'sub-002_task-FacePerception_run-1_events.tsv'))
         json_path = os.path.realpath(os.path.join(path, 'task-FacePerception_events.json'))
-        my_schema = load_schema_version('8.1.0')
+        schema = load_schema_version('8.1.0')
         sidecar = Sidecar(json_path,)
         input_data = TabularInput(data_path, sidecar=sidecar)
         counts = HedTagCounts('myName', 2)
         summary_dict = {}
-        hed_strings, definitions = get_assembled(input_data, sidecar, my_schema, 
-                                                 extra_def_dicts=None,
-                                                 shrink_defs=False, expand_defs=True)
+        hed_strings, definitions = get_assembled(input_data, schema, extra_def_dicts=None, defs_expanded=True)
         for hed in hed_strings:
             counts.update_event_counts(hed, 'myName')
         summary_dict['myName'] = counts

diff --git a/tests/tools/remodeling/test_validator.py b/tests/tools/remodeling/test_validator.py
@@ -124,29 +124,30 @@ def test_validate_parameter_data(self):
         factor_column_validate = [deepcopy(self.remodel_file)[1]]
         factor_column_validate[0]["parameters"]["factor_names"] = ["stopped"]
         error_strings = self.validator.validate(factor_column_validate)
-        self.assertEqual(error_strings[0], "Operation 1: factor_names_op: factor_names must be same length as factor_values")
+        self.assertEqual(error_strings[0], "Operation 1 (factor_column): factor_names must be same length as factor_values")
 
         factor_hed_tags_validate = [deepcopy(self.remodel_file)[2]]
         factor_hed_tags_validate[0]["parameters"]["query_names"] = ["correct"]
         error_strings = self.validator.validate(factor_hed_tags_validate)
-        self.assertEqual(error_strings[0], "Operation 1: factor_hed_tags_op: query_names must be same length as queries.")
+        self.assertEqual(error_strings[0], "Operation 1 (factor_hed_tags): QueryNamesLengthBad: The query_names length 1 must be empty or equalto the queries length 2.")
 
         merge_consecutive_validate = [deepcopy(self.remodel_file)[4]]
         merge_consecutive_validate[0]["parameters"]["match_columns"].append("trial_type")
         error_strings = self.validator.validate(merge_consecutive_validate)
-        self.assertEqual(error_strings[0], "Operation 1: merge_consecutive_op: column_name `trial_type` cannot not be a match_column.")
+        self.assertEqual(error_strings[0], "Operation 1 (merge_consecutive): column_name `trial_type` cannot not be a match_column.")
 
         remap_columns_validate_same_length = [deepcopy(self.remodel_file)[5]]
         remap_columns_validate_same_length[0]["parameters"]["map_list"][0] = [""]
         error_strings = self.validator.validate(remap_columns_validate_same_length)
-        self.assertEqual(error_strings[0], "Operation 1: remap_columns_op: all map_list arrays must be of length 3.")
+        self.assertEqual(error_strings[0], "Operation 1 (remap_columns): all map_list arrays must be of length 3.")
 
         remap_columns_validate_right_length = [deepcopy(self.remodel_file[5])]
-        remap_columns_validate_right_length[0]["parameters"]["map_list"] = [["string1", "string2"], ["string3", "string4"]]
+        remap_columns_validate_right_length[0]["parameters"]["map_list"] = \
+            [["string1", "string2"], ["string3", "string4"]]
         error_strings = self.validator.validate(remap_columns_validate_right_length)
-        self.assertEqual(error_strings[0], "Operation 1: remap_columns_op: all map_list arrays must be of length 3.")
+        self.assertEqual(error_strings[0], "Operation 1 (remap_columns): all map_list arrays must be of length 3.")
 
         remap_columns_integer_sources = [deepcopy(self.remodel_file[5])]
         remap_columns_integer_sources[0]["parameters"]["integer_sources"] = ["unknown_column"]
         error_strings = self.validator.validate(remap_columns_integer_sources)
-        self.assertEqual(error_strings[0], "Operation 1: remap_columns_op: the integer_sources {'unknown_column'} are missing from source_columns.")
+        self.assertEqual(error_strings[0], "Operation 1 (remap_columns): the integer_sources {'unknown_column'} are missing from source_columns.")