Merge pull request #847 from VisLab/develop

Fixed handling of optional remodeling arguments
hed-standard · Feb 2, 2024 · c9f6254 · c9f6254
2 parents f7a1bdf + bd148b7
commit c9f6254
Show file tree

Hide file tree

Showing 17 changed files with 158 additions and 150 deletions.
diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py
@@ -167,13 +167,15 @@ def get_type_defs(self, types):
         """ Return a list of definition names (lower case) that correspond to one of the specified types.
 
         Parameters:
-            types (list):  List of tags that are treated as types such as 'Condition-variable'
+            types (list or None):  List of tags that are treated as types such as 'Condition-variable'
 
         Returns:
             list:  List of definition names (lower-case) that correspond to the specified types
 
         """
         def_list = []
+        if not types:
+            return def_list
         for this_type in types:
             type_defs = HedTypeDefs(self.def_dict, type_tag=this_type)
             def_list = def_list + list(type_defs.def_map.keys())

diff --git a/hed/tools/remodeling/operations/factor_column_op.py b/hed/tools/remodeling/operations/factor_column_op.py
@@ -1,17 +1,13 @@
-""" Create tabular file factor columns from column values. """
+""" Append to tabular file columns of factors based on column values. """
 
 from hed.tools.remodeling.operations.base_op import BaseOp
 
-# TODO: Does not handle empty factor names.
-# TODO: Does not handle optional return columns.
-# TODO: Same length factornames and factorvalues
-
 
 class FactorColumnOp(BaseOp):
-    """ Create tabular file factor columns from column values.
+    """ Append to tabular file columns of factors based on column values.
 
     Required remodeling parameters:   
-        - **column_name** (*str*):  The name of a column in the DataFrame. 
+        - **column_name** (*str*):  The name of a column in the DataFrame to compute factors from.
 
     Optional remodeling parameters
         - **factor_names** (*list*):   Names to use as the factor columns.  
@@ -61,8 +57,8 @@ def __init__(self, parameters):
         """
         super().__init__(parameters)
         self.column_name = parameters['column_name']
-        self.factor_values = parameters['factor_values']
-        self.factor_names = parameters['factor_names']
+        self.factor_values = parameters.get('factor_values', None)
+        self.factor_names = parameters.get('factor_names', None)
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Create factor columns based on values in a specified column.
@@ -95,10 +91,12 @@ def do_op(self, dispatcher, df, name, sidecar=None):
 
     @staticmethod
     def validate_input_data(parameters):
-        if parameters.get("factor_names", False):
-            if len(parameters.get("factor_names")) != len(parameters.get("factor_values")):
-                return ["The list in factor_names, in the factor_column operation, should have the same number of items as factor_values."]
-            else: 
-                return []
+        """ Check that factor_names and factor_values have same length if given. """
+        names = parameters.get("factor_names", None)
+        values = parameters.get("factor_values", None)
+        if names and not values:
+            return ["factor_names_op: factor_names cannot be given without factor_values"]
+        elif names and values and len(names) != len(values):
+            return ["factor_names_op: factor_names must be same length as factor_values"]
         else:
             return []
diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py
@@ -1,4 +1,4 @@
-""" Create tabular file factors from tag queries. """
+""" Append to tabular file columns of factors based on column values. """
 
 
 import pandas as pd
@@ -12,19 +12,21 @@
 
 
 class FactorHedTagsOp(BaseOp):
-    """ Create tabular file factors from tag queries.
+    """ Append to tabular file columns of factors based on column values.
 
     Required remodeling parameters:   
         - **queries** (*list*): Queries to be applied successively as filters.       
 
     Optional remodeling parameters:   
         - **expand_context** (*bool*): Expand the context if True.
         - **query_names** (*list*):  Column names for the query factors. 
-        - **remove_types** (*list*):  Structural HED tags to be removed. 
+        - **remove_types** (*list*):  Structural HED tags to be removed (such as Condition-variable or Task).
+        - **expand_context** (*bool*): If true, expand the context based on Onset, Offset, and Duration.
 
     Notes:  
         - If query names are not provided, *query1*, *query2*, ... are used.   
-        - When the context is expanded, the effect of events for temporal extent is accounted for.  
+        - When the context is expanded, the effect of events for temporal extent is accounted for.
+
     """
     NAME = "factor_hed_tags"
 
@@ -39,9 +41,6 @@ class FactorHedTagsOp(BaseOp):
                 "minItems": 1,
                 "uniqueItems": True
             },
-            "expand_context": {
-                "type": "boolean"
-            },
             "query_names": {
                 "type": "array",
                 "items": {
@@ -57,6 +56,9 @@ class FactorHedTagsOp(BaseOp):
                 },
                 "minItems": 1,
                 "uniqueItems": True
+            },
+            "expand_context": {
+                "type": "boolean"
             }
         },
         "required": [
@@ -74,10 +76,10 @@ def __init__(self, parameters):
         """
         super().__init__(parameters)
         self.queries = parameters['queries']
-        self.query_names = parameters['query_names']
-        self.remove_types = parameters['remove_types']
+        self.remove_types = parameters.get('remove_types', [])
+        self.expand_context = parameters.get('expand_context', True)
         self.expression_parsers, self.query_names = get_expression_parsers(self.queries,
-                                                                           query_names=parameters['query_names'])
+                                                                           parameters.get('query_names', None))
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Factor the column using HED tag queries.
@@ -118,8 +120,8 @@ def do_op(self, dispatcher, df, name, sidecar=None):
 
     @staticmethod
     def validate_input_data(parameters):
-        errors = []
-        if parameters.get("query_names", False):
-            if len(parameters.get("query_names")) != len(parameters.get("queries")):
-                errors.append("The list in query_names, in the factor_hed_tags operation, should have the same number of items as queries.")
-        return errors
+        queries = parameters.get("queries", None)
+        names = parameters.get("query_names", None)
+        if names and queries and (len(names) != len(parameters["queries"])):
+            return ["factor_hed_tags_op: query_names must be same length as queries."]
+        return []
diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py
@@ -1,4 +1,4 @@
-""" Create tabular file factors from type variables. """
+""" Append to tabular file the factors computed from type variables. """
 
 import pandas as pd
 import numpy as np
@@ -7,11 +7,8 @@
 from hed.tools.analysis.event_manager import EventManager
 from hed.tools.analysis.hed_type_manager import HedTypeManager
 
-# TODO: restricted factor values are not implemented yet.
-
-
 class FactorHedTypeOp(BaseOp):
-    """ Create tabular file factors from type variables and append to tabular data.
+    """ Append to tabular file the factors computed from type variables.
 
     Required remodeling parameters:   
         - **type_tag** (*str*): HED tag used to find the factors (most commonly `condition-variable`).
@@ -52,7 +49,7 @@ def __init__(self, parameters):
         """
         super().__init__(parameters)
         self.type_tag = parameters["type_tag"]
-        self.type_values = parameters["type_values"]
+        self.type_values = parameters.get("type_values", None)
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Factor columns based on HED type and append to tabular data.

diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py
@@ -1,11 +1,11 @@
-""" Merge consecutive rows with same column value. """
+""" Merge consecutive rows of a tabular file with same column value. """
 
 import pandas as pd
 from hed.tools.remodeling.operations.base_op import BaseOp
 
 
 class MergeConsecutiveOp(BaseOp):
-    """ Merge consecutive rows with same column value.
+    """ Merge consecutive rows of a tabular file with same column value.
 
     Required remodeling parameters:
         - **column_name** (*str*): name of column whose consecutive values are to be compared (the merge column).  
@@ -14,7 +14,10 @@ class MergeConsecutiveOp(BaseOp):
         - **ignore_missing** (*bool*):  If true, missing match_columns are ignored.
 
     Optional remodeling parameters:
-        - **match_columns** (*list*):  A list of columns whose values have to be matched for two events to be the same.   
+        - **match_columns** (*list*):  A list of columns whose values have to be matched for two events to be the same.
+
+    Notes:
+          This operation is meant for time-based tabular files that have an onset column.
 
     """
     NAME = "merge_consecutive"
@@ -63,9 +66,9 @@ def __init__(self, parameters):
         super().__init__(parameters)
         self.column_name = parameters["column_name"]
         self.event_code = parameters["event_code"]
-        self.match_columns = parameters["match_columns"]
         self.set_durations = parameters["set_durations"]
         self.ignore_missing = parameters["ignore_missing"]
+        self.match_columns = parameters.get("match_columns", None)
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Merge consecutive rows with the same column value.
@@ -164,8 +167,8 @@ def _update_durations(df_new, remove_groups):
 
     @staticmethod
     def validate_input_data(parameters):
-        errors = []
-        if parameters.get("match_columns", False):
-            if parameters.get("column_name") in parameters.get("match_columns"):
-                errors.append("The column_name in the merge_consecutive operation cannot be specified as a match_column.")
-        return errors
+        match_columns = parameters.get("match_columns", None)
+        name = parameters.get("column_name", None)
+        if match_columns and name in match_columns:
+            return [f"merge_consecutive_op: column_name `{name}` cannot not be a match_column."]
+        return []
diff --git a/hed/tools/remodeling/operations/remap_columns_op.py b/hed/tools/remodeling/operations/remap_columns_op.py
@@ -1,4 +1,4 @@
-""" Map values in m columns into a new combinations in n columns. """
+""" Map values in m columns in a tabular file into a new combinations in n columns. """
 
 import pandas as pd
 import numpy as np
@@ -7,7 +7,7 @@
 
 
 class RemapColumnsOp(BaseOp):
-    """ Map values in m columns into a new combinations in n columns.
+    """ Map values in m columns in a tabular file into a new combinations in n columns.
 
     Required remodeling parameters:   
         - **source_columns** (*list*): The key columns to map (m key columns).   
@@ -53,7 +53,7 @@ class RemapColumnsOp(BaseOp):
                             "number"
                         ]
                     },
-                    "minItems" : 1
+                    "minItems": 1
                 },
                 "minItems": 1,
                 "uniqueItems": True
@@ -88,15 +88,12 @@ def __init__(self, parameters):
           """
         super().__init__(parameters)
         self.source_columns = parameters['source_columns']
-        self.integer_sources = []
-        self.string_sources = self.source_columns
-        if "integer_sources" in parameters:
-            self.string_sources = list(
-                set(self.source_columns).difference(set(self.integer_sources)))
         self.destination_columns = parameters['destination_columns']
         self.map_list = parameters['map_list']
         self.ignore_missing = parameters['ignore_missing']
-
+        self.string_sources = self.source_columns
+        self.integer_sources = parameters.get('integer_sources', [])
+        self.string_sources = list(set(self.source_columns).difference(set(self.integer_sources)))
         self.key_map = self._make_key_map()
 
     def _make_key_map(self):
@@ -145,13 +142,12 @@ def do_op(self, dispatcher, df, name, sidecar=None):
 
     @staticmethod
     def validate_input_data(parameters):
-        errors = []
-        if len(set([len(x) for x in parameters.get("map_list")])) != 1:
-            errors.append("The lists specified in the map_list parameter in the remap_columns operation should all have the same length.")
-        else:
-            if (len(parameters.get('source_columns')) + len(parameters.get("destination_columns"))) != len(parameters.get("map_list")[0]):
-                errors.append("The lists specified in the map_list parameter in the remap_columns operation should have a length equal to the number of source columns + the number of destination columns.")
-        if parameters.get("integer_sources", False):
-            if not all([(x in parameters.get("source_columns")) for x in parameters.get("integer_sources")]):
-                errors.append("All integer_sources in the remap_columns operation should be source_columns.")       
-        return errors
+        map_list = parameters["map_list"]
+        required_len = len(parameters['source_columns']) + len(parameters['destination_columns'])
+        for x in map_list:
+            if len(x) != required_len:
+                return [f"remap_columns_op: all map_list arrays must be of length {str(required_len)}."]
+        missing = set(parameters.get('integer_sources', [])) - set(parameters['source_columns'])
+        if missing:
+            return [f"remap_columns_op: the integer_sources {str(missing)} are missing from source_columns."]
+        return []
diff --git a/hed/tools/remodeling/operations/remove_rows_op.py b/hed/tools/remodeling/operations/remove_rows_op.py
@@ -1,10 +1,10 @@
-""" Remove rows from a tabular file. """
+""" Remove rows from a tabular file based on the values in a specified row. """
 
 from hed.tools.remodeling.operations.base_op import BaseOp
 
 
 class RemoveRowsOp(BaseOp):
-    """ Remove rows from a tabular file.
+    """ Remove rows from a tabular file based on the values in a specified row.
 
     Required remodeling parameters:   
         - **column_name** (*str*): The name of column to be tested.   

diff --git a/hed/tools/remodeling/operations/rename_columns_op.py b/hed/tools/remodeling/operations/rename_columns_op.py
@@ -7,7 +7,7 @@ class RenameColumnsOp (BaseOp):
     """ Rename columns in a tabular file.
 
     Required remodeling parameters:   
-        - **column_mapping** (*dict*): The names of the columns to be renamed.   
+        - **column_mapping** (*dict*): The names of the columns to be renamed with values to be remapped to.
         - **ignore_missing** (*bool*): If true, the names in column_mapping that are not columns and should be ignored.
 
     """

diff --git a/hed/tools/remodeling/operations/split_rows_op.py b/hed/tools/remodeling/operations/split_rows_op.py
@@ -1,12 +1,12 @@
-""" Split rows in a tabular file into multiple rows based on a column. """
+""" Split rows in a tabular file with  onset and duration columns into multiple rows based on a specified column. """
 
 import numpy as np
 import pandas as pd
 from hed.tools.remodeling.operations.base_op import BaseOp
 
 
 class SplitRowsOp(BaseOp):
-    """ Split rows in a tabular file into multiple rows based on parameters.
+    """ Split rows in a tabular file with  onset and duration columns into multiple rows based on a specified column.
 
     Required remodeling parameters:   
         - **anchor_column** (*str*): The column in which the names of new items are stored.   
@@ -106,7 +106,12 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             -If bad onset or duration.
 
         """
-
+        if 'onset' not in df.columns:
+            raise ValueError("MissingOnsetColumn",
+                             f"{name}: Data must have an onset column for split_rows_op")
+        elif 'duration' not in df.columns:
+            raise ValueError("MissingDurationColumn",
+                             f"{name}: Data must have an duration column for split_rows_op")
         df_new = df.copy()
 
         if self.anchor_column not in df_new.columns:
@@ -129,14 +134,14 @@ def _split_rows(self, df, df_list):
             df_list (list):  The list of split events and possibly the
 
         """
-        for event, event_parms in self.new_events.items():
+        for event, event_params in self.new_events.items():
             add_events = pd.DataFrame([], columns=df.columns)
             add_events['onset'] = self._create_onsets(
-                df, event_parms['onset_source'])
+                df, event_params['onset_source'])
             add_events[self.anchor_column] = event
-            self._add_durations(df, add_events, event_parms['duration'])
-            if len(event_parms['copy_columns']) > 0:
-                for column in event_parms['copy_columns']:
+            self._add_durations(df, add_events, event_params['duration'])
+            if len(event_params['copy_columns']) > 0:
+                for column in event_params['copy_columns']:
                     add_events[column] = df[column]
 
             # add_events['event_type'] = event

diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py
@@ -13,7 +13,7 @@ class SummarizeColumnValuesOp(BaseOp):
         - **summary_filename** (*str*): Base filename of the summary.     
 
     Optional remodeling parameters:
-        - **append_timecode** (*bool*): If false (default), the timecode is not appended to the base filename when summary is saved, otherwise it is. 
+        - **append_timecode** (*bool*): (**Optional**: Default false) If true append timecodes to the base filename when summary is saved.
         - **max_categorical** (*int*): Maximum number of unique values to include in summary for a categorical column.  
         - **skip_columns** (*list*):  Names of columns to skip in the summary.   
         - **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns. 
@@ -81,12 +81,11 @@ def __init__(self, parameters):
         super().__init__(parameters)
         self.summary_name = parameters['summary_name']
         self.summary_filename = parameters['summary_filename']
-        self.skip_columns = parameters['skip_columns']
-        self.value_columns = parameters['value_columns']
         self.append_timecode = parameters.get('append_timecode', False)
         self.max_categorical = parameters.get('max_categorical', float('inf'))
-        self.values_per_line = parameters.get(
-            'values_per_line', self.VALUES_PER_LINE)
+        self.skip_columns = parameters['skip_columns']
+        self.value_columns = parameters['value_columns']
+        self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE)
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Create a summary of the column values in df.

diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py
@@ -96,8 +96,7 @@ def __init__(self, parameters):
         self.append_timecode = parameters.get('append_timecode', False)
         self.include_context = parameters.get('include_context', True)
         self.replace_defs = parameters.get("replace_defs", True)
-        self.remove_types = parameters.get(
-            "remove_types", ["Condition-variable", "Task"])
+        self.remove_types = parameters.get("remove_types", [])
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Summarize the HED tags present in the dataset.