Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed handling of optional remodeling arguments #847

Merged
merged 2 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion hed/tools/analysis/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,15 @@ def get_type_defs(self, types):
""" Return a list of definition names (lower case) that correspond to one of the specified types.

Parameters:
types (list): List of tags that are treated as types such as 'Condition-variable'
types (list or None): List of tags that are treated as types such as 'Condition-variable'

Returns:
list: List of definition names (lower-case) that correspond to the specified types

"""
def_list = []
if not types:
return def_list
for this_type in types:
type_defs = HedTypeDefs(self.def_dict, type_tag=this_type)
def_list = def_list + list(type_defs.def_map.keys())
Expand Down
26 changes: 12 additions & 14 deletions hed/tools/remodeling/operations/factor_column_op.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
""" Create tabular file factor columns from column values. """
""" Append to tabular file columns of factors based on column values. """

from hed.tools.remodeling.operations.base_op import BaseOp

# TODO: Does not handle empty factor names.
# TODO: Does not handle optional return columns.
# TODO: Same length factornames and factorvalues


class FactorColumnOp(BaseOp):
""" Create tabular file factor columns from column values.
""" Append to tabular file columns of factors based on column values.

Required remodeling parameters:
- **column_name** (*str*): The name of a column in the DataFrame.
- **column_name** (*str*): The name of a column in the DataFrame to compute factors from.

Optional remodeling parameters
- **factor_names** (*list*): Names to use as the factor columns.
Expand Down Expand Up @@ -61,8 +57,8 @@ def __init__(self, parameters):
"""
super().__init__(parameters)
self.column_name = parameters['column_name']
self.factor_values = parameters['factor_values']
self.factor_names = parameters['factor_names']
self.factor_values = parameters.get('factor_values', None)
self.factor_names = parameters.get('factor_names', None)

def do_op(self, dispatcher, df, name, sidecar=None):
""" Create factor columns based on values in a specified column.
Expand Down Expand Up @@ -95,10 +91,12 @@ def do_op(self, dispatcher, df, name, sidecar=None):

@staticmethod
def validate_input_data(parameters):
if parameters.get("factor_names", False):
if len(parameters.get("factor_names")) != len(parameters.get("factor_values")):
return ["The list in factor_names, in the factor_column operation, should have the same number of items as factor_values."]
else:
return []
""" Check that factor_names and factor_values have same length if given. """
names = parameters.get("factor_names", None)
values = parameters.get("factor_values", None)
if names and not values:
return ["factor_names_op: factor_names cannot be given without factor_values"]
elif names and values and len(names) != len(values):
return ["factor_names_op: factor_names must be same length as factor_values"]
else:
return []
32 changes: 17 additions & 15 deletions hed/tools/remodeling/operations/factor_hed_tags_op.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Create tabular file factors from tag queries. """
""" Append to tabular file columns of factors based on column values. """


import pandas as pd
Expand All @@ -12,19 +12,21 @@


class FactorHedTagsOp(BaseOp):
""" Create tabular file factors from tag queries.
""" Append to tabular file columns of factors based on column values.

Required remodeling parameters:
- **queries** (*list*): Queries to be applied successively as filters.

Optional remodeling parameters:
- **expand_context** (*bool*): Expand the context if True.
- **query_names** (*list*): Column names for the query factors.
- **remove_types** (*list*): Structural HED tags to be removed.
- **remove_types** (*list*): Structural HED tags to be removed (such as Condition-variable or Task).
- **expand_context** (*bool*): If true, expand the context based on Onset, Offset, and Duration.

Notes:
- If query names are not provided, *query1*, *query2*, ... are used.
- When the context is expanded, the effect of events for temporal extent is accounted for.
- When the context is expanded, the effect of events for temporal extent is accounted for.

"""
NAME = "factor_hed_tags"

Expand All @@ -39,9 +41,6 @@ class FactorHedTagsOp(BaseOp):
"minItems": 1,
"uniqueItems": True
},
"expand_context": {
"type": "boolean"
},
"query_names": {
"type": "array",
"items": {
Expand All @@ -57,6 +56,9 @@ class FactorHedTagsOp(BaseOp):
},
"minItems": 1,
"uniqueItems": True
},
"expand_context": {
"type": "boolean"
}
},
"required": [
Expand All @@ -74,10 +76,10 @@ def __init__(self, parameters):
"""
super().__init__(parameters)
self.queries = parameters['queries']
self.query_names = parameters['query_names']
self.remove_types = parameters['remove_types']
self.remove_types = parameters.get('remove_types', [])
self.expand_context = parameters.get('expand_context', True)
self.expression_parsers, self.query_names = get_expression_parsers(self.queries,
query_names=parameters['query_names'])
parameters.get('query_names', None))

def do_op(self, dispatcher, df, name, sidecar=None):
""" Factor the column using HED tag queries.
Expand Down Expand Up @@ -118,8 +120,8 @@ def do_op(self, dispatcher, df, name, sidecar=None):

@staticmethod
def validate_input_data(parameters):
errors = []
if parameters.get("query_names", False):
if len(parameters.get("query_names")) != len(parameters.get("queries")):
errors.append("The list in query_names, in the factor_hed_tags operation, should have the same number of items as queries.")
return errors
queries = parameters.get("queries", None)
names = parameters.get("query_names", None)
if names and queries and (len(names) != len(parameters["queries"])):
return ["factor_hed_tags_op: query_names must be same length as queries."]
return []
9 changes: 3 additions & 6 deletions hed/tools/remodeling/operations/factor_hed_type_op.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Create tabular file factors from type variables. """
""" Append to tabular file the factors computed from type variables. """

import pandas as pd
import numpy as np
Expand All @@ -7,11 +7,8 @@
from hed.tools.analysis.event_manager import EventManager
from hed.tools.analysis.hed_type_manager import HedTypeManager

# TODO: restricted factor values are not implemented yet.


class FactorHedTypeOp(BaseOp):
""" Create tabular file factors from type variables and append to tabular data.
""" Append to tabular file the factors computed from type variables.

Required remodeling parameters:
- **type_tag** (*str*): HED tag used to find the factors (most commonly `condition-variable`).
Expand Down Expand Up @@ -52,7 +49,7 @@ def __init__(self, parameters):
"""
super().__init__(parameters)
self.type_tag = parameters["type_tag"]
self.type_values = parameters["type_values"]
self.type_values = parameters.get("type_values", None)

def do_op(self, dispatcher, df, name, sidecar=None):
""" Factor columns based on HED type and append to tabular data.
Expand Down
21 changes: 12 additions & 9 deletions hed/tools/remodeling/operations/merge_consecutive_op.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
""" Merge consecutive rows with same column value. """
""" Merge consecutive rows of a tabular file with same column value. """

import pandas as pd
from hed.tools.remodeling.operations.base_op import BaseOp


class MergeConsecutiveOp(BaseOp):
""" Merge consecutive rows with same column value.
""" Merge consecutive rows of a tabular file with same column value.

Required remodeling parameters:
- **column_name** (*str*): name of column whose consecutive values are to be compared (the merge column).
Expand All @@ -14,7 +14,10 @@ class MergeConsecutiveOp(BaseOp):
- **ignore_missing** (*bool*): If true, missing match_columns are ignored.

Optional remodeling parameters:
- **match_columns** (*list*): A list of columns whose values have to be matched for two events to be the same.
- **match_columns** (*list*): A list of columns whose values have to be matched for two events to be the same.

Notes:
This operation is meant for time-based tabular files that have an onset column.

"""
NAME = "merge_consecutive"
Expand Down Expand Up @@ -63,9 +66,9 @@ def __init__(self, parameters):
super().__init__(parameters)
self.column_name = parameters["column_name"]
self.event_code = parameters["event_code"]
self.match_columns = parameters["match_columns"]
self.set_durations = parameters["set_durations"]
self.ignore_missing = parameters["ignore_missing"]
self.match_columns = parameters.get("match_columns", None)

def do_op(self, dispatcher, df, name, sidecar=None):
""" Merge consecutive rows with the same column value.
Expand Down Expand Up @@ -164,8 +167,8 @@ def _update_durations(df_new, remove_groups):

@staticmethod
def validate_input_data(parameters):
errors = []
if parameters.get("match_columns", False):
if parameters.get("column_name") in parameters.get("match_columns"):
errors.append("The column_name in the merge_consecutive operation cannot be specified as a match_column.")
return errors
match_columns = parameters.get("match_columns", None)
name = parameters.get("column_name", None)
if match_columns and name in match_columns:
return [f"merge_consecutive_op: column_name `{name}` cannot not be a match_column."]
return []
34 changes: 15 additions & 19 deletions hed/tools/remodeling/operations/remap_columns_op.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Map values in m columns into a new combinations in n columns. """
""" Map values in m columns in a tabular file into a new combinations in n columns. """

import pandas as pd
import numpy as np
Expand All @@ -7,7 +7,7 @@


class RemapColumnsOp(BaseOp):
""" Map values in m columns into a new combinations in n columns.
""" Map values in m columns in a tabular file into a new combinations in n columns.

Required remodeling parameters:
- **source_columns** (*list*): The key columns to map (m key columns).
Expand Down Expand Up @@ -53,7 +53,7 @@ class RemapColumnsOp(BaseOp):
"number"
]
},
"minItems" : 1
"minItems": 1
},
"minItems": 1,
"uniqueItems": True
Expand Down Expand Up @@ -88,15 +88,12 @@ def __init__(self, parameters):
"""
super().__init__(parameters)
self.source_columns = parameters['source_columns']
self.integer_sources = []
self.string_sources = self.source_columns
if "integer_sources" in parameters:
self.string_sources = list(
set(self.source_columns).difference(set(self.integer_sources)))
self.destination_columns = parameters['destination_columns']
self.map_list = parameters['map_list']
self.ignore_missing = parameters['ignore_missing']

self.string_sources = self.source_columns
self.integer_sources = parameters.get('integer_sources', [])
self.string_sources = list(set(self.source_columns).difference(set(self.integer_sources)))
self.key_map = self._make_key_map()

def _make_key_map(self):
Expand Down Expand Up @@ -145,13 +142,12 @@ def do_op(self, dispatcher, df, name, sidecar=None):

@staticmethod
def validate_input_data(parameters):
errors = []
if len(set([len(x) for x in parameters.get("map_list")])) != 1:
errors.append("The lists specified in the map_list parameter in the remap_columns operation should all have the same length.")
else:
if (len(parameters.get('source_columns')) + len(parameters.get("destination_columns"))) != len(parameters.get("map_list")[0]):
errors.append("The lists specified in the map_list parameter in the remap_columns operation should have a length equal to the number of source columns + the number of destination columns.")
if parameters.get("integer_sources", False):
if not all([(x in parameters.get("source_columns")) for x in parameters.get("integer_sources")]):
errors.append("All integer_sources in the remap_columns operation should be source_columns.")
return errors
map_list = parameters["map_list"]
required_len = len(parameters['source_columns']) + len(parameters['destination_columns'])
for x in map_list:
if len(x) != required_len:
return [f"remap_columns_op: all map_list arrays must be of length {str(required_len)}."]
missing = set(parameters.get('integer_sources', [])) - set(parameters['source_columns'])
if missing:
return [f"remap_columns_op: the integer_sources {str(missing)} are missing from source_columns."]
return []
4 changes: 2 additions & 2 deletions hed/tools/remodeling/operations/remove_rows_op.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
""" Remove rows from a tabular file. """
""" Remove rows from a tabular file based on the values in a specified row. """

from hed.tools.remodeling.operations.base_op import BaseOp


class RemoveRowsOp(BaseOp):
""" Remove rows from a tabular file.
""" Remove rows from a tabular file based on the values in a specified row.

Required remodeling parameters:
- **column_name** (*str*): The name of column to be tested.
Expand Down
2 changes: 1 addition & 1 deletion hed/tools/remodeling/operations/rename_columns_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class RenameColumnsOp (BaseOp):
""" Rename columns in a tabular file.

Required remodeling parameters:
- **column_mapping** (*dict*): The names of the columns to be renamed.
- **column_mapping** (*dict*): The names of the columns to be renamed with values to be remapped to.
- **ignore_missing** (*bool*): If true, the names in column_mapping that are not columns and should be ignored.

"""
Expand Down
21 changes: 13 additions & 8 deletions hed/tools/remodeling/operations/split_rows_op.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
""" Split rows in a tabular file into multiple rows based on a column. """
""" Split rows in a tabular file with onset and duration columns into multiple rows based on a specified column. """

import numpy as np
import pandas as pd
from hed.tools.remodeling.operations.base_op import BaseOp


class SplitRowsOp(BaseOp):
""" Split rows in a tabular file into multiple rows based on parameters.
""" Split rows in a tabular file with onset and duration columns into multiple rows based on a specified column.

Required remodeling parameters:
- **anchor_column** (*str*): The column in which the names of new items are stored.
Expand Down Expand Up @@ -106,7 +106,12 @@ def do_op(self, dispatcher, df, name, sidecar=None):
-If bad onset or duration.

"""

if 'onset' not in df.columns:
raise ValueError("MissingOnsetColumn",
f"{name}: Data must have an onset column for split_rows_op")
elif 'duration' not in df.columns:
raise ValueError("MissingDurationColumn",
f"{name}: Data must have an duration column for split_rows_op")
df_new = df.copy()

if self.anchor_column not in df_new.columns:
Expand All @@ -129,14 +134,14 @@ def _split_rows(self, df, df_list):
df_list (list): The list of split events and possibly the

"""
for event, event_parms in self.new_events.items():
for event, event_params in self.new_events.items():
add_events = pd.DataFrame([], columns=df.columns)
add_events['onset'] = self._create_onsets(
df, event_parms['onset_source'])
df, event_params['onset_source'])
add_events[self.anchor_column] = event
self._add_durations(df, add_events, event_parms['duration'])
if len(event_parms['copy_columns']) > 0:
for column in event_parms['copy_columns']:
self._add_durations(df, add_events, event_params['duration'])
if len(event_params['copy_columns']) > 0:
for column in event_params['copy_columns']:
add_events[column] = df[column]

# add_events['event_type'] = event
Expand Down
9 changes: 4 additions & 5 deletions hed/tools/remodeling/operations/summarize_column_values_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class SummarizeColumnValuesOp(BaseOp):
- **summary_filename** (*str*): Base filename of the summary.

Optional remodeling parameters:
- **append_timecode** (*bool*): If false (default), the timecode is not appended to the base filename when summary is saved, otherwise it is.
- **append_timecode** (*bool*): (**Optional**: Default false) If true append timecodes to the base filename when summary is saved.
- **max_categorical** (*int*): Maximum number of unique values to include in summary for a categorical column.
- **skip_columns** (*list*): Names of columns to skip in the summary.
- **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns.
Expand Down Expand Up @@ -81,12 +81,11 @@ def __init__(self, parameters):
super().__init__(parameters)
self.summary_name = parameters['summary_name']
self.summary_filename = parameters['summary_filename']
self.skip_columns = parameters['skip_columns']
self.value_columns = parameters['value_columns']
self.append_timecode = parameters.get('append_timecode', False)
self.max_categorical = parameters.get('max_categorical', float('inf'))
self.values_per_line = parameters.get(
'values_per_line', self.VALUES_PER_LINE)
self.skip_columns = parameters['skip_columns']
self.value_columns = parameters['value_columns']
self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE)

def do_op(self, dispatcher, df, name, sidecar=None):
""" Create a summary of the column values in df.
Expand Down
3 changes: 1 addition & 2 deletions hed/tools/remodeling/operations/summarize_hed_tags_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,7 @@ def __init__(self, parameters):
self.append_timecode = parameters.get('append_timecode', False)
self.include_context = parameters.get('include_context', True)
self.replace_defs = parameters.get("replace_defs", True)
self.remove_types = parameters.get(
"remove_types", ["Condition-variable", "Task"])
self.remove_types = parameters.get("remove_types", [])

def do_op(self, dispatcher, df, name, sidecar=None):
""" Summarize the HED tags present in the dataset.
Expand Down
Loading