From e73b7a74d41af7bb002c962db7407867b4c9481c Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 31 Jul 2024 18:20:14 -0500 Subject: [PATCH] Add not even half baked start on changing dispatcher for boutiques style command line and summaries --- hed/tools/__init__.py | 1 + hed/tools/analysis/column_name_summary.py | 20 +- hed/tools/analysis/tabular_summary.py | 56 +++-- hed/tools/remodeling/cli/run_remodel.py | 11 +- hed/tools/remodeling/cli/run_summary.py | 194 ++++++++++++++++++ hed/tools/remodeling/dispatcher.py | 72 ++++++- .../remodeling/operations/base_summary.py | 18 +- .../operations/summarize_column_names_op.py | 13 +- .../operations/summarize_column_values_op.py | 60 +++--- .../operations/summarize_hed_tags_op.py | 5 + 10 files changed, 383 insertions(+), 67 deletions(-) create mode 100644 hed/tools/remodeling/cli/run_summary.py diff --git a/hed/tools/__init__.py b/hed/tools/__init__.py index 2d0bd977..3af0b1b5 100644 --- a/hed/tools/__init__.py +++ b/hed/tools/__init__.py @@ -52,3 +52,4 @@ from .remodeling.cli import run_remodel from .remodeling.cli import run_remodel_backup from .remodeling.cli import run_remodel_restore +from .remodeling.cli import run_summary diff --git a/hed/tools/analysis/column_name_summary.py b/hed/tools/analysis/column_name_summary.py index 2820a5b3..81035b22 100644 --- a/hed/tools/analysis/column_name_summary.py +++ b/hed/tools/analysis/column_name_summary.py @@ -1,5 +1,5 @@ """ Summarize the unique column names in a dataset. """ - +import copy import json @@ -11,6 +11,24 @@ def __init__(self, name=''): self.file_dict = {} self.unique_headers = [] + @staticmethod + def load_as_json2(json_data): + summary = ColumnNameSummary() + json_data = json_data["File summary"] + summary.name = json_data["Name"] + # summary.total_events = json_data["Total events"] + # summary.total_files = json_data["Total files"] + specifics = json_data["Specifics"] + all_column_data = specifics["Columns"] + for index, column_data in enumerate(all_column_data): + file_list = column_data["Files"] + unique_header = column_data["Column names"] + summary.unique_headers.append(unique_header) + for file in file_list: + summary.file_dict[file] = index + + return summary + def update(self, name, columns): """ Update the summary based on columns associated with a file. diff --git a/hed/tools/analysis/tabular_summary.py b/hed/tools/analysis/tabular_summary.py index 76d26c69..7bb24968 100644 --- a/hed/tools/analysis/tabular_summary.py +++ b/hed/tools/analysis/tabular_summary.py @@ -1,6 +1,5 @@ """ Summarize the contents of columnar files. """ - import json from hed.errors.exceptions import HedFileError from hed.tools.util import data_util @@ -74,6 +73,32 @@ def extract_sidecar_template(self): side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, []) return side_dict + @staticmethod + def load_as_json2(json_data): + summary = TabularSummary() + json_data = json_data["File summary"] + summary.name = json_data["Name"] + summary.total_events = json_data["Total events"] + summary.total_files = json_data["Total files"] + specifics = json_data["Specifics"] + # todo ian: this doesn't use value column summaries or categorical counts? What + summary.categorical_info = specifics["Categorical column summaries"] + summary.value_info = specifics["Value column summaries"] + summary.skip_cols = specifics["Skip columns"] + # summary.files = specifics["Files"] + + return summary + + def _sort_internal(self): + categorical_cols = {} + for key in sorted(self.categorical_info): + cat_dict = self.categorical_info[key] + val_dict = {v_key: cat_dict[v_key] for v_key in sorted(cat_dict.keys())} + categorical_cols[key] = val_dict + value_cols = {key: self.value_info[key] for key in sorted(self.value_info)} + self.categorical_info = categorical_cols + self.value_info = value_cols + def get_summary(self, as_json=False): """ Return the summary in dictionary format. @@ -81,22 +106,17 @@ def get_summary(self, as_json=False): as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary. """ - sorted_keys = sorted(self.categorical_info.keys()) - categorical_cols = {} - for key in sorted_keys: - cat_dict = self.categorical_info[key] - sorted_v_keys = sorted(list(cat_dict)) - val_dict = {} - for v_key in sorted_v_keys: - val_dict[v_key] = cat_dict[v_key] - categorical_cols[key] = val_dict - sorted_cols = sorted(map(str, list(self.value_info))) - value_cols = {} - for key in sorted_cols: - value_cols[key] = self.value_info[key] - summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files, - "Categorical columns": categorical_cols, "Value columns": value_cols, - "Skip columns": self.skip_cols, "Files": self.files} + self._sort_internal() + summary = {"Name": self.name, + "Total events": self.total_events, + "Total files": self.total_files, + "Categorical columns": self.categorical_info, + "Value columns": self.value_info, + "Skip columns": self.skip_cols, + "Files": self.files} + + # reloaded_summary = self.load_as_json(summary) + if as_json: return json.dumps(summary, indent=4) else: @@ -198,7 +218,7 @@ def _update_dataframe(self, data, name): else: col_values = col_values.astype(str) values = col_values.value_counts(ascending=True) - self._update_categorical(col_name, values) + self._update_categorical(col_name, values) def _update_dict_categorical(self, col_dict): """ Update this summary with the categorical information in the dictionary from another summary. diff --git a/hed/tools/remodeling/cli/run_remodel.py b/hed/tools/remodeling/cli/run_remodel.py index edf0482e..60c54b93 100644 --- a/hed/tools/remodeling/cli/run_remodel.py +++ b/hed/tools/remodeling/cli/run_remodel.py @@ -1,5 +1,5 @@ """ Main command-line program for running the remodeling tools. """ - +import copy import os import json import argparse @@ -62,6 +62,8 @@ def get_parser(): help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.") parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs", help="Directories names to exclude from search for files.") + parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="none", + choices=["participant", "group", "none"]) return parser @@ -224,13 +226,17 @@ def main(arg_list=None): for task, files in task_dict.items(): dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name, hed_versions=args.hed_versions) + if args.use_bids: run_bids_ops(dispatch, args, files) else: run_direct_ops(dispatch, args, files) + if not args.no_summaries: + # Todo ian: replace dataset_summary variable dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries, - summary_dir=save_dir, task_name=task) + summary_dir=save_dir, task_name=task, + dataset_summary=args.analysis_level != "participant") except Exception as ex: if args.log_dir: log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt' @@ -239,5 +245,6 @@ def main(arg_list=None): raise + if __name__ == '__main__': main() diff --git a/hed/tools/remodeling/cli/run_summary.py b/hed/tools/remodeling/cli/run_summary.py new file mode 100644 index 00000000..7d07231f --- /dev/null +++ b/hed/tools/remodeling/cli/run_summary.py @@ -0,0 +1,194 @@ +""" Main command-line program for running the remodeling tools. """ +import copy +import os +import json +import argparse +import logging +from hed.errors.exceptions import HedFileError + +from hed.tools.bids.bids_dataset import BidsDataset +from hed.tools.remodeling.remodeler_validator import RemodelerValidator +from hed.tools.remodeling.dispatcher import Dispatcher +from hed.tools.remodeling.backup_manager import BackupManager +from hed.tools.util import io_util + + +def get_parser(): + """ Create a parser for the run_remodel command-line arguments. + + Returns: + argparse.ArgumentParser: A parser for parsing the command line arguments. + + """ + parser = argparse.ArgumentParser(description="Converts event files based on a json file specifying operations.") + parser.add_argument("data_dir", help="Full path of dataset root directory.") + parser.add_argument("model_path", help="Full path of the file with remodeling instructions.") + parser.add_argument("-bd", "--backup_dir", default="", dest="backup_dir", + help="Directory for the backup that is being created") + parser.add_argument("-bn", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", + help="Name of the default backup for remodeling") + parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids", + help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.") + parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions", + help="File extensions to allow in locating files.") + parser.add_argument("-f", "--file-suffix", dest="file_suffix", default='events', + help="Filename suffix excluding file type of items to be analyzed (events by default).") + parser.add_argument("-i", "--individual-summaries", dest="individual_summaries", default="separate", + choices=["separate", "consolidated", "none"], + help="Controls individual file summaries ('none', 'separate', 'consolidated')") + parser.add_argument("-j", "--json-sidecar", dest="json_sidecar", nargs="?", + help="Optional path to JSON sidecar with HED information") + parser.add_argument("-ld", "--log_dir", dest="log_dir", default="", + help="Directory for storing log entries for errors.") +# parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", +# help="Name of the default backup for remodeling") + parser.add_argument("-nb", "--no-backup", action='store_true', dest="no_backup", + help="If present, the operations are run directly on the files with no backup.") + parser.add_argument("-ns", "--no-summaries", action='store_true', dest="no_summaries", + help="If present, the summaries are not saved, but rather discarded.") + parser.add_argument("-nu", "--no-update", action='store_true', dest="no_update", + help="If present, the files are not saved, but rather discarded.") + parser.add_argument("-r", "--hed-versions", dest="hed_versions", nargs="*", default=[], + help="Optional list of HED schema versions used for annotation, include prefixes.") + parser.add_argument("-s", "--save-formats", nargs="*", default=['.json', '.txt'], dest="save_formats", + help="Format for saving any summaries, if any. If no summaries are to be written," + + "use the -ns option.") + parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], + help="The names of the task. If an empty list is given, all tasks are lumped together." + + " If * is given, then tasks are found and reported individually.") + parser.add_argument("-v", "--verbose", action='store_true', + help="If present, output informative messages as computation progresses.") + parser.add_argument("-w", "--work-dir", default="", dest="work_dir", + help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.") + parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs", + help="Directories names to exclude from search for files.") + parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="group", + choices=["group"]) + return parser + + +def handle_backup(args): + """ Restore the backup if applicable. + + Parameters: + args (obj): Parsed arguments as an object. + + Returns: + str or None: Backup name if there was a backup done. + + """ + if args.no_backup: + backup_name = None + else: + backup_man = BackupManager(args.data_dir) + if not backup_man.get_backup(args.backup_name): + raise HedFileError("BackupDoesNotExist", f"Backup {args.backup_name} does not exist. " + f"Please run_remodel_backup first", "") + backup_man.restore_backup(args.backup_name, args.task_names, verbose=args.verbose) + backup_name = args.backup_name + return backup_name + + +def parse_arguments(arg_list=None): + """ Parse the command line arguments or arg_list if given. + + Parameters: + arg_list (list): List of command line arguments as a list. + + Returns: + Object: Argument object. + List: A list of parsed operations (each operation is a dictionary). + + :raises ValueError: + - If the operations were unable to be correctly parsed. + + """ + parser = get_parser() + args = parser.parse_args(arg_list) + if '*' in args.file_suffix: + args.file_suffix = None + if '*' in args.extensions: + args.extensions = None + args.data_dir = os.path.realpath(args.data_dir) + args.exclude_dirs = args.exclude_dirs + ['remodel'] + args.model_path = os.path.realpath(args.model_path) + if args.verbose: + print(f"Data directory: {args.data_dir}\nModel path: {args.model_path}") + with open(args.model_path, 'r') as fp: + operations = json.load(fp) + validator = RemodelerValidator() + errors = validator.validate(operations) + if errors: + raise ValueError("UnableToFullyParseOperations", + f"Fatal operation error, cannot continue:\n{errors}") + return args, operations + + +def parse_tasks(files, task_args): + """ Parse the tasks argument to get a task list. + + Parameters: + files (list): List of full paths of files. + task_args (str or list): The argument values for the task parameter. + + """ + if not task_args: + return {"": files} + task_dict = io_util.get_task_dict(files) + if task_args == "*" or isinstance(task_args, list) and task_args[0] == "*": + return task_dict + task_dict = {key: task_dict[key] for key in task_args if key in task_dict} + return task_dict + + +def main(arg_list=None): + """ The command-line program. + + Parameters: + arg_list (list or None): Called with value None when called from the command line. + Otherwise, called with the command-line parameters as an argument list. + + :raises HedFileError: + - if the data root directory does not exist. + - if the specified backup does not exist. + + """ + args, operations = parse_arguments(arg_list) + + if args.log_dir: + os.makedirs(args.log_dir, exist_ok=True) + timestamp = io_util.get_timestamp() + try: + if not os.path.isdir(args.data_dir): + raise HedFileError("DataDirectoryDoesNotExist", + f"The root data directory {args.data_dir} does not exist", "") + backup_name = handle_backup(args) + save_dir = None + if args.work_dir: + save_dir = os.path.realpath(os.path.join(args.work_dir, Dispatcher.REMODELING_SUMMARY_PATH)) + files = io_util.get_file_list(args.data_dir, name_suffix=args.file_suffix, extensions=args.extensions, + exclude_dirs=args.exclude_dirs) + task_dict = parse_tasks(files, args.task_names) + for task, files in task_dict.items(): + dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name, + hed_versions=args.hed_versions) + + # next task: add a makeshift "analysis level" parameter. particpant = generate individual, group = reload indivdual on load + # Need a way to determine WHICH run to reload options from + + dispatch.load_existing_summaries(save_dir) + + if not args.no_summaries: + dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries, + summary_dir=save_dir, task_name=task) + except Exception as ex: + if args.log_dir: + log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt' + logging.basicConfig(filename=os.path.join(args.log_dir, log_name), level=logging.ERROR) + logging.exception(f"{args.data_dir}: {args.model_path}") + raise + + + +if __name__ == '__main__': + main() diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py index 04e6d809..dd844532 100644 --- a/hed/tools/remodeling/dispatcher.py +++ b/hed/tools/remodeling/dispatcher.py @@ -4,6 +4,8 @@ import numpy as np import pandas as pd import json +import re +from collections import defaultdict from hed.errors.exceptions import HedFileError from hed.schema.hed_schema_io import load_schema_version from hed.schema.hed_schema import HedSchema @@ -154,8 +156,73 @@ def run_operations(self, file_path, sidecar=None, verbose=False): df = self.post_proc_data(df) return df + def load_existing_summaries(self, summary_dir): + if not summary_dir: + summary_dir = self.get_summary_save_dir() + + for operation in self.parsed_ops: + if hasattr(operation, "get_summary_class"): + # todo ian: How do we carefully get the name if the names aren't unique? + # Are there names in the files? + name = str(operation.summary_name) + filename = operation.summary_filename + + final_dir = os.path.join(summary_dir, name, "individual_summaries") + if not os.path.exists(final_dir): + continue + + base_files = os.listdir(final_dir) + # Need to filter out names here -> break name into parts(is there a function for this?) + # possible_files = [os.path.join(final_dir, file) for file in base_files] + possible_files = defaultdict(list) + # todo ian: not robust. Also put in a file + for file in base_files: + if not file.endswith(".json"): + continue + + final_filename = os.path.join(final_dir, file) + json_data = json.load(open(final_filename)) + loaded_name = json_data["Summary name"] + loaded_filename = json_data["Summary filename"] + if loaded_name != name or loaded_filename != filename: + continue + + # todo ian: this is all lazily done so far + trimmed_file = file[len(filename) + 1:] + trimmed_file, _ = os.path.splitext(trimmed_file) + + # Look for a timestamp ending a file. + timecode_pattern = r'(.*)(\d{4}_\d{2}_\d{2}_T_\d{2}_\d{2}_\d{2}_\d{3})$' + timecode_match = re.match(timecode_pattern, trimmed_file) + suffix = "" + if timecode_match: + trimmed_file = timecode_match.group(1) + suffix = timecode_match.group(2) + trimmed_file, _, run_number = trimmed_file.rpartition("_") + final_suffix = "_".join([run_number, suffix]) + possible_files[final_suffix].append(final_filename) + # todo ian: need some way to identify a run here + print(file) + + if not possible_files: + # todo: what error etc is this + continue + + actual_files = possible_files[list(possible_files)[0]] + + # How do we know which one to load here? + # Do we do all of them? + summary = operation.get_summary_class()(operation) + + # todo ian: this is probably an issue, don't some of these require other params? + sub_summary = summary.get_sub_summary_class() + summary.summary_dict = {file:sub_summary.load_as_json2(json.load(open(file))) for file in actual_files} + self.summary_dicts[operation.summary_name] = summary + breakHere = 3 + + def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="separate", - summary_dir=None, task_name=""): + summary_dir=None, task_name="", dataset_summary=True): """ Save the summary files in the specified formats. Parameters: @@ -179,7 +246,8 @@ def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="s summary_dir = self.get_summary_save_dir() os.makedirs(summary_dir, exist_ok=True) for summary_name, summary_item in self.summary_dicts.items(): - summary_item.save(summary_dir, save_formats, individual_summaries=individual_summaries, task_name=task_name) + summary_item.save(summary_dir, save_formats, individual_summaries=individual_summaries, task_name=task_name, + dataset_summary=dataset_summary) @staticmethod def parse_operations(operation_list): diff --git a/hed/tools/remodeling/operations/base_summary.py b/hed/tools/remodeling/operations/base_summary.py index dfafd05a..0296a0e8 100644 --- a/hed/tools/remodeling/operations/base_summary.py +++ b/hed/tools/remodeling/operations/base_summary.py @@ -143,7 +143,8 @@ def get_text_summary(self, individual_summaries="separate"): return summary - def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate", task_name=""): + def save(self, save_dir, file_formats=('.txt'), individual_summaries="separate", task_name="", + dataset_summary=True): """ Save the summaries using the format indicated. Parameters: @@ -160,12 +161,13 @@ def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate", summary = self.get_summary(individual_summaries=individual_summaries) else: continue - self._save_summary_files(save_dir, file_format, summary, individual_summaries, task_name=task_name) + self._save_summary_files(save_dir, file_format, summary, individual_summaries, task_name=task_name, + dataset_summary=dataset_summary) self.save_visualizations(save_dir, file_formats=file_formats, individual_summaries=individual_summaries, task_name=task_name) - def save_visualizations(self, save_dir, file_formats=['.svg'], individual_summaries="separate", task_name=""): + def save_visualizations(self, save_dir, file_formats=('.svg'), individual_summaries="separate", task_name=""): """ Save summary visualizations, if any, using the format indicated. Parameters: @@ -177,7 +179,8 @@ def save_visualizations(self, save_dir, file_formats=['.svg'], individual_summar """ pass - def _save_summary_files(self, save_dir, file_format, summary, individual_summaries, task_name=''): + def _save_summary_files(self, save_dir, file_format, summary, individual_summaries, task_name='', + dataset_summary=True): """ Save the files in the appropriate format. Parameters: @@ -200,12 +203,15 @@ def _save_summary_files(self, save_dir, file_format, summary, individual_summari self.op.summary_filename + task_name + time_stamp + file_format)) individual = summary.get("Individual files", {}) if individual_summaries == "none" or not individual: - self.dump_summary(filename, summary["Dataset"]) + if dataset_summary: + self.dump_summary(filename, summary["Dataset"]) return if individual_summaries == "consolidated": self.dump_summary(filename, summary) return - self.dump_summary(filename, summary["Dataset"]) + # todo ian: this is very clunky, replace variable dataset_summary + if dataset_summary: + self.dump_summary(filename, summary["Dataset"]) individual_dir = os.path.join(this_save, self.INDIVIDUAL_SUMMARIES_PATH + '/') os.makedirs(os.path.realpath(individual_dir), exist_ok=True) for name, sum_str in individual.items(): diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index 8dbdb15b..54af74a2 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -57,6 +57,10 @@ def __init__(self, parameters): self.summary_filename = parameters['summary_filename'] self.append_timecode = parameters.get('append_timecode', False) + @staticmethod + def get_summary_class(): + return ColumnNamesSummary + def do_op(self, dispatcher, df, name, sidecar=None): """ Create a column name summary for df. @@ -74,10 +78,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ df_new = df.copy() - summary = dispatcher.summary_dicts.get(self.summary_name, None) - if not summary: - summary = ColumnNamesSummary(self) - dispatcher.summary_dicts[self.summary_name] = summary + summary = dispatcher.summary_dicts.setdefault(self.summary_name, self.get_summary_class()(self)) summary.update_summary( {"name": name, "column_names": list(df_new.columns)}) return df_new @@ -99,6 +100,10 @@ def __init__(self, sum_op): """ super().__init__(sum_op) + @staticmethod + def get_sub_summary_class(): + return ColumnNameSummary + def update_summary(self, new_info): """ Update the summary for a given tabular input file. diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 612e4b5e..c76eb06d 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -77,6 +77,10 @@ class SummarizeColumnValuesOp(BaseOp): VALUES_PER_LINE = 5 MAX_CATEGORICAL = 50 + @staticmethod + def get_summary_class(): + return ColumnValueSummary + def __init__(self, parameters): """ Constructor for the summarize column values operation. @@ -110,14 +114,11 @@ def do_op(self, dispatcher, df, name, sidecar=None): Updates the relevant summary. """ - + # todo ian: does this actually help us? Maybe? + # I was trying to standardize these do_op functions creating the summary df_new = df.copy() - summary = dispatcher.summary_dicts.get(self.summary_name, None) - if not summary: - summary = ColumnValueSummary(self) - dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary( - {'df': dispatcher.post_proc_data(df_new), 'name': name}) + summary = dispatcher.summary_dicts.setdefault(self.summary_name, self.get_summary_class()(self)) + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name}) return df_new @staticmethod @@ -138,6 +139,10 @@ def __init__(self, sum_op): """ super().__init__(sum_op) + @staticmethod + def get_sub_summary_class(): + return TabularSummary + def update_summary(self, new_info): """ Update the summary for a given tabular input file. @@ -151,9 +156,8 @@ def update_summary(self, new_info): """ name = new_info['name'] if name not in self.summary_dict: - self.summary_dict[name] = \ - TabularSummary(value_cols=self.op.value_columns, - skip_cols=self.op.skip_columns, name=name) + self.summary_dict[name] = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, + name=name) self.summary_dict[name].update(new_info['df']) def get_details_dict(self, summary): @@ -167,17 +171,13 @@ def get_details_dict(self, summary): """ this_summary = summary.get_summary(as_json=False) - unique_counts = [(key, len(count_dict)) for key, - count_dict in this_summary['Categorical columns'].items()] + unique_counts = [(key, len(count_dict)) for key, count_dict in this_summary['Categorical columns'].items()] this_summary['Categorical counts'] = dict(unique_counts) for key, dict_entry in this_summary['Categorical columns'].items(): - num_disp, sorted_tuples = ColumnValueSummary.sort_dict( - dict_entry, reverse=True) - this_summary['Categorical columns'][key] = dict( - sorted_tuples[:min(num_disp, self.op.max_categorical)]) + num_disp, sorted_tuples = ColumnValueSummary.sort_dict(dict_entry, reverse=True) + this_summary['Categorical columns'][key] = dict(sorted_tuples[:min(num_disp, self.op.max_categorical)]) return {"Name": this_summary['Name'], "Total events": this_summary["Total events"], - "Total files": this_summary['Total files'], - "Files": list(this_summary['Files'].keys()), + "Total files": this_summary['Total files'], "Files": list(this_summary['Files'].keys()), "Specifics": {"Value columns": list(this_summary['Value columns']), "Skip columns": this_summary['Skip columns'], "Value column summaries": this_summary['Value columns'], @@ -191,8 +191,7 @@ def merge_all_info(self): TabularSummary - the summary object for column values. """ - all_sum = TabularSummary( - value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset') + all_sum = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset') for counts in self.summary_dict.values(): all_sum.update_summary(counts) return all_sum @@ -238,13 +237,10 @@ def _get_categorical_string(self, result, offset="", indent=" "): if not cat_dict: return "" count_dict = result['Categorical counts'] - sum_list = [ - f"{offset}{indent}Categorical column values[Events, Files]:"] + sum_list = [f"{offset}{indent}Categorical column values[Events, Files]:"] sorted_tuples = sorted(cat_dict.items(), key=lambda x: x[0]) for entry in sorted_tuples: - sum_list = sum_list + \ - self._get_categorical_col( - entry, count_dict, offset="", indent=" ") + sum_list = sum_list + self._get_categorical_col(entry, count_dict, offset="", indent=" ") return "\n".join(sum_list) def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT): @@ -260,14 +256,12 @@ def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [] specifics = result["Specifics"] - cat_string = self._get_categorical_string( - specifics, offset="", indent=indent) + cat_string = self._get_categorical_string(specifics, offset="", indent=indent) if cat_string: sum_list.append(cat_string) val_dict = specifics.get("Value column summaries", {}) if val_dict: - sum_list.append(ColumnValueSummary._get_value_string( - val_dict, offset="", indent=indent)) + sum_list.append(ColumnValueSummary._get_value_string(val_dict, offset="", indent=indent)) return sum_list def _get_categorical_col(self, entry, count_dict, offset="", indent=" "): @@ -289,8 +283,7 @@ def _get_categorical_col(self, entry, count_dict, offset="", indent=" "): # Create and partition the list of individual entries value_list = [f"{item[0]}{str(item[1])}" for item in entry[1].items()] value_list = value_list[:num_disp] - part_list = ColumnValueSummary.partition_list( - value_list, self.op.values_per_line) + part_list = ColumnValueSummary.partition_list(value_list, self.op.values_per_line) return col_list + [f"{offset}{indent * 3}{ColumnValueSummary.get_list_str(item)}" for item in part_list] @staticmethod @@ -321,11 +314,10 @@ def partition_list(lst, n): def _get_value_string(val_dict, offset="", indent=""): sum_list = [f"{offset}{indent}Value columns[Events, Files]:"] for col_name, val_counts in val_dict.items(): - sum_list.append(f"{offset}{indent*2}{col_name}{str(val_counts)}") + sum_list.append(f"{offset}{indent * 2}{col_name}{str(val_counts)}") return "\n".join(sum_list) @staticmethod def sort_dict(count_dict, reverse=False): - sorted_tuples = sorted( - count_dict.items(), key=lambda x: x[1][0], reverse=reverse) + sorted_tuples = sorted(count_dict.items(), key=lambda x: x[1][0], reverse=reverse) return len(sorted_tuples), sorted_tuples diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 89b678ee..e6b875e4 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -235,6 +235,10 @@ def __init__(self, sum_op): super().__init__(sum_op) self.sum_op = sum_op + # @staticmethod + # def get_sub_summary_class(): + # return ColumnNameSummary + def update_summary(self, new_info): """ Update the summary for a given tabular input file. @@ -247,6 +251,7 @@ def update_summary(self, new_info): """ counts = HedTagCounts( new_info['name'], total_events=len(new_info['df'])) + # todo ian: This should maybe instead call counts.update or counts.update_Summary input_data = TabularInput( new_info['df'], sidecar=new_info['sidecar'], name=new_info['name']) tag_man = HedTagManager(EventManager(input_data, new_info['schema']),