From e73b7a74d41af7bb002c962db7407867b4c9481c Mon Sep 17 00:00:00 2001
From: IanCa <ianrcallanan@gmail.com>
Date: Wed, 31 Jul 2024 18:20:14 -0500
Subject: [PATCH] Add not even half baked start on changing dispatcher for
 boutiques style command line and summaries

---
 hed/tools/__init__.py                         |   1 +
 hed/tools/analysis/column_name_summary.py     |  20 +-
 hed/tools/analysis/tabular_summary.py         |  56 +++--
 hed/tools/remodeling/cli/run_remodel.py       |  11 +-
 hed/tools/remodeling/cli/run_summary.py       | 194 ++++++++++++++++++
 hed/tools/remodeling/dispatcher.py            |  72 ++++++-
 .../remodeling/operations/base_summary.py     |  18 +-
 .../operations/summarize_column_names_op.py   |  13 +-
 .../operations/summarize_column_values_op.py  |  60 +++---
 .../operations/summarize_hed_tags_op.py       |   5 +
 10 files changed, 383 insertions(+), 67 deletions(-)
 create mode 100644 hed/tools/remodeling/cli/run_summary.py

diff --git a/hed/tools/__init__.py b/hed/tools/__init__.py
index 2d0bd977..3af0b1b5 100644
--- a/hed/tools/__init__.py
+++ b/hed/tools/__init__.py
@@ -52,3 +52,4 @@
 from .remodeling.cli import run_remodel
 from .remodeling.cli import run_remodel_backup
 from .remodeling.cli import run_remodel_restore
+from .remodeling.cli import run_summary
diff --git a/hed/tools/analysis/column_name_summary.py b/hed/tools/analysis/column_name_summary.py
index 2820a5b3..81035b22 100644
--- a/hed/tools/analysis/column_name_summary.py
+++ b/hed/tools/analysis/column_name_summary.py
@@ -1,5 +1,5 @@
 """ Summarize the unique column names in a dataset. """
-
+import copy
 import json
 
 
@@ -11,6 +11,24 @@ def __init__(self, name=''):
         self.file_dict = {}
         self.unique_headers = []
 
+    @staticmethod
+    def load_as_json2(json_data):
+        summary = ColumnNameSummary()
+        json_data = json_data["File summary"]
+        summary.name = json_data["Name"]
+        # summary.total_events = json_data["Total events"]
+        # summary.total_files = json_data["Total files"]
+        specifics = json_data["Specifics"]
+        all_column_data = specifics["Columns"]
+        for index, column_data in enumerate(all_column_data):
+            file_list = column_data["Files"]
+            unique_header = column_data["Column names"]
+            summary.unique_headers.append(unique_header)
+            for file in file_list:
+                summary.file_dict[file] = index
+
+        return summary
+
     def update(self, name, columns):
         """ Update the summary based on columns associated with a file.
 
diff --git a/hed/tools/analysis/tabular_summary.py b/hed/tools/analysis/tabular_summary.py
index 76d26c69..7bb24968 100644
--- a/hed/tools/analysis/tabular_summary.py
+++ b/hed/tools/analysis/tabular_summary.py
@@ -1,6 +1,5 @@
 """ Summarize the contents of columnar files. """
 
-
 import json
 from hed.errors.exceptions import HedFileError
 from hed.tools.util import data_util
@@ -74,6 +73,32 @@ def extract_sidecar_template(self):
             side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, [])
         return side_dict
 
+    @staticmethod
+    def load_as_json2(json_data):
+        summary = TabularSummary()
+        json_data = json_data["File summary"]
+        summary.name = json_data["Name"]
+        summary.total_events = json_data["Total events"]
+        summary.total_files = json_data["Total files"]
+        specifics = json_data["Specifics"]
+        # todo ian: this doesn't use value column summaries or categorical counts?  What
+        summary.categorical_info = specifics["Categorical column summaries"]
+        summary.value_info = specifics["Value column summaries"]
+        summary.skip_cols = specifics["Skip columns"]
+        # summary.files = specifics["Files"]
+
+        return summary
+
+    def _sort_internal(self):
+        categorical_cols = {}
+        for key in sorted(self.categorical_info):
+            cat_dict = self.categorical_info[key]
+            val_dict = {v_key: cat_dict[v_key] for v_key in sorted(cat_dict.keys())}
+            categorical_cols[key] = val_dict
+        value_cols = {key: self.value_info[key] for key in sorted(self.value_info)}
+        self.categorical_info = categorical_cols
+        self.value_info = value_cols
+
     def get_summary(self, as_json=False):
         """ Return the summary in dictionary format.
 
@@ -81,22 +106,17 @@ def get_summary(self, as_json=False):
             as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary.
 
         """
-        sorted_keys = sorted(self.categorical_info.keys())
-        categorical_cols = {}
-        for key in sorted_keys:
-            cat_dict = self.categorical_info[key]
-            sorted_v_keys = sorted(list(cat_dict))
-            val_dict = {}
-            for v_key in sorted_v_keys:
-                val_dict[v_key] = cat_dict[v_key]
-            categorical_cols[key] = val_dict
-        sorted_cols = sorted(map(str, list(self.value_info)))
-        value_cols = {}
-        for key in sorted_cols:
-            value_cols[key] = self.value_info[key]
-        summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
-                   "Categorical columns": categorical_cols, "Value columns": value_cols,
-                   "Skip columns": self.skip_cols, "Files": self.files}
+        self._sort_internal()
+        summary = {"Name": self.name,
+                   "Total events": self.total_events,
+                   "Total files": self.total_files,
+                   "Categorical columns": self.categorical_info,
+                   "Value columns": self.value_info,
+                   "Skip columns": self.skip_cols,
+                   "Files": self.files}
+
+        # reloaded_summary = self.load_as_json(summary)
+
         if as_json:
             return json.dumps(summary, indent=4)
         else:
@@ -198,7 +218,7 @@ def _update_dataframe(self, data, name):
             else:
                 col_values = col_values.astype(str)
                 values = col_values.value_counts(ascending=True)
-                self._update_categorical(col_name,  values)
+                self._update_categorical(col_name, values)
 
     def _update_dict_categorical(self, col_dict):
         """ Update this summary with the categorical information in the dictionary from another summary.
diff --git a/hed/tools/remodeling/cli/run_remodel.py b/hed/tools/remodeling/cli/run_remodel.py
index edf0482e..60c54b93 100644
--- a/hed/tools/remodeling/cli/run_remodel.py
+++ b/hed/tools/remodeling/cli/run_remodel.py
@@ -1,5 +1,5 @@
 """ Main command-line program for running the remodeling tools. """
-
+import copy
 import os
 import json
 import argparse
@@ -62,6 +62,8 @@ def get_parser():
                         help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.")
     parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
                         help="Directories names to exclude from search for files.")
+    parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="none",
+                        choices=["participant", "group", "none"])
     return parser
 
 
@@ -224,13 +226,17 @@ def main(arg_list=None):
         for task, files in task_dict.items():
             dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name,
                                   hed_versions=args.hed_versions)
+
             if args.use_bids:
                 run_bids_ops(dispatch, args, files)
             else:
                 run_direct_ops(dispatch, args, files)
+
             if not args.no_summaries:
+                # Todo ian: replace dataset_summary variable
                 dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries,
-                                        summary_dir=save_dir, task_name=task)
+                                        summary_dir=save_dir, task_name=task,
+                                        dataset_summary=args.analysis_level != "participant")
     except Exception as ex:
         if args.log_dir:
             log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt'
@@ -239,5 +245,6 @@ def main(arg_list=None):
         raise
 
 
+
 if __name__ == '__main__':
     main()
diff --git a/hed/tools/remodeling/cli/run_summary.py b/hed/tools/remodeling/cli/run_summary.py
new file mode 100644
index 00000000..7d07231f
--- /dev/null
+++ b/hed/tools/remodeling/cli/run_summary.py
@@ -0,0 +1,194 @@
+""" Main command-line program for running the remodeling tools. """
+import copy
+import os
+import json
+import argparse
+import logging
+from hed.errors.exceptions import HedFileError
+
+from hed.tools.bids.bids_dataset import BidsDataset
+from hed.tools.remodeling.remodeler_validator import RemodelerValidator
+from hed.tools.remodeling.dispatcher import Dispatcher
+from hed.tools.remodeling.backup_manager import BackupManager
+from hed.tools.util import io_util
+
+
+def get_parser():
+    """ Create a parser for the run_remodel command-line arguments.
+
+    Returns:
+        argparse.ArgumentParser:  A parser for parsing the command line arguments.
+
+    """
+    parser = argparse.ArgumentParser(description="Converts event files based on a json file specifying operations.")
+    parser.add_argument("data_dir", help="Full path of dataset root directory.")
+    parser.add_argument("model_path", help="Full path of the file with remodeling instructions.")
+    parser.add_argument("-bd", "--backup_dir", default="", dest="backup_dir",
+                        help="Directory for the backup that is being created")
+    parser.add_argument("-bn", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
+                        help="Name of the default backup for remodeling")
+    parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids",
+                        help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.")
+    parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
+                        help="File extensions to allow in locating files.")
+    parser.add_argument("-f", "--file-suffix", dest="file_suffix", default='events',
+                        help="Filename suffix excluding file type of items to be analyzed (events by default).")
+    parser.add_argument("-i", "--individual-summaries", dest="individual_summaries", default="separate",
+                        choices=["separate", "consolidated", "none"],
+                        help="Controls individual file summaries ('none', 'separate', 'consolidated')")
+    parser.add_argument("-j", "--json-sidecar", dest="json_sidecar", nargs="?",
+                        help="Optional path to JSON sidecar with HED information")
+    parser.add_argument("-ld", "--log_dir", dest="log_dir", default="",
+                        help="Directory for storing log entries for errors.")
+#    parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
+#                        help="Name of the default backup for remodeling")
+    parser.add_argument("-nb", "--no-backup", action='store_true', dest="no_backup",
+                        help="If present, the operations are run directly on the files with no backup.")
+    parser.add_argument("-ns", "--no-summaries", action='store_true', dest="no_summaries",
+                        help="If present, the summaries are not saved, but rather discarded.")
+    parser.add_argument("-nu", "--no-update", action='store_true', dest="no_update",
+                        help="If present, the files are not saved, but rather discarded.")
+    parser.add_argument("-r", "--hed-versions", dest="hed_versions", nargs="*", default=[],
+                        help="Optional list of HED schema versions used for annotation, include prefixes.")
+    parser.add_argument("-s", "--save-formats", nargs="*", default=['.json', '.txt'], dest="save_formats",
+                        help="Format for saving any summaries, if any. If no summaries are to be written," +
+                             "use the -ns option.")
+    parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[],
+                        help="The names of the task. If an empty list is given, all tasks are lumped together." +
+                        " If * is given, then tasks are found and reported individually.")
+    parser.add_argument("-v", "--verbose", action='store_true',
+                        help="If present, output informative messages as computation progresses.")
+    parser.add_argument("-w", "--work-dir", default="", dest="work_dir",
+                        help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.")
+    parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
+                        help="Directories names to exclude from search for files.")
+    parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="group",
+                        choices=["group"])
+    return parser
+
+
+def handle_backup(args):
+    """ Restore the backup if applicable.
+
+    Parameters:
+        args (obj): Parsed arguments as an object.
+
+    Returns:
+        str or None:  Backup name if there was a backup done.
+
+    """
+    if args.no_backup:
+        backup_name = None
+    else:
+        backup_man = BackupManager(args.data_dir)
+        if not backup_man.get_backup(args.backup_name):
+            raise HedFileError("BackupDoesNotExist", f"Backup {args.backup_name} does not exist. "
+                               f"Please run_remodel_backup first", "")
+        backup_man.restore_backup(args.backup_name, args.task_names, verbose=args.verbose)
+        backup_name = args.backup_name
+    return backup_name
+
+
+def parse_arguments(arg_list=None):
+    """ Parse the command line arguments or arg_list if given.
+
+    Parameters:
+        arg_list (list):  List of command line arguments as a list.
+
+    Returns:
+        Object:  Argument object.
+        List: A list of parsed operations (each operation is a dictionary).
+
+    :raises ValueError:
+        - If the operations were unable to be correctly parsed.
+
+    """
+    parser = get_parser()
+    args = parser.parse_args(arg_list)
+    if '*' in args.file_suffix:
+        args.file_suffix = None
+    if '*' in args.extensions:
+        args.extensions = None
+    args.data_dir = os.path.realpath(args.data_dir)
+    args.exclude_dirs = args.exclude_dirs + ['remodel']
+    args.model_path = os.path.realpath(args.model_path)
+    if args.verbose:
+        print(f"Data directory: {args.data_dir}\nModel path: {args.model_path}")
+    with open(args.model_path, 'r') as fp:
+        operations = json.load(fp)
+    validator = RemodelerValidator()
+    errors = validator.validate(operations)
+    if errors:
+        raise ValueError("UnableToFullyParseOperations",
+                         f"Fatal operation error, cannot continue:\n{errors}")
+    return args, operations
+
+
+def parse_tasks(files, task_args):
+    """ Parse the tasks argument to get a task list.
+
+    Parameters:
+        files (list):  List of full paths of files.
+        task_args (str or list):  The argument values for the task parameter.
+
+    """
+    if not task_args:
+        return {"": files}
+    task_dict = io_util.get_task_dict(files)
+    if task_args == "*" or isinstance(task_args, list) and task_args[0] == "*":
+        return task_dict
+    task_dict = {key: task_dict[key] for key in task_args if key in task_dict}
+    return task_dict
+
+
+def main(arg_list=None):
+    """ The command-line program.
+
+    Parameters:
+        arg_list (list or None):   Called with value None when called from the command line.
+                                   Otherwise, called with the command-line parameters as an argument list.
+
+    :raises HedFileError:
+        - if the data root directory does not exist.
+        - if the specified backup does not exist.
+
+    """
+    args, operations = parse_arguments(arg_list)
+
+    if args.log_dir:
+        os.makedirs(args.log_dir, exist_ok=True)
+        timestamp = io_util.get_timestamp()
+    try:
+        if not os.path.isdir(args.data_dir):
+            raise HedFileError("DataDirectoryDoesNotExist",
+                               f"The root data directory {args.data_dir} does not exist", "")
+        backup_name = handle_backup(args)
+        save_dir = None
+        if args.work_dir:
+            save_dir = os.path.realpath(os.path.join(args.work_dir, Dispatcher.REMODELING_SUMMARY_PATH))
+        files = io_util.get_file_list(args.data_dir, name_suffix=args.file_suffix, extensions=args.extensions,
+                                      exclude_dirs=args.exclude_dirs)
+        task_dict = parse_tasks(files, args.task_names)
+        for task, files in task_dict.items():
+            dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name,
+                                  hed_versions=args.hed_versions)
+
+            # next task: add a makeshift "analysis level" parameter.  particpant = generate individual, group = reload indivdual on load
+            # Need a way to determine WHICH run to reload options from
+
+            dispatch.load_existing_summaries(save_dir)
+
+            if not args.no_summaries:
+                dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries,
+                                        summary_dir=save_dir, task_name=task)
+    except Exception as ex:
+        if args.log_dir:
+            log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt'
+            logging.basicConfig(filename=os.path.join(args.log_dir, log_name), level=logging.ERROR)
+            logging.exception(f"{args.data_dir}: {args.model_path}")
+        raise
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py
index 04e6d809..dd844532 100644
--- a/hed/tools/remodeling/dispatcher.py
+++ b/hed/tools/remodeling/dispatcher.py
@@ -4,6 +4,8 @@
 import numpy as np
 import pandas as pd
 import json
+import re
+from collections import defaultdict
 from hed.errors.exceptions import HedFileError
 from hed.schema.hed_schema_io import load_schema_version
 from hed.schema.hed_schema import HedSchema
@@ -154,8 +156,73 @@ def run_operations(self, file_path, sidecar=None, verbose=False):
             df = self.post_proc_data(df)
         return df
 
+    def load_existing_summaries(self, summary_dir):
+        if not summary_dir:
+            summary_dir = self.get_summary_save_dir()
+
+        for operation in self.parsed_ops:
+            if hasattr(operation, "get_summary_class"):
+                # todo ian: How do we carefully get the name if the names aren't unique?
+                # Are there names in the files?
+                name = str(operation.summary_name)
+                filename = operation.summary_filename
+
+                final_dir = os.path.join(summary_dir, name, "individual_summaries")
+                if not os.path.exists(final_dir):
+                    continue
+
+                base_files = os.listdir(final_dir)
+                # Need to filter out names here -> break name into parts(is there a function for this?)
+                # possible_files = [os.path.join(final_dir, file) for file in base_files]
+                possible_files = defaultdict(list)
+                # todo ian: not robust.  Also put in a file
+                for file in base_files:
+                    if not file.endswith(".json"):
+                        continue
+
+                    final_filename = os.path.join(final_dir, file)
+                    json_data = json.load(open(final_filename))
+                    loaded_name = json_data["Summary name"]
+                    loaded_filename = json_data["Summary filename"]
+                    if loaded_name != name or loaded_filename != filename:
+                        continue
+
+                    # todo ian:  this is all lazily done so far
+                    trimmed_file = file[len(filename) + 1:]
+                    trimmed_file, _ = os.path.splitext(trimmed_file)
+
+                    # Look for a timestamp ending a file.
+                    timecode_pattern = r'(.*)(\d{4}_\d{2}_\d{2}_T_\d{2}_\d{2}_\d{2}_\d{3})$'
+                    timecode_match = re.match(timecode_pattern, trimmed_file)
+                    suffix = ""
+                    if timecode_match:
+                        trimmed_file = timecode_match.group(1)
+                        suffix = timecode_match.group(2)
+                    trimmed_file, _, run_number = trimmed_file.rpartition("_")
+                    final_suffix = "_".join([run_number, suffix])
+                    possible_files[final_suffix].append(final_filename)
+                    # todo ian: need some way to identify a run here
+                    print(file)
+
+                if not possible_files:
+                    # todo: what error etc is this
+                    continue
+
+                actual_files = possible_files[list(possible_files)[0]]
+
+                # How do we know which one to load here?
+                #     Do we do all of them?
+                summary = operation.get_summary_class()(operation)
+
+                # todo ian: this is probably an issue, don't some of these require other params?
+                sub_summary = summary.get_sub_summary_class()
+                summary.summary_dict = {file:sub_summary.load_as_json2(json.load(open(file))) for file in actual_files}
+                self.summary_dicts[operation.summary_name] = summary
+                breakHere = 3
+
+
     def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="separate",
-                       summary_dir=None, task_name=""):
+                       summary_dir=None, task_name="", dataset_summary=True):
         """ Save the summary files in the specified formats.
 
         Parameters:
@@ -179,7 +246,8 @@ def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="s
             summary_dir = self.get_summary_save_dir()
         os.makedirs(summary_dir, exist_ok=True)
         for summary_name, summary_item in self.summary_dicts.items():
-            summary_item.save(summary_dir, save_formats, individual_summaries=individual_summaries, task_name=task_name)
+            summary_item.save(summary_dir, save_formats, individual_summaries=individual_summaries, task_name=task_name,
+                              dataset_summary=dataset_summary)
 
     @staticmethod
     def parse_operations(operation_list):
diff --git a/hed/tools/remodeling/operations/base_summary.py b/hed/tools/remodeling/operations/base_summary.py
index dfafd05a..0296a0e8 100644
--- a/hed/tools/remodeling/operations/base_summary.py
+++ b/hed/tools/remodeling/operations/base_summary.py
@@ -143,7 +143,8 @@ def get_text_summary(self, individual_summaries="separate"):
 
         return summary
 
-    def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate", task_name=""):
+    def save(self, save_dir, file_formats=('.txt'), individual_summaries="separate", task_name="",
+             dataset_summary=True):
         """ Save the summaries using the format indicated.
 
         Parameters:
@@ -160,12 +161,13 @@ def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate",
                 summary = self.get_summary(individual_summaries=individual_summaries)
             else:
                 continue
-            self._save_summary_files(save_dir, file_format, summary, individual_summaries, task_name=task_name)
+            self._save_summary_files(save_dir, file_format, summary, individual_summaries, task_name=task_name,
+                                     dataset_summary=dataset_summary)
 
             self.save_visualizations(save_dir, file_formats=file_formats, individual_summaries=individual_summaries,
                                      task_name=task_name)
 
-    def save_visualizations(self, save_dir, file_formats=['.svg'], individual_summaries="separate", task_name=""):
+    def save_visualizations(self, save_dir, file_formats=('.svg'), individual_summaries="separate", task_name=""):
         """ Save summary visualizations, if any, using the format indicated.
 
         Parameters:
@@ -177,7 +179,8 @@ def save_visualizations(self, save_dir, file_formats=['.svg'], individual_summar
         """
         pass
 
-    def _save_summary_files(self, save_dir, file_format, summary, individual_summaries, task_name=''):
+    def _save_summary_files(self, save_dir, file_format, summary, individual_summaries, task_name='',
+                            dataset_summary=True):
         """ Save the files in the appropriate format.
 
         Parameters:
@@ -200,12 +203,15 @@ def _save_summary_files(self, save_dir, file_format, summary, individual_summari
                                                  self.op.summary_filename + task_name + time_stamp + file_format))
         individual = summary.get("Individual files", {})
         if individual_summaries == "none" or not individual:
-            self.dump_summary(filename, summary["Dataset"])
+            if dataset_summary:
+                self.dump_summary(filename, summary["Dataset"])
             return
         if individual_summaries == "consolidated":
             self.dump_summary(filename, summary)
             return
-        self.dump_summary(filename, summary["Dataset"])
+        # todo ian: this is very clunky, replace variable dataset_summary
+        if dataset_summary:
+            self.dump_summary(filename, summary["Dataset"])
         individual_dir = os.path.join(this_save, self.INDIVIDUAL_SUMMARIES_PATH + '/')
         os.makedirs(os.path.realpath(individual_dir), exist_ok=True)
         for name, sum_str in individual.items():
diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py
index 8dbdb15b..54af74a2 100644
--- a/hed/tools/remodeling/operations/summarize_column_names_op.py
+++ b/hed/tools/remodeling/operations/summarize_column_names_op.py
@@ -57,6 +57,10 @@ def __init__(self, parameters):
         self.summary_filename = parameters['summary_filename']
         self.append_timecode = parameters.get('append_timecode', False)
 
+    @staticmethod
+    def get_summary_class():
+        return ColumnNamesSummary
+
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Create a column name summary for df.
 
@@ -74,10 +78,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
 
         """
         df_new = df.copy()
-        summary = dispatcher.summary_dicts.get(self.summary_name, None)
-        if not summary:
-            summary = ColumnNamesSummary(self)
-            dispatcher.summary_dicts[self.summary_name] = summary
+        summary = dispatcher.summary_dicts.setdefault(self.summary_name, self.get_summary_class()(self))
         summary.update_summary(
             {"name": name, "column_names": list(df_new.columns)})
         return df_new
@@ -99,6 +100,10 @@ def __init__(self, sum_op):
         """
         super().__init__(sum_op)
 
+    @staticmethod
+    def get_sub_summary_class():
+        return ColumnNameSummary
+
     def update_summary(self, new_info):
         """ Update the summary for a given tabular input file.
 
diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py
index 612e4b5e..c76eb06d 100644
--- a/hed/tools/remodeling/operations/summarize_column_values_op.py
+++ b/hed/tools/remodeling/operations/summarize_column_values_op.py
@@ -77,6 +77,10 @@ class SummarizeColumnValuesOp(BaseOp):
     VALUES_PER_LINE = 5
     MAX_CATEGORICAL = 50
 
+    @staticmethod
+    def get_summary_class():
+        return ColumnValueSummary
+
     def __init__(self, parameters):
         """ Constructor for the summarize column values operation.
 
@@ -110,14 +114,11 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             Updates the relevant summary.
 
         """
-
+        # todo ian: does this actually help us?  Maybe?
+        # I was trying to standardize these do_op functions creating the summary
         df_new = df.copy()
-        summary = dispatcher.summary_dicts.get(self.summary_name, None)
-        if not summary:
-            summary = ColumnValueSummary(self)
-            dispatcher.summary_dicts[self.summary_name] = summary
-        summary.update_summary(
-            {'df': dispatcher.post_proc_data(df_new), 'name': name})
+        summary = dispatcher.summary_dicts.setdefault(self.summary_name, self.get_summary_class()(self))
+        summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name})
         return df_new
 
     @staticmethod
@@ -138,6 +139,10 @@ def __init__(self, sum_op):
         """
         super().__init__(sum_op)
 
+    @staticmethod
+    def get_sub_summary_class():
+        return TabularSummary
+
     def update_summary(self, new_info):
         """ Update the summary for a given tabular input file.
 
@@ -151,9 +156,8 @@ def update_summary(self, new_info):
         """
         name = new_info['name']
         if name not in self.summary_dict:
-            self.summary_dict[name] = \
-                TabularSummary(value_cols=self.op.value_columns,
-                               skip_cols=self.op.skip_columns, name=name)
+            self.summary_dict[name] = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns,
+                                                     name=name)
         self.summary_dict[name].update(new_info['df'])
 
     def get_details_dict(self, summary):
@@ -167,17 +171,13 @@ def get_details_dict(self, summary):
 
         """
         this_summary = summary.get_summary(as_json=False)
-        unique_counts = [(key, len(count_dict)) for key,
-                         count_dict in this_summary['Categorical columns'].items()]
+        unique_counts = [(key, len(count_dict)) for key, count_dict in this_summary['Categorical columns'].items()]
         this_summary['Categorical counts'] = dict(unique_counts)
         for key, dict_entry in this_summary['Categorical columns'].items():
-            num_disp, sorted_tuples = ColumnValueSummary.sort_dict(
-                dict_entry, reverse=True)
-            this_summary['Categorical columns'][key] = dict(
-                sorted_tuples[:min(num_disp, self.op.max_categorical)])
+            num_disp, sorted_tuples = ColumnValueSummary.sort_dict(dict_entry, reverse=True)
+            this_summary['Categorical columns'][key] = dict(sorted_tuples[:min(num_disp, self.op.max_categorical)])
         return {"Name": this_summary['Name'], "Total events": this_summary["Total events"],
-                "Total files": this_summary['Total files'],
-                "Files": list(this_summary['Files'].keys()),
+                "Total files": this_summary['Total files'], "Files": list(this_summary['Files'].keys()),
                 "Specifics": {"Value columns": list(this_summary['Value columns']),
                               "Skip columns": this_summary['Skip columns'],
                               "Value column summaries": this_summary['Value columns'],
@@ -191,8 +191,7 @@ def merge_all_info(self):
             TabularSummary - the summary object for column values.
 
         """
-        all_sum = TabularSummary(
-            value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset')
+        all_sum = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset')
         for counts in self.summary_dict.values():
             all_sum.update_summary(counts)
         return all_sum
@@ -238,13 +237,10 @@ def _get_categorical_string(self, result, offset="", indent="   "):
         if not cat_dict:
             return ""
         count_dict = result['Categorical counts']
-        sum_list = [
-            f"{offset}{indent}Categorical column values[Events, Files]:"]
+        sum_list = [f"{offset}{indent}Categorical column values[Events, Files]:"]
         sorted_tuples = sorted(cat_dict.items(), key=lambda x: x[0])
         for entry in sorted_tuples:
-            sum_list = sum_list + \
-                self._get_categorical_col(
-                    entry, count_dict, offset="", indent="   ")
+            sum_list = sum_list + self._get_categorical_col(entry, count_dict, offset="", indent="   ")
         return "\n".join(sum_list)
 
     def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT):
@@ -260,14 +256,12 @@ def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT):
         """
         sum_list = []
         specifics = result["Specifics"]
-        cat_string = self._get_categorical_string(
-            specifics, offset="", indent=indent)
+        cat_string = self._get_categorical_string(specifics, offset="", indent=indent)
         if cat_string:
             sum_list.append(cat_string)
         val_dict = specifics.get("Value column summaries", {})
         if val_dict:
-            sum_list.append(ColumnValueSummary._get_value_string(
-                val_dict, offset="", indent=indent))
+            sum_list.append(ColumnValueSummary._get_value_string(val_dict, offset="", indent=indent))
         return sum_list
 
     def _get_categorical_col(self, entry, count_dict, offset="", indent="   "):
@@ -289,8 +283,7 @@ def _get_categorical_col(self, entry, count_dict, offset="", indent="   "):
         # Create and partition the list of individual entries
         value_list = [f"{item[0]}{str(item[1])}" for item in entry[1].items()]
         value_list = value_list[:num_disp]
-        part_list = ColumnValueSummary.partition_list(
-            value_list, self.op.values_per_line)
+        part_list = ColumnValueSummary.partition_list(value_list, self.op.values_per_line)
         return col_list + [f"{offset}{indent * 3}{ColumnValueSummary.get_list_str(item)}" for item in part_list]
 
     @staticmethod
@@ -321,11 +314,10 @@ def partition_list(lst, n):
     def _get_value_string(val_dict, offset="", indent=""):
         sum_list = [f"{offset}{indent}Value columns[Events, Files]:"]
         for col_name, val_counts in val_dict.items():
-            sum_list.append(f"{offset}{indent*2}{col_name}{str(val_counts)}")
+            sum_list.append(f"{offset}{indent * 2}{col_name}{str(val_counts)}")
         return "\n".join(sum_list)
 
     @staticmethod
     def sort_dict(count_dict, reverse=False):
-        sorted_tuples = sorted(
-            count_dict.items(), key=lambda x: x[1][0], reverse=reverse)
+        sorted_tuples = sorted(count_dict.items(), key=lambda x: x[1][0], reverse=reverse)
         return len(sorted_tuples), sorted_tuples
diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py
index 89b678ee..e6b875e4 100644
--- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py
+++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py
@@ -235,6 +235,10 @@ def __init__(self, sum_op):
         super().__init__(sum_op)
         self.sum_op = sum_op
 
+    # @staticmethod
+    # def get_sub_summary_class():
+    #     return ColumnNameSummary
+
     def update_summary(self, new_info):
         """ Update the summary for a given tabular input file.
 
@@ -247,6 +251,7 @@ def update_summary(self, new_info):
         """
         counts = HedTagCounts(
             new_info['name'], total_events=len(new_info['df']))
+        # todo ian: This should maybe instead call counts.update or counts.update_Summary
         input_data = TabularInput(
             new_info['df'], sidecar=new_info['sidecar'], name=new_info['name'])
         tag_man = HedTagManager(EventManager(input_data, new_info['schema']),