Skip to content

Commit

Permalink
Add not even half baked start on changing dispatcher for boutiques st…
Browse files Browse the repository at this point in the history
…yle command line and summaries
  • Loading branch information
IanCa committed Jul 31, 2024
1 parent bc007ba commit e73b7a7
Show file tree
Hide file tree
Showing 10 changed files with 383 additions and 67 deletions.
1 change: 1 addition & 0 deletions hed/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@
from .remodeling.cli import run_remodel
from .remodeling.cli import run_remodel_backup
from .remodeling.cli import run_remodel_restore
from .remodeling.cli import run_summary
20 changes: 19 additions & 1 deletion hed/tools/analysis/column_name_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" Summarize the unique column names in a dataset. """

import copy
import json


Expand All @@ -11,6 +11,24 @@ def __init__(self, name=''):
self.file_dict = {}
self.unique_headers = []

@staticmethod
def load_as_json2(json_data):
summary = ColumnNameSummary()
json_data = json_data["File summary"]
summary.name = json_data["Name"]
# summary.total_events = json_data["Total events"]
# summary.total_files = json_data["Total files"]
specifics = json_data["Specifics"]
all_column_data = specifics["Columns"]
for index, column_data in enumerate(all_column_data):
file_list = column_data["Files"]
unique_header = column_data["Column names"]
summary.unique_headers.append(unique_header)
for file in file_list:
summary.file_dict[file] = index

return summary

def update(self, name, columns):
""" Update the summary based on columns associated with a file.
Expand Down
56 changes: 38 additions & 18 deletions hed/tools/analysis/tabular_summary.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
""" Summarize the contents of columnar files. """


import json
from hed.errors.exceptions import HedFileError
from hed.tools.util import data_util
Expand Down Expand Up @@ -74,29 +73,50 @@ def extract_sidecar_template(self):
side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, [])
return side_dict

@staticmethod
def load_as_json2(json_data):
summary = TabularSummary()
json_data = json_data["File summary"]
summary.name = json_data["Name"]
summary.total_events = json_data["Total events"]
summary.total_files = json_data["Total files"]
specifics = json_data["Specifics"]
# todo ian: this doesn't use value column summaries or categorical counts? What
summary.categorical_info = specifics["Categorical column summaries"]
summary.value_info = specifics["Value column summaries"]
summary.skip_cols = specifics["Skip columns"]
# summary.files = specifics["Files"]

return summary

def _sort_internal(self):
categorical_cols = {}
for key in sorted(self.categorical_info):
cat_dict = self.categorical_info[key]
val_dict = {v_key: cat_dict[v_key] for v_key in sorted(cat_dict.keys())}
categorical_cols[key] = val_dict
value_cols = {key: self.value_info[key] for key in sorted(self.value_info)}
self.categorical_info = categorical_cols
self.value_info = value_cols

def get_summary(self, as_json=False):
""" Return the summary in dictionary format.
Parameters:
as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary.
"""
sorted_keys = sorted(self.categorical_info.keys())
categorical_cols = {}
for key in sorted_keys:
cat_dict = self.categorical_info[key]
sorted_v_keys = sorted(list(cat_dict))
val_dict = {}
for v_key in sorted_v_keys:
val_dict[v_key] = cat_dict[v_key]
categorical_cols[key] = val_dict
sorted_cols = sorted(map(str, list(self.value_info)))
value_cols = {}
for key in sorted_cols:
value_cols[key] = self.value_info[key]
summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
"Categorical columns": categorical_cols, "Value columns": value_cols,
"Skip columns": self.skip_cols, "Files": self.files}
self._sort_internal()
summary = {"Name": self.name,
"Total events": self.total_events,
"Total files": self.total_files,
"Categorical columns": self.categorical_info,
"Value columns": self.value_info,
"Skip columns": self.skip_cols,
"Files": self.files}

# reloaded_summary = self.load_as_json(summary)

if as_json:
return json.dumps(summary, indent=4)
else:
Expand Down Expand Up @@ -198,7 +218,7 @@ def _update_dataframe(self, data, name):
else:
col_values = col_values.astype(str)
values = col_values.value_counts(ascending=True)
self._update_categorical(col_name, values)
self._update_categorical(col_name, values)

def _update_dict_categorical(self, col_dict):
""" Update this summary with the categorical information in the dictionary from another summary.
Expand Down
11 changes: 9 additions & 2 deletions hed/tools/remodeling/cli/run_remodel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" Main command-line program for running the remodeling tools. """

import copy
import os
import json
import argparse
Expand Down Expand Up @@ -62,6 +62,8 @@ def get_parser():
help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="none",
choices=["participant", "group", "none"])
return parser


Expand Down Expand Up @@ -224,13 +226,17 @@ def main(arg_list=None):
for task, files in task_dict.items():
dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name,
hed_versions=args.hed_versions)

if args.use_bids:
run_bids_ops(dispatch, args, files)
else:
run_direct_ops(dispatch, args, files)

if not args.no_summaries:
# Todo ian: replace dataset_summary variable
dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries,
summary_dir=save_dir, task_name=task)
summary_dir=save_dir, task_name=task,
dataset_summary=args.analysis_level != "participant")
except Exception as ex:
if args.log_dir:
log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt'
Expand All @@ -239,5 +245,6 @@ def main(arg_list=None):
raise



if __name__ == '__main__':
main()
194 changes: 194 additions & 0 deletions hed/tools/remodeling/cli/run_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
""" Main command-line program for running the remodeling tools. """
import copy
import os
import json
import argparse
import logging
from hed.errors.exceptions import HedFileError

from hed.tools.bids.bids_dataset import BidsDataset
from hed.tools.remodeling.remodeler_validator import RemodelerValidator
from hed.tools.remodeling.dispatcher import Dispatcher
from hed.tools.remodeling.backup_manager import BackupManager
from hed.tools.util import io_util


def get_parser():
""" Create a parser for the run_remodel command-line arguments.
Returns:
argparse.ArgumentParser: A parser for parsing the command line arguments.
"""
parser = argparse.ArgumentParser(description="Converts event files based on a json file specifying operations.")
parser.add_argument("data_dir", help="Full path of dataset root directory.")
parser.add_argument("model_path", help="Full path of the file with remodeling instructions.")
parser.add_argument("-bd", "--backup_dir", default="", dest="backup_dir",
help="Directory for the backup that is being created")
parser.add_argument("-bn", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids",
help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", default='events',
help="Filename suffix excluding file type of items to be analyzed (events by default).")
parser.add_argument("-i", "--individual-summaries", dest="individual_summaries", default="separate",
choices=["separate", "consolidated", "none"],
help="Controls individual file summaries ('none', 'separate', 'consolidated')")
parser.add_argument("-j", "--json-sidecar", dest="json_sidecar", nargs="?",
help="Optional path to JSON sidecar with HED information")
parser.add_argument("-ld", "--log_dir", dest="log_dir", default="",
help="Directory for storing log entries for errors.")
# parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
# help="Name of the default backup for remodeling")
parser.add_argument("-nb", "--no-backup", action='store_true', dest="no_backup",
help="If present, the operations are run directly on the files with no backup.")
parser.add_argument("-ns", "--no-summaries", action='store_true', dest="no_summaries",
help="If present, the summaries are not saved, but rather discarded.")
parser.add_argument("-nu", "--no-update", action='store_true', dest="no_update",
help="If present, the files are not saved, but rather discarded.")
parser.add_argument("-r", "--hed-versions", dest="hed_versions", nargs="*", default=[],
help="Optional list of HED schema versions used for annotation, include prefixes.")
parser.add_argument("-s", "--save-formats", nargs="*", default=['.json', '.txt'], dest="save_formats",
help="Format for saving any summaries, if any. If no summaries are to be written," +
"use the -ns option.")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[],
help="The names of the task. If an empty list is given, all tasks are lumped together." +
" If * is given, then tasks are found and reported individually.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
parser.add_argument("-w", "--work-dir", default="", dest="work_dir",
help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="group",
choices=["group"])
return parser


def handle_backup(args):
""" Restore the backup if applicable.
Parameters:
args (obj): Parsed arguments as an object.
Returns:
str or None: Backup name if there was a backup done.
"""
if args.no_backup:
backup_name = None
else:
backup_man = BackupManager(args.data_dir)
if not backup_man.get_backup(args.backup_name):
raise HedFileError("BackupDoesNotExist", f"Backup {args.backup_name} does not exist. "
f"Please run_remodel_backup first", "")
backup_man.restore_backup(args.backup_name, args.task_names, verbose=args.verbose)
backup_name = args.backup_name
return backup_name


def parse_arguments(arg_list=None):
""" Parse the command line arguments or arg_list if given.
Parameters:
arg_list (list): List of command line arguments as a list.
Returns:
Object: Argument object.
List: A list of parsed operations (each operation is a dictionary).
:raises ValueError:
- If the operations were unable to be correctly parsed.
"""
parser = get_parser()
args = parser.parse_args(arg_list)
if '*' in args.file_suffix:
args.file_suffix = None
if '*' in args.extensions:
args.extensions = None
args.data_dir = os.path.realpath(args.data_dir)
args.exclude_dirs = args.exclude_dirs + ['remodel']
args.model_path = os.path.realpath(args.model_path)
if args.verbose:
print(f"Data directory: {args.data_dir}\nModel path: {args.model_path}")
with open(args.model_path, 'r') as fp:
operations = json.load(fp)
validator = RemodelerValidator()
errors = validator.validate(operations)
if errors:
raise ValueError("UnableToFullyParseOperations",
f"Fatal operation error, cannot continue:\n{errors}")
return args, operations


def parse_tasks(files, task_args):
""" Parse the tasks argument to get a task list.
Parameters:
files (list): List of full paths of files.
task_args (str or list): The argument values for the task parameter.
"""
if not task_args:
return {"": files}
task_dict = io_util.get_task_dict(files)
if task_args == "*" or isinstance(task_args, list) and task_args[0] == "*":
return task_dict
task_dict = {key: task_dict[key] for key in task_args if key in task_dict}
return task_dict


def main(arg_list=None):
""" The command-line program.
Parameters:
arg_list (list or None): Called with value None when called from the command line.
Otherwise, called with the command-line parameters as an argument list.
:raises HedFileError:
- if the data root directory does not exist.
- if the specified backup does not exist.
"""
args, operations = parse_arguments(arg_list)

if args.log_dir:
os.makedirs(args.log_dir, exist_ok=True)
timestamp = io_util.get_timestamp()
try:
if not os.path.isdir(args.data_dir):
raise HedFileError("DataDirectoryDoesNotExist",
f"The root data directory {args.data_dir} does not exist", "")
backup_name = handle_backup(args)
save_dir = None
if args.work_dir:
save_dir = os.path.realpath(os.path.join(args.work_dir, Dispatcher.REMODELING_SUMMARY_PATH))
files = io_util.get_file_list(args.data_dir, name_suffix=args.file_suffix, extensions=args.extensions,
exclude_dirs=args.exclude_dirs)
task_dict = parse_tasks(files, args.task_names)
for task, files in task_dict.items():
dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name,
hed_versions=args.hed_versions)

# next task: add a makeshift "analysis level" parameter. particpant = generate individual, group = reload indivdual on load
# Need a way to determine WHICH run to reload options from

dispatch.load_existing_summaries(save_dir)

if not args.no_summaries:
dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries,
summary_dir=save_dir, task_name=task)
except Exception as ex:
if args.log_dir:
log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt'
logging.basicConfig(filename=os.path.join(args.log_dir, log_name), level=logging.ERROR)
logging.exception(f"{args.data_dir}: {args.model_path}")
raise



if __name__ == '__main__':
main()
Loading

0 comments on commit e73b7a7

Please sign in to comment.