From 730310e82870e5f438aa4a2e35db82e71af6c0af Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Thu, 9 Nov 2023 17:46:28 -0500 Subject: [PATCH 01/28] :recycle: Make correlations more antifragile So some failing correlations don't cause the whole thing to fail --- calculate_correlations.py | 399 +++++++++++++++++++++++--------------- 1 file changed, 248 insertions(+), 151 deletions(-) diff --git a/calculate_correlations.py b/calculate_correlations.py index 062d0e0..bd95a4b 100644 --- a/calculate_correlations.py +++ b/calculate_correlations.py @@ -1,53 +1,66 @@ #!/usr/bin/env python - +import argparse +from collections.abc import Generator +import itertools +import math +from multiprocessing import Pool +import os +from pathlib import Path +import pickle +import subprocess from typing import Optional, NamedTuple, Tuple, Union -from utils.html_script import body +import yaml -import os +import nibabel as nb import numpy as np import pandas as pd -from multiprocessing import Pool +from utils.html_script import body Axis = Union[int, Tuple[int, ...]] + class CorrValue(NamedTuple): + """Correlation values""" concor: np.ndarray pearson: np.ndarray + def read_yml_file(yml_filepath): - import yaml with open(yml_filepath,"r") as f: yml_dict = yaml.safe_load(f) return yml_dict + def write_yml_file(yml_dict, out_filepath): - import yaml with open(out_filepath, "wt") as f: yaml.safe_dump(yml_dict, f) + def read_pickle(pickle_file): - import pickle with open(pickle_file, "rb") as f: dct = pickle.load(f) return dct + def write_pickle(dct, out_filepath): - import pickle with open(out_filepath, "wb") as f: pickle.dump(dct, f, protocol=pickle.HIGHEST_PROTOCOL) + def read_txt_file(txt_file): with open(txt_file,"r") as f: strings = f.read().splitlines() return strings + def write_txt_file(text_lines, out_filepath): with open(out_filepath, "wt") as f: for line in text_lines: f.write("{0}\n".format(line)) + def write_dct(dct=None, text_lines=None, outname=None): if not dct: dct = {outname: text_lines} @@ -55,12 +68,13 @@ def write_dct(dct=None, text_lines=None, outname=None): dct.update({outname: text_lines}) return dct -def gather_local_filepaths(output_folder_path): - import os + +def gather_local_filepaths(output_folder_path: str) -> list[str]: + """Given a local path, return relevant paths within that directory""" filepaths = [] print("Gathering file paths from {0}\n".format(output_folder_path)) - for root, dirs, files in os.walk(output_folder_path): + for root, _dirs, files in os.walk(output_folder_path): # loops through every file in the directory for filename in files: # checks if the file is a nifti (.nii.gz) @@ -69,11 +83,21 @@ def gather_local_filepaths(output_folder_path): filepaths.append(os.path.join(root, filename)) if len(filepaths) == 0: - err = "\n\n[!] No filepaths were found given the output folder!\n\n" - raise Exception(err) + raise FileNotFoundError( + "\n\n[!] No filepaths were found given the output folder!\n\n") return filepaths + +class SummaryStats: + def __init__(self, array: np.ndarray, + axis: Optional[Union[int, str]] = None) -> None: + self.mean = np.mean(array, axis=axis, keepdims=True) + self.var = np.var(array, axis=axis, keepdims=True) + self.std = np.sqrt(self.var) + self.norm = (array - self.mean) / self.std + + def batch_correlate( x: np.ndarray, y: np.ndarray, axis: Optional[Axis] = None ) -> CorrValue: @@ -84,54 +108,83 @@ def batch_correlate( References: https://en.wikipedia.org/wiki/Concordance_correlation_coefficient """ - # Summary stats for x - x_mean = np.mean(x, axis=axis, keepdims=True) - x_var = np.var(x, axis=axis, keepdims=True) - x_std = np.sqrt(x_var) - # NOTE: Not trying to fix NaNs - x_norm = (x - x_mean) / x_std - - # Summary stats for y - y_mean = np.mean(y, axis=axis, keepdims=True) - y_var = np.var(y, axis=axis, keepdims=True) - y_std = np.sqrt(y_var) - y_norm = (y - y_mean) / y_std + # summary stats + try: + summary_stats = {'x': SummaryStats(x), 'y': SummaryStats(y)} + except ZeroDivisionError: + return CorrValue(np.nan, np.nan) # Correlation coefficients - pearson = np.mean(x_norm * y_norm, axis=axis, keepdims=True) - concor = 2 * pearson * x_std * y_std / (x_var + y_var + (x_mean - y_mean) ** 2) - + pearson = np.mean(summary_stats['x'].norm * summary_stats['y'].norm, + axis=axis, keepdims=True) + concor = (2 * pearson * summary_stats['x'].std * summary_stats['y'].std / + (summary_stats['x'].var + summary_stats['y'].var + + (summary_stats['x'].mean - summary_stats['y'].mean) ** 2)) # Squeeze reduced singleton dimensions if axis is not None: concor = np.squeeze(concor, axis=axis) pearson = np.squeeze(pearson, axis=axis) return CorrValue(concor, pearson) -def correlate_text_based(txt1, txt2): + +def determine_indices(df : pd.DataFrame) -> list: + """Determine indices of str-type columns in a DataFrame""" + return [i for i, val in + enumerate(df.applymap(lambda _: isinstance(_, str)).values[0]) if + val] + + +def correlate_text_based(txts: Union[list, tuple]) -> Generator: + delimiters = tuple(delimiter_from_filepath(path) for path in txts) # TODO: why do we drop columns containing na? - oned_one = pd.read_csv(txt1, delimiter=None, comment="#").dropna(axis=1).values - oned_two = pd.read_csv(txt2, delimiter=None, comment="#").dropna(axis=1).values - - concor, pearson = batch_correlate(oned_one, oned_two, axis=0) - concor = np.nanmean(concor) - pearson = np.nanmean(pearson) - return concor, pearson - -def create_unique_file_dict(filepaths, output_folder_path, replacements=None): - - # filepaths: - # list of output filepaths from a CPAC output directory - # output_folder_path: - # the CPAC output directory the filepaths are from - # replacements: - # (optional) a list of strings to be removed from the filepaths should - # they occur - - # output - # files_dict - # a dictionary of dictionaries, format: - # files_dict["centrality"] = - # {("centrality", midpath, nums): , ..} + initial_load = [pd.read_csv(txt, delimiter=delimiters[i], comment='#' + ).dropna(axis=1) for i, txt in enumerate(txts)] + for i, df in enumerate(initial_load): + # if we read a value-row as a header, fix that + try: + df.columns.astype(float) + initial_load[i] = pd.read_csv(txts[i], delimiter=delimiters[i], + comment='#', header=None + ).dropna(axis=1) + except ValueError: + pass + # assume string columns are indices and not values to correlate + indices = [] + for i in range(len(initial_load)): + indices.append(np.where(df.apply( + lambda _: _.dtype == np.dtypes.ObjectDType))[0]) + oned = [] + for i, index in enumerate(indices): + if index.shape[0]: + oned.append(pd.read_csv(txts[i], delimiter=delimiters[i], + comment='#', index_col=indices[i] + ).dropna(axis=1).values) + else: + oned.append(initial_load[i].values) + return (np.nanmean(measure) for measure in batch_correlate(*oned, axis=0)) + + +def create_unique_file_dict(filepaths: list[str], output_folder_path: str, + replacements: Optional[list[str]] = None + ) -> dict[str, dict[tuple, str]]: + """ + Parameters + ---------- + filepaths : list of str + list of output filepaths from a CPAC output directory + output_folder_path : str + the CPAC output directory the filepaths are from + replacements : list of str, optional + a list of strings to be removed from the filepaths should + they occur + + Returns + ------- + files_dict : dict + a dictionary of dictionaries, format: + files_dict["centrality"] = + {("centrality", midpath, nums): , ..} + """ files_dict = {} @@ -140,25 +193,25 @@ def create_unique_file_dict(filepaths, output_folder_path, replacements=None): if "_stack" in filepath: continue - if ("itk" in filepath) or ("xfm" in filepath) or ("montage" in filepath): + if ("itk" in filepath) or ("xfm" in filepath) or ( + "montage" in filepath + ): continue path_changes = [] real_filepath = filepath if replacements: for word_couple in replacements: if "," not in word_couple: - err = "\n\n[!] In the replacements text file, the old " \ - "substring and its replacement must be separated " \ - "by a comma.\n\n" - raise Exception(err) - word = word_couple.split(",")[0] - new = word_couple.split(",")[1] + raise SyntaxError( + "\n\n[!] In the replacements text file, the old " + "substring and its replacement must be separated " + "by a comma.\n\n") + word, new = word_couple.split(",") if word in filepath: - path_changes.append("old: {0}".format(filepath)) + path_changes.append(f"old: {filepath}") filepath = filepath.replace(word, new) - path_changes.append("new: {0}".format(filepath)) + path_changes.append(f"new: {filepath}") if path_changes: - import os with open(os.path.join(os.getcwd(), "path_changes.txt"), "wt") as f: for path in path_changes: f.write(path) @@ -231,11 +284,15 @@ def create_unique_file_dict(filepaths, output_folder_path, replacements=None): return files_dict -def gather_all_files(input_dct, pickle_dir, source='output_dir'): - - file_dct_list = [] +def gather_all_files(input_dct: dict, pickle_dir: str, + source: str = 'output_dir') -> tuple[dict, dict]: + """ + Given an input dictionary, a pickle directory, and (optionally) a source, + returns a pair of dicts + """ + file_dct_list = [{}, {}] - for key, pipe_dct in input_dct['pipelines'].items(): + for index, (key, pipe_dct) in enumerate(input_dct['pipelines'].items()): pipe_outdir = pipe_dct[source] @@ -256,37 +313,42 @@ def gather_all_files(input_dct, pickle_dir, source='output_dir'): # "and not a specific participant's output subdirectory either.)\n" # raise Exception(err) - output_pkl = os.path.join(pickle_dir, "{0}_{1}_paths.p".format(key, source)) + output_pkl = os.path.join(pickle_dir, f"{key}_{source}_paths.p") if os.path.exists(output_pkl): - print("Found output list pickle for {0}, skipping output file" \ - "path parsing..".format(key)) + print(f"Found output list pickle for {key}, skipping output file" + "path parsing..") pipeline_files_dct = read_pickle(output_pkl) else: pipeline_files_list = gather_local_filepaths(pipe_outdir) - - pipeline_files_dct = create_unique_file_dict(pipeline_files_list, - pipe_outdir, - pipe_dct['replacements']) - + pipeline_files_dct = create_unique_file_dict( + pipeline_files_list, pipe_outdir, pipe_dct['replacements']) write_pickle(pipeline_files_dct, output_pkl) - file_dct_list.append(pipeline_files_dct) + file_dct_list[index] = pipeline_files_dct - return (file_dct_list[0], file_dct_list[1]) + return tuple(file_dct_list) -def match_filepaths(old_files_dict, new_files_dict): - """Returns a dictionary mapping each filepath from the first CPAC run to the - second one, matched to derivative, strategy, and scan. - old_files_dict: each key is a derivative name, and each value is another - dictionary keying (derivative, mid-path, last digit in path) - tuples to a list containing the full filepath described by - the tuple that is the key - new_files_dict: same as above, but for the second CPAC run +def match_filepaths(old_files_dict: dict[str, dict[tuple, str]], + new_files_dict: dict[str, dict[tuple, str]] + ) -> dict[str, dict[tuple, ]]: + """Returns a dictionary mapping each filepath from the first C-PAC + run to the second one, matched to derivative, strategy, and scan. - matched_path_dict: same as the input dictionaries, except the list in the - sub-dictionary value has both file paths that are matched + Parameters + ---------- + old_files_dict, new_files_dict : dict + each key is a derivative name, and each value is another + dictionary keying (derivative, mid-path, last digit in path) + tuples to a list containing the full filepath described by + the tuple that is the key + + Returns + ------- + matched_path_dict : dict + same as the input dictionaries, except the list in the + sub-dictionary value has both file paths that are matched """ # file path matching @@ -330,16 +392,28 @@ def match_filepaths(old_files_dict, new_files_dict): return matched_files_dct -def calculate_correlation(args_tuple): - import os - import subprocess - import nibabel as nb - import numpy as np - import scipy.stats.mstats - import scipy.stats - import math - +def delimiter_from_filepath(filepath: Union[Path, str]) -> Optional[str]: + """ + Given a filepath, return expected value-separator delimiter + """ + if filepath.endswith('.tsv'): + return '\t' + if filepath.endswith('.csv'): + return ',' + with open(filepath, 'r', encoding='utf8') as _f: + first_line = '#' + while first_line.lstrip().startswith('#'): + first_line = _f.readline() + for delimiter in ['\t', ',', ' ']: + if delimiter in first_line: + if delimiter == ' ': + return r'\s+' + return delimiter + return None + + +def calculate_correlation(args_tuple): category = args_tuple[0] old_path = args_tuple[1] new_path = args_tuple[2] @@ -400,46 +474,48 @@ def calculate_correlation(args_tuple): if os.path.exists(old_path) and os.path.exists(new_path): if ('.csv' in old_path and '.csv' in new_path) or \ - ('spatial_map_timeseries.txt' in old_path and 'spatial_map_timeseries.txt' in new_path) or \ + ('.txt' in old_path and '.txt' in new_path) or \ ('.1D' in old_path and '.1D' in new_path) or \ ('.tsv' in old_path and '.tsv' in new_path): try: - concor, pearson = correlate_text_based(old_path, new_path) + concor, pearson = correlate_text_based((old_path, new_path)) + except Exception as e: + return category, e, (old_path, new_path) - if concor > 0.980: - corr_tuple = (category, [concor], [pearson]) - else: - corr_tuple = (category, [concor], [pearson], (old_path, new_path)) - if verbose: - print("Success - {0}".format(str(concor))) + if concor > 0.980: + corr_tuple = (category, [concor], [pearson]) + else: + corr_tuple = (category, [concor], [pearson], (old_path, new_path)) + if verbose: + print("Success - {0}".format(str(concor))) - except Exception as e: - corr_tuple = ("file reading problem: {0}".format(e), - old_path, new_path) - if verbose: - print(str(corr_tuple)) + # except Exception as e: + # corr_tuple = ("file reading problem: {0}".format(e), + # old_path, new_path) + # if verbose: + # print(str(corr_tuple)) return corr_tuple else: - try: - old_file_img = nb.load(old_path) - old_file_hdr = old_file_img.header - new_file_img = nb.load(new_path) - new_file_hdr = new_file_img.header + # try: + old_file_img = nb.load(old_path) + old_file_hdr = old_file_img.header + new_file_img = nb.load(new_path) + new_file_hdr = new_file_img.header - old_file_dims = old_file_hdr.get_zooms() - new_file_dims = new_file_hdr.get_zooms() + old_file_dims = old_file_hdr.get_zooms() + new_file_dims = new_file_hdr.get_zooms() - data_1 = nb.load(old_path).get_fdata() - data_2 = nb.load(new_path).get_fdata() + data_1 = nb.load(old_path).get_fdata() + data_2 = nb.load(new_path).get_fdata() - except Exception as e: - corr_tuple = ("file reading problem: {0}".format(e), - old_path, new_path) - if verbose: - print(str(corr_tuple)) - return corr_tuple + # except Exception as e: + # corr_tuple = ("file reading problem: {0}".format(e), + # old_path, new_path) + # if verbose: + # print(str(corr_tuple)) + # return corr_tuple ## set up and run the Pearson correlation and concordance correlation if data_1.flatten().shape == data_2.flatten().shape: @@ -485,6 +561,7 @@ def calculate_correlation(args_tuple): return corr_tuple + def run_correlations(matched_dct, input_dct, source='output_dir', quick=False, verbose=False): all_corr_dct = { @@ -540,9 +617,15 @@ def run_correlations(matched_dct, input_dct, source='output_dir', quick=False, v print("\nCorrelations of the {0} are done.\n".format(source)) + failures = [] + for corr_tuple in corr_tuple_list: if not corr_tuple: continue + if isinstance(corr_tuple[1], Exception): + failures.append((corr_tuple[0], corr_tuple[1], + ' | '.join(corr_tuple[2]))) + continue if corr_tuple[0] not in all_corr_dct['concordance'].keys(): all_corr_dct['concordance'][corr_tuple[0]] = [] if corr_tuple[0] not in all_corr_dct['pearson'].keys(): @@ -561,7 +644,8 @@ def run_correlations(matched_dct, input_dct, source='output_dir', quick=False, v except TypeError: pass - return all_corr_dct + return all_corr_dct, failures + def post180_organize_correlations(concor_dct, corr_type="concordance", quick=False): @@ -694,11 +778,14 @@ def organize_correlations(concor_dict, corr_type="concordance", quick=False): return corr_map_dict -def quick_summary(dct, corr_map_dct, output_dir): + +def quick_summary(dct, corr_map_dct, output_dir) -> dict: for corr_group in corr_map_dct["correlations"].keys(): cat_dct = {} lines = [] - for output_type, corr_vec in dict(corr_map_dct["correlations"][corr_group]).items(): + for output_type, corr_vec in dict( + corr_map_dct["correlations"][corr_group] + ).items(): try: corrmean = np.mean(np.asarray(corr_vec)) except TypeError: @@ -708,39 +795,51 @@ def quick_summary(dct, corr_map_dct, output_dir): dct = write_dct(dct, lines, output_type) return(dct) -def compare_pipelines(input_dct, dir_type='output_dir'): +def compare_pipelines(input_dct: dict, + dir_type: str = 'output_dir') -> tuple[dict, dict]: + """ + Given an input dict containing keys 'settings', gather prreviously + generated pickles or all relevant output and working files + + Returns + ------- + corr_map : dict + + pearson_map : dict + """ output_dir = input_dct['settings']['output_dir'] pickle_dir = input_dct['settings']['pickle_dir'] - corrs_pkl = os.path.join(pickle_dir, "{0}_correlations.p".format(dir_type)) - matched_pkl = os.path.join(pickle_dir, "{0}_matched_files.p".format(dir_type)) - + corrs_pkl = os.path.join(pickle_dir, f"{dir_type}_correlations.p") + failures_pkl = os.path.join(pickle_dir, f"{dir_type}_failures.p") + matched_pkl = os.path.join(pickle_dir, f"{dir_type}_matched_files.p") + all_corr_dct = None if os.path.exists(corrs_pkl): - print("\n\nFound the correlations pickle: {0}\n\n" - "Starting from there..\n".format(corrs_pkl)) + print(f"\n\nFound the correlations pickle: {corrs_pkl}\n\n" + "Starting from there..\n") all_corr_dct = read_pickle(corrs_pkl) elif os.path.exists(matched_pkl): - print("\n\nFound the matched filepaths pickle: {0}\n\n" - "Starting from there..\n".format(matched_pkl)) + print(f"\n\nFound the matched filepaths pickle: {matched_pkl}\n\n" + "Starting from there..\n") matched_dct = read_pickle(matched_pkl) else: # gather all relevant output and working files outfiles1_dct, outfiles2_dct = gather_all_files(input_dct, pickle_dir, source=dir_type) - matched_dct = match_filepaths(outfiles1_dct, outfiles2_dct) write_pickle(matched_dct, matched_pkl) if not all_corr_dct: - all_corr_dct = run_correlations(matched_dct, + all_corr_dct, failures = run_correlations(matched_dct, input_dct, source=dir_type, quick=input_dct['settings']['quick'], verbose=input_dct['settings']['verbose']) write_pickle(all_corr_dct, corrs_pkl) + write_pickle(failures, failures_pkl) if dir_type == 'work_dir': sorted_vals = [] @@ -788,14 +887,13 @@ def compare_pipelines(input_dct, dir_type='output_dir'): # pearson_map_dict["pipeline_names"], output_dir) return(corr_map, pearson_map) -def main(): - - import os - import argparse - - from multiprocessing import Pool - import itertools +def main() -> tuple: + """ + • Parse commandline arguments + • Read input YAML + • Check for already completed stuff (pickles) + """ parser = argparse.ArgumentParser() parser.add_argument("input_yaml", type=str, help="file path of the script's input YAML") @@ -811,8 +909,8 @@ def main(): input_dct = read_yml_file(args.input_yaml) # check for already completed stuff (pickles) - output_dir = os.path.join(os.getcwd(), - "correlations_{0}".format(input_dct['settings']['run_name'])) + output_dir = os.path.join( + os.getcwd(), f"correlations_{input_dct['settings']['run_name']}") pickle_dir = os.path.join(output_dir, "pickles") if not os.path.exists(pickle_dir): @@ -821,11 +919,11 @@ def main(): except: err = "\n\n[!] Could not create the output directory for the " \ "correlations. Do you have write permissions?\nAttempted " \ - "output directory: {0}\n\n".format(output_dir) + f"output directory: {output_dir}\n\n" raise Exception(err) - input_dct['settings'].update({'output_dir': output_dir}) - input_dct['settings'].update({'pickle_dir': pickle_dir}) + input_dct['settings'].update({'output_dir': output_dir, + 'pickle_dir': pickle_dir}) corr_map, pearson_map = compare_pipelines(input_dct, dir_type='output_dir') corr_map_keys = list(corr_map.keys()) @@ -840,6 +938,5 @@ def main(): if __name__ == "__main__": all_keys, data_source, branch = main() html_body = body(all_keys, data_source) - file = open(f"{data_source}_{branch}.json","w") - file.write(html_body) - file.close() \ No newline at end of file + with open(f"{data_source}_{branch}.json", "w") as file: + file.write(html_body) From 56b616c71b5b0cc1e0fec13bd5f66f785095286a Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 10 Nov 2023 20:03:54 -0500 Subject: [PATCH 02/28] :construction: :heavy_minus_sign: :heavy_plus_sign: Begin script to transition from FusionCharts to D3 Ref https://d3-graph-gallery.com/graph/heatmap_style.html Co-authored-by: Holtz Yan --- templates/heatmap.js | 126 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 templates/heatmap.js diff --git a/templates/heatmap.js b/templates/heatmap.js new file mode 100644 index 0000000..0d4ca2d --- /dev/null +++ b/templates/heatmap.js @@ -0,0 +1,126 @@ +// set the dimensions and margins of the graph +var margin = {top: 80, right: 25, bottom: 30, left: 40}, + width = 800 - margin.left - margin.right, + height = 5000 - margin.top - margin.bottom; + +// append the svg object to the body of the page +var svg = d3.select("#heatmap-container") + .html(null) + .append("svg") + .attr("width", width + margin.left + margin.right) + .attr("height", height + margin.top + margin.bottom) + .append("g") + .attr("transform", + "translate(" + margin.left + "," + margin.top + ")"); + +//Read the data +datasource = d3.json("DATAFILE"); +datasource.then(function(data) { + + data.sort(function(a, b) { return d3.descending(a.rowid, b.rowid) }); + // Labels of row and columns -> unique identifier of the column called 'group' and 'variable' + var groupedData = d3.group(data, d => d.columnid); + var myGroups = Array.from(groupedData.keys()); + var myVars = Array.from(d3.group(data, d => d.rowid).keys()); + + // Build X scales and axis: + var x = d3.scaleBand() + .domain(myGroups) + .range([0, width]) + .padding(0.05); + + svg.append("g") + .style("font-size", 15) + .attr("transform", "translate(0,0)") + .call(d3.axisTop(x).tickSize(0)) + .select(".domain").remove(); + + // Build Y scales and axis: + var y = d3.scaleBand() + .domain(myVars) + .range([height, 0]) + .padding(0.05); + + svg.append("g") + .style("font-size", 15) + .attr("transform", "translate(" + width + ",0)") + .call(d3.axisLeft(y).tickSize(0)) + .select(".domain").remove(); + + // Build color scale + var myColor = d3.scaleSequential() + .interpolator(d3.interpolateRdYlGn) + .domain([0.8, 1]); + + // Create a tooltip + var tooltip = d3.select("#my_dataviz") + .append("div") + .style("opacity", 0) + .attr("class", "tooltip") + .style("background-color", "white") + .style("border", "solid") + .style("border-width", "2px") + .style("border-radius", "5px") + .style("padding", "5px"); + + // Three functions that change the tooltip when user hovers / moves / leaves a cell + var mouseover = function(d) { + tooltip + .style("opacity", 1); + d3.select(this) + .style("stroke", "black") + .style("opacity", 1); + }; + + var mousemove = function(d) { + tooltip + .html(d.rowid + ": " + d.value) + .style("left", (d3.pointer(this)[0] + 70) + "px") + .style("top", (d3.pointer(this)[1]) + "px"); + }; + + var mouseleave = function(d) { + tooltip + .style("opacity", 0); + d3.select(this) + .style("stroke", "none") + .style("opacity", 0.8); + }; + + // Add the squares + svg.selectAll() + .data(data, function(d) {return d.columnid + ':' + d.variable;}) + .enter() + .append("rect") + .attr("x", function(d) { return x(d.columnid) + (x.bandwidth() / 2); }) + .attr("y", function(d) { return y(d.rowid); }) + .attr("rx", 4) + .attr("ry", 4) + .attr("width", y.bandwidth()) + .attr("height", y.bandwidth()) + .style("fill", function(d) { return myColor(d.value); }) + .style("stroke-width", 0) + .style("stroke", "none") + .style("opacity", 0.8) + .on("mouseover", mouseover) + .on("mousemove", mousemove) + .on("mouseleave", mouseleave); +}); + +// Add title to graph +svg.append("text") + .attr("x", 0) + .attr("y", -50) + .attr("text-anchor", "left") + .style("font-size", "22px") + .text("GRAPHTITLE"); + +// Add subtitle to graph +svg.append("text") + .attr("x", 0) + .attr("y", -20) + .attr("text-anchor", "left") + .style("font-size", "14px") + .style("fill", "grey") + .style("max-width", 400) + .text("GRAPHSUBTITLE"); \ No newline at end of file From 5b3f06f6c900675822e883991ef84c8863c289c6 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 10 Nov 2023 20:07:00 -0500 Subject: [PATCH 03/28] :construction: :heavy_minus_sign: :heavy_plus_sign: Work toward moving from FusionCharts to D3 --- build_d3_dashboard.py | 37 +++++++++++++++++++++++++++++++++++++ templates/heatmap.html | 9 +++++++++ 2 files changed, 46 insertions(+) create mode 100644 build_d3_dashboard.py create mode 100644 templates/heatmap.html diff --git a/build_d3_dashboard.py b/build_d3_dashboard.py new file mode 100644 index 0000000..1ea05b9 --- /dev/null +++ b/build_d3_dashboard.py @@ -0,0 +1,37 @@ +import os +from shutil import copy +import click +from lxml import etree + + +@click.command() +@click.option('--json_file', required=True, help='JSON file from correlations') +@click.option('--branch', required=True, help='branch name') + + +def main(json_file=None, branch=None): + outdir = f'output/{branch}' + os.makedirs(outdir, exist_ok=True) + json_filename = os.path.basename(json_file) + copy(json_file, '/'.join([outdir, json_filename])) + name = json_filename.replace(f"_{branch}.json", '') + with open('templates/heatmap.html', 'r', encoding='utf-8') as _f: + body = etree.HTML(_f.read()) + script_element = etree.SubElement(body[0], 'script') + script_element.set('defer', 'defer') + script_element.set('src', f'./heatmap.js') + with open('templates/heatmap.js', 'r', encoding='utf-8') as _f: + with open(f'{outdir}/heatmap.js', 'w', encoding='utf=8') as _s: + _s.write(_f.read().replace( + 'DATAFILE', json_filename).replace( + 'GRAPHTITLE', branch).replace('GRAPHSUBTITLE', name)) + body = etree.tostring(body, encoding='unicode', method='html') + + with open(f'{outdir}/{name}.html', 'w', encoding='utf-8') as _f: + _f.write(body) + + return body, name, branch + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/templates/heatmap.html b/templates/heatmap.html new file mode 100644 index 0000000..0cd1893 --- /dev/null +++ b/templates/heatmap.html @@ -0,0 +1,9 @@ + + + Correlations heatmap + + + +
Correlations heatmap will load here!
+ + \ No newline at end of file From 699e5b42cc3a6d7b94927aecedcc48649b1aea72 Mon Sep 17 00:00:00 2001 From: Florian Rupprecht Date: Tue, 21 Feb 2023 16:23:56 -0500 Subject: [PATCH 04/28] Add pre-commit hooks for isort and black --- .pre-commit-config.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0df1a99 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: + - repo: https://github.com/pycqa/isort + rev: 5.11.5 + hooks: + - id: isort + files: "\\.(py)$" + - repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + files: "\\.(py)$" \ No newline at end of file From db1a98ed1c39d87bc00678076625aa879d6eebcc Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Mon, 13 Nov 2023 13:09:14 -0500 Subject: [PATCH 05/28] :rotating_light: Initital run of precommit hooks (isort & black) --- build_d3_dashboard.py | 42 ++-- build_dashboard.py | 29 ++- calculate_correlations.py | 498 +++++++++++++++++++++----------------- create_yml.py | 49 ++-- utils/html_script.py | 23 +- utils/parse_yaml.py | 64 ++--- 6 files changed, 394 insertions(+), 311 deletions(-) diff --git a/build_d3_dashboard.py b/build_d3_dashboard.py index 1ea05b9..72d1f24 100644 --- a/build_d3_dashboard.py +++ b/build_d3_dashboard.py @@ -1,37 +1,39 @@ import os from shutil import copy + import click from lxml import etree @click.command() -@click.option('--json_file', required=True, help='JSON file from correlations') -@click.option('--branch', required=True, help='branch name') - - +@click.option("--json_file", required=True, help="JSON file from correlations") +@click.option("--branch", required=True, help="branch name") def main(json_file=None, branch=None): - outdir = f'output/{branch}' + outdir = f"output/{branch}" os.makedirs(outdir, exist_ok=True) json_filename = os.path.basename(json_file) - copy(json_file, '/'.join([outdir, json_filename])) - name = json_filename.replace(f"_{branch}.json", '') - with open('templates/heatmap.html', 'r', encoding='utf-8') as _f: + copy(json_file, "/".join([outdir, json_filename])) + name = json_filename.replace(f"_{branch}.json", "") + with open("templates/heatmap.html", "r", encoding="utf-8") as _f: body = etree.HTML(_f.read()) - script_element = etree.SubElement(body[0], 'script') - script_element.set('defer', 'defer') - script_element.set('src', f'./heatmap.js') - with open('templates/heatmap.js', 'r', encoding='utf-8') as _f: - with open(f'{outdir}/heatmap.js', 'w', encoding='utf=8') as _s: - _s.write(_f.read().replace( - 'DATAFILE', json_filename).replace( - 'GRAPHTITLE', branch).replace('GRAPHSUBTITLE', name)) - body = etree.tostring(body, encoding='unicode', method='html') - - with open(f'{outdir}/{name}.html', 'w', encoding='utf-8') as _f: + script_element = etree.SubElement(body[0], "script") + script_element.set("defer", "defer") + script_element.set("src", f"./heatmap.js") + with open("templates/heatmap.js", "r", encoding="utf-8") as _f: + with open(f"{outdir}/heatmap.js", "w", encoding="utf=8") as _s: + _s.write( + _f.read() + .replace("DATAFILE", json_filename) + .replace("GRAPHTITLE", branch) + .replace("GRAPHSUBTITLE", name) + ) + body = etree.tostring(body, encoding="unicode", method="html") + + with open(f"{outdir}/{name}.html", "w", encoding="utf-8") as _f: _f.write(body) return body, name, branch if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/build_dashboard.py b/build_dashboard.py index 82ee895..5880235 100644 --- a/build_dashboard.py +++ b/build_dashboard.py @@ -1,24 +1,30 @@ -from utils.html_script import write_html, setup_browser - import os + import click +from utils.html_script import setup_browser, write_html + + def process_option(ctx, param, value): if value is not None: - values = value.split(',') + values = value.split(",") return [val.strip() for val in values] -@click.command() -@click.option('--json_files', required=True, - callback=process_option, help='JSON files from correlations') -@click.option('--branch', required=True, help='branch name') +@click.command() +@click.option( + "--json_files", + required=True, + callback=process_option, + help="JSON files from correlations", +) +@click.option("--branch", required=True, help="branch name") def main(json_files=None, branch=None): - body = '' + body = "" data_source = [] for json in json_files: name = os.path.basename(json) - data = name.replace(f"_{branch}.json", '') + data = name.replace(f"_{branch}.json", "") data_source.append(data) with open(json) as user_file: file_contents = user_file.read() @@ -26,12 +32,13 @@ def main(json_files=None, branch=None): body = (body.rstrip()).rstrip(",") html_body = write_html(body) - file = open('html.html', 'w') + file = open("html.html", "w") file.write(html_body) file.close() setup_browser(html_body) return body, data_source, branch + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/calculate_correlations.py b/calculate_correlations.py index bd95a4b..7edbdfc 100644 --- a/calculate_correlations.py +++ b/calculate_correlations.py @@ -1,19 +1,19 @@ #!/usr/bin/env python import argparse -from collections.abc import Generator import itertools import math -from multiprocessing import Pool import os -from pathlib import Path import pickle import subprocess -from typing import Optional, NamedTuple, Tuple, Union -import yaml +from collections.abc import Generator +from multiprocessing import Pool +from pathlib import Path +from typing import NamedTuple, Optional, Tuple, Union import nibabel as nb import numpy as np import pandas as pd +import yaml from utils.html_script import body @@ -22,12 +22,13 @@ class CorrValue(NamedTuple): """Correlation values""" + concor: np.ndarray pearson: np.ndarray def read_yml_file(yml_filepath): - with open(yml_filepath,"r") as f: + with open(yml_filepath, "r") as f: yml_dict = yaml.safe_load(f) return yml_dict @@ -50,7 +51,7 @@ def write_pickle(dct, out_filepath): def read_txt_file(txt_file): - with open(txt_file,"r") as f: + with open(txt_file, "r") as f: strings = f.read().splitlines() return strings @@ -78,20 +79,27 @@ def gather_local_filepaths(output_folder_path: str) -> list[str]: # loops through every file in the directory for filename in files: # checks if the file is a nifti (.nii.gz) - if '.nii' in filename or '.csv' in filename or '.txt' in filename \ - or '.1D' in filename or '.tsv' in filename: + if ( + ".nii" in filename + or ".csv" in filename + or ".txt" in filename + or ".1D" in filename + or ".tsv" in filename + ): filepaths.append(os.path.join(root, filename)) if len(filepaths) == 0: raise FileNotFoundError( - "\n\n[!] No filepaths were found given the output folder!\n\n") + "\n\n[!] No filepaths were found given the output folder!\n\n" + ) return filepaths class SummaryStats: - def __init__(self, array: np.ndarray, - axis: Optional[Union[int, str]] = None) -> None: + def __init__( + self, array: np.ndarray, axis: Optional[Union[int, str]] = None + ) -> None: self.mean = np.mean(array, axis=axis, keepdims=True) self.var = np.var(array, axis=axis, keepdims=True) self.std = np.sqrt(self.var) @@ -110,16 +118,25 @@ def batch_correlate( """ # summary stats try: - summary_stats = {'x': SummaryStats(x), 'y': SummaryStats(y)} + summary_stats = {"x": SummaryStats(x), "y": SummaryStats(y)} except ZeroDivisionError: return CorrValue(np.nan, np.nan) # Correlation coefficients - pearson = np.mean(summary_stats['x'].norm * summary_stats['y'].norm, - axis=axis, keepdims=True) - concor = (2 * pearson * summary_stats['x'].std * summary_stats['y'].std / - (summary_stats['x'].var + summary_stats['y'].var + - (summary_stats['x'].mean - summary_stats['y'].mean) ** 2)) + pearson = np.mean( + summary_stats["x"].norm * summary_stats["y"].norm, axis=axis, keepdims=True + ) + concor = ( + 2 + * pearson + * summary_stats["x"].std + * summary_stats["y"].std + / ( + summary_stats["x"].var + + summary_stats["y"].var + + (summary_stats["x"].mean - summary_stats["y"].mean) ** 2 + ) + ) # Squeeze reduced singleton dimensions if axis is not None: concor = np.squeeze(concor, axis=axis) @@ -127,46 +144,57 @@ def batch_correlate( return CorrValue(concor, pearson) -def determine_indices(df : pd.DataFrame) -> list: +def determine_indices(df: pd.DataFrame) -> list: """Determine indices of str-type columns in a DataFrame""" - return [i for i, val in - enumerate(df.applymap(lambda _: isinstance(_, str)).values[0]) if - val] + return [ + i + for i, val in enumerate(df.applymap(lambda _: isinstance(_, str)).values[0]) + if val + ] def correlate_text_based(txts: Union[list, tuple]) -> Generator: delimiters = tuple(delimiter_from_filepath(path) for path in txts) # TODO: why do we drop columns containing na? - initial_load = [pd.read_csv(txt, delimiter=delimiters[i], comment='#' - ).dropna(axis=1) for i, txt in enumerate(txts)] + initial_load = [ + pd.read_csv(txt, delimiter=delimiters[i], comment="#").dropna(axis=1) + for i, txt in enumerate(txts) + ] for i, df in enumerate(initial_load): # if we read a value-row as a header, fix that try: df.columns.astype(float) - initial_load[i] = pd.read_csv(txts[i], delimiter=delimiters[i], - comment='#', header=None - ).dropna(axis=1) + initial_load[i] = pd.read_csv( + txts[i], delimiter=delimiters[i], comment="#", header=None + ).dropna(axis=1) except ValueError: pass # assume string columns are indices and not values to correlate indices = [] for i in range(len(initial_load)): - indices.append(np.where(df.apply( - lambda _: _.dtype == np.dtypes.ObjectDType))[0]) + indices.append( + np.where(df.apply(lambda _: _.dtype == np.dtypes.ObjectDType))[0] + ) oned = [] for i, index in enumerate(indices): if index.shape[0]: - oned.append(pd.read_csv(txts[i], delimiter=delimiters[i], - comment='#', index_col=indices[i] - ).dropna(axis=1).values) + oned.append( + pd.read_csv( + txts[i], delimiter=delimiters[i], comment="#", index_col=indices[i] + ) + .dropna(axis=1) + .values + ) else: oned.append(initial_load[i].values) return (np.nanmean(measure) for measure in batch_correlate(*oned, axis=0)) -def create_unique_file_dict(filepaths: list[str], output_folder_path: str, - replacements: Optional[list[str]] = None - ) -> dict[str, dict[tuple, str]]: +def create_unique_file_dict( + filepaths: list[str], + output_folder_path: str, + replacements: Optional[list[str]] = None, +) -> dict[str, dict[tuple, str]]: """ Parameters ---------- @@ -182,20 +210,17 @@ def create_unique_file_dict(filepaths: list[str], output_folder_path: str, ------- files_dict : dict a dictionary of dictionaries, format: - files_dict["centrality"] = + files_dict["centrality"] = {("centrality", midpath, nums): , ..} """ files_dict = {} for filepath in filepaths: - if "_stack" in filepath: continue - if ("itk" in filepath) or ("xfm" in filepath) or ( - "montage" in filepath - ): + if ("itk" in filepath) or ("xfm" in filepath) or ("montage" in filepath): continue path_changes = [] real_filepath = filepath @@ -205,7 +230,8 @@ def create_unique_file_dict(filepaths: list[str], output_folder_path: str, raise SyntaxError( "\n\n[!] In the replacements text file, the old " "substring and its replacement must be separated " - "by a comma.\n\n") + "by a comma.\n\n" + ) word, new = word_couple.split(",") if word in filepath: path_changes.append(f"old: {filepath}") @@ -242,15 +268,15 @@ def create_unique_file_dict(filepaths: list[str], output_folder_path: str, else: tags = [] category = filename - category = category.rstrip('.gz').rstrip('.nii') + category = category.rstrip(".gz").rstrip(".nii") - excl_tags = ['sub-', 'ses-', 'task-', 'run-', 'acq-'] + excl_tags = ["sub-", "ses-", "task-", "run-", "acq-"] # len(filetag) == 1 is temporary for broken/missing ses-* tag for filetag in filename.split("_"): for exctag in excl_tags: if exctag in filetag or len(filetag) == 1: - category = category.replace(f'{filetag}_', '') + category = category.replace(f"{filetag}_", "") # this provides a way to safely identify the specific file # without relying on a full string of the filename (because @@ -259,7 +285,7 @@ def create_unique_file_dict(filepaths: list[str], output_folder_path: str, nums_in_folder = [int(s) for s in folder if s.isdigit()] nums_in_filename = [int(s) for s in filename if s.isdigit()] - file_nums = '' + file_nums = "" for num in nums_in_folder: file_nums = file_nums + str(num) @@ -280,33 +306,35 @@ def create_unique_file_dict(filepaths: list[str], output_folder_path: str, files_dict[category] = {} files_dict[category].update(temp_dict) - + return files_dict -def gather_all_files(input_dct: dict, pickle_dir: str, - source: str = 'output_dir') -> tuple[dict, dict]: +def gather_all_files( + input_dct: dict, pickle_dir: str, source: str = "output_dir" +) -> tuple[dict, dict]: """ Given an input dictionary, a pickle directory, and (optionally) a source, returns a pair of dicts """ file_dct_list = [{}, {}] - for index, (key, pipe_dct) in enumerate(input_dct['pipelines'].items()): - + for index, (key, pipe_dct) in enumerate(input_dct["pipelines"].items()): pipe_outdir = pipe_dct[source] - if input_dct['settings']['s3_creds']: + if input_dct["settings"]["s3_creds"]: if not "s3://" in pipe_outdir: - err = "\n\n[!] If pulling output files from an S3 bucket, the "\ - "output folder path must have the s3:// prefix.\n\n" + err = ( + "\n\n[!] If pulling output files from an S3 bucket, the " + "output folder path must have the s3:// prefix.\n\n" + ) raise Exception(err) else: - pipe_outdir = os.path.abspath(pipe_outdir).rstrip('/') + pipe_outdir = os.path.abspath(pipe_outdir).rstrip("/") - pipeline_name = pipe_outdir.split('/')[-1] + pipeline_name = pipe_outdir.split("/")[-1] - #if source == "output_dir" and "pipeline_" not in pipeline_name: + # if source == "output_dir" and "pipeline_" not in pipeline_name: # err = "\n\n[!] Your pipeline output directory has to be a specific " \ # "one that has the 'pipeline_' prefix.\n\n(Not the main output " \ # "directory that contains all of the 'pipeline_X' subdirectories," \ @@ -316,13 +344,16 @@ def gather_all_files(input_dct: dict, pickle_dir: str, output_pkl = os.path.join(pickle_dir, f"{key}_{source}_paths.p") if os.path.exists(output_pkl): - print(f"Found output list pickle for {key}, skipping output file" - "path parsing..") + print( + f"Found output list pickle for {key}, skipping output file" + "path parsing.." + ) pipeline_files_dct = read_pickle(output_pkl) else: pipeline_files_list = gather_local_filepaths(pipe_outdir) pipeline_files_dct = create_unique_file_dict( - pipeline_files_list, pipe_outdir, pipe_dct['replacements']) + pipeline_files_list, pipe_outdir, pipe_dct["replacements"] + ) write_pickle(pipeline_files_dct, output_pkl) file_dct_list[index] = pipeline_files_dct @@ -330,9 +361,10 @@ def gather_all_files(input_dct: dict, pickle_dir: str, return tuple(file_dct_list) -def match_filepaths(old_files_dict: dict[str, dict[tuple, str]], - new_files_dict: dict[str, dict[tuple, str]] - ) -> dict[str, dict[tuple, ]]: +def match_filepaths( + old_files_dict: dict[str, dict[tuple, str]], + new_files_dict: dict[str, dict[tuple, str]], +) -> dict[str, dict[tuple,]]: """Returns a dictionary mapping each filepath from the first C-PAC run to the second one, matched to derivative, strategy, and scan. @@ -361,15 +393,15 @@ def match_filepaths(old_files_dict: dict[str, dict[tuple, str]], if key in old_files_dict.keys(): for file_id in new_files_dict[key]: if file_id in old_files_dict[key].keys(): - if key not in matched_path_dict.keys(): matched_path_dict[key] = {} - matched_path_dict[key][file_id] = \ + matched_path_dict[key][file_id] = ( old_files_dict[key][file_id] + new_files_dict[key][file_id] + ) else: - missing_in_old.append(file_id)#new_files_dict[key][file_id]) + missing_in_old.append(file_id) # new_files_dict[key][file_id]) else: missing_in_old.append(new_files_dict[key]) @@ -380,14 +412,16 @@ def match_filepaths(old_files_dict: dict[str, dict[tuple, str]], missing_in_new.append(old_files_dict[key]) if len(matched_path_dict) == 0: - err = "\n\n[!] No output paths were successfully matched between " \ - "the two CPAC output directories!\n\n" + err = ( + "\n\n[!] No output paths were successfully matched between " + "the two CPAC output directories!\n\n" + ) raise Exception(err) matched_files_dct = { "matched": matched_path_dict, "missing_old": missing_in_old, - "missing_new": missing_in_new + "missing_new": missing_in_new, } return matched_files_dct @@ -397,23 +431,23 @@ def delimiter_from_filepath(filepath: Union[Path, str]) -> Optional[str]: """ Given a filepath, return expected value-separator delimiter """ - if filepath.endswith('.tsv'): - return '\t' - if filepath.endswith('.csv'): - return ',' - with open(filepath, 'r', encoding='utf8') as _f: - first_line = '#' - while first_line.lstrip().startswith('#'): + if filepath.endswith(".tsv"): + return "\t" + if filepath.endswith(".csv"): + return "," + with open(filepath, "r", encoding="utf8") as _f: + first_line = "#" + while first_line.lstrip().startswith("#"): first_line = _f.readline() - for delimiter in ['\t', ',', ' ']: + for delimiter in ["\t", ",", " "]: if delimiter in first_line: - if delimiter == ' ': - return r'\s+' + if delimiter == " ": + return r"\s+" return delimiter return None -def calculate_correlation(args_tuple): +def calculate_correlation(args_tuple): category = args_tuple[0] old_path = args_tuple[1] new_path = args_tuple[2] @@ -429,14 +463,16 @@ def calculate_correlation(args_tuple): if s3_creds: try: # full filepath with filename - old_local_file = os.path.join(local_dir, "s3_input_files", \ - old_path.replace("s3://","")) + old_local_file = os.path.join( + local_dir, "s3_input_files", old_path.replace("s3://", "") + ) # directory without filename - old_local_path = old_local_file.replace(old_path.split("/")[-1],"") + old_local_path = old_local_file.replace(old_path.split("/")[-1], "") - new_local_file = os.path.join(local_dir, "s3_input_files", \ - new_path.replace("s3://","")) - new_local_path = new_local_file.replace(new_path.split("/")[-1],"") + new_local_file = os.path.join( + local_dir, "s3_input_files", new_path.replace("s3://", "") + ) + new_local_path = new_local_file.replace(new_path.split("/")[-1], "") if not os.path.exists(old_local_path): os.makedirs(old_local_path) @@ -444,39 +480,44 @@ def calculate_correlation(args_tuple): os.makedirs(new_local_path) except Exception as e: - err = "\n\nLocals: {0}\n\n[!] Could not create the local S3 " \ - "download directory.\n\nError details: {1}\n\n".format((locals(), e)) + err = ( + "\n\nLocals: {0}\n\n[!] Could not create the local S3 " + "download directory.\n\nError details: {1}\n\n".format((locals(), e)) + ) raise Exception(e) try: old_path = old_local_file except Exception as e: - err = "\n\nLocals: {0}\n\n[!] Could not download the files from " \ - "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" \ - "\nS3 creds: {3}\n\nError details: {4}\n\n".format(locals(), - old_path, - old_local_path, - s3_creds, e) + err = ( + "\n\nLocals: {0}\n\n[!] Could not download the files from " + "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" + "\nS3 creds: {3}\n\nError details: {4}\n\n".format( + locals(), old_path, old_local_path, s3_creds, e + ) + ) raise Exception(e) try: new_path = new_local_file except Exception as e: - err = "\n\nLocals: {0}\n\n[!] Could not download the files from " \ - "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" \ - "\nS3 creds: {3}\n\nError details: {4}\n\n".format(locals(), - new_path, - new_local_path, - s3_creds, e) + err = ( + "\n\nLocals: {0}\n\n[!] Could not download the files from " + "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" + "\nS3 creds: {3}\n\nError details: {4}\n\n".format( + locals(), new_path, new_local_path, s3_creds, e + ) + ) raise Exception(e) ## nibabel to pull the data from the re-assembled file paths if os.path.exists(old_path) and os.path.exists(new_path): - - if ('.csv' in old_path and '.csv' in new_path) or \ - ('.txt' in old_path and '.txt' in new_path) or \ - ('.1D' in old_path and '.1D' in new_path) or \ - ('.tsv' in old_path and '.tsv' in new_path): + if ( + (".csv" in old_path and ".csv" in new_path) + or (".txt" in old_path and ".txt" in new_path) + or (".1D" in old_path and ".1D" in new_path) + or (".tsv" in old_path and ".tsv" in new_path) + ): try: concor, pearson = correlate_text_based((old_path, new_path)) except Exception as e: @@ -490,7 +531,7 @@ def calculate_correlation(args_tuple): print("Success - {0}".format(str(concor))) # except Exception as e: - # corr_tuple = ("file reading problem: {0}".format(e), + # corr_tuple = ("file reading problem: {0}".format(e), # old_path, new_path) # if verbose: # print(str(corr_tuple)) @@ -511,7 +552,7 @@ def calculate_correlation(args_tuple): data_2 = nb.load(new_path).get_fdata() # except Exception as e: - # corr_tuple = ("file reading problem: {0}".format(e), + # corr_tuple = ("file reading problem: {0}".format(e), # old_path, new_path) # if verbose: # print(str(corr_tuple)) @@ -528,8 +569,7 @@ def calculate_correlation(args_tuple): else: concor, pearson = batch_correlate(data_1, data_2) except Exception as e: - corr_tuple = ("correlating problem: {0}".format(e), - old_path, new_path) + corr_tuple = ("correlating problem: {0}".format(e), old_path, new_path) if verbose: print(str(corr_tuple)) return corr_tuple @@ -562,55 +602,54 @@ def calculate_correlation(args_tuple): return corr_tuple -def run_correlations(matched_dct, input_dct, source='output_dir', quick=False, verbose=False): - - all_corr_dct = { - 'pearson': {}, - 'concordance': {}, - 'sub_optimal': {} - } +def run_correlations( + matched_dct, input_dct, source="output_dir", quick=False, verbose=False +): + all_corr_dct = {"pearson": {}, "concordance": {}, "sub_optimal": {}} args_list = [] quick_list = [ - 'anatomical_brain', - 'anatomical_csf_mask', - 'anatomical_gm_mask', - 'anatomical_wm_mask', - 'anatomical_to_standard', - 'functional_preprocessed', - 'functional_brain_mask', - 'mean_functional_in_anat', - 'functional_nuisance_residuals', - 'functional_nuisance_regressors', - 'functional_to_standard', - 'roi_timeseries' + "anatomical_brain", + "anatomical_csf_mask", + "anatomical_gm_mask", + "anatomical_wm_mask", + "anatomical_to_standard", + "functional_preprocessed", + "functional_brain_mask", + "mean_functional_in_anat", + "functional_nuisance_residuals", + "functional_nuisance_regressors", + "functional_to_standard", + "roi_timeseries", ] - matched_path_dct = matched_dct['matched'] - output_dir = input_dct['settings']['correlations_dir'] - s3_creds = input_dct['settings']['s3_creds'] + matched_path_dct = matched_dct["matched"] + output_dir = input_dct["settings"]["correlations_dir"] + s3_creds = input_dct["settings"]["s3_creds"] for category in matched_path_dct.keys(): - if quick: if category not in quick_list: continue for file_id in matched_path_dct[category].keys(): - old_path = matched_path_dct[category][file_id][0] new_path = matched_path_dct[category][file_id][1] - if source == 'work_dir': - args_list.append((file_id, old_path, new_path, output_dir, s3_creds, verbose)) + if source == "work_dir": + args_list.append( + (file_id, old_path, new_path, output_dir, s3_creds, verbose) + ) else: - args_list.append((category, old_path, new_path, output_dir, s3_creds, verbose)) + args_list.append( + (category, old_path, new_path, output_dir, s3_creds, verbose) + ) print("\nNumber of correlations to calculate: {0}\n".format(len(args_list))) print("Running correlations...") - p = Pool(input_dct['settings']['n_cpus']) + p = Pool(input_dct["settings"]["n_cpus"]) corr_tuple_list = p.map(calculate_correlation, args_list) p.close() p.join() @@ -623,24 +662,23 @@ def run_correlations(matched_dct, input_dct, source='output_dir', quick=False, v if not corr_tuple: continue if isinstance(corr_tuple[1], Exception): - failures.append((corr_tuple[0], corr_tuple[1], - ' | '.join(corr_tuple[2]))) + failures.append((corr_tuple[0], corr_tuple[1], " | ".join(corr_tuple[2]))) continue - if corr_tuple[0] not in all_corr_dct['concordance'].keys(): - all_corr_dct['concordance'][corr_tuple[0]] = [] - if corr_tuple[0] not in all_corr_dct['pearson'].keys(): - all_corr_dct['pearson'][corr_tuple[0]] = [] - all_corr_dct['concordance'][corr_tuple[0]] += corr_tuple[1] - all_corr_dct['pearson'][corr_tuple[0]] += corr_tuple[2] + if corr_tuple[0] not in all_corr_dct["concordance"].keys(): + all_corr_dct["concordance"][corr_tuple[0]] = [] + if corr_tuple[0] not in all_corr_dct["pearson"].keys(): + all_corr_dct["pearson"][corr_tuple[0]] = [] + all_corr_dct["concordance"][corr_tuple[0]] += corr_tuple[1] + all_corr_dct["pearson"][corr_tuple[0]] += corr_tuple[2] if len(corr_tuple) > 3: - if corr_tuple[0] not in all_corr_dct['sub_optimal'].keys(): - all_corr_dct['sub_optimal'][corr_tuple[0]] = [] + if corr_tuple[0] not in all_corr_dct["sub_optimal"].keys(): + all_corr_dct["sub_optimal"][corr_tuple[0]] = [] try: - all_corr_dct['sub_optimal'][corr_tuple[0]].append("{0}:\n{1}\n{2}" - "\n\n".format(corr_tuple[1][0], - corr_tuple[3][0], - corr_tuple[3][1])) + all_corr_dct["sub_optimal"][corr_tuple[0]].append( + "{0}:\n{1}\n{2}" + "\n\n".format(corr_tuple[1][0], corr_tuple[3][0], corr_tuple[3][1]) + ) except TypeError: pass @@ -648,13 +686,12 @@ def run_correlations(matched_dct, input_dct, source='output_dir', quick=False, v def post180_organize_correlations(concor_dct, corr_type="concordance", quick=False): - corr_map_dct = {"correlations": {}} for key in concor_dct: if "problem" in key: continue # shouldn't need this - FIX - rawkey = key.replace('acq-', '').replace('run-', '') + rawkey = key.replace("acq-", "").replace("run-", "") datatype = rawkey.split("_")[-1] if datatype not in corr_map_dct["correlations"]: @@ -679,40 +716,28 @@ def organize_correlations(concor_dict, corr_type="concordance", quick=False): corr_map_dict = {} corr_map_dict["correlations"] = {} - derivs = [ - 'alff', - 'dr_tempreg', - 'reho', - 'sca_roi', - 'timeseries', - 'ndmg'] - anats = [ - 'anatomical', - 'seg' - ] + derivs = ["alff", "dr_tempreg", "reho", "sca_roi", "timeseries", "ndmg"] + anats = ["anatomical", "seg"] time_series = [ - 'functional_freq', - 'nuisance_residuals', - 'functional_preprocessed', - 'functional_to_standard', - 'ica_aroma_', - 'motion_correct', - 'slice_time', + "functional_freq", + "nuisance_residuals", + "functional_preprocessed", + "functional_to_standard", + "ica_aroma_", + "motion_correct", + "slice_time", ] - funcs = [ - 'functional', - 'displacement'] + funcs = ["functional", "displacement"] for key in concor_dict: - if quick: core[key] = concor_dict[key] continue - if 'xfm' in key or 'mixel' in key: + if "xfm" in key or "mixel" in key: continue - if 'centrality' in key or 'vmhc' in key or 'sca_tempreg' in key: + if "centrality" in key or "vmhc" in key or "sca_tempreg" in key: template_outputs[key] = concor_dict[key] continue @@ -722,7 +747,7 @@ def organize_correlations(concor_dict, corr_type="concordance", quick=False): continue for word in derivs: - if word in key and 'standard' not in key: + if word in key and "standard" not in key: native_outputs[key] = concor_dict[key] continue elif word in key: @@ -730,7 +755,7 @@ def organize_correlations(concor_dict, corr_type="concordance", quick=False): continue for word in time_series: - if word in key and 'mean' not in key and 'mask' not in key: + if word in key and "mean" not in key and "mask" not in key: timeseries[key] = concor_dict[key] continue @@ -751,7 +776,7 @@ def organize_correlations(concor_dict, corr_type="concordance", quick=False): corr_map_dict["correlations"][group] = regCorrMap else: print("No values in {0}".format(group)) - + group = "{0}_native_space_outputs".format(corr_type) if len(native_outputs.values()) > 0: corr_map_dict["correlations"][group] = native_outputs @@ -793,23 +818,24 @@ def quick_summary(dct, corr_map_dct, output_dir) -> dict: lines.append("{0}: {1}".format(output_type, corrmean)) dct = write_dct(dct, lines, output_type) - return(dct) + return dct -def compare_pipelines(input_dct: dict, - dir_type: str = 'output_dir') -> tuple[dict, dict]: +def compare_pipelines( + input_dct: dict, dir_type: str = "output_dir" +) -> tuple[dict, dict]: """ Given an input dict containing keys 'settings', gather prreviously generated pickles or all relevant output and working files - + Returns ------- corr_map : dict - + pearson_map : dict """ - output_dir = input_dct['settings']['output_dir'] - pickle_dir = input_dct['settings']['pickle_dir'] + output_dir = input_dct["settings"]["output_dir"] + pickle_dir = input_dct["settings"]["pickle_dir"] corrs_pkl = os.path.join(pickle_dir, f"{dir_type}_correlations.p") failures_pkl = os.path.join(pickle_dir, f"{dir_type}_failures.p") @@ -817,40 +843,51 @@ def compare_pipelines(input_dct: dict, all_corr_dct = None if os.path.exists(corrs_pkl): - print(f"\n\nFound the correlations pickle: {corrs_pkl}\n\n" - "Starting from there..\n") + print( + f"\n\nFound the correlations pickle: {corrs_pkl}\n\n" + "Starting from there..\n" + ) all_corr_dct = read_pickle(corrs_pkl) elif os.path.exists(matched_pkl): - print(f"\n\nFound the matched filepaths pickle: {matched_pkl}\n\n" - "Starting from there..\n") + print( + f"\n\nFound the matched filepaths pickle: {matched_pkl}\n\n" + "Starting from there..\n" + ) matched_dct = read_pickle(matched_pkl) else: # gather all relevant output and working files - outfiles1_dct, outfiles2_dct = gather_all_files(input_dct, pickle_dir, - source=dir_type) + outfiles1_dct, outfiles2_dct = gather_all_files( + input_dct, pickle_dir, source=dir_type + ) matched_dct = match_filepaths(outfiles1_dct, outfiles2_dct) write_pickle(matched_dct, matched_pkl) if not all_corr_dct: - all_corr_dct, failures = run_correlations(matched_dct, - input_dct, - source=dir_type, - quick=input_dct['settings']['quick'], - verbose=input_dct['settings']['verbose']) + all_corr_dct, failures = run_correlations( + matched_dct, + input_dct, + source=dir_type, + quick=input_dct["settings"]["quick"], + verbose=input_dct["settings"]["verbose"], + ) write_pickle(all_corr_dct, corrs_pkl) write_pickle(failures, failures_pkl) - - if dir_type == 'work_dir': + + if dir_type == "work_dir": sorted_vals = [] - #sorted_keys = sorted(all_corr_dct, key=all_corr_dct.get) - for key in all_corr_dct.keys(): #sorted_keys: - if 'file reading problem:' in key or 'different shape' in key or 'correlating problem' in key: + # sorted_keys = sorted(all_corr_dct, key=all_corr_dct.get) + for key in all_corr_dct.keys(): # sorted_keys: + if ( + "file reading problem:" in key + or "different shape" in key + or "correlating problem" in key + ): continue else: sorted_vals.append("{0}: {1}".format(all_corr_dct[key], key)) working_corrs_file = os.path.join(output_dir, "work_dir_correlations.txt") - with open(working_corrs_file, 'wt') as f: + with open(working_corrs_file, "wt") as f: for line in sorted_vals: f.write(line) f.write("\n") @@ -862,30 +899,36 @@ def compare_pipelines(input_dct: dict, else: organize = post180_organize_correlations - corr_map_dict = organize(all_corr_dct["concordance"], "concordance", - quick=input_dct['settings']['quick']) + corr_map_dict = organize( + all_corr_dct["concordance"], + "concordance", + quick=input_dct["settings"]["quick"], + ) corr_map_dict["pipeline_names"] = input_dct["pipelines"].keys() - - pearson_map_dict = organize(all_corr_dct["pearson"], "pearson", - quick=input_dct['settings']['quick']) + + pearson_map_dict = organize( + all_corr_dct["pearson"], "pearson", quick=input_dct["settings"]["quick"] + ) pearson_map_dict["pipeline_names"] = input_dct["pipelines"].keys() dct = {} corr_map = quick_summary(dct, corr_map_dict, output_dir) pearson_map = quick_summary(dct, pearson_map_dict, output_dir) - if all_corr_dct['sub_optimal']: - write_yml_file(all_corr_dct['sub_optimal'], os.path.join(output_dir, "sub_optimal.yml")) + if all_corr_dct["sub_optimal"]: + write_yml_file( + all_corr_dct["sub_optimal"], os.path.join(output_dir, "sub_optimal.yml") + ) - #for corr_group_name in corr_map_dict["correlations"].keys(): + # for corr_group_name in corr_map_dict["correlations"].keys(): # corr_group = corr_map_dict["correlations"][corr_group_name] # create_boxplot(corr_group, corr_group_name, # corr_map_dict["pipeline_names"], output_dir) - #for corr_group_name in pearson_map_dict["correlations"].keys(): + # for corr_group_name in pearson_map_dict["correlations"].keys(): # corr_group = pearson_map_dict["correlations"][corr_group_name] # create_boxplot(corr_group, corr_group_name, # pearson_map_dict["pipeline_names"], output_dir) - return(corr_map, pearson_map) + return (corr_map, pearson_map) def main() -> tuple: @@ -895,12 +938,11 @@ def main() -> tuple: • Check for already completed stuff (pickles) """ parser = argparse.ArgumentParser() - parser.add_argument("input_yaml", type=str, - help="file path of the script's input YAML") - parser.add_argument("--data_source", type=str, - help="Which site data comes from") - parser.add_argument("--branch", type=str, - help="Branch name") + parser.add_argument( + "input_yaml", type=str, help="file path of the script's input YAML" + ) + parser.add_argument("--data_source", type=str, help="Which site data comes from") + parser.add_argument("--branch", type=str, help="Branch name") args = parser.parse_args() data_source = args.data_source branch = args.branch @@ -910,27 +952,29 @@ def main() -> tuple: # check for already completed stuff (pickles) output_dir = os.path.join( - os.getcwd(), f"correlations_{input_dct['settings']['run_name']}") + os.getcwd(), f"correlations_{input_dct['settings']['run_name']}" + ) pickle_dir = os.path.join(output_dir, "pickles") if not os.path.exists(pickle_dir): try: os.makedirs(pickle_dir) except: - err = "\n\n[!] Could not create the output directory for the " \ - "correlations. Do you have write permissions?\nAttempted " \ - f"output directory: {output_dir}\n\n" + err = ( + "\n\n[!] Could not create the output directory for the " + "correlations. Do you have write permissions?\nAttempted " + f"output directory: {output_dir}\n\n" + ) raise Exception(err) - input_dct['settings'].update({'output_dir': output_dir, - 'pickle_dir': pickle_dir}) + input_dct["settings"].update({"output_dir": output_dir, "pickle_dir": pickle_dir}) - corr_map, pearson_map = compare_pipelines(input_dct, dir_type='output_dir') + corr_map, pearson_map = compare_pipelines(input_dct, dir_type="output_dir") corr_map_keys = list(corr_map.keys()) all_keys = [] for key in corr_map_keys: keys = list(corr_map[key]) - for i in keys: + for i in keys: all_keys.append(i) return all_keys, data_source, branch diff --git a/create_yml.py b/create_yml.py index a7238ea..fb4a3ed 100644 --- a/create_yml.py +++ b/create_yml.py @@ -1,29 +1,48 @@ -from utils.parse_yaml import cpac_yaml - import os + import click -@click.command() -@click.option('--pipeline1', required=True, type=str, help='Path to output directory from CPAC run ' - 'to correlate against pipeline2') -@click.option('--pipeline2', required=True, type=str, help='Path to output directory from CPAC run ' - 'to correlate against pipeline1') -@click.option('--workspace', type=str, help = 'directory to save correlations') -@click.option('--branch', type=str, help = 'branch name') -@click.option('--data_source', type=str, help = 'Data site') +from utils.parse_yaml import cpac_yaml +@click.command() +@click.option( + "--pipeline1", + required=True, + type=str, + help="Path to output directory from CPAC run " "to correlate against pipeline2", +) +@click.option( + "--pipeline2", + required=True, + type=str, + help="Path to output directory from CPAC run " "to correlate against pipeline1", +) +@click.option("--workspace", type=str, help="directory to save correlations") +@click.option("--branch", type=str, help="branch name") +@click.option("--data_source", type=str, help="Data site") def main(pipeline1, pipeline2, workspace, branch, data_source): """ Correlate outputs from regression run again another C-PAC version. """ - git_home = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir) - run_name = f'{branch}_{data_source}' - - cpac_yaml(pipeline1, pipeline2, f'{workspace}/correlations', run_name, 1, branch, data_source) + git_home = os.path.normpath( + os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir + ) + run_name = f"{branch}_{data_source}" + + cpac_yaml( + pipeline1, + pipeline2, + f"{workspace}/correlations", + run_name, + 1, + branch, + data_source, + ) return + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/utils/html_script.py b/utils/html_script.py index 7a23761..c0aff5c 100644 --- a/utils/html_script.py +++ b/utils/html_script.py @@ -1,4 +1,3 @@ - def dataset(name, data_source, value): dataset = f""" {{ @@ -9,10 +8,11 @@ def dataset(name, data_source, value): """ return dataset + def body(all_keys, data_source): - data_body = '' + data_body = "" for key in all_keys: - name_value = key.split(': ') + name_value = key.split(": ") name = name_value[0] value = name_value[1] data_body += dataset(name, data_source, value) @@ -23,9 +23,9 @@ def body(all_keys, data_source): """ return data_body + def write_html(data_body): - script = \ - f""" + script = f""" Correlations @@ -92,15 +92,16 @@ def write_html(data_body): """ - return(script) + return script + def setup_browser(html_template): import tempfile import webbrowser - with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as temp_file: - temp_file.write(html_template.encode('utf-8')) - filename = 'file:///'+ temp_file.name + with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file: + temp_file.write(html_template.encode("utf-8")) + filename = "file:///" + temp_file.name webbrowser.open_new_tab(filename) - - return \ No newline at end of file + + return diff --git a/utils/parse_yaml.py b/utils/parse_yaml.py index f288dee..bc1932e 100644 --- a/utils/parse_yaml.py +++ b/utils/parse_yaml.py @@ -1,59 +1,69 @@ import os + import yaml + def get_dir(paths): if not paths: directory = None else: for root, dirs, files in os.walk(paths): for dir in dirs: - if 'pipeline_' in dir: + if "pipeline_" in dir: directory = os.path.join(root, dir) return directory -def write_pipeline_yaml(output_dir=None, working_dir=None, log_dir=None, \ - pipeline_config=None, pipeline_name=None): +def write_pipeline_yaml( + output_dir=None, + working_dir=None, + log_dir=None, + pipeline_config=None, + pipeline_name=None, +): pipeline = { pipeline_name: { "output_dir": output_dir, "work_dir": working_dir, "log_dir": log_dir, "pipe_config": pipeline_config, - "replacements": None + "replacements": None, } } return pipeline + def parse_yaml(directory=None, pipeline_name=None): - subdirs = ['log', 'working', 'output'] + subdirs = ["log", "working", "output"] paths = {} for subdir in subdirs: if os.path.isdir(os.path.join(directory, subdir)): - paths[f"{subdir}_dir"] = (os.path.join(directory, subdir)) + paths[f"{subdir}_dir"] = os.path.join(directory, subdir) else: paths[f"{subdir}_dir"] = None - log_dir = get_dir(paths['log_dir']) + log_dir = get_dir(paths["log_dir"]) - for root, dirs, files in os.walk(paths['log_dir']): + for root, dirs, files in os.walk(paths["log_dir"]): for file in files: if file.endswith("Z.yml"): pipeline_config = os.path.join(root, file) - working_dir = get_dir(paths['working_dir']) - output_dir = get_dir(paths['output_dir']) + working_dir = get_dir(paths["working_dir"]) + output_dir = get_dir(paths["output_dir"]) - pipeline_dict = write_pipeline_yaml(output_dir, working_dir, log_dir, \ - pipeline_config, pipeline_name) + pipeline_dict = write_pipeline_yaml( + output_dir, working_dir, log_dir, pipeline_config, pipeline_name + ) return pipeline_dict -def write_yaml(pipeline_1=None, pipeline_2=None, correlations_dir=None, \ - run_name=None, n_cpus=None): +def write_yaml( + pipeline_1=None, pipeline_2=None, correlations_dir=None, run_name=None, n_cpus=None +): yaml_dict = {} yaml_dict["settings"] = { "n_cpus": n_cpus, @@ -61,25 +71,25 @@ def write_yaml(pipeline_1=None, pipeline_2=None, correlations_dir=None, \ "run_name": run_name, "s3_creds": None, "quick": False, - "verbose": False + "verbose": False, } - yaml_dict["pipelines"] = { - **pipeline_1, - **pipeline_2 - } + yaml_dict["pipelines"] = {**pipeline_1, **pipeline_2} return yaml_dict -def cpac_yaml(pipeline1, pipeline2, correlations_dir, run_name, n_cpus, branch, data_source): - - pipeline_1 = parse_yaml(pipeline1, 'pipeline_1') - pipeline_2 = parse_yaml(pipeline2, 'pipeline_2') - yaml_contents = write_yaml(pipeline_1, pipeline_2, correlations_dir, - run_name, n_cpus) +def cpac_yaml( + pipeline1, pipeline2, correlations_dir, run_name, n_cpus, branch, data_source +): + pipeline_1 = parse_yaml(pipeline1, "pipeline_1") + pipeline_2 = parse_yaml(pipeline2, "pipeline_2") + + yaml_contents = write_yaml( + pipeline_1, pipeline_2, correlations_dir, run_name, n_cpus + ) - with open(f'{branch}_{data_source}.yml', 'w') as file: + with open(f"{branch}_{data_source}.yml", "w") as file: yaml.dump(yaml_contents, file, default_flow_style=False, sort_keys=False) - return \ No newline at end of file + return From 0cdde222426dc302a2be93447bfef0aa69a4f22e Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Tue, 14 Nov 2023 17:52:08 -0500 Subject: [PATCH 06/28] :truck: SSOT cpac_correlations Ref https://github.com/FCP-INDI/CPAC_regtest_pack/pull/7 --- calculate_correlations.py | 983 +------------------------------------- requirements.txt | 1 + 2 files changed, 5 insertions(+), 979 deletions(-) create mode 100644 requirements.txt diff --git a/calculate_correlations.py b/calculate_correlations.py index 7edbdfc..103f5ed 100644 --- a/calculate_correlations.py +++ b/calculate_correlations.py @@ -1,986 +1,11 @@ #!/usr/bin/env python -import argparse -import itertools -import math -import os -import pickle -import subprocess -from collections.abc import Generator -from multiprocessing import Pool -from pathlib import Path -from typing import NamedTuple, Optional, Tuple, Union - -import nibabel as nb -import numpy as np -import pandas as pd -import yaml +"""Calculate correlations and write them to D3-friendly file""" +from cpac_correlations import cpac_correlations from utils.html_script import body -Axis = Union[int, Tuple[int, ...]] - - -class CorrValue(NamedTuple): - """Correlation values""" - - concor: np.ndarray - pearson: np.ndarray - - -def read_yml_file(yml_filepath): - with open(yml_filepath, "r") as f: - yml_dict = yaml.safe_load(f) - - return yml_dict - - -def write_yml_file(yml_dict, out_filepath): - with open(out_filepath, "wt") as f: - yaml.safe_dump(yml_dict, f) - - -def read_pickle(pickle_file): - with open(pickle_file, "rb") as f: - dct = pickle.load(f) - return dct - - -def write_pickle(dct, out_filepath): - with open(out_filepath, "wb") as f: - pickle.dump(dct, f, protocol=pickle.HIGHEST_PROTOCOL) - - -def read_txt_file(txt_file): - with open(txt_file, "r") as f: - strings = f.read().splitlines() - return strings - - -def write_txt_file(text_lines, out_filepath): - with open(out_filepath, "wt") as f: - for line in text_lines: - f.write("{0}\n".format(line)) - - -def write_dct(dct=None, text_lines=None, outname=None): - if not dct: - dct = {outname: text_lines} - else: - dct.update({outname: text_lines}) - return dct - - -def gather_local_filepaths(output_folder_path: str) -> list[str]: - """Given a local path, return relevant paths within that directory""" - filepaths = [] - - print("Gathering file paths from {0}\n".format(output_folder_path)) - for root, _dirs, files in os.walk(output_folder_path): - # loops through every file in the directory - for filename in files: - # checks if the file is a nifti (.nii.gz) - if ( - ".nii" in filename - or ".csv" in filename - or ".txt" in filename - or ".1D" in filename - or ".tsv" in filename - ): - filepaths.append(os.path.join(root, filename)) - - if len(filepaths) == 0: - raise FileNotFoundError( - "\n\n[!] No filepaths were found given the output folder!\n\n" - ) - - return filepaths - - -class SummaryStats: - def __init__( - self, array: np.ndarray, axis: Optional[Union[int, str]] = None - ) -> None: - self.mean = np.mean(array, axis=axis, keepdims=True) - self.var = np.var(array, axis=axis, keepdims=True) - self.std = np.sqrt(self.var) - self.norm = (array - self.mean) / self.std - - -def batch_correlate( - x: np.ndarray, y: np.ndarray, axis: Optional[Axis] = None -) -> CorrValue: - """ - Compute a batch of concordance and Pearson correlation coefficients between - x and y along an axis (or axes). - - References: - https://en.wikipedia.org/wiki/Concordance_correlation_coefficient - """ - # summary stats - try: - summary_stats = {"x": SummaryStats(x), "y": SummaryStats(y)} - except ZeroDivisionError: - return CorrValue(np.nan, np.nan) - - # Correlation coefficients - pearson = np.mean( - summary_stats["x"].norm * summary_stats["y"].norm, axis=axis, keepdims=True - ) - concor = ( - 2 - * pearson - * summary_stats["x"].std - * summary_stats["y"].std - / ( - summary_stats["x"].var - + summary_stats["y"].var - + (summary_stats["x"].mean - summary_stats["y"].mean) ** 2 - ) - ) - # Squeeze reduced singleton dimensions - if axis is not None: - concor = np.squeeze(concor, axis=axis) - pearson = np.squeeze(pearson, axis=axis) - return CorrValue(concor, pearson) - - -def determine_indices(df: pd.DataFrame) -> list: - """Determine indices of str-type columns in a DataFrame""" - return [ - i - for i, val in enumerate(df.applymap(lambda _: isinstance(_, str)).values[0]) - if val - ] - - -def correlate_text_based(txts: Union[list, tuple]) -> Generator: - delimiters = tuple(delimiter_from_filepath(path) for path in txts) - # TODO: why do we drop columns containing na? - initial_load = [ - pd.read_csv(txt, delimiter=delimiters[i], comment="#").dropna(axis=1) - for i, txt in enumerate(txts) - ] - for i, df in enumerate(initial_load): - # if we read a value-row as a header, fix that - try: - df.columns.astype(float) - initial_load[i] = pd.read_csv( - txts[i], delimiter=delimiters[i], comment="#", header=None - ).dropna(axis=1) - except ValueError: - pass - # assume string columns are indices and not values to correlate - indices = [] - for i in range(len(initial_load)): - indices.append( - np.where(df.apply(lambda _: _.dtype == np.dtypes.ObjectDType))[0] - ) - oned = [] - for i, index in enumerate(indices): - if index.shape[0]: - oned.append( - pd.read_csv( - txts[i], delimiter=delimiters[i], comment="#", index_col=indices[i] - ) - .dropna(axis=1) - .values - ) - else: - oned.append(initial_load[i].values) - return (np.nanmean(measure) for measure in batch_correlate(*oned, axis=0)) - - -def create_unique_file_dict( - filepaths: list[str], - output_folder_path: str, - replacements: Optional[list[str]] = None, -) -> dict[str, dict[tuple, str]]: - """ - Parameters - ---------- - filepaths : list of str - list of output filepaths from a CPAC output directory - output_folder_path : str - the CPAC output directory the filepaths are from - replacements : list of str, optional - a list of strings to be removed from the filepaths should - they occur - - Returns - ------- - files_dict : dict - a dictionary of dictionaries, format: - files_dict["centrality"] = - {("centrality", midpath, nums): , ..} - """ - - files_dict = {} - - for filepath in filepaths: - if "_stack" in filepath: - continue - - if ("itk" in filepath) or ("xfm" in filepath) or ("montage" in filepath): - continue - path_changes = [] - real_filepath = filepath - if replacements: - for word_couple in replacements: - if "," not in word_couple: - raise SyntaxError( - "\n\n[!] In the replacements text file, the old " - "substring and its replacement must be separated " - "by a comma.\n\n" - ) - word, new = word_couple.split(",") - if word in filepath: - path_changes.append(f"old: {filepath}") - filepath = filepath.replace(word, new) - path_changes.append(f"new: {filepath}") - if path_changes: - with open(os.path.join(os.getcwd(), "path_changes.txt"), "wt") as f: - for path in path_changes: - f.write(path) - f.write("\n") - - filename = filepath.split("/")[-1] - - # name of the directory the file is in - folder = filepath.split("/")[-2] - - midpath = filepath.replace(output_folder_path, "") - midpath = midpath.replace(filename, "") - - pre180 = False - if pre180: - # name of the output type/derivative - try: - category = midpath.split("/")[2] - except IndexError as e: - continue - - if "eigenvector" in filepath: - category = category + ": eigenvector" - if "degree" in filepath: - category = category + ": degree" - if "lfcd" in filepath: - category = category + ": lfcd" - else: - tags = [] - category = filename - category = category.rstrip(".gz").rstrip(".nii") - - excl_tags = ["sub-", "ses-", "task-", "run-", "acq-"] - - # len(filetag) == 1 is temporary for broken/missing ses-* tag - for filetag in filename.split("_"): - for exctag in excl_tags: - if exctag in filetag or len(filetag) == 1: - category = category.replace(f"{filetag}_", "") - - # this provides a way to safely identify the specific file - # without relying on a full string of the filename (because - # this can change between versions depending on what any given - # processing tool appends to output file names) - nums_in_folder = [int(s) for s in folder if s.isdigit()] - nums_in_filename = [int(s) for s in filename if s.isdigit()] - - file_nums = "" - - for num in nums_in_folder: - file_nums = file_nums + str(num) - - for num in nums_in_filename: - file_nums = file_nums + str(num) - - # load these settings into the tuple so that the file can be - # identified without relying on its full path (as it would be - # impossible to match files from two regression tests just - # based on their filepaths) - file_tuple = (category, midpath, file_nums) - - temp_dict = {} - temp_dict[file_tuple] = [real_filepath] - - if category not in files_dict.keys(): - files_dict[category] = {} - - files_dict[category].update(temp_dict) - - return files_dict - - -def gather_all_files( - input_dct: dict, pickle_dir: str, source: str = "output_dir" -) -> tuple[dict, dict]: - """ - Given an input dictionary, a pickle directory, and (optionally) a source, - returns a pair of dicts - """ - file_dct_list = [{}, {}] - - for index, (key, pipe_dct) in enumerate(input_dct["pipelines"].items()): - pipe_outdir = pipe_dct[source] - - if input_dct["settings"]["s3_creds"]: - if not "s3://" in pipe_outdir: - err = ( - "\n\n[!] If pulling output files from an S3 bucket, the " - "output folder path must have the s3:// prefix.\n\n" - ) - raise Exception(err) - else: - pipe_outdir = os.path.abspath(pipe_outdir).rstrip("/") - - pipeline_name = pipe_outdir.split("/")[-1] - - # if source == "output_dir" and "pipeline_" not in pipeline_name: - # err = "\n\n[!] Your pipeline output directory has to be a specific " \ - # "one that has the 'pipeline_' prefix.\n\n(Not the main output " \ - # "directory that contains all of the 'pipeline_X' subdirectories," \ - # "and not a specific participant's output subdirectory either.)\n" - # raise Exception(err) - - output_pkl = os.path.join(pickle_dir, f"{key}_{source}_paths.p") - - if os.path.exists(output_pkl): - print( - f"Found output list pickle for {key}, skipping output file" - "path parsing.." - ) - pipeline_files_dct = read_pickle(output_pkl) - else: - pipeline_files_list = gather_local_filepaths(pipe_outdir) - pipeline_files_dct = create_unique_file_dict( - pipeline_files_list, pipe_outdir, pipe_dct["replacements"] - ) - write_pickle(pipeline_files_dct, output_pkl) - - file_dct_list[index] = pipeline_files_dct - - return tuple(file_dct_list) - - -def match_filepaths( - old_files_dict: dict[str, dict[tuple, str]], - new_files_dict: dict[str, dict[tuple, str]], -) -> dict[str, dict[tuple,]]: - """Returns a dictionary mapping each filepath from the first C-PAC - run to the second one, matched to derivative, strategy, and scan. - - Parameters - ---------- - old_files_dict, new_files_dict : dict - each key is a derivative name, and each value is another - dictionary keying (derivative, mid-path, last digit in path) - tuples to a list containing the full filepath described by - the tuple that is the key - - Returns - ------- - matched_path_dict : dict - same as the input dictionaries, except the list in the - sub-dictionary value has both file paths that are matched - """ - - # file path matching - matched_path_dict = {} - missing_in_old = [] - missing_in_new = [] - - for key in new_files_dict: - # for types of derivative... - if key in old_files_dict.keys(): - for file_id in new_files_dict[key]: - if file_id in old_files_dict[key].keys(): - if key not in matched_path_dict.keys(): - matched_path_dict[key] = {} - - matched_path_dict[key][file_id] = ( - old_files_dict[key][file_id] + new_files_dict[key][file_id] - ) - - else: - missing_in_old.append(file_id) # new_files_dict[key][file_id]) - else: - missing_in_old.append(new_files_dict[key]) - - # find out what is in the last version's outputs that isn't in the new - # version's outputs - for key in old_files_dict: - if new_files_dict.get(key) != None: - missing_in_new.append(old_files_dict[key]) - - if len(matched_path_dict) == 0: - err = ( - "\n\n[!] No output paths were successfully matched between " - "the two CPAC output directories!\n\n" - ) - raise Exception(err) - - matched_files_dct = { - "matched": matched_path_dict, - "missing_old": missing_in_old, - "missing_new": missing_in_new, - } - - return matched_files_dct - - -def delimiter_from_filepath(filepath: Union[Path, str]) -> Optional[str]: - """ - Given a filepath, return expected value-separator delimiter - """ - if filepath.endswith(".tsv"): - return "\t" - if filepath.endswith(".csv"): - return "," - with open(filepath, "r", encoding="utf8") as _f: - first_line = "#" - while first_line.lstrip().startswith("#"): - first_line = _f.readline() - for delimiter in ["\t", ",", " "]: - if delimiter in first_line: - if delimiter == " ": - return r"\s+" - return delimiter - return None - - -def calculate_correlation(args_tuple): - category = args_tuple[0] - old_path = args_tuple[1] - new_path = args_tuple[2] - local_dir = args_tuple[3] - s3_creds = args_tuple[4] - verbose = args_tuple[5] - - if verbose: - print("Calculating correlation between {0} and {1}".format(old_path, new_path)) - - corr_tuple = None - - if s3_creds: - try: - # full filepath with filename - old_local_file = os.path.join( - local_dir, "s3_input_files", old_path.replace("s3://", "") - ) - # directory without filename - old_local_path = old_local_file.replace(old_path.split("/")[-1], "") - - new_local_file = os.path.join( - local_dir, "s3_input_files", new_path.replace("s3://", "") - ) - new_local_path = new_local_file.replace(new_path.split("/")[-1], "") - - if not os.path.exists(old_local_path): - os.makedirs(old_local_path) - if not os.path.exists(new_local_path): - os.makedirs(new_local_path) - - except Exception as e: - err = ( - "\n\nLocals: {0}\n\n[!] Could not create the local S3 " - "download directory.\n\nError details: {1}\n\n".format((locals(), e)) - ) - raise Exception(e) - - try: - old_path = old_local_file - except Exception as e: - err = ( - "\n\nLocals: {0}\n\n[!] Could not download the files from " - "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" - "\nS3 creds: {3}\n\nError details: {4}\n\n".format( - locals(), old_path, old_local_path, s3_creds, e - ) - ) - raise Exception(e) - - try: - new_path = new_local_file - except Exception as e: - err = ( - "\n\nLocals: {0}\n\n[!] Could not download the files from " - "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" - "\nS3 creds: {3}\n\nError details: {4}\n\n".format( - locals(), new_path, new_local_path, s3_creds, e - ) - ) - raise Exception(e) - - ## nibabel to pull the data from the re-assembled file paths - if os.path.exists(old_path) and os.path.exists(new_path): - if ( - (".csv" in old_path and ".csv" in new_path) - or (".txt" in old_path and ".txt" in new_path) - or (".1D" in old_path and ".1D" in new_path) - or (".tsv" in old_path and ".tsv" in new_path) - ): - try: - concor, pearson = correlate_text_based((old_path, new_path)) - except Exception as e: - return category, e, (old_path, new_path) - - if concor > 0.980: - corr_tuple = (category, [concor], [pearson]) - else: - corr_tuple = (category, [concor], [pearson], (old_path, new_path)) - if verbose: - print("Success - {0}".format(str(concor))) - - # except Exception as e: - # corr_tuple = ("file reading problem: {0}".format(e), - # old_path, new_path) - # if verbose: - # print(str(corr_tuple)) - - return corr_tuple - - else: - # try: - old_file_img = nb.load(old_path) - old_file_hdr = old_file_img.header - new_file_img = nb.load(new_path) - new_file_hdr = new_file_img.header - - old_file_dims = old_file_hdr.get_zooms() - new_file_dims = new_file_hdr.get_zooms() - - data_1 = nb.load(old_path).get_fdata() - data_2 = nb.load(new_path).get_fdata() - - # except Exception as e: - # corr_tuple = ("file reading problem: {0}".format(e), - # old_path, new_path) - # if verbose: - # print(str(corr_tuple)) - # return corr_tuple - - ## set up and run the Pearson correlation and concordance correlation - if data_1.flatten().shape == data_2.flatten().shape: - try: - if len(old_file_dims) > 3: - axis = tuple(range(3, len(old_file_dims))) - concor, pearson = batch_correlate(data_1, data_2, axis=axis) - concor = np.nanmean(concor) - pearson = np.nanmean(pearson) - else: - concor, pearson = batch_correlate(data_1, data_2) - except Exception as e: - corr_tuple = ("correlating problem: {0}".format(e), old_path, new_path) - if verbose: - print(str(corr_tuple)) - return corr_tuple - if concor > 0.980: - corr_tuple = (category, [concor], [pearson]) - else: - corr_tuple = (category, [concor], [pearson], (old_path, new_path)) - if verbose: - print("Success - {0}".format(str(concor))) - else: - corr_tuple = ("different shape", old_path, new_path) - if verbose: - print(str(corr_tuple)) - - else: - if not os.path.exists(old_path): - corr_tuple = ("file doesn't exist", [old_path], None) - if verbose: - print(str(corr_tuple)) - if not os.path.exists(new_path): - if not corr_tuple: - corr_tuple = ("file doesn't exist", [new_path], None) - if verbose: - print(str(corr_tuple)) - else: - corr_tuple = ("file doesn't exist", old_path, new_path) - if verbose: - print(str(corr_tuple)) - - return corr_tuple - - -def run_correlations( - matched_dct, input_dct, source="output_dir", quick=False, verbose=False -): - all_corr_dct = {"pearson": {}, "concordance": {}, "sub_optimal": {}} - - args_list = [] - - quick_list = [ - "anatomical_brain", - "anatomical_csf_mask", - "anatomical_gm_mask", - "anatomical_wm_mask", - "anatomical_to_standard", - "functional_preprocessed", - "functional_brain_mask", - "mean_functional_in_anat", - "functional_nuisance_residuals", - "functional_nuisance_regressors", - "functional_to_standard", - "roi_timeseries", - ] - - matched_path_dct = matched_dct["matched"] - output_dir = input_dct["settings"]["correlations_dir"] - s3_creds = input_dct["settings"]["s3_creds"] - - for category in matched_path_dct.keys(): - if quick: - if category not in quick_list: - continue - - for file_id in matched_path_dct[category].keys(): - old_path = matched_path_dct[category][file_id][0] - new_path = matched_path_dct[category][file_id][1] - - if source == "work_dir": - args_list.append( - (file_id, old_path, new_path, output_dir, s3_creds, verbose) - ) - else: - args_list.append( - (category, old_path, new_path, output_dir, s3_creds, verbose) - ) - - print("\nNumber of correlations to calculate: {0}\n".format(len(args_list))) - - print("Running correlations...") - p = Pool(input_dct["settings"]["n_cpus"]) - corr_tuple_list = p.map(calculate_correlation, args_list) - p.close() - p.join() - - print("\nCorrelations of the {0} are done.\n".format(source)) - - failures = [] - - for corr_tuple in corr_tuple_list: - if not corr_tuple: - continue - if isinstance(corr_tuple[1], Exception): - failures.append((corr_tuple[0], corr_tuple[1], " | ".join(corr_tuple[2]))) - continue - if corr_tuple[0] not in all_corr_dct["concordance"].keys(): - all_corr_dct["concordance"][corr_tuple[0]] = [] - if corr_tuple[0] not in all_corr_dct["pearson"].keys(): - all_corr_dct["pearson"][corr_tuple[0]] = [] - all_corr_dct["concordance"][corr_tuple[0]] += corr_tuple[1] - all_corr_dct["pearson"][corr_tuple[0]] += corr_tuple[2] - - if len(corr_tuple) > 3: - if corr_tuple[0] not in all_corr_dct["sub_optimal"].keys(): - all_corr_dct["sub_optimal"][corr_tuple[0]] = [] - try: - all_corr_dct["sub_optimal"][corr_tuple[0]].append( - "{0}:\n{1}\n{2}" - "\n\n".format(corr_tuple[1][0], corr_tuple[3][0], corr_tuple[3][1]) - ) - except TypeError: - pass - - return all_corr_dct, failures - - -def post180_organize_correlations(concor_dct, corr_type="concordance", quick=False): - corr_map_dct = {"correlations": {}} - for key in concor_dct: - if "problem" in key: - continue - # shouldn't need this - FIX - rawkey = key.replace("acq-", "").replace("run-", "") - datatype = rawkey.split("_")[-1] - - if datatype not in corr_map_dct["correlations"]: - corr_map_dct["correlations"][datatype] = {} - corr_map_dct["correlations"][datatype][rawkey] = concor_dct[key] - - return corr_map_dct - - -def organize_correlations(concor_dict, corr_type="concordance", quick=False): - # break up all of the correlations into groups - each group of derivatives - # will go into its own boxplot - - regCorrMap = {} - native_outputs = {} - template_outputs = {} - timeseries = {} - functionals = {} - - core = {} - - corr_map_dict = {} - corr_map_dict["correlations"] = {} - - derivs = ["alff", "dr_tempreg", "reho", "sca_roi", "timeseries", "ndmg"] - anats = ["anatomical", "seg"] - time_series = [ - "functional_freq", - "nuisance_residuals", - "functional_preprocessed", - "functional_to_standard", - "ica_aroma_", - "motion_correct", - "slice_time", - ] - funcs = ["functional", "displacement"] - - for key in concor_dict: - if quick: - core[key] = concor_dict[key] - continue - - if "xfm" in key or "mixel" in key: - continue - - if "centrality" in key or "vmhc" in key or "sca_tempreg" in key: - template_outputs[key] = concor_dict[key] - continue - - for word in anats: - if word in key: - regCorrMap[key] = concor_dict[key] - continue - - for word in derivs: - if word in key and "standard" not in key: - native_outputs[key] = concor_dict[key] - continue - elif word in key: - template_outputs[key] = concor_dict[key] - continue - - for word in time_series: - if word in key and "mean" not in key and "mask" not in key: - timeseries[key] = concor_dict[key] - continue - - for word in funcs: - if word in key: - functionals[key] = concor_dict[key] - - if quick: - group = "{0}_core_outputs".format(corr_type) - if len(core.values()) > 0: - corr_map_dict["correlations"][group] = core - else: - print("No values in {0}".format(group)) - return corr_map_dict - - group = "{0}_registration_and_segmentation".format(corr_type) - if len(regCorrMap.values()) > 0: - corr_map_dict["correlations"][group] = regCorrMap - else: - print("No values in {0}".format(group)) - - group = "{0}_native_space_outputs".format(corr_type) - if len(native_outputs.values()) > 0: - corr_map_dict["correlations"][group] = native_outputs - else: - print("No values in {0}".format(group)) - - group = "{0}_template_space_outputs".format(corr_type) - if len(template_outputs.values()) > 0: - corr_map_dict["correlations"][group] = template_outputs - else: - print("No values in {0}".format(group)) - - group = "{0}_timeseries_outputs".format(corr_type) - if len(timeseries.values()) > 0: - corr_map_dict["correlations"][group] = timeseries - else: - print("No values in {0}".format(group)) - - group = "{0}_functional_outputs".format(corr_type) - if len(functionals.values()) > 0: - corr_map_dict["correlations"][group] = functionals - else: - print("No values in {0}".format(group)) - - return corr_map_dict - - -def quick_summary(dct, corr_map_dct, output_dir) -> dict: - for corr_group in corr_map_dct["correlations"].keys(): - cat_dct = {} - lines = [] - for output_type, corr_vec in dict( - corr_map_dct["correlations"][corr_group] - ).items(): - try: - corrmean = np.mean(np.asarray(corr_vec)) - except TypeError: - continue - lines.append("{0}: {1}".format(output_type, corrmean)) - - dct = write_dct(dct, lines, output_type) - return dct - - -def compare_pipelines( - input_dct: dict, dir_type: str = "output_dir" -) -> tuple[dict, dict]: - """ - Given an input dict containing keys 'settings', gather prreviously - generated pickles or all relevant output and working files - - Returns - ------- - corr_map : dict - - pearson_map : dict - """ - output_dir = input_dct["settings"]["output_dir"] - pickle_dir = input_dct["settings"]["pickle_dir"] - - corrs_pkl = os.path.join(pickle_dir, f"{dir_type}_correlations.p") - failures_pkl = os.path.join(pickle_dir, f"{dir_type}_failures.p") - matched_pkl = os.path.join(pickle_dir, f"{dir_type}_matched_files.p") - - all_corr_dct = None - if os.path.exists(corrs_pkl): - print( - f"\n\nFound the correlations pickle: {corrs_pkl}\n\n" - "Starting from there..\n" - ) - all_corr_dct = read_pickle(corrs_pkl) - elif os.path.exists(matched_pkl): - print( - f"\n\nFound the matched filepaths pickle: {matched_pkl}\n\n" - "Starting from there..\n" - ) - matched_dct = read_pickle(matched_pkl) - - else: - # gather all relevant output and working files - outfiles1_dct, outfiles2_dct = gather_all_files( - input_dct, pickle_dir, source=dir_type - ) - matched_dct = match_filepaths(outfiles1_dct, outfiles2_dct) - write_pickle(matched_dct, matched_pkl) - - if not all_corr_dct: - all_corr_dct, failures = run_correlations( - matched_dct, - input_dct, - source=dir_type, - quick=input_dct["settings"]["quick"], - verbose=input_dct["settings"]["verbose"], - ) - write_pickle(all_corr_dct, corrs_pkl) - write_pickle(failures, failures_pkl) - - if dir_type == "work_dir": - sorted_vals = [] - # sorted_keys = sorted(all_corr_dct, key=all_corr_dct.get) - for key in all_corr_dct.keys(): # sorted_keys: - if ( - "file reading problem:" in key - or "different shape" in key - or "correlating problem" in key - ): - continue - else: - sorted_vals.append("{0}: {1}".format(all_corr_dct[key], key)) - working_corrs_file = os.path.join(output_dir, "work_dir_correlations.txt") - with open(working_corrs_file, "wt") as f: - for line in sorted_vals: - f.write(line) - f.write("\n") - - else: - pre180 = False - if pre180: - organize = organize_correlations - else: - organize = post180_organize_correlations - - corr_map_dict = organize( - all_corr_dct["concordance"], - "concordance", - quick=input_dct["settings"]["quick"], - ) - corr_map_dict["pipeline_names"] = input_dct["pipelines"].keys() - - pearson_map_dict = organize( - all_corr_dct["pearson"], "pearson", quick=input_dct["settings"]["quick"] - ) - pearson_map_dict["pipeline_names"] = input_dct["pipelines"].keys() - dct = {} - corr_map = quick_summary(dct, corr_map_dict, output_dir) - pearson_map = quick_summary(dct, pearson_map_dict, output_dir) - - if all_corr_dct["sub_optimal"]: - write_yml_file( - all_corr_dct["sub_optimal"], os.path.join(output_dir, "sub_optimal.yml") - ) - - # for corr_group_name in corr_map_dict["correlations"].keys(): - # corr_group = corr_map_dict["correlations"][corr_group_name] - # create_boxplot(corr_group, corr_group_name, - # corr_map_dict["pipeline_names"], output_dir) - - # for corr_group_name in pearson_map_dict["correlations"].keys(): - # corr_group = pearson_map_dict["correlations"][corr_group_name] - # create_boxplot(corr_group, corr_group_name, - # pearson_map_dict["pipeline_names"], output_dir) - return (corr_map, pearson_map) - - -def main() -> tuple: - """ - • Parse commandline arguments - • Read input YAML - • Check for already completed stuff (pickles) - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "input_yaml", type=str, help="file path of the script's input YAML" - ) - parser.add_argument("--data_source", type=str, help="Which site data comes from") - parser.add_argument("--branch", type=str, help="Branch name") - args = parser.parse_args() - data_source = args.data_source - branch = args.branch - - # get the input info - input_dct = read_yml_file(args.input_yaml) - - # check for already completed stuff (pickles) - output_dir = os.path.join( - os.getcwd(), f"correlations_{input_dct['settings']['run_name']}" - ) - pickle_dir = os.path.join(output_dir, "pickles") - - if not os.path.exists(pickle_dir): - try: - os.makedirs(pickle_dir) - except: - err = ( - "\n\n[!] Could not create the output directory for the " - "correlations. Do you have write permissions?\nAttempted " - f"output directory: {output_dir}\n\n" - ) - raise Exception(err) - - input_dct["settings"].update({"output_dir": output_dir, "pickle_dir": pickle_dir}) - - corr_map, pearson_map = compare_pipelines(input_dct, dir_type="output_dir") - corr_map_keys = list(corr_map.keys()) - all_keys = [] - for key in corr_map_keys: - keys = list(corr_map[key]) - for i in keys: - all_keys.append(i) - return all_keys, data_source, branch - - if __name__ == "__main__": - all_keys, data_source, branch = main() + all_keys, data_source, branch = cpac_correlations() html_body = body(all_keys, data_source) - with open(f"{data_source}_{branch}.json", "w") as file: + with open(f"{data_source}_{branch}.json", "w", encoding="utf-8") as file: file.write(html_body) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b3ef36 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/FCP-INDI/CPAC_regtest_pack.git@package_correlations#subdirectory=cpac_correlations \ No newline at end of file From 164a8a656807afeac080e7da8b06964423668976 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Thu, 28 Dec 2023 22:43:16 -0500 Subject: [PATCH 07/28] :package: Package repo for portable installation --- .gitignore | 1 + .pre-commit-config.yaml | 72 +++++++++++++--- pyproject.toml | 81 ++++++++++++++++++ requirements.txt | 1 - src/regression_dashboard/__init__.py | 0 .../build_d3_dashboard.py | 2 +- .../regression_dashboard/build_dashboard.py | 3 +- .../calculate_correlations.py | 4 +- .../regression_dashboard/create_yml.py | 15 +--- src/regression_dashboard/generate_comment.py | 3 + .../templates}/heatmap.html | 2 +- .../templates}/heatmap.js | 2 +- .../__pycache__/parse_yaml.cpython-311.pyc | Bin .../utils}/html_script.py | 22 ++--- .../regression_dashboard/utils}/parse_yaml.py | 18 ++-- 15 files changed, 170 insertions(+), 56 deletions(-) create mode 100644 .gitignore create mode 100644 pyproject.toml delete mode 100644 requirements.txt create mode 100644 src/regression_dashboard/__init__.py rename build_d3_dashboard.py => src/regression_dashboard/build_d3_dashboard.py (96%) rename build_dashboard.py => src/regression_dashboard/build_dashboard.py (93%) rename calculate_correlations.py => src/regression_dashboard/calculate_correlations.py (75%) rename create_yml.py => src/regression_dashboard/create_yml.py (71%) create mode 100644 src/regression_dashboard/generate_comment.py rename {templates => src/regression_dashboard/templates}/heatmap.html (96%) rename {templates => src/regression_dashboard/templates}/heatmap.js (99%) rename {utils => src/regression_dashboard/utils}/__pycache__/parse_yaml.cpython-311.pyc (100%) rename {utils => src/regression_dashboard/utils}/html_script.py (92%) rename {utils => src/regression_dashboard/utils}/parse_yaml.py (89%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0df1a99..267a008 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,63 @@ +fail_fast: false + repos: - - repo: https://github.com/pycqa/isort - rev: 5.11.5 - hooks: - - id: isort - files: "\\.(py)$" - - repo: https://github.com/psf/black - rev: 23.1.0 - hooks: - - id: black - files: "\\.(py)$" \ No newline at end of file +- repo: https://github.com/python-poetry/poetry + rev: 1.7.0 + hooks: + - id: poetry-check + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + args: [--ignore-missing-imports] + additional_dependencies: + - types-toml + - types-PyYAML +- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.12.0 + hooks: + - id: pretty-format-yaml + args: + - --autofix + - --indent=2 + - id: pretty-format-toml + exclude: ^poetry.lock$ + args: + - --autofix + - --indent=2 + - --no-sort + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-case-conflict + - id: end-of-file-fixer + - id: mixed-line-ending + args: + - --fix=lf + - id: trailing-whitespace + - id: pretty-format-json + args: + - --autofix + - --indent=4 + - --no-sort-keys + - id: check-merge-conflict + - id: check-yaml + - id: check-json + - id: check-toml + +- repo: local + hooks: + - id: yaml-file-extension + name: Prefer .yaml over .yml. + entry: YAML files must have .yaml extension. + language: fail + files: \.yml$ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..fbcdc4a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,81 @@ +[tool.poetry] +name = "cpac_regression_dashboard" +version = "1.0.0" +description = "Generate a dashboard for C-PAC regression tests" +authors = [ + "Amy Gutierrez <58920810+amygutierrez@users.noreply.github.com>", + "Jon Clucas =1.2.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9b3ef36..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -git+https://github.com/FCP-INDI/CPAC_regtest_pack.git@package_correlations#subdirectory=cpac_correlations \ No newline at end of file diff --git a/src/regression_dashboard/__init__.py b/src/regression_dashboard/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build_d3_dashboard.py b/src/regression_dashboard/build_d3_dashboard.py similarity index 96% rename from build_d3_dashboard.py rename to src/regression_dashboard/build_d3_dashboard.py index 72d1f24..0cc47c9 100644 --- a/build_d3_dashboard.py +++ b/src/regression_dashboard/build_d3_dashboard.py @@ -18,7 +18,7 @@ def main(json_file=None, branch=None): body = etree.HTML(_f.read()) script_element = etree.SubElement(body[0], "script") script_element.set("defer", "defer") - script_element.set("src", f"./heatmap.js") + script_element.set("src", "./heatmap.js") with open("templates/heatmap.js", "r", encoding="utf-8") as _f: with open(f"{outdir}/heatmap.js", "w", encoding="utf=8") as _s: _s.write( diff --git a/build_dashboard.py b/src/regression_dashboard/build_dashboard.py similarity index 93% rename from build_dashboard.py rename to src/regression_dashboard/build_dashboard.py index 5880235..f00da15 100644 --- a/build_dashboard.py +++ b/src/regression_dashboard/build_dashboard.py @@ -2,13 +2,14 @@ import click -from utils.html_script import setup_browser, write_html +from .utils.html_script import setup_browser, write_html def process_option(ctx, param, value): if value is not None: values = value.split(",") return [val.strip() for val in values] + return [] @click.command() diff --git a/calculate_correlations.py b/src/regression_dashboard/calculate_correlations.py similarity index 75% rename from calculate_correlations.py rename to src/regression_dashboard/calculate_correlations.py index 103f5ed..d0b7326 100644 --- a/calculate_correlations.py +++ b/src/regression_dashboard/calculate_correlations.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -"""Calculate correlations and write them to D3-friendly file""" +"""Calculate correlations and write them to D3-friendly file.""" from cpac_correlations import cpac_correlations -from utils.html_script import body +from .utils.html_script import body if __name__ == "__main__": all_keys, data_source, branch = cpac_correlations() diff --git a/create_yml.py b/src/regression_dashboard/create_yml.py similarity index 71% rename from create_yml.py rename to src/regression_dashboard/create_yml.py index fb4a3ed..b7eed2c 100644 --- a/create_yml.py +++ b/src/regression_dashboard/create_yml.py @@ -2,7 +2,7 @@ import click -from utils.parse_yaml import cpac_yaml +from .utils.parse_yaml import cpac_yaml @click.command() @@ -21,14 +21,9 @@ @click.option("--workspace", type=str, help="directory to save correlations") @click.option("--branch", type=str, help="branch name") @click.option("--data_source", type=str, help="Data site") -def main(pipeline1, pipeline2, workspace, branch, data_source): - """ - Correlate outputs from regression run again another C-PAC version. - """ - - git_home = os.path.normpath( - os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir - ) +def main(pipeline1, pipeline2, workspace, branch, data_source) -> None: + """Correlate outputs from regression run again another C-PAC version.""" + os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir) run_name = f"{branch}_{data_source}" cpac_yaml( @@ -41,8 +36,6 @@ def main(pipeline1, pipeline2, workspace, branch, data_source): data_source, ) - return - if __name__ == "__main__": main() diff --git a/src/regression_dashboard/generate_comment.py b/src/regression_dashboard/generate_comment.py new file mode 100644 index 0000000..7bff39a --- /dev/null +++ b/src/regression_dashboard/generate_comment.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Gather generated PNGs and link to heatmap in a GitHub-flavored Markdown string.""" diff --git a/templates/heatmap.html b/src/regression_dashboard/templates/heatmap.html similarity index 96% rename from templates/heatmap.html rename to src/regression_dashboard/templates/heatmap.html index 0cd1893..2f5fc29 100644 --- a/templates/heatmap.html +++ b/src/regression_dashboard/templates/heatmap.html @@ -6,4 +6,4 @@
Correlations heatmap will load here!
- \ No newline at end of file + diff --git a/templates/heatmap.js b/src/regression_dashboard/templates/heatmap.js similarity index 99% rename from templates/heatmap.js rename to src/regression_dashboard/templates/heatmap.js index 0d4ca2d..c36bcd6 100644 --- a/templates/heatmap.js +++ b/src/regression_dashboard/templates/heatmap.js @@ -123,4 +123,4 @@ svg.append("text") .style("font-size", "14px") .style("fill", "grey") .style("max-width", 400) - .text("GRAPHSUBTITLE"); \ No newline at end of file + .text("GRAPHSUBTITLE"); diff --git a/utils/__pycache__/parse_yaml.cpython-311.pyc b/src/regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc similarity index 100% rename from utils/__pycache__/parse_yaml.cpython-311.pyc rename to src/regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc diff --git a/utils/html_script.py b/src/regression_dashboard/utils/html_script.py similarity index 92% rename from utils/html_script.py rename to src/regression_dashboard/utils/html_script.py index c0aff5c..99d60a6 100644 --- a/utils/html_script.py +++ b/src/regression_dashboard/utils/html_script.py @@ -1,12 +1,11 @@ -def dataset(name, data_source, value): - dataset = f""" +def dataset(name, data_source, value) -> str: + return f""" {{ "rowid": "{name}", "columnid": "{data_source}", "value": "{value}" }}, """ - return dataset def body(all_keys, data_source): @@ -16,16 +15,11 @@ def body(all_keys, data_source): name = name_value[0] value = name_value[1] data_body += dataset(name, data_source, value) - out = f""" - {{"data": [ - {data_body} - ]}} - """ return data_body -def write_html(data_body): - script = f""" +def write_html(data_body) -> str: + return f""" Correlations @@ -90,12 +84,10 @@ def write_html(data_body):
Correlations heatmap will load here!
- """ - - return script + """ # noqa: E501 -def setup_browser(html_template): +def setup_browser(html_template) -> None: import tempfile import webbrowser @@ -103,5 +95,3 @@ def setup_browser(html_template): temp_file.write(html_template.encode("utf-8")) filename = "file:///" + temp_file.name webbrowser.open_new_tab(filename) - - return diff --git a/utils/parse_yaml.py b/src/regression_dashboard/utils/parse_yaml.py similarity index 89% rename from utils/parse_yaml.py rename to src/regression_dashboard/utils/parse_yaml.py index bc1932e..73af52f 100644 --- a/utils/parse_yaml.py +++ b/src/regression_dashboard/utils/parse_yaml.py @@ -8,9 +8,9 @@ def get_dir(paths): directory = None else: for root, dirs, files in os.walk(paths): - for dir in dirs: - if "pipeline_" in dir: - directory = os.path.join(root, dir) + for _dir in dirs: + if "pipeline_" in _dir: + directory = os.path.join(root, _dir) return directory @@ -21,7 +21,7 @@ def write_pipeline_yaml( pipeline_config=None, pipeline_name=None, ): - pipeline = { + return { pipeline_name: { "output_dir": output_dir, "work_dir": working_dir, @@ -31,8 +31,6 @@ def write_pipeline_yaml( } } - return pipeline - def parse_yaml(directory=None, pipeline_name=None): subdirs = ["log", "working", "output"] @@ -54,12 +52,10 @@ def parse_yaml(directory=None, pipeline_name=None): working_dir = get_dir(paths["working_dir"]) output_dir = get_dir(paths["output_dir"]) - pipeline_dict = write_pipeline_yaml( + return write_pipeline_yaml( output_dir, working_dir, log_dir, pipeline_config, pipeline_name ) - return pipeline_dict - def write_yaml( pipeline_1=None, pipeline_2=None, correlations_dir=None, run_name=None, n_cpus=None @@ -81,7 +77,7 @@ def write_yaml( def cpac_yaml( pipeline1, pipeline2, correlations_dir, run_name, n_cpus, branch, data_source -): +) -> None: pipeline_1 = parse_yaml(pipeline1, "pipeline_1") pipeline_2 = parse_yaml(pipeline2, "pipeline_2") @@ -91,5 +87,3 @@ def cpac_yaml( with open(f"{branch}_{data_source}.yml", "w") as file: yaml.dump(yaml_contents, file, default_flow_style=False, sort_keys=False) - - return From f87094d8236874df0f315831cb1df869a4a79a34 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Wed, 3 Jan 2024 16:21:41 -0500 Subject: [PATCH 08/28] :sparkles: Post boxplots and correlation coefficients to comments and PRs --- pyproject.toml | 2 + src/regression_dashboard/generate_comment.py | 144 +++++++++++++++++++ 2 files changed, 146 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index fbcdc4a..dd44ec6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ packages = [{include = "regression_dashboard", from = "src"}] [tool.poetry.dependencies] python = ">=3.9" +PyGithub = "*" cpac-correlations = {git = "https://github.com/FCP-INDI/CPAC_regtest_pack.git", rev = "package_correlations", subdirectory = "cpac_correlations"} [tool.poetry.group.dev.dependencies] @@ -24,6 +25,7 @@ ruff = "^0.1.7" [tool.poetry.scripts] cpac_regsuite_create_yaml = 'regression_dashboard.create_yml:main' cpac_regsuite_create_yml = 'regression_dashboard.create_yml:main' +cpac_regsuite_generate_comment = 'regression_dashboard.generate_comment:main' [tool.pytest.ini_options] pythonpath = [ diff --git a/src/regression_dashboard/generate_comment.py b/src/regression_dashboard/generate_comment.py index 7bff39a..da79b60 100644 --- a/src/regression_dashboard/generate_comment.py +++ b/src/regression_dashboard/generate_comment.py @@ -1,3 +1,147 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """Gather generated PNGs and link to heatmap in a GitHub-flavored Markdown string.""" +from dataclasses import dataclass +import os +from pathlib import Path +import sys +from typing import Generator + +from github import Github + + +@dataclass +class EnvVars: + """Dataclass for environment variables.""" + + github_token: str + owner: str + repo: str + sha: str + testing_owner: str + + def __init__(self) -> None: + """Initialize the dataclass from the environment.""" + attrs = ["github_token", "owner", "repo", "sha", "testing_owner"] + for attr in attrs: + setattr(self, attr, os.environ.get(attr.upper(), "")) + + +_ENV = EnvVars() + + +def gather_images(path: Path) -> Generator[Path, None, None]: + """Gather the images. + + Parameters + ---------- + path : Path + The path to the correlations directory.. + + Yields + ------ + image : Path + The path to an image. + """ + return path.glob("*.png") + + +def gather_text(path: Path) -> str: + """Gathers and concatenates all text files in the given directory. + + Parameters + ---------- + path : Path + The path to the correlations directory. + + Returns + ------- + str + The concatenated text. + """ + text = "" + for file in path.glob("*.txt"): + with open(file, "r", encoding="utf=8") as _f: + text += _f.read() + text += "\n" + return text.strip() + + +def generate_comment(path: Path) -> str: + """Generate the comment. + + Parameters + ---------- + path : Path + The path to the correlations directory. + + Returns + ------- + str : The comment. + """ + comment = "" + for image in gather_images(path): + raw_image_path = _raw_image_path(_ENV.testing_owner, _ENV.repo, _ENV.sha, image) + comment += f"![{image.stem}]({raw_image_path})\n" + return comment + gather_text(path) + + +def main() -> None: + """Generate and post a comment on a GitHub commit. + + Also post the comment to any open PR in which the commit is the most recent. + """ + if len(sys.argv) > 1: + if sys.argv[1] in ["-h", "--help"]: + print("Usage: cpac_regsuite_generate_comment [path]") + print("If no path is given, the current working directory is used.") + print("Required environment variables:") + print( + "GITHUB_TOKEN: A personal access token with scope to write to " + "comments and pull requests." + ) + print("OWNER: The owner of the repository.") + print("REPO: The name of the repository.") + print("SHA: The SHA of the commit.") + print("TESTING_OWNER: The owner of the testing repository.") + sys.exit(0) + path = Path(sys.argv[1]) + else: + path = Path(os.getcwd()) + personal_access_token = os.environ.get("GITHUB_TOKEN") + g = Github(personal_access_token) + repo = g.get_repo(f"{_ENV.owner}/{_ENV.repo}") + commit = repo.get_commit(_ENV.sha) + comment = generate_comment(path) + commit.create_comment(comment) + for pr in repo.get_pulls(state="open", sort="created"): + if pr.head.sha == _ENV.sha: + pr.create_issue_comment(comment) + + +def _raw_image_path(owner: str, repo: str, sha: str, image: Path) -> str: + """Generate the raw image path. + + Parameters + ---------- + owner : str + The owner of the repository. + + repo : str + The name of the repository. + + sha : str + The SHA of the commit. + + image : Path + The path to the image. + + Returns + ------- + str : The raw image path. + """ + return f"https://raw.githubusercontent.com/{owner}/regtest-runlogs/{repo}_{sha}/{image.name}" + + +if __name__ == "__main__": + main() From 0465513cff02f7a9d6f10205e5fb0de5c8eb37e4 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 5 Jan 2024 01:02:19 -0500 Subject: [PATCH 09/28] :sparkles: Add script to post a comment to a given commit and relevant PR(s) --- pyproject.toml | 18 ++- src/cpac_regression_dashboard/__init__.py | 3 + src/cpac_regression_dashboard/_version.py | 9 ++ .../build_d3_dashboard.py | 0 .../build_dashboard.py | 0 .../calculate_correlations.py | 0 .../create_yml.py | 0 .../generate_comment.py | 107 +++++++++++++++++- .../templates/heatmap.html | 0 .../templates/heatmap.js | 0 .../__pycache__/parse_yaml.cpython-311.pyc | Bin .../utils/html_script.py | 0 .../utils/parse_yaml.py | 0 src/regression_dashboard/__init__.py | 0 14 files changed, 125 insertions(+), 12 deletions(-) create mode 100644 src/cpac_regression_dashboard/__init__.py create mode 100644 src/cpac_regression_dashboard/_version.py rename src/{regression_dashboard => cpac_regression_dashboard}/build_d3_dashboard.py (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/build_dashboard.py (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/calculate_correlations.py (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/create_yml.py (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/generate_comment.py (52%) rename src/{regression_dashboard => cpac_regression_dashboard}/templates/heatmap.html (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/templates/heatmap.js (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/utils/__pycache__/parse_yaml.cpython-311.pyc (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/utils/html_script.py (100%) rename src/{regression_dashboard => cpac_regression_dashboard}/utils/parse_yaml.py (100%) delete mode 100644 src/regression_dashboard/__init__.py diff --git a/pyproject.toml b/pyproject.toml index dd44ec6..fb14f3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cpac_regression_dashboard" -version = "1.0.0" +version = "1.0.0.dev1" description = "Generate a dashboard for C-PAC regression tests" authors = [ "Amy Gutierrez <58920810+amygutierrez@users.noreply.github.com>", @@ -8,11 +8,14 @@ authors = [ ] license = "LGPL-2.1" readme = "README.md" -packages = [{include = "regression_dashboard", from = "src"}] +packages = [{from = "src", include = "cpac_regression_dashboard"}] [tool.poetry.dependencies] python = ">=3.9" +cairosvg = "*" +gitpython = "*" PyGithub = "*" +pyppeteer = "*" cpac-correlations = {git = "https://github.com/FCP-INDI/CPAC_regtest_pack.git", rev = "package_correlations", subdirectory = "cpac_correlations"} [tool.poetry.group.dev.dependencies] @@ -23,9 +26,12 @@ pytest-cov = "^4.1.0" ruff = "^0.1.7" [tool.poetry.scripts] -cpac_regsuite_create_yaml = 'regression_dashboard.create_yml:main' -cpac_regsuite_create_yml = 'regression_dashboard.create_yml:main' -cpac_regsuite_generate_comment = 'regression_dashboard.generate_comment:main' +cpac_regsuite_create_yaml = 'cpac_regression_dashboard.create_yml:main' +cpac_regsuite_create_yml = 'cpac_regression_dashboard.create_yml:main' +cpac_regsuite_generate_comment = 'cpac_regression_dashboard.generate_comment:main' + +[tool.poetry.urls] +"Source Code" = "https://github.com/FCP-INDI/C-PAC_regression_dashboard" [tool.pytest.ini_options] pythonpath = [ @@ -40,7 +46,7 @@ src = ["src"] target-version = "py39" [tool.ruff.lint] -select = ["ANN", "D", "E", "F", "I"] +select = ["ANN", "D", "E", "F", "I", "Q"] ignore = [ "ANN101", # self should not be annotated. "ANN102" # cls should not be annotated. diff --git a/src/cpac_regression_dashboard/__init__.py b/src/cpac_regression_dashboard/__init__.py new file mode 100644 index 0000000..75013dc --- /dev/null +++ b/src/cpac_regression_dashboard/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Create a dashboard of regression test results.""" diff --git a/src/cpac_regression_dashboard/_version.py b/src/cpac_regression_dashboard/_version.py new file mode 100644 index 0000000..f220d89 --- /dev/null +++ b/src/cpac_regression_dashboard/_version.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Get version from packaging metadata.""" +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("cpac_regression_dashboard") +except PackageNotFoundError: + __version__ = "unknown" diff --git a/src/regression_dashboard/build_d3_dashboard.py b/src/cpac_regression_dashboard/build_d3_dashboard.py similarity index 100% rename from src/regression_dashboard/build_d3_dashboard.py rename to src/cpac_regression_dashboard/build_d3_dashboard.py diff --git a/src/regression_dashboard/build_dashboard.py b/src/cpac_regression_dashboard/build_dashboard.py similarity index 100% rename from src/regression_dashboard/build_dashboard.py rename to src/cpac_regression_dashboard/build_dashboard.py diff --git a/src/regression_dashboard/calculate_correlations.py b/src/cpac_regression_dashboard/calculate_correlations.py similarity index 100% rename from src/regression_dashboard/calculate_correlations.py rename to src/cpac_regression_dashboard/calculate_correlations.py diff --git a/src/regression_dashboard/create_yml.py b/src/cpac_regression_dashboard/create_yml.py similarity index 100% rename from src/regression_dashboard/create_yml.py rename to src/cpac_regression_dashboard/create_yml.py diff --git a/src/regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py similarity index 52% rename from src/regression_dashboard/generate_comment.py rename to src/cpac_regression_dashboard/generate_comment.py index da79b60..3ddffa2 100644 --- a/src/regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -1,13 +1,21 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """Gather generated PNGs and link to heatmap in a GitHub-flavored Markdown string.""" +import asyncio from dataclasses import dataclass +from importlib.metadata import metadata import os from pathlib import Path import sys +import tempfile from typing import Generator +from cairosvg import svg2png +from git import Repo from github import Github +from pyppeteer import launch + +from ._version import __version__ @dataclass @@ -30,6 +38,45 @@ def __init__(self) -> None: _ENV = EnvVars() +@dataclass +class Heatmap: + """Heatmap dataclass.""" + + filename: str + content: str + + +def add_heatmap_to_branch(file: Heatmap) -> None: + """Add a heatmap to a branch. + + Parameters + ---------- + file : Heatmap + The heatmap file to add. + + Returns + ------- + None + """ + personal_access_token = os.environ.get("GITHUB_TOKEN") + g = Github(personal_access_token) + repo = g.get_repo(f"{_ENV.testing_owner}/regtest-runlogs") + branch_name = f"{_ENV.repo}_{_ENV.sha}" + with tempfile.TemporaryDirectory() as _temp_dir: + temp_dir = Path(_temp_dir) + local_repo = Repo.clone_from( + repo.clone_url, temp_dir, branch=branch_name, depth=1 + ) + svg_path = temp_dir / f"{file.filename}.svg" + png_path = temp_dir / f"{file.filename}.png" + with open(svg_path, "w") as _f: + _f.write(file.content) + svg2png(background_color="white", url=str(svg_path), write_to=str(png_path)) + local_repo.index.add([png_path]) + local_repo.index.commit(":loud_sound: Add heatmap image") + local_repo.remotes.origin.push(branch_name) + + def gather_images(path: Path) -> Generator[Path, None, None]: """Gather the images. @@ -59,15 +106,15 @@ def gather_text(path: Path) -> str: str The concatenated text. """ - text = "" + text = "|feature|coefficient|\n|---|---|\n" for file in path.glob("*.txt"): with open(file, "r", encoding="utf=8") as _f: - text += _f.read() - text += "\n" + for line in _f.readlines(): + text += f"|{'|'.join(_.strip() for _ in line.split(':', 1))}|\n" return text.strip() -def generate_comment(path: Path) -> str: +async def generate_comment(path: Path) -> str: """Generate the comment. Parameters @@ -79,13 +126,53 @@ def generate_comment(path: Path) -> str: ------- str : The comment. """ - comment = "" + project_urls = metadata(__package__).get_all("Project-URL", []) + source_url = None + for _url in project_urls: + if _url.startswith("Source Code, "): + source_url = _url.split(",")[1].strip() + break + if source_url is None: + comment = f"Generated by {__name__} {__version__}\n\n" + else: + _packageless_name = __name__.replace(__package__, "").lstrip(".") + comment = ( + f"Generated by [{__package__}]({source_url})." + f"{_packageless_name} {__version__}\n\n" + ) + comment += await get_heatmap() for image in gather_images(path): raw_image_path = _raw_image_path(_ENV.testing_owner, _ENV.repo, _ENV.sha, image) comment += f"![{image.stem}]({raw_image_path})\n" return comment + gather_text(path) +async def get_heatmap() -> str: + """Get a heatmap image.""" + url = f"https://{_ENV.testing_owner}.github.io/dashboard/?data_sha={_ENV.sha}" + browser = await launch() + page = await browser.newPage() + await page.goto(url, waitUntil="networkidle0") + svg_string = await page.evaluate( + """() => { + let svg = document.querySelector('svg'); + return svg ? svg.outerHTML : null; + }""" + ) + if svg_string is not None: + _heatmap = Heatmap("heatmap", svg_string) + add_heatmap_to_branch(_heatmap) + heatmap = _raw_image_path( + _ENV.testing_owner, _ENV.repo, _ENV.sha, Path(f"{_heatmap.filename}.png") + ) + heatmap = f"[![heatmap]({heatmap})]({url})" + else: + heatmap = "" + + await browser.close() + return heatmap + + def main() -> None: """Generate and post a comment on a GitHub commit. @@ -105,14 +192,22 @@ def main() -> None: print("SHA: The SHA of the commit.") print("TESTING_OWNER: The owner of the testing repository.") sys.exit(0) + elif sys.argv[1] in ["-v", "--version"]: + print(f"{__name__} version {__version__}") + sys.exit(0) path = Path(sys.argv[1]) else: path = Path(os.getcwd()) + asyncio.run(post_comment(path)) + + +async def post_comment(path: Path) -> None: + """Post a comment on a GitHub commit and relevant PR.""" personal_access_token = os.environ.get("GITHUB_TOKEN") g = Github(personal_access_token) repo = g.get_repo(f"{_ENV.owner}/{_ENV.repo}") commit = repo.get_commit(_ENV.sha) - comment = generate_comment(path) + comment = await generate_comment(path) commit.create_comment(comment) for pr in repo.get_pulls(state="open", sort="created"): if pr.head.sha == _ENV.sha: diff --git a/src/regression_dashboard/templates/heatmap.html b/src/cpac_regression_dashboard/templates/heatmap.html similarity index 100% rename from src/regression_dashboard/templates/heatmap.html rename to src/cpac_regression_dashboard/templates/heatmap.html diff --git a/src/regression_dashboard/templates/heatmap.js b/src/cpac_regression_dashboard/templates/heatmap.js similarity index 100% rename from src/regression_dashboard/templates/heatmap.js rename to src/cpac_regression_dashboard/templates/heatmap.js diff --git a/src/regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc b/src/cpac_regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc similarity index 100% rename from src/regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc rename to src/cpac_regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc diff --git a/src/regression_dashboard/utils/html_script.py b/src/cpac_regression_dashboard/utils/html_script.py similarity index 100% rename from src/regression_dashboard/utils/html_script.py rename to src/cpac_regression_dashboard/utils/html_script.py diff --git a/src/regression_dashboard/utils/parse_yaml.py b/src/cpac_regression_dashboard/utils/parse_yaml.py similarity index 100% rename from src/regression_dashboard/utils/parse_yaml.py rename to src/cpac_regression_dashboard/utils/parse_yaml.py diff --git a/src/regression_dashboard/__init__.py b/src/regression_dashboard/__init__.py deleted file mode 100644 index e69de29..0000000 From e3c5aada83e2351123ce8a09afefbcb1df365f8a Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 19 Jan 2024 18:29:18 -0500 Subject: [PATCH 10/28] :necktie: Only walk log_dir if exists --- .../utils/parse_yaml.py | 66 +++++++++++++------ 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/src/cpac_regression_dashboard/utils/parse_yaml.py b/src/cpac_regression_dashboard/utils/parse_yaml.py index 73af52f..0aa504d 100644 --- a/src/cpac_regression_dashboard/utils/parse_yaml.py +++ b/src/cpac_regression_dashboard/utils/parse_yaml.py @@ -1,9 +1,17 @@ +"""From a pair of CPAC output directories, write a YAML file for regression.""" import os +from typing import Optional, Union import yaml +_PIPELINE_DICT = dict[Optional[str], dict[str, Optional[Union[str, int]]]] +_FULL_YAML_DICT = dict[ + str, Union[dict[str, Union[bool, int, Optional[str]]], _PIPELINE_DICT] +] -def get_dir(paths): + +def get_dir(paths: str) -> Optional[str]: + """Get the full path to a ``pipeline_*`` directory.""" if not paths: directory = None else: @@ -15,12 +23,13 @@ def get_dir(paths): def write_pipeline_yaml( - output_dir=None, - working_dir=None, - log_dir=None, - pipeline_config=None, - pipeline_name=None, -): + output_dir: Optional[str] = None, + working_dir: Optional[str] = None, + log_dir: Optional[str] = None, + pipeline_config: Optional[str] = None, + pipeline_name: Optional[str] = None, +) -> _PIPELINE_DICT: + """Collect paths and strings to write.""" return { pipeline_name: { "output_dir": output_dir, @@ -32,24 +41,27 @@ def write_pipeline_yaml( } -def parse_yaml(directory=None, pipeline_name=None): +def parse_yaml(directory: str, pipeline_name: str) -> _PIPELINE_DICT: + """Parse a CPAC output directory for pipeline information.""" subdirs = ["log", "working", "output"] - paths = {} + paths: dict[str, Optional[str]] = {} for subdir in subdirs: if os.path.isdir(os.path.join(directory, subdir)): paths[f"{subdir}_dir"] = os.path.join(directory, subdir) else: paths[f"{subdir}_dir"] = None - - log_dir = get_dir(paths["log_dir"]) - - for root, dirs, files in os.walk(paths["log_dir"]): - for file in files: - if file.endswith("Z.yml"): - pipeline_config = os.path.join(root, file) - + assert isinstance(paths["log_dir"], str) + log_dir: Optional[str] = get_dir(paths["log_dir"]) + + if log_dir is not None: + for root, _dirs, files in os.walk(paths["log_dir"]): + for file in files: + if file.endswith("Z.yml"): + pipeline_config = os.path.join(root, file) + assert isinstance(paths["working_dir"], str) working_dir = get_dir(paths["working_dir"]) + assert isinstance(paths["output_dir"], str) output_dir = get_dir(paths["output_dir"]) return write_pipeline_yaml( @@ -58,9 +70,14 @@ def parse_yaml(directory=None, pipeline_name=None): def write_yaml( - pipeline_1=None, pipeline_2=None, correlations_dir=None, run_name=None, n_cpus=None -): - yaml_dict = {} + pipeline_1: _PIPELINE_DICT, + pipeline_2: _PIPELINE_DICT, + correlations_dir: Optional[str] = None, + run_name: Optional[str] = None, + n_cpus: Optional[int] = None, +) -> _FULL_YAML_DICT: + """Combine settings and both pipelines into a single dictionary.""" + yaml_dict: _FULL_YAML_DICT = {} yaml_dict["settings"] = { "n_cpus": n_cpus, "correlations_dir": correlations_dir, @@ -76,8 +93,15 @@ def write_yaml( def cpac_yaml( - pipeline1, pipeline2, correlations_dir, run_name, n_cpus, branch, data_source + pipeline1: str, + pipeline2: str, + correlations_dir: str, + run_name: str, + n_cpus: int, + branch: str, + data_source: str, ) -> None: + """Write a YAML file for the regression run.""" pipeline_1 = parse_yaml(pipeline1, "pipeline_1") pipeline_2 = parse_yaml(pipeline2, "pipeline_2") From 71bb23bbbcb550a8effbd1135d395e56fafc1e63 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 19 Jan 2024 18:34:19 -0500 Subject: [PATCH 11/28] :package: Make `cpac_regsuite_correlate` a CLI script --- pyproject.toml | 1 + src/cpac_regression_dashboard/calculate_correlations.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fb14f3a..0b785d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ pytest-cov = "^4.1.0" ruff = "^0.1.7" [tool.poetry.scripts] +cpac_regsuite_correlate = 'cpac_regression_dashboard.cpac_correlations:main' cpac_regsuite_create_yaml = 'cpac_regression_dashboard.create_yml:main' cpac_regsuite_create_yml = 'cpac_regression_dashboard.create_yml:main' cpac_regsuite_generate_comment = 'cpac_regression_dashboard.generate_comment:main' diff --git a/src/cpac_regression_dashboard/calculate_correlations.py b/src/cpac_regression_dashboard/calculate_correlations.py index d0b7326..2477282 100644 --- a/src/cpac_regression_dashboard/calculate_correlations.py +++ b/src/cpac_regression_dashboard/calculate_correlations.py @@ -4,8 +4,15 @@ from .utils.html_script import body -if __name__ == "__main__": + +def main() -> None: # noqa: D103 all_keys, data_source, branch = cpac_correlations() html_body = body(all_keys, data_source) with open(f"{data_source}_{branch}.json", "w", encoding="utf-8") as file: file.write(html_body) + + +main.__doc__ = __doc__ + +if __name__ == "__main__": + main() From b7e3c0d6e30d9dca88e7e93592cd15df82171a44 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Mon, 24 Jun 2024 14:21:38 -0400 Subject: [PATCH 12/28] :alien: FCP-INDI/CPAC_regtest_pack#7 is merged --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0b785d7..c884bf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ cairosvg = "*" gitpython = "*" PyGithub = "*" pyppeteer = "*" -cpac-correlations = {git = "https://github.com/FCP-INDI/CPAC_regtest_pack.git", rev = "package_correlations", subdirectory = "cpac_correlations"} +cpac-correlations = {git = "https://github.com/FCP-INDI/CPAC_regtest_pack.git", subdirectory = "cpac_correlations"} [tool.poetry.group.dev.dependencies] pytest = "^7.4.3" From eb56a7916e70db3bc76cd98cc1f0f069dbee00c7 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Mon, 5 Aug 2024 11:40:38 -0400 Subject: [PATCH 13/28] :necktie: Return path to regression YAML when generating --- src/cpac_regression_dashboard/utils/parse_yaml.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/cpac_regression_dashboard/utils/parse_yaml.py b/src/cpac_regression_dashboard/utils/parse_yaml.py index 0aa504d..bf93449 100644 --- a/src/cpac_regression_dashboard/utils/parse_yaml.py +++ b/src/cpac_regression_dashboard/utils/parse_yaml.py @@ -1,5 +1,6 @@ """From a pair of CPAC output directories, write a YAML file for regression.""" import os +from pathlib import Path from typing import Optional, Union import yaml @@ -100,14 +101,17 @@ def cpac_yaml( n_cpus: int, branch: str, data_source: str, -) -> None: +) -> Path: """Write a YAML file for the regression run.""" - pipeline_1 = parse_yaml(pipeline1, "pipeline_1") - pipeline_2 = parse_yaml(pipeline2, "pipeline_2") + pipeline_1: _PIPELINE_DICT = parse_yaml(pipeline1, "pipeline_1") + pipeline_2: _PIPELINE_DICT = parse_yaml(pipeline2, "pipeline_2") - yaml_contents = write_yaml( + yaml_contents: _FULL_YAML_DICT = write_yaml( pipeline_1, pipeline_2, correlations_dir, run_name, n_cpus ) - with open(f"{branch}_{data_source}.yml", "w") as file: + yaml_path: Path = Path(f"{branch}_{data_source}.yml") + """Path to YAML file for regression correlation.""" + with yaml_path.open("w") as file: yaml.dump(yaml_contents, file, default_flow_style=False, sort_keys=False) + return yaml_path From 63ee4764cc5dc12f7b579ce932ad87398c632480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Theodore=20=F0=9F=90=88=F0=9F=A4=96?= Date: Mon, 12 Aug 2024 12:15:15 -0400 Subject: [PATCH 14/28] :alien: Update regtest dev branch --- pyproject.toml | 2 +- .../utils/__pycache__/parse_yaml.cpython-311.pyc | Bin 3382 -> 0 bytes 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 src/cpac_regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc diff --git a/pyproject.toml b/pyproject.toml index c884bf3..15a8fa3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ cairosvg = "*" gitpython = "*" PyGithub = "*" pyppeteer = "*" -cpac-correlations = {git = "https://github.com/FCP-INDI/CPAC_regtest_pack.git", subdirectory = "cpac_correlations"} +cpac-correlations = {git = "https://github.com/FCP-INDI/CPAC_regtest_pack.git", subdirectory = "cpac_correlations", branch = "correlate_from_python"} [tool.poetry.group.dev.dependencies] pytest = "^7.4.3" diff --git a/src/cpac_regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc b/src/cpac_regression_dashboard/utils/__pycache__/parse_yaml.cpython-311.pyc deleted file mode 100644 index 206699b974352d7157f108b335e0e879d71cedac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3382 zcmbtWO>7&-6`tY#aF>6wC`zJaNl|4>3>8_d;vdO>MchhuYL?{NILGN zv%7C*-n@A`^X9$p&5K~rhoCTj`@8ge5utz4PQ7?)jn_Yg#x|0WgtKU|TH{45dA>x8 z9ubL1@=DB?c#)9=iG`Z2@i=I65)V?mgg-%X@!wS8LK@ff(u#+Vf$wz*$TrfE%1Fq- zxmm4AYNgTE(Q|0OZ1iW*ZxX+T`G&_}3|42LKW1dIEPQGfh>0x!83*y) zj1`G)iksr`kIwD(?sflZXm`j#j5y7NRhF+hIY;!bdAD$ zI6SiJeGzyT*t@@fu{=Co8J;fApEw*9=24*gfQ`I{|DiNTX{~snb`P~58gQtrj&uwL zhNG466E1@dPUl>+#yQ)k6tsN7-GeQxD`b`K#`b2FWw+s{c2B02+)`%Q4iGt?O{Hbm zFn>XDo^-(^sd`KvA{4MfQiC9L}QE$y>vT#Qr~RI-|3H zK$7RF=LRfpYe_=K=mPYo4N(_& zXt2FeB(`nut#EN_^&hq*uZyjgG<;h#TesTw$Gvrb{x(Vet&#zaI%cit0o|{2x=+W? z!6q*ob}@o_5F*OE5NB)#V#oH@V#4OD@ny3vO3>(d#4S#4tYr!PqZ>eRflx(+dZx`~ zR0<~oWl&5Z)Oxlc=OlGKqdjzEAs!+$ZIH8c$g36}P)H;6Xl=gQbHkAMsZfBDARS{V z01^rcr~kH3BU8$xHI;^06&UNZ8kfFP*duX}p&_YWgHr;i3ymMRuX)D`gv`04aG!%% zZrmlC{GmU%HU0SJS2v%ApY;B5;Ol{%%zjU~Z>G{WQ}$o2_^(nd96ua;cX#NiW}vAN3F+~!>ISH%Q<%7KxCz{t^Pyq$F%>MqKq`AW}NITWjeV&U8($fv&&nO6m5xyd;o$Yd zk@LH;7ZcAW_O$)`<&kTZk!!^c3cTX=?*!D3l)8&!iPdX4#aMQF5va z=K?9@l2r&+FDBDOmLMAb{&U>g+P-kHS%FV8V(FK<>0U zd#tXg6Z8)94gzA%Z?2TtJ}1l!`K(aXVW2*Onu8j~R?BR^!o4d6X0y4 zW`xrr3$)>hw`ClR-!?92O~|Bsb%7of;Cm}Bh+5MD>l~mnqw_%7r>xX=;kk?qpVqYA z6}G-`UDSONFY$l%J_p{*1|QSQAUv%r!zkxJho+Fqu6usRe2&&JMxP@LkAwiDgavXD z6cg%WHcmcbDHz#!{dU}Ahb4I_Rmf_|rL3}^RJDz)Y>TQwwB)M1p~g8k^xb2#StT#$ zY?huQ+^-62`MCeoU#3o*Z}ztM8YF^d@rT&D@qxnxal}EbWT+Pd;I@+Y(a-Q5Y>kaO TOqi-_CGYyN-$8E@cftP!;c@}m From 89fc4880de0b980d097976f1f1ea186cb5fdfbad Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Tue, 27 Aug 2024 12:33:49 -0400 Subject: [PATCH 15/28] :bug: Make `directory = paths` if `paths` is an empty dir --- src/cpac_regression_dashboard/utils/parse_yaml.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/cpac_regression_dashboard/utils/parse_yaml.py b/src/cpac_regression_dashboard/utils/parse_yaml.py index bf93449..2593f30 100644 --- a/src/cpac_regression_dashboard/utils/parse_yaml.py +++ b/src/cpac_regression_dashboard/utils/parse_yaml.py @@ -13,9 +13,8 @@ def get_dir(paths: str) -> Optional[str]: """Get the full path to a ``pipeline_*`` directory.""" - if not paths: - directory = None - else: + directory = paths + if directory: for root, dirs, files in os.walk(paths): for _dir in dirs: if "pipeline_" in _dir: From 7447bd6a542e15530b4f32448a8d32fb48bde987 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Tue, 27 Aug 2024 13:40:44 -0400 Subject: [PATCH 16/28] :goal_net: Coerce `n_cpus` to int --- .../utils/parse_yaml.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/cpac_regression_dashboard/utils/parse_yaml.py b/src/cpac_regression_dashboard/utils/parse_yaml.py index 2593f30..2b97856 100644 --- a/src/cpac_regression_dashboard/utils/parse_yaml.py +++ b/src/cpac_regression_dashboard/utils/parse_yaml.py @@ -1,14 +1,12 @@ """From a pair of CPAC output directories, write a YAML file for regression.""" import os from pathlib import Path -from typing import Optional, Union +from typing import cast, Optional import yaml -_PIPELINE_DICT = dict[Optional[str], dict[str, Optional[Union[str, int]]]] -_FULL_YAML_DICT = dict[ - str, Union[dict[str, Union[bool, int, Optional[str]]], _PIPELINE_DICT] -] +_PIPELINE_DICT = dict[Optional[str], dict[str, Optional[int | str]]] +_FULL_YAML_DICT = dict[str, dict[str, bool | int | Optional[str]] | _PIPELINE_DICT] def get_dir(paths: str) -> Optional[str]: @@ -72,20 +70,23 @@ def parse_yaml(directory: str, pipeline_name: str) -> _PIPELINE_DICT: def write_yaml( pipeline_1: _PIPELINE_DICT, pipeline_2: _PIPELINE_DICT, - correlations_dir: Optional[str] = None, - run_name: Optional[str] = None, - n_cpus: Optional[int] = None, + correlations_dir: str, + run_name: str, + n_cpus: int = 1, ) -> _FULL_YAML_DICT: """Combine settings and both pipelines into a single dictionary.""" yaml_dict: _FULL_YAML_DICT = {} - yaml_dict["settings"] = { - "n_cpus": n_cpus, - "correlations_dir": correlations_dir, - "run_name": run_name, - "s3_creds": None, - "quick": False, - "verbose": False, - } + yaml_dict["settings"] = cast( + dict[str, bool | int | Optional[str]] | _PIPELINE_DICT, + { + "n_cpus": int(n_cpus), + "correlations_dir": correlations_dir, + "run_name": run_name, + "s3_creds": None, + "quick": False, + "verbose": False, + }, + ) yaml_dict["pipelines"] = {**pipeline_1, **pipeline_2} From 8b71ac982daebcc11f9318dec6232c4959992f8d Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Tue, 27 Aug 2024 14:41:45 -0400 Subject: [PATCH 17/28] :packaging: Fix corrleation script target --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 15a8fa3..0259653 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ pytest-cov = "^4.1.0" ruff = "^0.1.7" [tool.poetry.scripts] -cpac_regsuite_correlate = 'cpac_regression_dashboard.cpac_correlations:main' +cpac_regsuite_correlate = 'cpac_regression_dashboard.calculate_correlations:main' cpac_regsuite_create_yaml = 'cpac_regression_dashboard.create_yml:main' cpac_regsuite_create_yml = 'cpac_regression_dashboard.create_yml:main' cpac_regsuite_generate_comment = 'cpac_regression_dashboard.generate_comment:main' From c4d6fed9ebbafc1bbf04d9f676f7e9e23f395e75 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Wed, 28 Aug 2024 16:01:45 -0400 Subject: [PATCH 18/28] :necktie: Make JSON an Array --- src/cpac_regression_dashboard/utils/html_script.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cpac_regression_dashboard/utils/html_script.py b/src/cpac_regression_dashboard/utils/html_script.py index 99d60a6..8442e4e 100644 --- a/src/cpac_regression_dashboard/utils/html_script.py +++ b/src/cpac_regression_dashboard/utils/html_script.py @@ -1,4 +1,4 @@ -def dataset(name, data_source, value) -> str: +def dataset(name: str, data_source: str, value: float | int | str) -> str: return f""" {{ "rowid": "{name}", @@ -8,13 +8,14 @@ def dataset(name, data_source, value) -> str: """ -def body(all_keys, data_source): - data_body = "" +def body(all_keys: list[str], data_source: str) -> str: + data_body: str = "[" for key in all_keys: name_value = key.split(": ") name = name_value[0] value = name_value[1] data_body += dataset(name, data_source, value) + data_body += "]" return data_body From 2fe78c3c0e63179cf9ae3f1bf797c3a4ff3ed70a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Theodore=20=F0=9F=90=88=F0=9F=A4=96?= Date: Fri, 20 Sep 2024 20:33:12 -0400 Subject: [PATCH 19/28] :necktie: Make correlations D3-readble JSON --- src/cpac_regression_dashboard/calculate_correlations.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cpac_regression_dashboard/calculate_correlations.py b/src/cpac_regression_dashboard/calculate_correlations.py index 2477282..ec21582 100644 --- a/src/cpac_regression_dashboard/calculate_correlations.py +++ b/src/cpac_regression_dashboard/calculate_correlations.py @@ -1,15 +1,18 @@ #!/usr/bin/env python """Calculate correlations and write them to D3-friendly file.""" +import json + from cpac_correlations import cpac_correlations from .utils.html_script import body def main() -> None: # noqa: D103 + """Gather correlation coefficients and write them to D3-readable JSON.""" all_keys, data_source, branch = cpac_correlations() html_body = body(all_keys, data_source) with open(f"{data_source}_{branch}.json", "w", encoding="utf-8") as file: - file.write(html_body) + file.write(json.dumps(json.loads(f"[{html_body.strip().strip(',')}]"))) main.__doc__ = __doc__ From de1ba8f68044f5cbc4af7689416fe86962d6c871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Theodore=20=F0=9F=90=88=F0=9F=A4=96?= Date: Wed, 25 Sep 2024 13:37:22 -0400 Subject: [PATCH 20/28] :art: Clean up JSON format --- src/cpac_regression_dashboard/utils/html_script.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cpac_regression_dashboard/utils/html_script.py b/src/cpac_regression_dashboard/utils/html_script.py index 8442e4e..020c090 100644 --- a/src/cpac_regression_dashboard/utils/html_script.py +++ b/src/cpac_regression_dashboard/utils/html_script.py @@ -1,3 +1,6 @@ +import json + + def dataset(name: str, data_source: str, value: float | int | str) -> str: return f""" {{ @@ -15,8 +18,9 @@ def body(all_keys: list[str], data_source: str) -> str: name = name_value[0] value = name_value[1] data_body += dataset(name, data_source, value) + data_body = data_body.strip() data_body += "]" - return data_body + return json.dumps(json.loads(data_body)) def write_html(data_body) -> str: From c92a7281583722f0c51c98cbcb8d5e5fab23d43e Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Thu, 26 Sep 2024 13:58:43 -0400 Subject: [PATCH 21/28] :goal_net: Specify AssertionErrors --- src/cpac_regression_dashboard/utils/parse_yaml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cpac_regression_dashboard/utils/parse_yaml.py b/src/cpac_regression_dashboard/utils/parse_yaml.py index 2b97856..8d50688 100644 --- a/src/cpac_regression_dashboard/utils/parse_yaml.py +++ b/src/cpac_regression_dashboard/utils/parse_yaml.py @@ -49,7 +49,7 @@ def parse_yaml(directory: str, pipeline_name: str) -> _PIPELINE_DICT: paths[f"{subdir}_dir"] = os.path.join(directory, subdir) else: paths[f"{subdir}_dir"] = None - assert isinstance(paths["log_dir"], str) + assert isinstance(paths["log_dir"], str), f"log_dir: {paths['log_dir']}" log_dir: Optional[str] = get_dir(paths["log_dir"]) if log_dir is not None: @@ -57,9 +57,9 @@ def parse_yaml(directory: str, pipeline_name: str) -> _PIPELINE_DICT: for file in files: if file.endswith("Z.yml"): pipeline_config = os.path.join(root, file) - assert isinstance(paths["working_dir"], str) + assert isinstance(paths["working_dir"], str), f"working_dir: {paths['working_dir']}" working_dir = get_dir(paths["working_dir"]) - assert isinstance(paths["output_dir"], str) + assert isinstance(paths["output_dir"], str), f"output_dir: {paths['output_dir']}" output_dir = get_dir(paths["output_dir"]) return write_pipeline_yaml( From 17eb0a8f3337f7c303c389873c6ffb2c75251745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Theodore=20=F0=9F=90=88=F0=9F=A4=96?= Date: Fri, 27 Sep 2024 11:07:23 -0400 Subject: [PATCH 22/28] :heavy_plus_sign:/:heavy_minus_sign: Replace `pyppeteer` with `playwright` --- pyproject.toml | 2 +- .../generate_comment.py | 42 ++++++++++--------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0259653..4e07978 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ python = ">=3.9" cairosvg = "*" gitpython = "*" PyGithub = "*" -pyppeteer = "*" +playwright = "*" cpac-correlations = {git = "https://github.com/FCP-INDI/CPAC_regtest_pack.git", subdirectory = "cpac_correlations", branch = "correlate_from_python"} [tool.poetry.group.dev.dependencies] diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py index 3ddffa2..933633a 100644 --- a/src/cpac_regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -13,7 +13,7 @@ from cairosvg import svg2png from git import Repo from github import Github -from pyppeteer import launch +from playwright.async_api import async_playwright from ._version import __version__ @@ -150,26 +150,30 @@ async def generate_comment(path: Path) -> str: async def get_heatmap() -> str: """Get a heatmap image.""" url = f"https://{_ENV.testing_owner}.github.io/dashboard/?data_sha={_ENV.sha}" - browser = await launch() - page = await browser.newPage() - await page.goto(url, waitUntil="networkidle0") - svg_string = await page.evaluate( - """() => { - let svg = document.querySelector('svg'); - return svg ? svg.outerHTML : null; - }""" - ) - if svg_string is not None: - _heatmap = Heatmap("heatmap", svg_string) - add_heatmap_to_branch(_heatmap) - heatmap = _raw_image_path( - _ENV.testing_owner, _ENV.repo, _ENV.sha, Path(f"{_heatmap.filename}.png") + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + await page.goto(url, wait_until="networkidle") + svg_string = await page.evaluate( + """() => { + let svg = document.querySelector('svg'); + return svg ? svg.outerHTML : null; + }""" ) - heatmap = f"[![heatmap]({heatmap})]({url})" - else: - heatmap = "" + if svg_string is not None: + _heatmap = Heatmap("heatmap", svg_string) + add_heatmap_to_branch(_heatmap) + heatmap = _raw_image_path( + _ENV.testing_owner, + _ENV.repo, + _ENV.sha, + Path(f"{_heatmap.filename}.png"), + ) + heatmap = f"[![heatmap]({heatmap})]({url})" + else: + heatmap = "" - await browser.close() + await browser.close() return heatmap From 814e02461ee229ca12317816e72972cf63403f75 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 27 Sep 2024 14:52:57 -0400 Subject: [PATCH 23/28] :goal_net: Warn if playwright + chromium fails --- .../generate_comment.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py index 933633a..ba004f1 100644 --- a/src/cpac_regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -151,15 +151,22 @@ async def get_heatmap() -> str: """Get a heatmap image.""" url = f"https://{_ENV.testing_owner}.github.io/dashboard/?data_sha={_ENV.sha}" async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) - page = await browser.new_page() - await page.goto(url, wait_until="networkidle") - svg_string = await page.evaluate( - """() => { - let svg = document.querySelector('svg'); - return svg ? svg.outerHTML : null; - }""" - ) + try: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + await page.goto(url, wait_until="networkidle") + svg_string = await page.evaluate( + """() => { + let svg = document.querySelector('svg'); + return svg ? svg.outerHTML : null; +}""" + ) + except Exception as exception: + from warnings import warn + + warn( + f"{exception}\n\nAre playwright and chromium installed?", RuntimeWarning + ) if svg_string is not None: _heatmap = Heatmap("heatmap", svg_string) add_heatmap_to_branch(_heatmap) From 9eaaf35c68798ae3f3b69d7861fd27017abf8d8f Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 18 Oct 2024 11:21:47 -0400 Subject: [PATCH 24/28] :memo: Document `$PLAYWRIGHT_BROWSERS_PATH` --- pyproject.toml | 1 + src/cpac_regression_dashboard/generate_comment.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4e07978..3f093e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ cpac_regsuite_correlate = 'cpac_regression_dashboard.calculate_correlations:main cpac_regsuite_create_yaml = 'cpac_regression_dashboard.create_yml:main' cpac_regsuite_create_yml = 'cpac_regression_dashboard.create_yml:main' cpac_regsuite_generate_comment = 'cpac_regression_dashboard.generate_comment:main' +"cpac-regsuite-generate-comment" = 'cpac_regression_dashboard.generate_comment:main' [tool.poetry.urls] "Source Code" = "https://github.com/FCP-INDI/C-PAC_regression_dashboard" diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py index ba004f1..7643719 100644 --- a/src/cpac_regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -199,6 +199,7 @@ def main() -> None: "comments and pull requests." ) print("OWNER: The owner of the repository.") + print("PLAYWRIGHT_BROWSERS_PATH: The path for Playwright browsers.") print("REPO: The name of the repository.") print("SHA: The SHA of the commit.") print("TESTING_OWNER: The owner of the testing repository.") From 646a47e6bde86a1664b725490bb701f255092a12 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Fri, 18 Oct 2024 16:19:39 -0400 Subject: [PATCH 25/28] :necktie: Set personal access token in `git.Repo` --- src/cpac_regression_dashboard/generate_comment.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py index 7643719..e890c2e 100644 --- a/src/cpac_regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -58,14 +58,18 @@ def add_heatmap_to_branch(file: Heatmap) -> None: ------- None """ - personal_access_token = os.environ.get("GITHUB_TOKEN") - g = Github(personal_access_token) + g = Github(_ENV.github_token) repo = g.get_repo(f"{_ENV.testing_owner}/regtest-runlogs") branch_name = f"{_ENV.repo}_{_ENV.sha}" with tempfile.TemporaryDirectory() as _temp_dir: temp_dir = Path(_temp_dir) local_repo = Repo.clone_from( - repo.clone_url, temp_dir, branch=branch_name, depth=1 + repo.clone_url.replace( + "https://", f"https://${_ENV.github_token}:x-oauth-basic@" + ), + temp_dir, + branch=branch_name, + depth=1, ) svg_path = temp_dir / f"{file.filename}.svg" png_path = temp_dir / f"{file.filename}.png" From e9443814a386e4e3cf43e492a982a03b7c134677 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Mon, 21 Oct 2024 12:35:35 -0400 Subject: [PATCH 26/28] :necktie: Adjust git fetch-pull-(force)push flow --- src/cpac_regression_dashboard/generate_comment.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py index e890c2e..29d2abe 100644 --- a/src/cpac_regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -12,6 +12,7 @@ from cairosvg import svg2png from git import Repo +from git.exc import GitCommandError from github import Github from playwright.async_api import async_playwright @@ -71,6 +72,9 @@ def add_heatmap_to_branch(file: Heatmap) -> None: branch=branch_name, depth=1, ) + # make sure branch is up to date + local_repo.remotes.origin.fetch("+refs/heads/*:refs/remotes/origin/*") + local_repo.remotes.origin.pull(branch_name) svg_path = temp_dir / f"{file.filename}.svg" png_path = temp_dir / f"{file.filename}.png" with open(svg_path, "w") as _f: @@ -78,7 +82,10 @@ def add_heatmap_to_branch(file: Heatmap) -> None: svg2png(background_color="white", url=str(svg_path), write_to=str(png_path)) local_repo.index.add([png_path]) local_repo.index.commit(":loud_sound: Add heatmap image") - local_repo.remotes.origin.push(branch_name) + try: + local_repo.remotes.origin.push(branch_name) + except GitCommandError: + local_repo.remotes.origin.push(branch_name, force=True) def gather_images(path: Path) -> Generator[Path, None, None]: From 0cc9df65bfa2d8d24d29954e30f04d03aa5c67d9 Mon Sep 17 00:00:00 2001 From: Jon Clucas Date: Tue, 22 Oct 2024 22:11:11 -0400 Subject: [PATCH 27/28] :construction_worker: Update Chromium before building heatmap + repost comment on any open PRs --- .pre-commit-config.yaml | 1 + .../generate_comment.py | 36 +++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 267a008..4d6b15e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,6 +19,7 @@ repos: - id: mypy args: [--ignore-missing-imports] additional_dependencies: + - types-requests - types-toml - types-PyYAML - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py index 29d2abe..8e14f1a 100644 --- a/src/cpac_regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -6,15 +6,18 @@ from importlib.metadata import metadata import os from pathlib import Path +import subprocess import sys import tempfile -from typing import Generator +from typing import Generator, Optional from cairosvg import svg2png from git import Repo from git.exc import GitCommandError from github import Github +from github.Repository import Repository from playwright.async_api import async_playwright +import requests from ._version import __version__ @@ -160,6 +163,9 @@ async def generate_comment(path: Path) -> str: async def get_heatmap() -> str: """Get a heatmap image.""" + subprocess.run( + "playwright install chromium".split(" "), check=False + ) # update chromium url = f"https://{_ENV.testing_owner}.github.io/dashboard/?data_sha={_ENV.sha}" async with async_playwright() as p: try: @@ -224,9 +230,35 @@ def main() -> None: asyncio.run(post_comment(path)) +def repost_comment_on_pull_request( + repo: Repository, comment: str, pr: dict[str, str] +) -> None: + """Repost a commit comment on a PR containing that commit.""" + pr_number = pr["number"] + issue = repo.get_issue(number=pr_number) + issue.create_comment(comment) + + +def repost_comment_on_pull_requests(repo: Repository, comment: str) -> None: + """Repost a commit comment on all PR containing that commit.""" + pr_url: str = f"https://api.github.com/repos/{_ENV.owner}/{_ENV.repo}/commits/{_ENV.sha}/pulls" + headers: dict[str, str] = { + "Authorization": f"Bearer {_ENV.github_token}", + "Accept": "application/vnd.github.v3+json", + } + + response: requests.Response = requests.get(pr_url, headers=headers) + success_response = 200 + if response.status_code == success_response: + pull_requests: Optional[list[dict]] = response.json() + if pull_requests: + for pr in pull_requests: + repost_comment_on_pull_request(repo, comment, pr) + + async def post_comment(path: Path) -> None: """Post a comment on a GitHub commit and relevant PR.""" - personal_access_token = os.environ.get("GITHUB_TOKEN") + personal_access_token = _ENV.github_token g = Github(personal_access_token) repo = g.get_repo(f"{_ENV.owner}/{_ENV.repo}") commit = repo.get_commit(_ENV.sha) From c689b91ce5d4043ebf054facb5ed8072554cb351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Theodore=20=F0=9F=90=88=F0=9F=A4=96?= Date: Wed, 6 Nov 2024 12:41:24 -0500 Subject: [PATCH 28/28] :bug: Remove lingering `$` --- src/cpac_regression_dashboard/generate_comment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py index 8e14f1a..04577d8 100644 --- a/src/cpac_regression_dashboard/generate_comment.py +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -69,7 +69,7 @@ def add_heatmap_to_branch(file: Heatmap) -> None: temp_dir = Path(_temp_dir) local_repo = Repo.clone_from( repo.clone_url.replace( - "https://", f"https://${_ENV.github_token}:x-oauth-basic@" + "https://", f"https://{_ENV.github_token}:x-oauth-basic@" ), temp_dir, branch=branch_name, @@ -85,6 +85,7 @@ def add_heatmap_to_branch(file: Heatmap) -> None: svg2png(background_color="white", url=str(svg_path), write_to=str(png_path)) local_repo.index.add([png_path]) local_repo.index.commit(":loud_sound: Add heatmap image") + try: local_repo.remotes.origin.push(branch_name) except GitCommandError: