From f001a6ae4616c28d625d1deb079ae345790ff57f Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 9 Jul 2024 14:38:01 -0500 Subject: [PATCH 01/18] start of evaluation implementation --- Snakefile | 7 +++++++ config/config.yaml | 9 +++++++++ spras/evaluation.py | 46 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 spras/evaluation.py diff --git a/Snakefile b/Snakefile index 71a8a6ed..4e00a7c1 100644 --- a/Snakefile +++ b/Snakefile @@ -102,6 +102,9 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + # if _config.config.evaluation_include: + # final_input.extend(expand('{out_dir}{sep}{dataset}-{goldstandard}.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. # (if analysis is specified, these should be implicitly run). @@ -153,6 +156,10 @@ rule merge_input: dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset) runner.merge_input(dataset_dict, output.dataset_file) +# TODO: add a merge input for gold standard data? +# may need to update runner.py to add a merge_gs_input function + + # The checkpoint is like a rule but can be used in dynamic workflows # The workflow directed acyclic graph is re-evaluated after the checkpoint job runs # If the checkpoint has not executed for the provided wildcard values, it will be run and then the rest of the diff --git a/config/config.yaml b/config/config.yaml index 741d8ca9..bcff35f2 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -115,6 +115,13 @@ datasets: other_files: [] # Relative path from the spras directory data_dir: "input" + +gold_standard: + - + label: gs + node_files: ["gs_nodes.txt"] + # edge_files: [] TODO: later iteration + data_dir: "input" # If we want to reconstruct then we should set run to true. # TODO: if include is true above but run is false here, algs are not run. @@ -156,3 +163,5 @@ analysis: linkage: 'ward' # 'euclidean', 'manhattan', 'cosine' metric: 'euclidean' + evaluation: + include: true diff --git a/spras/evaluation.py b/spras/evaluation.py new file mode 100644 index 00000000..e3df89fe --- /dev/null +++ b/spras/evaluation.py @@ -0,0 +1,46 @@ +import os +import pickle as pkl +import warnings + +import pandas as pd + +class Evaluation: + + def __init__(self, gold_standard_dict): + self.label = None + self.node_table = None + # self.edge_table = None TODO: later iteration + self.load_files_from_dict(gold_standard_dict) + return + + def to_file(self, file_name): + """ + Saves dataset object to pickle file + """ + with open(file_name, "wb") as f: + pkl.dump(self, f) + + @classmethod + def from_file(cls, file_name): + """ + Loads dataset object from a pickle file. + Usage: dataset = Dataset.from_file(pickle_file) + """ + with open(file_name, "rb") as f: + return pkl.load(f) + + def load_files_from_dict(self, gold_standard_dict): + + self.label = gold_standard_dict["label"] + node_data_files = gold_standard_dict["node_files"] + data_loc = gold_standard_dict["data_dir"] + + single_node_table = pd.read_table(os.path.join(data_loc, node_file)) + self.node_table = single_node_table + + # self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID]) + # for loop? and read in node dataset into a pandas df + +def percision_recall(): + None + # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html From 0e1a89da1c3c7d6b0ba5156da1f5af41a030f91c Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 15 Jul 2024 12:16:17 -0500 Subject: [PATCH 02/18] in progress --- input/gs_nodes.txt | 1 + spras/config.py | 7 ++++++- spras/evaluation.py | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 input/gs_nodes.txt diff --git a/input/gs_nodes.txt b/input/gs_nodes.txt new file mode 100644 index 00000000..8c7e5a66 --- /dev/null +++ b/input/gs_nodes.txt @@ -0,0 +1 @@ +A \ No newline at end of file diff --git a/spras/config.py b/spras/config.py index 91676ca5..0faf0755 100644 --- a/spras/config.py +++ b/spras/config.py @@ -69,6 +69,8 @@ def __init__(self, raw_config): self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run self.datasets = None + # a dictionary to store configured gold standard data against ouptut of SPRAS runs + self.gold_standard = None # The hash length SPRAS will use to identify parameter combinations. Default is 7 self.hash_length = DEFAULT_HASH_LENGTH # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. @@ -95,6 +97,7 @@ def __init__(self, raw_config): # A Boolean specifying whether to run the ML analysis self.analysis_include_ml = None + _raw_config = copy.deepcopy(raw_config) self.process_config(_raw_config) @@ -139,7 +142,9 @@ def process_config(self, raw_config): # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - + print(self.datasets) + self.gold_standard = {goldstandard["label"]: dict(goldstandard) for goldstandard in raw_config["gold_standard"]} + print(self.gold_standard) # Code snipped from Snakefile that may be useful for assigning default labels # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index diff --git a/spras/evaluation.py b/spras/evaluation.py index e3df89fe..46459314 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -38,9 +38,9 @@ def load_files_from_dict(self, gold_standard_dict): single_node_table = pd.read_table(os.path.join(data_loc, node_file)) self.node_table = single_node_table - # self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID]) - # for loop? and read in node dataset into a pandas df def percision_recall(): None # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html + +def \ No newline at end of file From fd495bd431264762169c368b84a30370adf4b51e Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 15 Jul 2024 12:24:47 -0500 Subject: [PATCH 03/18] still in progress --- Snakefile | 13 +++++++++++-- spras/config.py | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index 4e00a7c1..4ded1e3b 100644 --- a/Snakefile +++ b/Snakefile @@ -34,6 +34,8 @@ def get_dataset(_datasets, label): algorithms = list(algorithm_params) algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] dataset_labels = list(_config.config.datasets.keys()) +gold_standard_labels = list(_config.config.gold_standard.keys()) +print("gold standard labels", gold_standard_labels) #TODO delete # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): @@ -102,8 +104,9 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - # if _config.config.evaluation_include: - # final_input.extend(expand('{out_dir}{sep}{dataset}-{goldstandard}.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + if _config.config.evaluation_include: + # final_input.extend(expand('{out_dir}{sep}{dataset}-{goldstandard}.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-{gold_standard}-evaluation.txt',out_dir=out_dir, sep=SEP,dataset=dataset_labels,gold_standard=gold_standard_labels, algorithm_params=algorithms_with_params)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. @@ -329,6 +332,12 @@ rule ml_analysis_aggregate_algo: ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.ensemble_network(summary_df, output.ensemble_network_file) +rule evaluation: + input: dataset_file = SEP.join([out_dir,'{dataset}-merged.pickle']) + output: eval_file = SEP.join([out_dir, "{dataset}-{gold_standard}-evaluation.txt"]) + run: + "touch $(eval_file)" + # Remove the output directory rule clean: shell: f'rm -rf {out_dir}' diff --git a/spras/config.py b/spras/config.py index 0faf0755..7982ae91 100644 --- a/spras/config.py +++ b/spras/config.py @@ -142,9 +142,7 @@ def process_config(self, raw_config): # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - print(self.datasets) self.gold_standard = {goldstandard["label"]: dict(goldstandard) for goldstandard in raw_config["gold_standard"]} - print(self.gold_standard) # Code snipped from Snakefile that may be useful for assigning default labels # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index @@ -229,3 +227,5 @@ def process_config(self, raw_config): self.analysis_include_ml_aggregate_algo = False else: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] + + \ No newline at end of file From ddacaa8d53fbfe1151747303c5e17ad08f240b76 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 15 Jul 2024 14:43:42 -0500 Subject: [PATCH 04/18] first full pass at implementing evaluation code --- Snakefile | 25 ++++++++++++++++++------- config/config.yaml | 4 ++-- input/gs_nodes.txt | 3 ++- spras/config.py | 10 ++++++++-- spras/evaluation.py | 39 ++++++++++++++++++++++++++++++--------- spras/runner.py | 10 ++++++++++ test/test_config.py | 4 ++++ 7 files changed, 74 insertions(+), 21 deletions(-) diff --git a/Snakefile b/Snakefile index 4ded1e3b..4ca0aaf6 100644 --- a/Snakefile +++ b/Snakefile @@ -3,6 +3,7 @@ from spras import runner import shutil import yaml from spras.dataset import Dataset +from spras.evaluation import Evaluation from spras.analysis import ml, summary, graphspace, cytoscape import spras.config as _config @@ -35,7 +36,6 @@ algorithms = list(algorithm_params) algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] dataset_labels = list(_config.config.datasets.keys()) gold_standard_labels = list(_config.config.gold_standard.keys()) -print("gold standard labels", gold_standard_labels) #TODO delete # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): @@ -104,8 +104,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - if _config.config.evaluation_include: - # final_input.extend(expand('{out_dir}{sep}{dataset}-{goldstandard}.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + if _config.config.analysis_include_evalution: final_input.extend(expand('{out_dir}{sep}{dataset}-{gold_standard}-evaluation.txt',out_dir=out_dir, sep=SEP,dataset=dataset_labels,gold_standard=gold_standard_labels, algorithm_params=algorithms_with_params)) if len(final_input) == 0: @@ -159,9 +158,18 @@ rule merge_input: dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset) runner.merge_input(dataset_dict, output.dataset_file) -# TODO: add a merge input for gold standard data? -# may need to update runner.py to add a merge_gs_input function +def get_gs_dependencies(wildcards): + gs = _config.config.gold_standard[wildcards.gold_standard] + all_files = gs["node_files"] + all_files = [gs["data_dir"] + SEP + data_file for data_file in all_files] + return all_files +rule merge_gs_input: + input: get_gs_dependencies + output: gs_file = SEP.join([out_dir, '{gold_standard}-merged.pickle']) + run: + gs_dict = get_dataset(_config.config.gold_standard, wildcards.gold_standard) + runner.merge_gold_standard_input(gs_dict, output.gs_file) # The checkpoint is like a rule but can be used in dynamic workflows # The workflow directed acyclic graph is re-evaluated after the checkpoint job runs @@ -333,10 +341,13 @@ rule ml_analysis_aggregate_algo: ml.ensemble_network(summary_df, output.ensemble_network_file) rule evaluation: - input: dataset_file = SEP.join([out_dir,'{dataset}-merged.pickle']) + input: + gs_file = SEP.join([out_dir,'{gold_standard}-merged.pickle']), + pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params) output: eval_file = SEP.join([out_dir, "{dataset}-{gold_standard}-evaluation.txt"]) run: - "touch $(eval_file)" + node_table = Evaluation.from_file(input.gs_file).node_table + Evaluation.precision(input.pathways, node_table, output.eval_file) # Remove the output directory rule clean: diff --git a/config/config.yaml b/config/config.yaml index bcff35f2..646e0570 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -115,10 +115,10 @@ datasets: other_files: [] # Relative path from the spras directory data_dir: "input" - + gold_standard: - - label: gs + label: gs1 node_files: ["gs_nodes.txt"] # edge_files: [] TODO: later iteration data_dir: "input" diff --git a/input/gs_nodes.txt b/input/gs_nodes.txt index 8c7e5a66..2fe4ab08 100644 --- a/input/gs_nodes.txt +++ b/input/gs_nodes.txt @@ -1 +1,2 @@ -A \ No newline at end of file +A +B \ No newline at end of file diff --git a/spras/config.py b/spras/config.py index 7982ae91..5e3b600b 100644 --- a/spras/config.py +++ b/spras/config.py @@ -15,6 +15,7 @@ import copy as copy import itertools as it import os +import re import numpy as np import yaml @@ -142,7 +143,13 @@ def process_config(self, raw_config): # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} + self.gold_standard = {goldstandard["label"]: dict(goldstandard) for goldstandard in raw_config["gold_standard"]} + for key in self.gold_standard: + pattern = r'^\w+$' + if not bool(re.match(pattern, key)): + raise ValueError(f"Gold Standard label \'{key}\' contains invalid values. Gold Standard labels can only contain letters, numbers, or underscores.") + # Code snipped from Snakefile that may be useful for assigning default labels # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index @@ -222,10 +229,9 @@ def process_config(self, raw_config): self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] + self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"] if 'aggregate_per_algorithm' not in self.ml_params: self.analysis_include_ml_aggregate_algo = False else: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] - - \ No newline at end of file diff --git a/spras/evaluation.py b/spras/evaluation.py index 46459314..caab8783 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -1,18 +1,23 @@ import os import pickle as pkl import warnings +from pathlib import Path +from typing import Iterable import pandas as pd +from sklearn.metrics import precision_score + class Evaluation: - + NODE_ID = "NODEID" + def __init__(self, gold_standard_dict): self.label = None self.node_table = None # self.edge_table = None TODO: later iteration self.load_files_from_dict(gold_standard_dict) return - + def to_file(self, file_name): """ Saves dataset object to pickle file @@ -30,17 +35,33 @@ def from_file(cls, file_name): return pkl.load(f) def load_files_from_dict(self, gold_standard_dict): - + self.label = gold_standard_dict["label"] - node_data_files = gold_standard_dict["node_files"] + node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] - single_node_table = pd.read_table(os.path.join(data_loc, node_file)) + single_node_table = pd.read_table(os.path.join(data_loc, node_data_files), header=None) + single_node_table.columns = [self.NODE_ID] self.node_table = single_node_table + # TODO: are we allowing multiple node files or single in node_files for gs + # if yes, a for loop is needed + + def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): + + y_true = node_table['NODEID'].tolist() + results = [] + + for file in file_paths: + df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"]) + y_pred = list(set(df['Node1']).union(set(df['Node2']))) + all_nodes = set(y_true).union(set(y_pred)) + y_true_binary = [1 if node in y_true else 0 for node in all_nodes] + y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] + + precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) -def percision_recall(): - None - # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html + results.append({"Pathway": file, "Precision": precision}) -def \ No newline at end of file + precision_df = pd.DataFrame(results) + precision_df.to_csv(output_file, sep="\t", index=False) diff --git a/spras/runner.py b/spras/runner.py index 6ef26496..8cbc7b96 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -2,6 +2,7 @@ from spras.allpairs import AllPairs as allpairs from spras.dataset import Dataset from spras.domino import DOMINO as domino +from spras.evaluation import Evaluation from spras.meo import MEO as meo from spras.mincostflow import MinCostFlow as mincostflow from spras.omicsintegrator1 import OmicsIntegrator1 as omicsintegrator1 @@ -42,6 +43,15 @@ def merge_input(dataset_dict, dataset_file): dataset = Dataset(dataset_dict) dataset.to_file(dataset_file) +def merge_gold_standard_input(gs_dict, gs_file): + """ + Merge files listed for this gold standard dataset and write the dataset to disk + @param gs_dict: gold standard dataset to process + @param gs_file: output filename + """ + gs_dataset = Evaluation(gs_dict) + gs_dataset.to_file(gs_file) + def prepare_inputs(algorithm, data_file, filename_map): """ diff --git a/test/test_config.py b/test/test_config.py index 602f95af..ef1c1ebd 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -20,6 +20,7 @@ def get_test_config(): } }, "datasets": [{"label":"alg1"}, {"label":"alg2"}], + "gold_standard": [{"label":"gs1"}], "algorithms": [{"params": ["param2", "param2"]}], "analysis": { "summary": { @@ -34,6 +35,9 @@ def get_test_config(): "cytoscape": { "include": False }, + "evaluation": { + "include": False + }, }, } From 99691a9836b779dc66dc6343f8c468e9e57483b4 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 17 Jul 2024 10:05:49 -0500 Subject: [PATCH 05/18] ideas for adding gold standard to specific dataset pairings --- Snakefile | 4 ++++ config/config.yaml | 1 + spras/config.py | 2 ++ spras/evaluation.py | 3 +++ 4 files changed, 10 insertions(+) diff --git a/Snakefile b/Snakefile index 4ca0aaf6..dc6c3fb7 100644 --- a/Snakefile +++ b/Snakefile @@ -37,6 +37,8 @@ algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, par dataset_labels = list(_config.config.datasets.keys()) gold_standard_labels = list(_config.config.gold_standard.keys()) +# TODO: create something that will be gs to dataset pairing + # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): return len(algorithm_params.get(algo, {})) > 1 @@ -105,6 +107,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) if _config.config.analysis_include_evalution: + # TODO: update to using gs to specific dataset pairing final_input.extend(expand('{out_dir}{sep}{dataset}-{gold_standard}-evaluation.txt',out_dir=out_dir, sep=SEP,dataset=dataset_labels,gold_standard=gold_standard_labels, algorithm_params=algorithms_with_params)) if len(final_input) == 0: @@ -340,6 +343,7 @@ rule ml_analysis_aggregate_algo: ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.ensemble_network(summary_df, output.ensemble_network_file) +# update to use specific gs to dataset pairing rule evaluation: input: gs_file = SEP.join([out_dir,'{gold_standard}-merged.pickle']), diff --git a/config/config.yaml b/config/config.yaml index 646e0570..ed7bfcae 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -122,6 +122,7 @@ gold_standard: node_files: ["gs_nodes.txt"] # edge_files: [] TODO: later iteration data_dir: "input" + # TODO: dataset: [] # If we want to reconstruct then we should set run to true. # TODO: if include is true above but run is false here, algs are not run. diff --git a/spras/config.py b/spras/config.py index 5e3b600b..74df4f51 100644 --- a/spras/config.py +++ b/spras/config.py @@ -97,6 +97,8 @@ def __init__(self, raw_config): self.analysis_include_cytoscape = None # A Boolean specifying whether to run the ML analysis self.analysis_include_ml = None + # A Boolean specifying whether to run the Evaluation analysis + self.analysis_include_evalution = None _raw_config = copy.deepcopy(raw_config) diff --git a/spras/evaluation.py b/spras/evaluation.py index caab8783..de66cdd2 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -16,6 +16,7 @@ def __init__(self, gold_standard_dict): self.node_table = None # self.edge_table = None TODO: later iteration self.load_files_from_dict(gold_standard_dict) + # TODO add a self.dataset_somthing = None return def to_file(self, file_name): @@ -37,6 +38,8 @@ def from_file(cls, file_name): def load_files_from_dict(self, gold_standard_dict): self.label = gold_standard_dict["label"] + # TODO: set self.datasets + node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] From 2c6d1a5780714cd55bb4639fcb10717866f90f11 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 17 Jul 2024 11:31:03 -0500 Subject: [PATCH 06/18] second pass of implementing evaluation code, added dataset-goldstandard pairs --- Snakefile | 25 ++++++++++++++++++------- config/config.yaml | 11 ++++++++--- input/{gs_nodes.txt => gs_nodes0.txt} | 0 input/gs_nodes1.txt | 1 + spras/evaluation.py | 8 +++++--- 5 files changed, 32 insertions(+), 13 deletions(-) rename input/{gs_nodes.txt => gs_nodes0.txt} (100%) create mode 100644 input/gs_nodes1.txt diff --git a/Snakefile b/Snakefile index dc6c3fb7..01155352 100644 --- a/Snakefile +++ b/Snakefile @@ -37,7 +37,10 @@ algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, par dataset_labels = list(_config.config.datasets.keys()) gold_standard_labels = list(_config.config.gold_standard.keys()) -# TODO: create something that will be gs to dataset pairing +dataset_gs_pairs_tuples = [(gs_values['label'], dataset) for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']] +# am I able to send tuples around? +dataset_gs_pairs_formatted = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']] +# prefomatting makes it easier to send around but requires more functions to use # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): @@ -107,8 +110,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) if _config.config.analysis_include_evalution: - # TODO: update to using gs to specific dataset pairing - final_input.extend(expand('{out_dir}{sep}{dataset}-{gold_standard}-evaluation.txt',out_dir=out_dir, sep=SEP,dataset=dataset_labels,gold_standard=gold_standard_labels, algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gs_pairs_formatted,algorithm_params=algorithms_with_params)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. @@ -343,12 +345,21 @@ rule ml_analysis_aggregate_algo: ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.ensemble_network(summary_df, output.ensemble_network_file) -# update to use specific gs to dataset pairing +def get_gs_pickle_file(wildcards): + parts = wildcards.dataset_gs_pairs_formatted.split('-') + gs = parts[1] + return SEP.join([out_dir, f'{gs}-merged.pickle']) + +def get_dataset_label(wildcards): + parts = wildcards.dataset_gs_pairs_formatted.split('-') + dataset = parts[0] + return dataset + rule evaluation: input: - gs_file = SEP.join([out_dir,'{gold_standard}-merged.pickle']), - pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params) - output: eval_file = SEP.join([out_dir, "{dataset}-{gold_standard}-evaluation.txt"]) + gs_file = get_gs_pickle_file, + pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label), + output: eval_file = SEP.join([out_dir, "{dataset_gs_pairs_formatted}-evaluation.txt"]) run: node_table = Evaluation.from_file(input.gs_file).node_table Evaluation.precision(input.pathways, node_table, output.eval_file) diff --git a/config/config.yaml b/config/config.yaml index ed7bfcae..583b45ea 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -118,11 +118,16 @@ datasets: gold_standard: - - label: gs1 - node_files: ["gs_nodes.txt"] + label: gs0 + node_files: ["gs_nodes0.txt"] # edge_files: [] TODO: later iteration data_dir: "input" - # TODO: dataset: [] + datasets: ["data0"] + - + label: gs1 + node_files: ["gs_nodes1.txt"] + data_dir: "input" + datasets: ["data1", "data0"] # If we want to reconstruct then we should set run to true. # TODO: if include is true above but run is false here, algs are not run. diff --git a/input/gs_nodes.txt b/input/gs_nodes0.txt similarity index 100% rename from input/gs_nodes.txt rename to input/gs_nodes0.txt diff --git a/input/gs_nodes1.txt b/input/gs_nodes1.txt new file mode 100644 index 00000000..96d80cd6 --- /dev/null +++ b/input/gs_nodes1.txt @@ -0,0 +1 @@ +C \ No newline at end of file diff --git a/spras/evaluation.py b/spras/evaluation.py index de66cdd2..bfb7ff56 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -16,7 +16,7 @@ def __init__(self, gold_standard_dict): self.node_table = None # self.edge_table = None TODO: later iteration self.load_files_from_dict(gold_standard_dict) - # TODO add a self.dataset_somthing = None + self.datasets = None return def to_file(self, file_name): @@ -38,8 +38,8 @@ def from_file(cls, file_name): def load_files_from_dict(self, gold_standard_dict): self.label = gold_standard_dict["label"] - # TODO: set self.datasets - + self.datasets = gold_standard_dict["datasets"] + node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] @@ -50,6 +50,8 @@ def load_files_from_dict(self, gold_standard_dict): # TODO: are we allowing multiple node files or single in node_files for gs # if yes, a for loop is needed + # TODO: later iteration - chose between node and edge file, or allow both + def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): y_true = node_table['NODEID'].tolist() From 860d840b87446a485d2710d2f97585daf68e2592 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 19 Jul 2024 14:58:14 -0500 Subject: [PATCH 07/18] cleaned up and added commenting --- Snakefile | 46 +++++++++++++++++++++++---------------------- config/config.yaml | 1 + spras/config.py | 3 ++- spras/evaluation.py | 23 +++++++++++++++++++---- 4 files changed, 46 insertions(+), 27 deletions(-) diff --git a/Snakefile b/Snakefile index 01155352..a63edc0a 100644 --- a/Snakefile +++ b/Snakefile @@ -28,19 +28,15 @@ hac_params = _config.config.hac_params FRAMEWORK = _config.config.container_framework print(f"Running {FRAMEWORK} containers") -# Return the dataset dictionary from the config file given the label -def get_dataset(_datasets, label): +# Return the dataset or goldstandard dictionary from the config file given the label +def get_dict(_datasets, label): + print(_datasets, label) return _datasets[label] algorithms = list(algorithm_params) algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] dataset_labels = list(_config.config.datasets.keys()) -gold_standard_labels = list(_config.config.gold_standard.keys()) - -dataset_gs_pairs_tuples = [(gs_values['label'], dataset) for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']] -# am I able to send tuples around? -dataset_gs_pairs_formatted = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']] -# prefomatting makes it easier to send around but requires more functions to use +dataset_goldstandard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']] # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): @@ -63,7 +59,7 @@ def write_parameter_log(algorithm, param_label, logfile): # Log the dataset contents specified in the config file in a yaml file def write_dataset_log(dataset, logfile): - dataset_contents = get_dataset(_config.config.datasets,dataset) + dataset_contents = get_dict(_config.config.datasets,dataset) # safe_dump gives RepresenterError for an OrderedDict # config file has to convert the dataset from OrderedDict to dict to avoid this @@ -110,7 +106,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) if _config.config.analysis_include_evalution: - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gs_pairs_formatted,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_goldstandard_pairs,algorithm_params=algorithms_with_params)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. @@ -160,21 +156,23 @@ rule merge_input: output: dataset_file = SEP.join([out_dir, '{dataset}-merged.pickle']) run: # Pass the dataset to PRRunner where the files will be merged and written to disk (i.e. pickled) - dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset) + dataset_dict = get_dict(_config.config.datasets, wildcards.dataset) runner.merge_input(dataset_dict, output.dataset_file) -def get_gs_dependencies(wildcards): +# Return all files used in the gold standard +def get_goldstandard_dependencies(wildcards): gs = _config.config.gold_standard[wildcards.gold_standard] all_files = gs["node_files"] all_files = [gs["data_dir"] + SEP + data_file for data_file in all_files] return all_files +# Merge all node files for a goldstandard into a single node table rule merge_gs_input: - input: get_gs_dependencies - output: gs_file = SEP.join([out_dir, '{gold_standard}-merged.pickle']) + input: get_goldstandard_dependencies + output: goldstandard_file = SEP.join([out_dir, '{gold_standard}-merged.pickle']) run: - gs_dict = get_dataset(_config.config.gold_standard, wildcards.gold_standard) - runner.merge_gold_standard_input(gs_dict, output.gs_file) + goldstandard_dict = get_dict(_config.config.gold_standard, wildcards.gold_standard) + runner.merge_gold_standard_input(goldstandard_dict, output.goldstandard_file) # The checkpoint is like a rule but can be used in dynamic workflows # The workflow directed acyclic graph is re-evaluated after the checkpoint job runs @@ -326,6 +324,7 @@ def collect_pathways_per_algo(wildcards): filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params) +# Cluster the output pathways per algorithm for each dataset rule ml_analysis_aggregate_algo: input: pathways = collect_pathways_per_algo @@ -345,23 +344,26 @@ rule ml_analysis_aggregate_algo: ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.ensemble_network(summary_df, output.ensemble_network_file) -def get_gs_pickle_file(wildcards): - parts = wildcards.dataset_gs_pairs_formatted.split('-') +# Return the gold standard pickle file for a specific gold standard +def get_goldstandard_pickle_file(wildcards): + parts = wildcards.dataset_goldstandard_pairs.split('-') gs = parts[1] return SEP.join([out_dir, f'{gs}-merged.pickle']) +# Returns the dataset corresponding to the gold standard pair def get_dataset_label(wildcards): - parts = wildcards.dataset_gs_pairs_formatted.split('-') + parts = wildcards.dataset_goldstandard_pairs.split('-') dataset = parts[0] return dataset +# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard rule evaluation: input: - gs_file = get_gs_pickle_file, + goldstandard_file = get_goldstandard_pickle_file, pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label), - output: eval_file = SEP.join([out_dir, "{dataset_gs_pairs_formatted}-evaluation.txt"]) + output: eval_file = SEP.join([out_dir, "{dataset_goldstandard_pairs}-evaluation.txt"]) run: - node_table = Evaluation.from_file(input.gs_file).node_table + node_table = Evaluation.from_file(input.goldstandard_file).node_table Evaluation.precision(input.pathways, node_table, output.eval_file) # Remove the output directory diff --git a/config/config.yaml b/config/config.yaml index 583b45ea..4267ba9e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -122,6 +122,7 @@ gold_standard: node_files: ["gs_nodes0.txt"] # edge_files: [] TODO: later iteration data_dir: "input" + # Set of datasets to compare with the specific gold standard dataset datasets: ["data0"] - label: gs1 diff --git a/spras/config.py b/spras/config.py index 74df4f51..d8247371 100644 --- a/spras/config.py +++ b/spras/config.py @@ -70,7 +70,7 @@ def __init__(self, raw_config): self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run self.datasets = None - # a dictionary to store configured gold standard data against ouptut of SPRAS runs + # A dictionary to store configured gold standard data against ouptut of SPRAS runs self.gold_standard = None # The hash length SPRAS will use to identify parameter combinations. Default is 7 self.hash_length = DEFAULT_HASH_LENGTH @@ -146,6 +146,7 @@ def process_config(self, raw_config): # Convert to dicts to simplify the yaml logging self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} + # TODO: turn into try except self.gold_standard = {goldstandard["label"]: dict(goldstandard) for goldstandard in raw_config["gold_standard"]} for key in self.gold_standard: pattern = r'^\w+$' diff --git a/spras/evaluation.py b/spras/evaluation.py index bfb7ff56..45e3db4e 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -21,7 +21,7 @@ def __init__(self, gold_standard_dict): def to_file(self, file_name): """ - Saves dataset object to pickle file + Saves gold standard object to pickle file """ with open(file_name, "wb") as f: pkl.dump(self, f) @@ -29,14 +29,22 @@ def to_file(self, file_name): @classmethod def from_file(cls, file_name): """ - Loads dataset object from a pickle file. - Usage: dataset = Dataset.from_file(pickle_file) + Loads gold standard object from a pickle file. + Usage: gold_standard = Evaluation.from_file(pickle_file) """ with open(file_name, "rb") as f: return pkl.load(f) def load_files_from_dict(self, gold_standard_dict): + """ + Loads gold standard files from gold_standard_dict, which is one gold standard dataset + dictionary from the list in the config file with the fields in the config file. + Populates node_table. + + node_table is a single column of nodes pandas table. + returns: none + """ self.label = gold_standard_dict["label"] self.datasets = gold_standard_dict["datasets"] @@ -53,7 +61,14 @@ def load_files_from_dict(self, gold_standard_dict): # TODO: later iteration - chose between node and edge file, or allow both def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): - + """ + Takes in file paths for a specific dataset and an associated gold standard node table. + Calculates precision for each pathway file + Returns output back to output_file + @param file_paths: file paths of pathway reconstruction algorithm outputs + @param node_table: the gold standard nodes + @param output_file: the filename to save the precision of each pathway + """ y_true = node_table['NODEID'].tolist() results = [] From f635e99c2d5917eb4501911ac125fa3875fe8543 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 19 Jul 2024 14:59:57 -0500 Subject: [PATCH 08/18] precommit --- spras/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config.py b/spras/config.py index d8247371..10f8f398 100644 --- a/spras/config.py +++ b/spras/config.py @@ -146,7 +146,7 @@ def process_config(self, raw_config): # Convert to dicts to simplify the yaml logging self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - # TODO: turn into try except + # TODO: turn into try except self.gold_standard = {goldstandard["label"]: dict(goldstandard) for goldstandard in raw_config["gold_standard"]} for key in self.gold_standard: pattern = r'^\w+$' From 0a1e305a149cefecc2d2ee1206d0be358b49ac70 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 22 Jul 2024 12:06:19 -0500 Subject: [PATCH 09/18] updated code to work optionally, updated code based on PR comments --- Snakefile | 39 +++++++++++++++++++-------------------- config/config.yaml | 2 +- config/egfr.yaml | 2 ++ spras/config.py | 18 ++++++++++++++---- spras/evaluation.py | 10 ++++++++++ spras/runner.py | 10 ---------- 6 files changed, 46 insertions(+), 35 deletions(-) diff --git a/Snakefile b/Snakefile index a63edc0a..3035aa58 100644 --- a/Snakefile +++ b/Snakefile @@ -28,15 +28,14 @@ hac_params = _config.config.hac_params FRAMEWORK = _config.config.container_framework print(f"Running {FRAMEWORK} containers") -# Return the dataset or goldstandard dictionary from the config file given the label -def get_dict(_datasets, label): - print(_datasets, label) +# Return the dataset or gold_standard dictionary from the config file given the label +def get_dataset(_datasets, label): return _datasets[label] algorithms = list(algorithm_params) algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] dataset_labels = list(_config.config.datasets.keys()) -dataset_goldstandard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']] +dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['datasets']] # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): @@ -59,7 +58,7 @@ def write_parameter_log(algorithm, param_label, logfile): # Log the dataset contents specified in the config file in a yaml file def write_dataset_log(dataset, logfile): - dataset_contents = get_dict(_config.config.datasets,dataset) + dataset_contents = get_dataset(_config.config.datasets,dataset) # safe_dump gives RepresenterError for an OrderedDict # config file has to convert the dataset from OrderedDict to dict to avoid this @@ -106,7 +105,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) if _config.config.analysis_include_evalution: - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_goldstandard_pairs,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. @@ -156,23 +155,23 @@ rule merge_input: output: dataset_file = SEP.join([out_dir, '{dataset}-merged.pickle']) run: # Pass the dataset to PRRunner where the files will be merged and written to disk (i.e. pickled) - dataset_dict = get_dict(_config.config.datasets, wildcards.dataset) + dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset) runner.merge_input(dataset_dict, output.dataset_file) # Return all files used in the gold standard -def get_goldstandard_dependencies(wildcards): - gs = _config.config.gold_standard[wildcards.gold_standard] +def get_gold_standard_dependencies(wildcards): + gs = _config.config.gold_standards[wildcards.gold_standard] all_files = gs["node_files"] all_files = [gs["data_dir"] + SEP + data_file for data_file in all_files] return all_files -# Merge all node files for a goldstandard into a single node table +# Merge all node files for a gold_standard into a single node table rule merge_gs_input: - input: get_goldstandard_dependencies - output: goldstandard_file = SEP.join([out_dir, '{gold_standard}-merged.pickle']) + input: get_gold_standard_dependencies + output: gold_standard_file = SEP.join([out_dir, '{gold_standard}-merged.pickle']) run: - goldstandard_dict = get_dict(_config.config.gold_standard, wildcards.gold_standard) - runner.merge_gold_standard_input(goldstandard_dict, output.goldstandard_file) + gold_standard_dict = get_dataset(_config.config.gold_standards, wildcards.gold_standard) + Evaluation.merge_gold_standard_input(gold_standard_dict, output.gold_standard_file) # The checkpoint is like a rule but can be used in dynamic workflows # The workflow directed acyclic graph is re-evaluated after the checkpoint job runs @@ -345,25 +344,25 @@ rule ml_analysis_aggregate_algo: ml.ensemble_network(summary_df, output.ensemble_network_file) # Return the gold standard pickle file for a specific gold standard -def get_goldstandard_pickle_file(wildcards): - parts = wildcards.dataset_goldstandard_pairs.split('-') +def get_gold_standard_pickle_file(wildcards): + parts = wildcards.dataset_gold_standard_pairs.split('-') gs = parts[1] return SEP.join([out_dir, f'{gs}-merged.pickle']) # Returns the dataset corresponding to the gold standard pair def get_dataset_label(wildcards): - parts = wildcards.dataset_goldstandard_pairs.split('-') + parts = wildcards.dataset_gold_standard_pairs.split('-') dataset = parts[0] return dataset # Run evaluation code for a specific dataset's pathway outputs against its paired gold standard rule evaluation: input: - goldstandard_file = get_goldstandard_pickle_file, + gold_standard_file = get_gold_standard_pickle_file, pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label), - output: eval_file = SEP.join([out_dir, "{dataset_goldstandard_pairs}-evaluation.txt"]) + output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"]) run: - node_table = Evaluation.from_file(input.goldstandard_file).node_table + node_table = Evaluation.from_file(input.gold_standard_file).node_table Evaluation.precision(input.pathways, node_table, output.eval_file) # Remove the output directory diff --git a/config/config.yaml b/config/config.yaml index 4267ba9e..9db7dfc1 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -122,7 +122,7 @@ gold_standard: node_files: ["gs_nodes0.txt"] # edge_files: [] TODO: later iteration data_dir: "input" - # Set of datasets to compare with the specific gold standard dataset + # Set of datasets (dataset labels) to compare with the specific gold standard dataset datasets: ["data0"] - label: gs1 diff --git a/config/egfr.yaml b/config/egfr.yaml index 71282905..ce7b64cf 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -89,3 +89,5 @@ analysis: include: true ml: include: false + evaluation: + include: false diff --git a/spras/config.py b/spras/config.py index 10f8f398..ccc507b5 100644 --- a/spras/config.py +++ b/spras/config.py @@ -71,7 +71,7 @@ def __init__(self, raw_config): # A dictionary to store configured datasets against which SPRAS will be run self.datasets = None # A dictionary to store configured gold standard data against ouptut of SPRAS runs - self.gold_standard = None + self.gold_standards = None # The hash length SPRAS will use to identify parameter combinations. Default is 7 self.hash_length = DEFAULT_HASH_LENGTH # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. @@ -147,11 +147,15 @@ def process_config(self, raw_config): self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} # TODO: turn into try except - self.gold_standard = {goldstandard["label"]: dict(goldstandard) for goldstandard in raw_config["gold_standard"]} - for key in self.gold_standard: + try: + self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]} + except: + self.gold_standards = {} + + for key in self.gold_standards: pattern = r'^\w+$' if not bool(re.match(pattern, key)): - raise ValueError(f"Gold Standard label \'{key}\' contains invalid values. Gold Standard labels can only contain letters, numbers, or underscores.") + raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.") # Code snipped from Snakefile that may be useful for assigning default labels # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] @@ -234,6 +238,12 @@ def process_config(self, raw_config): self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"] + # the code will run correctly without this section below + # TODO: decide if this part is needed + if self.gold_standards == {} and self.analysis_include_evalution == True: + print("Gold standard data not provided. Evaluation analysis cannot run.") + self.analysis_include_evalution = False + if 'aggregate_per_algorithm' not in self.ml_params: self.analysis_include_ml_aggregate_algo = False else: diff --git a/spras/evaluation.py b/spras/evaluation.py index 45e3db4e..49d060dd 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -19,6 +19,15 @@ def __init__(self, gold_standard_dict): self.datasets = None return + def merge_gold_standard_input(gs_dict, gs_file): + """ + Merge files listed for this gold standard dataset and write the dataset to disk + @param gs_dict: gold standard dataset to process + @param gs_file: output filename + """ + gs_dataset = Evaluation(gs_dict) + gs_dataset.to_file(gs_file) + def to_file(self, file_name): """ Saves gold standard object to pickle file @@ -60,6 +69,7 @@ def load_files_from_dict(self, gold_standard_dict): # TODO: later iteration - chose between node and edge file, or allow both + # TODO: move outside of Evaluation class? def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): """ Takes in file paths for a specific dataset and an associated gold standard node table. diff --git a/spras/runner.py b/spras/runner.py index 8cbc7b96..b1bd6101 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -43,16 +43,6 @@ def merge_input(dataset_dict, dataset_file): dataset = Dataset(dataset_dict) dataset.to_file(dataset_file) -def merge_gold_standard_input(gs_dict, gs_file): - """ - Merge files listed for this gold standard dataset and write the dataset to disk - @param gs_dict: gold standard dataset to process - @param gs_file: output filename - """ - gs_dataset = Evaluation(gs_dict) - gs_dataset.to_file(gs_file) - - def prepare_inputs(algorithm, data_file, filename_map): """ Prepare general dataset files for this algorithm From c3ae809ddc5d51dce13c15c5580da062041871a1 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 24 Jul 2024 11:10:26 -0500 Subject: [PATCH 10/18] commenting what causes the code to break when certain attributes are left out of an evaluation set --- spras/config.py | 3 +-- spras/evaluation.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/spras/config.py b/spras/config.py index ccc507b5..4f97dfba 100644 --- a/spras/config.py +++ b/spras/config.py @@ -146,7 +146,6 @@ def process_config(self, raw_config): # Convert to dicts to simplify the yaml logging self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - # TODO: turn into try except try: self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]} except: @@ -238,7 +237,7 @@ def process_config(self, raw_config): self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"] - # the code will run correctly without this section below + # COMMENT: the code will run correctly without this section below due to empty dict in try except above # TODO: decide if this part is needed if self.gold_standards == {} and self.analysis_include_evalution == True: print("Gold standard data not provided. Evaluation analysis cannot run.") diff --git a/spras/evaluation.py b/spras/evaluation.py index 49d060dd..bde5a316 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -13,10 +13,10 @@ class Evaluation: def __init__(self, gold_standard_dict): self.label = None + self.datasets = None self.node_table = None # self.edge_table = None TODO: later iteration self.load_files_from_dict(gold_standard_dict) - self.datasets = None return def merge_gold_standard_input(gs_dict, gs_file): @@ -54,10 +54,16 @@ def load_files_from_dict(self, gold_standard_dict): returns: none """ - self.label = gold_standard_dict["label"] - self.datasets = gold_standard_dict["datasets"] + self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception + self.datasets = gold_standard_dict["datasets"] # COMMENT: can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file + + try: + # COMMENT: cannot be empty, snakemake will run evaluation even if empty + node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now + except: + if not gold_standard_dict["node_files"]: + raise ValueError (f"Node_files for {self.label} is an empty list, cannot run evalution") - node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] single_node_table = pd.read_table(os.path.join(data_loc, node_data_files), header=None) From fe5811a8bcbb4feeabbc8baa2fe9f7081b9754e2 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 24 Jul 2024 11:16:00 -0500 Subject: [PATCH 11/18] clean up --- spras/config.py | 2 +- spras/evaluation.py | 11 +++-------- spras/runner.py | 1 - 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/spras/config.py b/spras/config.py index 4f97dfba..cd765ded 100644 --- a/spras/config.py +++ b/spras/config.py @@ -237,7 +237,7 @@ def process_config(self, raw_config): self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"] - # COMMENT: the code will run correctly without this section below due to empty dict in try except above + # COMMENT: the code will run correctly without this section below due to empty dict in try except above # TODO: decide if this part is needed if self.gold_standards == {} and self.analysis_include_evalution == True: print("Gold standard data not provided. Evaluation analysis cannot run.") diff --git a/spras/evaluation.py b/spras/evaluation.py index bde5a316..66893f07 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -54,15 +54,11 @@ def load_files_from_dict(self, gold_standard_dict): returns: none """ - self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception + self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception self.datasets = gold_standard_dict["datasets"] # COMMENT: can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file - try: - # COMMENT: cannot be empty, snakemake will run evaluation even if empty - node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now - except: - if not gold_standard_dict["node_files"]: - raise ValueError (f"Node_files for {self.label} is an empty list, cannot run evalution") + # COMMENT: cannot be empty, snakemake will run evaluation even if empty + node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] @@ -75,7 +71,6 @@ def load_files_from_dict(self, gold_standard_dict): # TODO: later iteration - chose between node and edge file, or allow both - # TODO: move outside of Evaluation class? def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): """ Takes in file paths for a specific dataset and an associated gold standard node table. diff --git a/spras/runner.py b/spras/runner.py index b1bd6101..fbf5ad5d 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -2,7 +2,6 @@ from spras.allpairs import AllPairs as allpairs from spras.dataset import Dataset from spras.domino import DOMINO as domino -from spras.evaluation import Evaluation from spras.meo import MEO as meo from spras.mincostflow import MinCostFlow as mincostflow from spras.omicsintegrator1 import OmicsIntegrator1 as omicsintegrator1 From b381e9a0569d8f52f457f3c37b95f0d91263f09b Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 15 Aug 2024 14:05:44 -0500 Subject: [PATCH 12/18] precommit --- spras/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config.py b/spras/config.py index 34894c75..554198b5 100644 --- a/spras/config.py +++ b/spras/config.py @@ -145,7 +145,7 @@ def process_config(self, raw_config): # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - + for key in self.datasets: pattern = r'^\w+$' if not bool(re.match(pattern, key)): From 1bb19e3a8d392b7e5d4c44574f25917f37fdcb59 Mon Sep 17 00:00:00 2001 From: Neha Talluri <78840540+ntalluri@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:25:27 -0500 Subject: [PATCH 13/18] Update spras/config.py comment Co-authored-by: Anthony Gitter --- spras/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config.py b/spras/config.py index 554198b5..3fad39e9 100644 --- a/spras/config.py +++ b/spras/config.py @@ -70,7 +70,7 @@ def __init__(self, raw_config): self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run self.datasets = None - # A dictionary to store configured gold standard data against ouptut of SPRAS runs + # A dictionary to store configured gold standard data against output of SPRAS runs self.gold_standards = None # The hash length SPRAS will use to identify parameter combinations. Default is 7 self.hash_length = DEFAULT_HASH_LENGTH From 1efd15072ab536dabe63a4a70740f8031cc762e1 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 20 Aug 2024 10:58:48 -0500 Subject: [PATCH 14/18] made changes based on comments --- Snakefile | 2 +- config/config.yaml | 6 ++++-- spras/config.py | 14 ++++++++++---- spras/evaluation.py | 12 ++++++------ 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/Snakefile b/Snakefile index 3035aa58..c65e738b 100644 --- a/Snakefile +++ b/Snakefile @@ -35,7 +35,7 @@ def get_dataset(_datasets, label): algorithms = list(algorithm_params) algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] dataset_labels = list(_config.config.datasets.keys()) -dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['datasets']] +dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['dataset_labels']] # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): diff --git a/config/config.yaml b/config/config.yaml index ce832f15..20d2e4cd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -119,17 +119,19 @@ datasets: gold_standard: - + # Labels can only contain letters, numbers, or underscores + label: data0 label: gs0 node_files: ["gs_nodes0.txt"] # edge_files: [] TODO: later iteration data_dir: "input" # Set of datasets (dataset labels) to compare with the specific gold standard dataset - datasets: ["data0"] + dataset_labels: ["data0"] - label: gs1 node_files: ["gs_nodes1.txt"] data_dir: "input" - datasets: ["data1", "data0"] + dataset_labels: ["data1", "data0"] # If we want to reconstruct then we should set run to true. # TODO: if include is true above but run is false here, algs are not run. diff --git a/spras/config.py b/spras/config.py index 3fad39e9..8fa96694 100644 --- a/spras/config.py +++ b/spras/config.py @@ -151,16 +151,25 @@ def process_config(self, raw_config): if not bool(re.match(pattern, key)): raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") + # parse gold standard information try: self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]} except: self.gold_standards = {} + # check that gold_standard labels are formatted correctly for key in self.gold_standards: pattern = r'^\w+$' if not bool(re.match(pattern, key)): raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.") + # check that all the dataset labels in the gold standards are existing datasets labels + dataset_labels = set(self.datasets.keys()) + gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']} + for label in gold_standard_dataset_labels: + if label not in dataset_labels: + raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.") + # Code snipped from Snakefile that may be useful for assigning default labels # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index @@ -242,11 +251,8 @@ def process_config(self, raw_config): self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"] - # COMMENT: the code will run correctly without this section below due to empty dict in try except above - # TODO: decide if this part is needed if self.gold_standards == {} and self.analysis_include_evalution == True: - print("Gold standard data not provided. Evaluation analysis cannot run.") - self.analysis_include_evalution = False + raise ValueError("Evaluation analysis cannot run as gold standard data not provided. Please set evaluation include to false or provide gold standard data.") if 'aggregate_per_algorithm' not in self.ml_params: self.analysis_include_ml_aggregate_algo = False diff --git a/spras/evaluation.py b/spras/evaluation.py index 66893f07..a6e1a916 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -54,10 +54,10 @@ def load_files_from_dict(self, gold_standard_dict): returns: none """ - self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception - self.datasets = gold_standard_dict["datasets"] # COMMENT: can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file + self.label = gold_standard_dict["label"] # cannot be empty, will break with a NoneType exception + self.datasets = gold_standard_dict["dataset_labels"] # can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file - # COMMENT: cannot be empty, snakemake will run evaluation even if empty + # cannot be empty, snakemake will run evaluation even if empty node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] @@ -80,13 +80,13 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: @param node_table: the gold standard nodes @param output_file: the filename to save the precision of each pathway """ - y_true = node_table['NODEID'].tolist() + y_true = set(node_table['NODEID']) results = [] for file in file_paths: df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"]) - y_pred = list(set(df['Node1']).union(set(df['Node2']))) - all_nodes = set(y_true).union(set(y_pred)) + y_pred = set(df['Node1']).union(set(df['Node2'])) + all_nodes = y_true.union(y_pred) y_true_binary = [1 if node in y_true else 0 for node in all_nodes] y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] From bbac74445eeea450972e330ab090f2d04651d240 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 20 Aug 2024 11:13:25 -0500 Subject: [PATCH 15/18] fixed label mistake --- config/config.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 20d2e4cd..04cd1655 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -120,12 +120,11 @@ datasets: gold_standard: - # Labels can only contain letters, numbers, or underscores - label: data0 label: gs0 node_files: ["gs_nodes0.txt"] # edge_files: [] TODO: later iteration data_dir: "input" - # Set of datasets (dataset labels) to compare with the specific gold standard dataset + # List of dataset labels to compare with the specific gold standard dataset dataset_labels: ["data0"] - label: gs1 From cfd3c03b30d803654a10fb3b9bc43321df8b467a Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 20 Aug 2024 11:13:42 -0500 Subject: [PATCH 16/18] updated test_config --- test/test_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_config.py b/test/test_config.py index b00f39ee..51d61e15 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -20,7 +20,7 @@ def get_test_config(): } }, "datasets": [{"label":"alg1"}, {"label":"alg2"}], - "gold_standard": [{"label":"gs1"}], + "gold_standard": [{"label":"gs1", "dataset_labels":[]}], "algorithms": [{"params": ["param2", "param2"]}], "analysis": { "summary": { @@ -118,6 +118,7 @@ def test_error_dataset_label(self): def test_correct_dataset_label(self): test_config = get_test_config() + print(test_config) correct_test_dicts = [{"label":"test"}, {"label":"123"}, {"label":"test123"}, {"label":"123test"}, {"label":"_"}, {"label":"test_test"}, {"label":"_test"}, {"label":"test_"}] for test_dict in correct_test_dicts: From c30e7edb65b8f00fffdb31370b3ab4b62acca678 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Fri, 23 Aug 2024 12:24:14 -0500 Subject: [PATCH 17/18] Add test cases and minor reformatting --- Snakefile | 2 +- spras/config.py | 14 +++++++------- spras/evaluation.py | 22 ++++++++++++---------- spras/runner.py | 1 + test/test_config.py | 35 +++++++++++++++++++++++++++-------- 5 files changed, 48 insertions(+), 26 deletions(-) diff --git a/Snakefile b/Snakefile index c65e738b..999f1969 100644 --- a/Snakefile +++ b/Snakefile @@ -104,7 +104,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) - if _config.config.analysis_include_evalution: + if _config.config.analysis_include_evaluation: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) if len(final_input) == 0: diff --git a/spras/config.py b/spras/config.py index 8fa96694..f6448a63 100644 --- a/spras/config.py +++ b/spras/config.py @@ -92,14 +92,13 @@ def __init__(self, raw_config): # A Boolean specifying whether to run the summary analysis self.analysis_include_summary = None # A Boolean specifying whether to run the GraphSpace analysis - self.analysis_include_graphspace = None + self.analysis_include_graphspace = None # A Boolean specifying whether to run the Cytoscape analysis - self.analysis_include_cytoscape = None + self.analysis_include_cytoscape = None # A Boolean specifying whether to run the ML analysis self.analysis_include_ml = None # A Boolean specifying whether to run the Evaluation analysis - self.analysis_include_evalution = None - + self.analysis_include_evaluation = None _raw_config = copy.deepcopy(raw_config) self.process_config(_raw_config) @@ -249,10 +248,11 @@ def process_config(self, raw_config): self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] - self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"] + self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"] - if self.gold_standards == {} and self.analysis_include_evalution == True: - raise ValueError("Evaluation analysis cannot run as gold standard data not provided. Please set evaluation include to false or provide gold standard data.") + if self.gold_standards == {} and self.analysis_include_evaluation: + raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " + "Please set evaluation include to false or provide gold standard data.") if 'aggregate_per_algorithm' not in self.ml_params: self.analysis_include_ml_aggregate_algo = False diff --git a/spras/evaluation.py b/spras/evaluation.py index a6e1a916..5d00e7d4 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -1,8 +1,7 @@ import os import pickle as pkl -import warnings from pathlib import Path -from typing import Iterable +from typing import Dict, Iterable import pandas as pd from sklearn.metrics import precision_score @@ -11,7 +10,7 @@ class Evaluation: NODE_ID = "NODEID" - def __init__(self, gold_standard_dict): + def __init__(self, gold_standard_dict: Dict): self.label = None self.datasets = None self.node_table = None @@ -19,6 +18,7 @@ def __init__(self, gold_standard_dict): self.load_files_from_dict(gold_standard_dict) return + @staticmethod def merge_gold_standard_input(gs_dict, gs_file): """ Merge files listed for this gold standard dataset and write the dataset to disk @@ -35,8 +35,8 @@ def to_file(self, file_name): with open(file_name, "wb") as f: pkl.dump(self, f) - @classmethod - def from_file(cls, file_name): + @staticmethod + def from_file(file_name): """ Loads gold standard object from a pickle file. Usage: gold_standard = Evaluation.from_file(pickle_file) @@ -44,7 +44,7 @@ def from_file(cls, file_name): with open(file_name, "rb") as f: return pkl.load(f) - def load_files_from_dict(self, gold_standard_dict): + def load_files_from_dict(self, gold_standard_dict: Dict): """ Loads gold standard files from gold_standard_dict, which is one gold standard dataset dictionary from the list in the config file with the fields in the config file. @@ -54,11 +54,11 @@ def load_files_from_dict(self, gold_standard_dict): returns: none """ - self.label = gold_standard_dict["label"] # cannot be empty, will break with a NoneType exception - self.datasets = gold_standard_dict["dataset_labels"] # can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file + self.label = gold_standard_dict["label"] # cannot be empty, will break with a NoneType exception + self.datasets = gold_standard_dict["dataset_labels"] # can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file # cannot be empty, snakemake will run evaluation even if empty - node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now + node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] @@ -71,6 +71,7 @@ def load_files_from_dict(self, gold_standard_dict): # TODO: later iteration - chose between node and edge file, or allow both + @staticmethod def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): """ Takes in file paths for a specific dataset and an associated gold standard node table. @@ -84,12 +85,13 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: results = [] for file in file_paths: - df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"]) + df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"]) y_pred = set(df['Node1']).union(set(df['Node2'])) all_nodes = y_true.union(y_pred) y_true_binary = [1 if node in y_true else 0 for node in all_nodes] y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] + # default to 0.0 if there is a divide by 0 error precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) results.append({"Pathway": file, "Precision": precision}) diff --git a/spras/runner.py b/spras/runner.py index fbf5ad5d..6ef26496 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -42,6 +42,7 @@ def merge_input(dataset_dict, dataset_file): dataset = Dataset(dataset_dict) dataset.to_file(dataset_file) + def prepare_inputs(algorithm, data_file, filename_map): """ Prepare general dataset files for this algorithm diff --git a/test/test_config.py b/test/test_config.py index 51d61e15..a0178400 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -19,8 +19,8 @@ def get_test_config(): "reconstruction_dir": "my_dir" } }, - "datasets": [{"label":"alg1"}, {"label":"alg2"}], - "gold_standard": [{"label":"gs1", "dataset_labels":[]}], + "datasets": [{"label": "alg1"}, {"label": "alg2"}], + "gold_standard": [{"label": "gs1", "dataset_labels": []}], "algorithms": [{"params": ["param2", "param2"]}], "analysis": { "summary": { @@ -43,6 +43,7 @@ def get_test_config(): return test_raw_config + class TestConfig: """ Tests various parts of the configuration mechanism @@ -109,18 +110,36 @@ def test_config_container_registry(self): def test_error_dataset_label(self): test_config = get_test_config() - error_test_dicts = [{"label":"test$"}, {"label":"@test'"}, {"label":"[test]"}, {"label":"test-test"}, {"label":"✉"}] + error_test_dicts = [{"label": "test$"}, {"label": "@test'"}, {"label": "[test]"}, {"label": "test-test"}, + {"label": "✉"}] for test_dict in error_test_dicts: - test_config["datasets"]= [test_dict] - with pytest.raises(ValueError): #raises error if any chars other than letters, numbers, or underscores are in dataset label + test_config["datasets"] = [test_dict] + with pytest.raises(ValueError): # raises error if any chars other than letters, numbers, or underscores are in dataset label config.init_global(test_config) def test_correct_dataset_label(self): test_config = get_test_config() print(test_config) - correct_test_dicts = [{"label":"test"}, {"label":"123"}, {"label":"test123"}, {"label":"123test"}, {"label":"_"}, {"label":"test_test"}, {"label":"_test"}, {"label":"test_"}] + correct_test_dicts = [{"label": "test"}, {"label": "123"}, {"label": "test123"}, {"label": "123test"}, {"label": "_"}, + {"label": "test_test"}, {"label": "_test"}, {"label": "test_"}] for test_dict in correct_test_dicts: - test_config["datasets"]= [test_dict] - config.init_global(test_config) # no error should be raised + test_config["datasets"] = [test_dict] + config.init_global(test_config) # no error should be raised + + def test_error_gs_label(self): + test_config = get_test_config() + error_labels = ["test$", "@test'"] + + for test_label in error_labels: + test_config["gold_standard"][0]["label"] = test_label + with pytest.raises(ValueError): # raises error if any chars other than letters, numbers, or underscores are in gs label + config.init_global(test_config) + + def test_error_gs_dataset_mismatch(self): + test_config = get_test_config() + test_config["gold_standard"] = [{"label": "gs1", "dataset_labels": ["mismatch"]}] + + with pytest.raises(ValueError): + config.init_global(test_config) From 522e66c78f041ab975ca5a5b5655b2d8c64bb00c Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Fri, 23 Aug 2024 12:32:25 -0500 Subject: [PATCH 18/18] Make gold standards plural in config --- config/config.yaml | 2 +- spras/config.py | 2 +- test/test_config.py | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 04cd1655..b87bcd45 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -117,7 +117,7 @@ datasets: # Relative path from the spras directory data_dir: "input" -gold_standard: +gold_standards: - # Labels can only contain letters, numbers, or underscores label: gs0 diff --git a/spras/config.py b/spras/config.py index f6448a63..f7664a46 100644 --- a/spras/config.py +++ b/spras/config.py @@ -152,7 +152,7 @@ def process_config(self, raw_config): # parse gold standard information try: - self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]} + self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standards"]} except: self.gold_standards = {} diff --git a/test/test_config.py b/test/test_config.py index a0178400..bf13cd6e 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -20,7 +20,7 @@ def get_test_config(): } }, "datasets": [{"label": "alg1"}, {"label": "alg2"}], - "gold_standard": [{"label": "gs1", "dataset_labels": []}], + "gold_standards": [{"label": "gs1", "dataset_labels": []}], "algorithms": [{"params": ["param2", "param2"]}], "analysis": { "summary": { @@ -120,7 +120,6 @@ def test_error_dataset_label(self): def test_correct_dataset_label(self): test_config = get_test_config() - print(test_config) correct_test_dicts = [{"label": "test"}, {"label": "123"}, {"label": "test123"}, {"label": "123test"}, {"label": "_"}, {"label": "test_test"}, {"label": "_test"}, {"label": "test_"}] @@ -133,13 +132,13 @@ def test_error_gs_label(self): error_labels = ["test$", "@test'"] for test_label in error_labels: - test_config["gold_standard"][0]["label"] = test_label + test_config["gold_standards"][0]["label"] = test_label with pytest.raises(ValueError): # raises error if any chars other than letters, numbers, or underscores are in gs label config.init_global(test_config) def test_error_gs_dataset_mismatch(self): test_config = get_test_config() - test_config["gold_standard"] = [{"label": "gs1", "dataset_labels": ["mismatch"]}] + test_config["gold_standards"] = [{"label": "gs1", "dataset_labels": ["mismatch"]}] with pytest.raises(ValueError): config.init_global(test_config)