diff --git a/Snakefile b/Snakefile index 3035aa58..c65e738b 100644 --- a/Snakefile +++ b/Snakefile @@ -35,7 +35,7 @@ def get_dataset(_datasets, label): algorithms = list(algorithm_params) algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] dataset_labels = list(_config.config.datasets.keys()) -dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['datasets']] +dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['dataset_labels']] # Get algorithms that are running multiple parameter combinations def algo_has_mult_param_combos(algo): diff --git a/config/config.yaml b/config/config.yaml index ce832f15..20d2e4cd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -119,17 +119,19 @@ datasets: gold_standard: - + # Labels can only contain letters, numbers, or underscores + label: data0 label: gs0 node_files: ["gs_nodes0.txt"] # edge_files: [] TODO: later iteration data_dir: "input" # Set of datasets (dataset labels) to compare with the specific gold standard dataset - datasets: ["data0"] + dataset_labels: ["data0"] - label: gs1 node_files: ["gs_nodes1.txt"] data_dir: "input" - datasets: ["data1", "data0"] + dataset_labels: ["data1", "data0"] # If we want to reconstruct then we should set run to true. # TODO: if include is true above but run is false here, algs are not run. diff --git a/spras/config.py b/spras/config.py index 3fad39e9..8fa96694 100644 --- a/spras/config.py +++ b/spras/config.py @@ -151,16 +151,25 @@ def process_config(self, raw_config): if not bool(re.match(pattern, key)): raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") + # parse gold standard information try: self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]} except: self.gold_standards = {} + # check that gold_standard labels are formatted correctly for key in self.gold_standards: pattern = r'^\w+$' if not bool(re.match(pattern, key)): raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.") + # check that all the dataset labels in the gold standards are existing datasets labels + dataset_labels = set(self.datasets.keys()) + gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']} + for label in gold_standard_dataset_labels: + if label not in dataset_labels: + raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.") + # Code snipped from Snakefile that may be useful for assigning default labels # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index @@ -242,11 +251,8 @@ def process_config(self, raw_config): self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"] - # COMMENT: the code will run correctly without this section below due to empty dict in try except above - # TODO: decide if this part is needed if self.gold_standards == {} and self.analysis_include_evalution == True: - print("Gold standard data not provided. Evaluation analysis cannot run.") - self.analysis_include_evalution = False + raise ValueError("Evaluation analysis cannot run as gold standard data not provided. Please set evaluation include to false or provide gold standard data.") if 'aggregate_per_algorithm' not in self.ml_params: self.analysis_include_ml_aggregate_algo = False diff --git a/spras/evaluation.py b/spras/evaluation.py index 66893f07..a6e1a916 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -54,10 +54,10 @@ def load_files_from_dict(self, gold_standard_dict): returns: none """ - self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception - self.datasets = gold_standard_dict["datasets"] # COMMENT: can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file + self.label = gold_standard_dict["label"] # cannot be empty, will break with a NoneType exception + self.datasets = gold_standard_dict["dataset_labels"] # can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file - # COMMENT: cannot be empty, snakemake will run evaluation even if empty + # cannot be empty, snakemake will run evaluation even if empty node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now data_loc = gold_standard_dict["data_dir"] @@ -80,13 +80,13 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: @param node_table: the gold standard nodes @param output_file: the filename to save the precision of each pathway """ - y_true = node_table['NODEID'].tolist() + y_true = set(node_table['NODEID']) results = [] for file in file_paths: df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"]) - y_pred = list(set(df['Node1']).union(set(df['Node2']))) - all_nodes = set(y_true).union(set(y_pred)) + y_pred = set(df['Node1']).union(set(df['Node2'])) + all_nodes = y_true.union(y_pred) y_true_binary = [1 if node in y_true else 0 for node in all_nodes] y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]