Skip to content

Commit

Permalink
made changes based on comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ntalluri committed Aug 20, 2024
1 parent 1bb19e3 commit 1efd150
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def get_dataset(_datasets, label):
algorithms = list(algorithm_params)
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
dataset_labels = list(_config.config.datasets.keys())
dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['datasets']]
dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['dataset_labels']]

# Get algorithms that are running multiple parameter combinations
def algo_has_mult_param_combos(algo):
Expand Down
6 changes: 4 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,19 @@ datasets:

gold_standard:
-
# Labels can only contain letters, numbers, or underscores
label: data0
label: gs0
node_files: ["gs_nodes0.txt"]
# edge_files: [] TODO: later iteration
data_dir: "input"
# Set of datasets (dataset labels) to compare with the specific gold standard dataset
datasets: ["data0"]
dataset_labels: ["data0"]
-
label: gs1
node_files: ["gs_nodes1.txt"]
data_dir: "input"
datasets: ["data1", "data0"]
dataset_labels: ["data1", "data0"]

# If we want to reconstruct then we should set run to true.
# TODO: if include is true above but run is false here, algs are not run.
Expand Down
14 changes: 10 additions & 4 deletions spras/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,25 @@ def process_config(self, raw_config):
if not bool(re.match(pattern, key)):
raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.")

# parse gold standard information
try:
self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standard"]}
except:
self.gold_standards = {}

# check that gold_standard labels are formatted correctly
for key in self.gold_standards:
pattern = r'^\w+$'
if not bool(re.match(pattern, key)):
raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.")

# check that all the dataset labels in the gold standards are existing datasets labels
dataset_labels = set(self.datasets.keys())
gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
for label in gold_standard_dataset_labels:
if label not in dataset_labels:
raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")

# Code snipped from Snakefile that may be useful for assigning default labels
# dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
# Maps from the dataset label to the dataset list index
Expand Down Expand Up @@ -242,11 +251,8 @@ def process_config(self, raw_config):
self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
self.analysis_include_evalution = raw_config["analysis"]["evaluation"]["include"]

# COMMENT: the code will run correctly without this section below due to empty dict in try except above
# TODO: decide if this part is needed
if self.gold_standards == {} and self.analysis_include_evalution == True:
print("Gold standard data not provided. Evaluation analysis cannot run.")
self.analysis_include_evalution = False
raise ValueError("Evaluation analysis cannot run as gold standard data not provided. Please set evaluation include to false or provide gold standard data.")

if 'aggregate_per_algorithm' not in self.ml_params:
self.analysis_include_ml_aggregate_algo = False
Expand Down
12 changes: 6 additions & 6 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def load_files_from_dict(self, gold_standard_dict):
returns: none
"""
self.label = gold_standard_dict["label"] # COMMENT: cannot be empty, will break with a NoneType exception
self.datasets = gold_standard_dict["datasets"] # COMMENT: can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file
self.label = gold_standard_dict["label"] # cannot be empty, will break with a NoneType exception
self.datasets = gold_standard_dict["dataset_labels"] # can be empty, snakemake will not run evaluation due to dataset_gold_standard_pairs in snakemake file

# COMMENT: cannot be empty, snakemake will run evaluation even if empty
# cannot be empty, snakemake will run evaluation even if empty
node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now

data_loc = gold_standard_dict["data_dir"]
Expand All @@ -80,13 +80,13 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file:
@param node_table: the gold standard nodes
@param output_file: the filename to save the precision of each pathway
"""
y_true = node_table['NODEID'].tolist()
y_true = set(node_table['NODEID'])
results = []

for file in file_paths:
df = pd.read_table(file, sep="\t", header = 0, usecols=["Node1", "Node2"])
y_pred = list(set(df['Node1']).union(set(df['Node2'])))
all_nodes = set(y_true).union(set(y_pred))
y_pred = set(df['Node1']).union(set(df['Node2']))
all_nodes = y_true.union(y_pred)
y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]

Expand Down

0 comments on commit 1efd150

Please sign in to comment.