From 6de1811eb658221bcf06d74c30233f601c8a68ff Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 11 Dec 2022 22:42:23 +0200 Subject: [PATCH 1/2] Adjusting the import and default paths to the current directories --- conf/minimal-configuration-script.json | 55 ++++++++ conf/minimal-configuration.json | 5 +- graph_generation/generate_neo4j_multi_hpf.py | 10 +- grim/grim.py | 32 +++-- grim/run_impute_def.py | 127 +++++++++++++++++++ scripts/parallel-imputation.py | 12 +- scripts/runfile.py | 14 +- 7 files changed, 227 insertions(+), 28 deletions(-) create mode 100644 conf/minimal-configuration-script.json create mode 100644 grim/run_impute_def.py diff --git a/conf/minimal-configuration-script.json b/conf/minimal-configuration-script.json new file mode 100644 index 0000000..45ddca5 --- /dev/null +++ b/conf/minimal-configuration-script.json @@ -0,0 +1,55 @@ +{ + "populations": [ + "CAU" + ], + "freq_trim_threshold": 1e-5, + "priority": { + "alpha": 0.4999999, + "eta": 0, + "beta": 1e-7, + "gamma": 1e-7, + "delta": 0.4999999 + }, + "UNK_priors": "SR", + "FULL_LOCI": "ABCQR", + "loci_map": { + "A": 1, + "B": 2, + "C": 3, + "DQB1": 4, + "DRB1": 5 + }, + + "factor_missing_data": 0.0001, + "Plan_B_Matrix": [ + [[1, 2, 3, 4, 5]], + [[1, 2, 3], [4, 5]], + [[1], [2, 3], [4, 5]], + [[1, 2, 3], [4], [5]], + [[1], [2, 3], [4], [5]], + [[1], [2], [3], [4], [5]] + ], + "planb": true, + "number_of_options_threshold": 100000, + "epsilon": 1e-3, + "number_of_results": 10, + "number_of_pop_results": 100, + "output_MUUG": true, + "output_haplotypes": true, + "freq_data_dir": "data/freqs" , + "freq_file": "output/hpf.csv" , + "graph_files_path": "graph_generation/output/csv/" , + "node_csv_file": "nodes.csv", + "edges_csv_file": "edges.csv", + "info_node_csv_file": "info_node.csv", + "top_links_csv_file": "top_links.csv", + "imputation_in_file": "data/subjects/donor.csv", + "imputation_out_umug_freq_filename": "don.umug", + "imputation_out_umug_pops_filename": "don.umug.pops", + "imputation_out_hap_freq_filename": "don.pmug", + "imputation_out_hap_pops_filename": "don.pmug.pops", + "imputation_out_miss_filename": "don.miss", + "imputation_out_problem_filename": "don.problem", + "max_haplotypes_number_in_phase": 100, + "imuptation_out_path": "output" +} diff --git a/conf/minimal-configuration.json b/conf/minimal-configuration.json index 4dc8590..96565a3 100644 --- a/conf/minimal-configuration.json +++ b/conf/minimal-configuration.json @@ -37,9 +37,8 @@ "output_MUUG": true, "output_haplotypes": true, "freq_data_dir": "data/freqs" , - "pops_count_file": "graph_generation/output/pop_ratio.txt" , - "freq_file": "graph_generation/output/hpf.csv" , - "graph_files_path": "graph_generation/output/csv/" , + "freq_file": "output/hpf.csv" , + "graph_files_path": "output/csv/" , "node_csv_file": "nodes.csv", "edges_csv_file": "edges.csv", "info_node_csv_file": "info_node.csv", diff --git a/graph_generation/generate_neo4j_multi_hpf.py b/graph_generation/generate_neo4j_multi_hpf.py index abb3c87..d32a63d 100644 --- a/graph_generation/generate_neo4j_multi_hpf.py +++ b/graph_generation/generate_neo4j_multi_hpf.py @@ -205,7 +205,7 @@ def loci_order(loc_values): def generate_graph( - config_file="../../conf/minimal-configuration.json", em_pop=None, em=False + config_file="../conf/minimal-configuration-script.json", em_pop=None, em=False, use_default_path = False ): ############################################################################## # Configure @@ -215,7 +215,9 @@ def generate_graph( # Input file # freq_file = path + freq_file - + path = "" + if use_default_path: + path = os.path.dirname(os.path.realpath(__file__)) + "/" parser = argparse.ArgumentParser() parser.add_argument( "-c", @@ -243,10 +245,10 @@ def generate_graph( pops = em_pop freq_trim = conf.get("freq_trim_threshold") - freq_file = conf.get("freq_file") + freq_file = path + conf.get("freq_file") dict_count_of_pop = {} - pop_ratio_dir = conf.get("pops_count_file") + pop_ratio_dir = path + conf.get("pops_count_file", "") path = pathlib.Path(pop_ratio_dir) if em or not path.is_file(): diff --git a/grim/grim.py b/grim/grim.py index 7660e20..039ec81 100644 --- a/grim/grim.py +++ b/grim/grim.py @@ -23,39 +23,53 @@ # -from .validation import runfile -from .imputation.graph_generation import generate_neo4j_multi_hpf + + from .imputation.impute import Imputation from .imputation.networkx_graph import Graph + +import sys import os +# adding Folder_2 to the system path +sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)).replace("/grim", "")) + + +from graph_generation import generate_neo4j_multi_hpf +from grim.run_impute_def import run_impute + + def graph_freqs(conf_file="", for_em=False, em_pop=None): + use_default_path = False if conf_file == "": + use_default_path = True conf_file = ( - os.path.dirname(os.path.realpath(__file__)) + os.path.dirname(os.path.realpath(__file__)).replace("/grim", "") + "/conf/minimal-configuration.json" ) generate_neo4j_multi_hpf.generate_graph( - config_file=conf_file, em_pop=em_pop, em=for_em + config_file=conf_file, em_pop=em_pop, em=for_em, use_default_path = use_default_path ) def impute(conf_file=""): + project_dir_in_file, project_dir_graph = "", "" if conf_file == "": + conf_file = ( - os.path.dirname(os.path.realpath(__file__)) + os.path.dirname(os.path.realpath(__file__)).replace("/grim", "") + "/conf/minimal-configuration.json" ) project_dir_graph = ( - os.path.dirname(os.path.realpath(__file__)) - + "/imputation/graph_generation/" + os.path.dirname(os.path.realpath(__file__)).replace("/grim", "") + + "/graph_generation/" ) - project_dir_in_file = os.path.dirname(os.path.realpath(__file__)) + "/" - runfile.run_impute(conf_file, project_dir_graph, project_dir_in_file) + project_dir_in_file = os.path.dirname(os.path.realpath(__file__)).replace("/grim", "") + "/" + run_impute(conf_file, project_dir_graph, project_dir_in_file) def impute_instance(config, graph, count_by_prob=None): diff --git a/grim/run_impute_def.py b/grim/run_impute_def.py new file mode 100644 index 0000000..4f9a22b --- /dev/null +++ b/grim/run_impute_def.py @@ -0,0 +1,127 @@ +import argparse +import cProfile +import json +import pathlib +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))) + +from .imputation.impute import Imputation +from .imputation.networkx_graph import Graph + +# Profiler start +#pr = cProfile.Profile() +#pr.enable() + +def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_graph = "", project_dir_in_file = ""): + + configuration_file = conf_file + + #project_dir = ""# "../" + #output_dir = "output/" + + + # Read configuration file and load properties + with open(configuration_file) as f: + json_conf = json.load(f) + + graph_files_path = json_conf.get("graph_files_path") + if graph_files_path[-1] != '/': + graph_files_path += '/' + output_dir = json_conf.get("imuptation_out_path", "output") + if output_dir[-1] != '/': + output_dir += '/' + config = { + "planb": json_conf.get('planb', True), + "pops": json_conf.get('populations'), + "priority": json_conf.get('priority'), + "epsilon": json_conf.get('epsilon', 1e-3), + "number_of_results": json_conf.get('number_of_results', 1000), + "number_of_pop_results": json_conf.get('number_of_pop_results', 100), + "output_MUUG": json_conf.get("output_MUUG", True), + "output_haplotypes": json_conf.get("output_haplotypes", False), + "node_file": project_dir_graph + graph_files_path + json_conf.get("node_csv_file"), + "top_links_file": project_dir_graph + graph_files_path + json_conf.get("top_links_csv_file"), + "edges_file": project_dir_graph + graph_files_path +json_conf.get("edges_csv_file"), + "imputation_input_file": project_dir_in_file + json_conf.get("imputation_in_file"), + "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"), + "imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename"), + "imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"), + "imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename"), + "imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename"), + "imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename"), + "factor_missing_data": json_conf.get("factor_missing_data", 0.01), + "loci_map": json_conf.get("loci_map", {"A": 1, "B":3, "C": 2, "DQB1": 4, "DRB1": 5} ), + "matrix_planb": json_conf.get("Plan_B_Matrix", [ + [[1, 2, 3, 4, 5]], + [[1, 2, 3], [4, 5]], + [[1], [2, 3], [4, 5]], + [[1, 2, 3], [4], [5]], + [[1], [2, 3], [4], [5]], + [[1], [2], [3], [4], [5]] + ]), + "pops_count_file": project_dir_graph + json_conf.get("pops_count_file",'' ), + "use_pops_count_file": json_conf.get("pops_count_file",False), + "number_of_options_threshold": json_conf.get("number_of_options_threshold", 100000), + "max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase",100 ), + "bin_imputation_input_file": project_dir_in_file + json_conf.get("bin_imputation_in_file", "None"), + "nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []), + "save_mode": json_conf.get("save_space_mode", False), + "UNK_priors" : json_conf.get("UNK_priors", "MR") + + } + + # Display the configurations we are using + print('****************************************************************************************************') + print("Performing imputation based on:") + print("\tPopulation: {}".format(config["pops"])) + print("\tPriority: {}".format(config["priority"])) + print("\tUNK priority: {}".format(config["UNK_priors"])) + print("\tEpsilon: {}".format(config["epsilon"])) + print("\tPlan B: {}".format(config["planb"])) + print("\tNumber of Results: {}".format(config["number_of_results"])) + print("\tNumber of Population Results: {}".format(config["number_of_pop_results"])) + print("\tNodes File: {}".format(config["node_file"])) + print("\tTop Links File: {}".format(config["edges_file"])) + print("\tInput File: {}".format(config["imputation_input_file"])) + print("\tOutput UMUG Format: {}".format(config["output_MUUG"])) + print("\tOutput UMUG Freq Filename: {}".format(config["imputation_out_umug_freq_file"])) + print("\tOutput UMUG Pops Filename: {}".format(config["imputation_out_umug_pops_file"])) + print("\tOutput Haplotype Format: {}".format(config["output_haplotypes"])) + print("\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"])) + print("\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"])) + print("\tOutput Miss Filename: {}".format(config["imputation_out_miss_file"])) + print("\tOutput Problem Filename: {}".format(config["imputation_out_problem_file"])) + print("\tFactor Missing Data: {}".format(config["factor_missing_data"])) + print("\tLoci Map: {}".format(config["loci_map"])) + print("\tPlan B Matrix: {}".format(config["matrix_planb"])) + print("\tPops Count File: {}".format(config["pops_count_file"])) + print("\tUse Pops Count File: {}".format(config["use_pops_count_file"])) + print("\tNumber of Options Threshold: {}".format(config["number_of_options_threshold"])) + print("\tMax Number of haplotypes in phase: {}".format(config["max_haplotypes_number_in_phase"])) + if config["nodes_for_plan_A"]: + print("\tNodes in plan A: {}".format(config["nodes_for_plan_A"])) + print("\tSave space mode: {}".format(config["save_mode"])) + print('****************************************************************************************************') + + + all_loci_set = set() + for _, val in config["loci_map"].items(): + all_loci_set.add(str(val)) + + config["full_loci"] = ''.join(sorted(all_loci_set)) + # Perform imputation + graph = Graph(config) + graph.build_graph(config["node_file"], config["top_links_file"], config["edges_file"]) + imputation = Imputation(graph, config) + + # Create output directory if it doesn't exist + pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True) + + # Write out the results from imputation + imputation.impute_file(config) + + # Profiler end + #pr.disable() + #pr.print_stats(sort="time") diff --git a/scripts/parallel-imputation.py b/scripts/parallel-imputation.py index 7efc7cb..3ad0404 100755 --- a/scripts/parallel-imputation.py +++ b/scripts/parallel-imputation.py @@ -8,10 +8,10 @@ import pathlib from multiprocessing.pool import Pool -from imputegl import Imputation +from grim.imputation.impute import Imputation from imputegl.impute import write_best_prob, write_best_prob_genotype -from imputegl.networkx_graph import Graph +from grim.imputation.networkx_graph import Graph # Profiler start pr = cProfile.Profile() @@ -22,7 +22,7 @@ "-c", "--config", required=False, - default="../minimal-configuration.json", + default="../minimal-configuration-script.json", help="Configuration JSON file", type=str, ) @@ -56,9 +56,9 @@ "number_of_pop_results": json_conf.get("number_of_pop_results", 100), "output_MUUG": json_conf.get("output_MUUG", True), "output_haplotypes": json_conf.get("output_haplotypes", False), - "node_file": project_dir + json_conf.get("node_csv_file"), - "top_links_file": project_dir + json_conf.get("top_links_csv_file"), - "edges_file": project_dir + json_conf.get("edges_csv_file"), + "node_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("node_csv_file"), + "top_links_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("top_links_csv_file"), + "edges_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"), "imputation_input_file": project_dir + json_conf.get("imputation_in_file"), "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"), diff --git a/scripts/runfile.py b/scripts/runfile.py index ab0d22a..94c0c56 100755 --- a/scripts/runfile.py +++ b/scripts/runfile.py @@ -6,6 +6,8 @@ import sys import os +sys.path.insert(0, os.path.join("..")) + from grim.imputation.impute import Imputation from grim.imputation.networkx_graph import Graph @@ -18,7 +20,7 @@ "-c", "--config", required=False, - default="../conf/minimal-configuration.json", + default="../conf/minimal-configuration-script.json", help="Configuration JSON file", type=str, ) @@ -28,6 +30,7 @@ # read the config file output_dir = "output/" +project_dir = "../" # Read configuration file and load properties with open(configuration_file) as f: @@ -42,11 +45,10 @@ "number_of_pop_results": json_conf.get("number_of_pop_results", 100), "output_MUUG": json_conf.get("output_MUUG", True), "output_haplotypes": json_conf.get("output_haplotypes", False), - "node_file": json_conf.get("graph_files_path") + json_conf.get("node_csv_file"), - "top_links_file": json_conf.get("graph_files_path") - + json_conf.get("top_links_csv_file"), - "edges_file": json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"), - "imputation_input_file": json_conf.get("imputation_in_file"), + "node_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("node_csv_file"), + "top_links_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("top_links_csv_file"), + "edges_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"), + "imputation_input_file": project_dir + json_conf.get("imputation_in_file"), "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"), "imputation_out_umug_pops_file": output_dir From 0b5e8880299a0f16238617f1b753dbe809fd7228 Mon Sep 17 00:00:00 2001 From: sapiris Date: Mon, 12 Dec 2022 12:14:40 +0200 Subject: [PATCH 2/2] Adjusting the import and default paths to the current directories --- MANIFEST.in | 4 ++++ setup.py | 9 +++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index bc6d0e1..f979886 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,3 +7,7 @@ recursive-include grim *.txt recursive-include grim *.json recursive-include grim *.pyx recursive-include grim *.pyd +recursive-include conf *.json +recursive-include data *.csv +recursive-include graph_generation *.csv +recursive-include graph_generation *.py diff --git a/setup.py b/setup.py index 8c5a4f4..2856644 100644 --- a/setup.py +++ b/setup.py @@ -77,10 +77,11 @@ include=[ "grim", "grim.imputation", - "grim.imputation.imputegl", - "grim.imputation.graph_generation", - "grim.validation", - "grim.conf", + "graph_generation", + "graph_generation.output", + "data", + "data.subjects", + "conf" ] ), test_suite="tests",