Skip to content

Commit

Permalink
Merge branch 'sapiris-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
pbashyal-nmdp committed Dec 12, 2022
2 parents a08fd16 + 289f912 commit 6b5db7d
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 32 deletions.
4 changes: 4 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,7 @@ recursive-include grim *.txt
recursive-include grim *.json
recursive-include grim *.pyx
recursive-include grim *.pyd
recursive-include conf *.json
recursive-include data *.csv
recursive-include graph_generation *.csv
recursive-include graph_generation *.py
55 changes: 55 additions & 0 deletions conf/minimal-configuration-script.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"populations": [
"CAU"
],
"freq_trim_threshold": 1e-5,
"priority": {
"alpha": 0.4999999,
"eta": 0,
"beta": 1e-7,
"gamma": 1e-7,
"delta": 0.4999999
},
"UNK_priors": "SR",
"FULL_LOCI": "ABCQR",
"loci_map": {
"A": 1,
"B": 2,
"C": 3,
"DQB1": 4,
"DRB1": 5
},

"factor_missing_data": 0.0001,
"Plan_B_Matrix": [
[[1, 2, 3, 4, 5]],
[[1, 2, 3], [4, 5]],
[[1], [2, 3], [4, 5]],
[[1, 2, 3], [4], [5]],
[[1], [2, 3], [4], [5]],
[[1], [2], [3], [4], [5]]
],
"planb": true,
"number_of_options_threshold": 100000,
"epsilon": 1e-3,
"number_of_results": 10,
"number_of_pop_results": 100,
"output_MUUG": true,
"output_haplotypes": true,
"freq_data_dir": "data/freqs" ,
"freq_file": "output/hpf.csv" ,
"graph_files_path": "graph_generation/output/csv/" ,
"node_csv_file": "nodes.csv",
"edges_csv_file": "edges.csv",
"info_node_csv_file": "info_node.csv",
"top_links_csv_file": "top_links.csv",
"imputation_in_file": "data/subjects/donor.csv",
"imputation_out_umug_freq_filename": "don.umug",
"imputation_out_umug_pops_filename": "don.umug.pops",
"imputation_out_hap_freq_filename": "don.pmug",
"imputation_out_hap_pops_filename": "don.pmug.pops",
"imputation_out_miss_filename": "don.miss",
"imputation_out_problem_filename": "don.problem",
"max_haplotypes_number_in_phase": 100,
"imuptation_out_path": "output"
}
5 changes: 2 additions & 3 deletions conf/minimal-configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@
"output_MUUG": true,
"output_haplotypes": true,
"freq_data_dir": "data/freqs" ,
"pops_count_file": "graph_generation/output/pop_ratio.txt" ,
"freq_file": "graph_generation/output/hpf.csv" ,
"graph_files_path": "graph_generation/output/csv/" ,
"freq_file": "output/hpf.csv" ,
"graph_files_path": "output/csv/" ,
"node_csv_file": "nodes.csv",
"edges_csv_file": "edges.csv",
"info_node_csv_file": "info_node.csv",
Expand Down
10 changes: 6 additions & 4 deletions graph_generation/generate_neo4j_multi_hpf.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def loci_order(loc_values):


def generate_graph(
config_file="../../conf/minimal-configuration.json", em_pop=None, em=False
config_file="../conf/minimal-configuration-script.json", em_pop=None, em=False, use_default_path = False
):
##############################################################################
# Configure
Expand All @@ -215,7 +215,9 @@ def generate_graph(

# Input file
# freq_file = path + freq_file

path = ""
if use_default_path:
path = os.path.dirname(os.path.realpath(__file__)) + "/"
parser = argparse.ArgumentParser()
parser.add_argument(
"-c",
Expand Down Expand Up @@ -243,10 +245,10 @@ def generate_graph(
pops = em_pop
freq_trim = conf.get("freq_trim_threshold")

freq_file = conf.get("freq_file")
freq_file = path + conf.get("freq_file")
dict_count_of_pop = {}

pop_ratio_dir = conf.get("pops_count_file")
pop_ratio_dir = path + conf.get("pops_count_file", "")
path = pathlib.Path(pop_ratio_dir)

if em or not path.is_file():
Expand Down
32 changes: 23 additions & 9 deletions grim/grim.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,39 +23,53 @@
#


from .validation import runfile
from .imputation.graph_generation import generate_neo4j_multi_hpf



from .imputation.impute import Imputation
from .imputation.networkx_graph import Graph

import sys
import os

# adding Folder_2 to the system path
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)).replace("/grim", ""))


from graph_generation import generate_neo4j_multi_hpf
from grim.run_impute_def import run_impute



def graph_freqs(conf_file="", for_em=False, em_pop=None):
use_default_path = False
if conf_file == "":
use_default_path = True
conf_file = (
os.path.dirname(os.path.realpath(__file__))
os.path.dirname(os.path.realpath(__file__)).replace("/grim", "")
+ "/conf/minimal-configuration.json"
)

generate_neo4j_multi_hpf.generate_graph(
config_file=conf_file, em_pop=em_pop, em=for_em
config_file=conf_file, em_pop=em_pop, em=for_em, use_default_path = use_default_path
)


def impute(conf_file=""):

project_dir_in_file, project_dir_graph = "", ""
if conf_file == "":

conf_file = (
os.path.dirname(os.path.realpath(__file__))
os.path.dirname(os.path.realpath(__file__)).replace("/grim", "")
+ "/conf/minimal-configuration.json"
)
project_dir_graph = (
os.path.dirname(os.path.realpath(__file__))
+ "/imputation/graph_generation/"
os.path.dirname(os.path.realpath(__file__)).replace("/grim", "")
+ "/graph_generation/"
)
project_dir_in_file = os.path.dirname(os.path.realpath(__file__)) + "/"
runfile.run_impute(conf_file, project_dir_graph, project_dir_in_file)
project_dir_in_file = os.path.dirname(os.path.realpath(__file__)).replace("/grim", "") + "/"
run_impute(conf_file, project_dir_graph, project_dir_in_file)


def impute_instance(config, graph, count_by_prob=None):
Expand Down
127 changes: 127 additions & 0 deletions grim/run_impute_def.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import argparse
import cProfile
import json
import pathlib
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))

from .imputation.impute import Imputation
from .imputation.networkx_graph import Graph

# Profiler start
#pr = cProfile.Profile()
#pr.enable()

def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_graph = "", project_dir_in_file = ""):

configuration_file = conf_file

#project_dir = ""# "../"
#output_dir = "output/"


# Read configuration file and load properties
with open(configuration_file) as f:
json_conf = json.load(f)

graph_files_path = json_conf.get("graph_files_path")
if graph_files_path[-1] != '/':
graph_files_path += '/'
output_dir = json_conf.get("imuptation_out_path", "output")
if output_dir[-1] != '/':
output_dir += '/'
config = {
"planb": json_conf.get('planb', True),
"pops": json_conf.get('populations'),
"priority": json_conf.get('priority'),
"epsilon": json_conf.get('epsilon', 1e-3),
"number_of_results": json_conf.get('number_of_results', 1000),
"number_of_pop_results": json_conf.get('number_of_pop_results', 100),
"output_MUUG": json_conf.get("output_MUUG", True),
"output_haplotypes": json_conf.get("output_haplotypes", False),
"node_file": project_dir_graph + graph_files_path + json_conf.get("node_csv_file"),
"top_links_file": project_dir_graph + graph_files_path + json_conf.get("top_links_csv_file"),
"edges_file": project_dir_graph + graph_files_path +json_conf.get("edges_csv_file"),
"imputation_input_file": project_dir_in_file + json_conf.get("imputation_in_file"),
"imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"),
"imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename"),
"imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"),
"imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename"),
"imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename"),
"imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename"),
"factor_missing_data": json_conf.get("factor_missing_data", 0.01),
"loci_map": json_conf.get("loci_map", {"A": 1, "B":3, "C": 2, "DQB1": 4, "DRB1": 5} ),
"matrix_planb": json_conf.get("Plan_B_Matrix", [
[[1, 2, 3, 4, 5]],
[[1, 2, 3], [4, 5]],
[[1], [2, 3], [4, 5]],
[[1, 2, 3], [4], [5]],
[[1], [2, 3], [4], [5]],
[[1], [2], [3], [4], [5]]
]),
"pops_count_file": project_dir_graph + json_conf.get("pops_count_file",'' ),
"use_pops_count_file": json_conf.get("pops_count_file",False),
"number_of_options_threshold": json_conf.get("number_of_options_threshold", 100000),
"max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase",100 ),
"bin_imputation_input_file": project_dir_in_file + json_conf.get("bin_imputation_in_file", "None"),
"nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []),
"save_mode": json_conf.get("save_space_mode", False),
"UNK_priors" : json_conf.get("UNK_priors", "MR")

}

# Display the configurations we are using
print('****************************************************************************************************')
print("Performing imputation based on:")
print("\tPopulation: {}".format(config["pops"]))
print("\tPriority: {}".format(config["priority"]))
print("\tUNK priority: {}".format(config["UNK_priors"]))
print("\tEpsilon: {}".format(config["epsilon"]))
print("\tPlan B: {}".format(config["planb"]))
print("\tNumber of Results: {}".format(config["number_of_results"]))
print("\tNumber of Population Results: {}".format(config["number_of_pop_results"]))
print("\tNodes File: {}".format(config["node_file"]))
print("\tTop Links File: {}".format(config["edges_file"]))
print("\tInput File: {}".format(config["imputation_input_file"]))
print("\tOutput UMUG Format: {}".format(config["output_MUUG"]))
print("\tOutput UMUG Freq Filename: {}".format(config["imputation_out_umug_freq_file"]))
print("\tOutput UMUG Pops Filename: {}".format(config["imputation_out_umug_pops_file"]))
print("\tOutput Haplotype Format: {}".format(config["output_haplotypes"]))
print("\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"]))
print("\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"]))
print("\tOutput Miss Filename: {}".format(config["imputation_out_miss_file"]))
print("\tOutput Problem Filename: {}".format(config["imputation_out_problem_file"]))
print("\tFactor Missing Data: {}".format(config["factor_missing_data"]))
print("\tLoci Map: {}".format(config["loci_map"]))
print("\tPlan B Matrix: {}".format(config["matrix_planb"]))
print("\tPops Count File: {}".format(config["pops_count_file"]))
print("\tUse Pops Count File: {}".format(config["use_pops_count_file"]))
print("\tNumber of Options Threshold: {}".format(config["number_of_options_threshold"]))
print("\tMax Number of haplotypes in phase: {}".format(config["max_haplotypes_number_in_phase"]))
if config["nodes_for_plan_A"]:
print("\tNodes in plan A: {}".format(config["nodes_for_plan_A"]))
print("\tSave space mode: {}".format(config["save_mode"]))
print('****************************************************************************************************')


all_loci_set = set()
for _, val in config["loci_map"].items():
all_loci_set.add(str(val))

config["full_loci"] = ''.join(sorted(all_loci_set))
# Perform imputation
graph = Graph(config)
graph.build_graph(config["node_file"], config["top_links_file"], config["edges_file"])
imputation = Imputation(graph, config)

# Create output directory if it doesn't exist
pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True)

# Write out the results from imputation
imputation.impute_file(config)

# Profiler end
#pr.disable()
#pr.print_stats(sort="time")
12 changes: 6 additions & 6 deletions scripts/parallel-imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
import pathlib
from multiprocessing.pool import Pool

from imputegl import Imputation
from grim.imputation.impute import Imputation
from imputegl.impute import write_best_prob, write_best_prob_genotype

from imputegl.networkx_graph import Graph
from grim.imputation.networkx_graph import Graph

# Profiler start
pr = cProfile.Profile()
Expand All @@ -22,7 +22,7 @@
"-c",
"--config",
required=False,
default="../minimal-configuration.json",
default="../minimal-configuration-script.json",
help="Configuration JSON file",
type=str,
)
Expand Down Expand Up @@ -56,9 +56,9 @@
"number_of_pop_results": json_conf.get("number_of_pop_results", 100),
"output_MUUG": json_conf.get("output_MUUG", True),
"output_haplotypes": json_conf.get("output_haplotypes", False),
"node_file": project_dir + json_conf.get("node_csv_file"),
"top_links_file": project_dir + json_conf.get("top_links_csv_file"),
"edges_file": project_dir + json_conf.get("edges_csv_file"),
"node_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("node_csv_file"),
"top_links_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("top_links_csv_file"),
"edges_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"),
"imputation_input_file": project_dir + json_conf.get("imputation_in_file"),
"imputation_out_umug_freq_file": output_dir
+ json_conf.get("imputation_out_umug_freq_filename"),
Expand Down
14 changes: 8 additions & 6 deletions scripts/runfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import sys
import os

sys.path.insert(0, os.path.join(".."))

from grim.imputation.impute import Imputation
from grim.imputation.networkx_graph import Graph

Expand All @@ -18,7 +20,7 @@
"-c",
"--config",
required=False,
default="../conf/minimal-configuration.json",
default="../conf/minimal-configuration-script.json",
help="Configuration JSON file",
type=str,
)
Expand All @@ -28,6 +30,7 @@

# read the config file
output_dir = "output/"
project_dir = "../"

# Read configuration file and load properties
with open(configuration_file) as f:
Expand All @@ -42,11 +45,10 @@
"number_of_pop_results": json_conf.get("number_of_pop_results", 100),
"output_MUUG": json_conf.get("output_MUUG", True),
"output_haplotypes": json_conf.get("output_haplotypes", False),
"node_file": json_conf.get("graph_files_path") + json_conf.get("node_csv_file"),
"top_links_file": json_conf.get("graph_files_path")
+ json_conf.get("top_links_csv_file"),
"edges_file": json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"),
"imputation_input_file": json_conf.get("imputation_in_file"),
"node_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("node_csv_file"),
"top_links_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("top_links_csv_file"),
"edges_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"),
"imputation_input_file": project_dir + json_conf.get("imputation_in_file"),
"imputation_out_umug_freq_file": output_dir
+ json_conf.get("imputation_out_umug_freq_filename"),
"imputation_out_umug_pops_file": output_dir
Expand Down
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@
include=[
"grim",
"grim.imputation",
"grim.imputation.imputegl",
"grim.imputation.graph_generation",
"grim.validation",
"grim.conf",
"graph_generation",
"graph_generation.output",
"data",
"data.subjects",
"conf"
]
),
test_suite="tests",
Expand Down

0 comments on commit 6b5db7d

Please sign in to comment.