Skip to content

Commit

Permalink
Merge pull request #23 from ArnovanHilten/dev
Browse files Browse the repository at this point in the history
Topology Update
  • Loading branch information
ArnovanHilten authored Sep 28, 2020
2 parents 33408bd + 44a63d1 commit 94e134e
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 32 deletions.
32 changes: 29 additions & 3 deletions GenNet.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from GenNet_utils.Create_plots import plot
from GenNet_utils.Train_network import train_classification, train_regression
from GenNet_utils.Convert import convert
from GenNet_utils.Topology import topology


def main(args):
Expand All @@ -23,6 +24,9 @@ def main(args):
plot(args)
if args.mode == 'convert':
convert(args)
if args.mode == "topology":
topology(args)



if __name__ == '__main__':
Expand All @@ -37,7 +41,7 @@ def main(args):
parser_convert.add_argument('-variants', type=str,
help="Path to file with row numbers of variants to include, if none is "
"given all variants will be used", default=None)
parser_convert.add_argument("-o", "--out", type=str, required=True, help="path to save result folder")
parser_convert.add_argument("-o", "--out", type=str, default=os.getcwd() + '/processed_data/', help="path for saving the results, default ./processed_data")
parser_convert.add_argument('-ID', action='store_true', default=False,
help='Flag to convert minimac data to genotype per subject files first (default False)')

Expand Down Expand Up @@ -116,7 +120,29 @@ def main(args):
metavar="Layer_number:",
default=0
)

parser_topology = subparsers.add_parser("topology", help="Create standard topology files")
parser_topology.add_argument(
"type",
default='create_annovar_input', type=str,
choices=['create_annovar_input', 'create_gene_network'],
help="Create annovar input, create gene network topology from annovar output"
)
parser_topology.add_argument(
"path",
type=str,
help="Path to the input data. For create_annovar_input this is the folder containing hase: genotype, "
"probes and individuals "
)
parser_topology.add_argument(
'study_name',
type=str,
help='Study name used in Convert. Name of the files in the genotype individuals and probe folders'
)
parser_topology.add_argument(
"-out",
type=str,
help="Path. Where to save the result, default ./processed_data",
default=os.getcwd() + '/processed_data/'
)
args = parser.parse_args()

main(args)
25 changes: 22 additions & 3 deletions GenNet_utils/Convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,24 +201,43 @@ def transpose_genotype(args, hdf_name):
print("Completed", args.study_name)


def exclude_variants_probes(args):
used_indices = pd.read_csv(args.variants, header=None)
used_indices = used_indices.index.values[used_indices.values.flatten()]
probes = pd.read_hdf(args.outfolder + '/probes/' + args.study_name + '.h5', mode="r")
print("Probes shape", probes.shape)
print("Selecting variants..")
probes = probes.iloc[used_indices]
print("Probes shape", probes.shape)
probes.to_hdf(args.outfolder + '/probes/' + args.study_name + '_selected.h5', key='probes', format='table',
data_columns=True, append=True,
complib='zlib', complevel=9, min_itemsize=45)

def convert(args):
hase_convert(args)
# 1. hase
if type(args.out) is list:
args.outfolder = args.out[0]
else:
args.outfolder = args.out

if (os.path.exists(args.outfolder + '/probes/')) and (os.path.exists(args.outfolder + '/genotype/')) and (os.path.exists(args.outfolder + '/individuals/')):
print("The folders: probes, genotype and individuals already exist. Data seems already in HASE format. Delete "
"the folders if the files are not converted properly. Continuing with the current files:")
else:
hase_convert(args)

# 2. converting multiple lists into single string
if type(args.study_name) is list:
args.study_name = args.study_name[0]
else:
args.study_name = args.study_name

merge_hdf5_hase(args)
hdf5_name = impute_hase_hdf5(args)

if args.variants is None:
pass

else:
hdf5_name = exclude_variants(args)

exclude_variants_probes(args)
transpose_genotype(args, hdf_name=hdf5_name)
14 changes: 9 additions & 5 deletions GenNet_utils/Create_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import tensorflow as tf
import tensorflow.keras as K
import scipy

import tables
tf.keras.backend.set_epsilon(0.0000001)
tf_version = tf.__version__ # ToDo use packaging.version
if tf_version <= '1.13.1':
Expand All @@ -38,7 +38,9 @@ def layer_block(model, mask, i):
columns = list(network_csv.columns.values)
network_csv = network_csv.sort_values(by=columns, ascending=True)

inputsize = len(network_csv)
h5file = tables.open_file(datapath + "genotype.h5", "r")
inputsize = h5file.root.data.shape[1]
h5file.close()

input_layer = K.Input((inputsize,), name='input_layer')
model = K.layers.Reshape(input_shape=(inputsize,), target_shape=(inputsize, 1))(input_layer)
Expand All @@ -47,9 +49,11 @@ def layer_block(model, mask, i):
network_csv2 = network_csv.drop_duplicates(columns[i])
matrix_ones = np.ones(len(network_csv2[[columns[i], columns[i + 1]]]), np.bool)
matrix_coord = (network_csv2[columns[i]].values, network_csv2[columns[i + 1]].values)
mask = scipy.sparse.coo_matrix(((matrix_ones), matrix_coord),
shape=(network_csv2[columns[i]].max() + 1,
network_csv2[columns[i + 1]].max() + 1))
if i == 0:
matrixshape = (inputsize, network_csv2[columns[i + 1]].max() + 1)
else:
matrixshape = (network_csv2[columns[i]].max() + 1, network_csv2[columns[i + 1]].max() + 1)
mask = scipy.sparse.coo_matrix(((matrix_ones), matrix_coord), shape = matrixshape)
masks.append(mask)
model = layer_block(model, mask, i)

Expand Down
64 changes: 43 additions & 21 deletions GenNet_utils/Create_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,6 @@
from GenNet_utils.Utility_functions import query_yes_no, get_paths


def plot(args):
folder, resultpath = get_paths(args.ID)
importance_csv = pd.read_csv(resultpath + "/connection_weights.csv", index_col=0)
layer = args.layer_n
if args.type == "layer_weight":
plot_layer_weight(resultpath, importance_csv, layer=layer, num_annotated=10)
elif args.type == "circos":
cicos_plot(resultpath=resultpath, importance_csv=importance_csv, plot_weights=False, plot_arrows=True)
elif args.type == "raw_importance":
manhattan_importance(resultpath=resultpath, importance_csv=importance_csv)
else:
print("invalid type:", args.type)
exit()


def cicos_plot(resultpath, importance_csv, plot_weights=True, plot_arrows=False):
print("in progress...")
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
Expand Down Expand Up @@ -81,7 +66,11 @@ def plot_layer_weight(resultpath, importance_csv, layer=0, num_annotated=10):

plt.figure(figsize=(20, 10))
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
color_end = np.sort(csv_file.groupby("node_layer_" + str(layer + 1))["node_layer_" + str(layer)].max().values)

if "chr" in csv_file.columns:
color_end = np.sort(csv_file.groupby("chr")["node_layer_" + str(layer)].max().values)
else:
color_end = np.sort(csv_file.groupby("node_layer_" + str(layer + 1))["node_layer_" + str(layer)].max().values)
color_end = np.insert(color_end, 0, 0)

csv_file = csv_file[["node_layer_" + str(layer), "node_layer_" + str(layer + 1), "weights_" + str(layer),
Expand Down Expand Up @@ -142,21 +131,38 @@ def plot_layer_weight(resultpath, importance_csv, layer=0, num_annotated=10):
def manhattan_importance(resultpath, importance_csv, num_annotated=10):
csv_file = importance_csv.copy()
plt.figure(figsize=(20, 10))
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * 1000
color_end = np.sort(csv_file.groupby("node_layer_1")["node_layer_0"].max().values)
color_end = np.insert(color_end, 0, 0)

gene_middle = []

if "chr" in csv_file.columns:
color_end = np.sort(csv_file.groupby("chr")["node_layer_0"].max().values)
print('coloring per chromosome')
color_end = np.insert(color_end, 0, 0)
for i in range(len(color_end) - 1):
gene_middle.append((color_end[i] + color_end[i + 1]) / 2)
else:
color_end = np.sort(csv_file.groupby("node_layer_1")["node_layer_0"].max().values)
color_end = np.insert(color_end, 0, 0)
print("no chr information continuing by coloring per group in node_layer_1")

weights = abs(csv_file["raw_importance"])
weights = weights / max(weights)
x = np.arange(len(weights))

print(len(color_end), "color groups")
colormap = ['#7dcfe2', '#4b78b5', 'darkgrey', 'dimgray'] * len(color_end)

for i in range(len(color_end) - 1):
plt.scatter(x[color_end[i]:color_end[i + 1]], weights[color_end[i]:color_end[i + 1]], c=colormap[i])

plt.ylim(bottom=0, top=1.2)
plt.xlim(0, len(weights) + int(len(weights) / 100))
plt.title("Raw importance for each path", size=36)
plt.xlabel("Path", size=18)
plt.title("Raw Importance Manhattan", size=36)
if len(gene_middle) > 1:
plt.xticks(gene_middle, np.arange(len(gene_middle)) + 1, size=16)
plt.xlabel("Chromosome", size=18)
else:
plt.xlabel("Chromosome position", size=18)
plt.ylabel("Weights", size=18)

csv_file["pos"] = x
Expand All @@ -179,3 +185,19 @@ def manhattan_importance(resultpath, importance_csv, num_annotated=10):

plt.savefig(resultpath + "Path_importance.png", bbox_inches='tight', pad_inches=0)
plt.show()


def plot(args):
folder, resultpath = get_paths(args.ID)
importance_csv = pd.read_csv(resultpath + "/connection_weights.csv", index_col=0)
print(resultpath)
layer = args.layer_n
if args.type == "layer_weight":
plot_layer_weight(resultpath, importance_csv, layer=layer, num_annotated=10)
elif args.type == "circos":
cicos_plot(resultpath=resultpath, importance_csv=importance_csv, plot_weights=False, plot_arrows=True)
elif args.type == "raw_importance":
manhattan_importance(resultpath=resultpath, importance_csv=importance_csv)
else:
print("invalid type:", args.type)
exit()
119 changes: 119 additions & 0 deletions GenNet_utils/Topology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os

import numpy as np
import pandas as pd


def Create_Annovar_input(args):
hasepath = args.path
studyname = args.study_name
savepath = args.out

if os.path.exists(hasepath + '/probes/' + studyname + '_selected.h5'):
probes = pd.read_hdf(hasepath + '/probes/' + studyname + '_selected.h5', mode="r")
else:
probes = pd.read_hdf(hasepath + '/probes/' + studyname + '.h5', mode="r")
print(probes.shape)

if os.path.exists(hasepath + '/probes/' + studyname + '_hash_table.csv.gz'):
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv.gz', compression="gzip", sep='\t')
else:
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv', sep='\t')

hashtable['allele1'] = hashtable['keys']
unhashed_probes = probes.merge(hashtable, on='allele1', how="left")
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele1"])
unhashed_probes = unhashed_probes.rename(columns={'allele': 'allele1'})

# reload hashtable for other allele

if os.path.exists(hasepath + '/probes/' + studyname + '_hash_table.csv.gz'):
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv.gz', compression="gzip", sep='\t')
else:
hashtable = pd.read_csv(hasepath + '/probes/' + studyname + '_hash_table.csv', sep='\t')

hashtable['allele2'] = hashtable['keys']
unhashed_probes = unhashed_probes.merge(hashtable, on='allele2', how="left")
unhashed_probes = unhashed_probes.drop(columns=["keys", "allele2"])
unhashed_probes = unhashed_probes.rename(columns={'allele': 'allele2'})

# clean
annovar_input = unhashed_probes.drop(columns=["ID", "distance"])
annovar_input["bp2"] = annovar_input["bp"]
annovar_input["index_col"] = annovar_input.index
annovar_input = annovar_input[['CHR', 'bp', "bp2", "allele1", "allele2", "index_col"]]

# print('Shape', annovar_input.shape)
# if args.variants is None:
# pass
# else:
# used_indices = pd.read_csv(args.variants, header=None)
# used_indices = used_indices.index.values[used_indices.values.flatten()]
# annovar_input = annovar_input.loc[annovar_input['index_col'].isin(used_indices)]
# annovar_input['index_col'] = np.arange(len(annovar_input)) # after splitting out the unused variants the numbering needs to be reset to match the genotype matrix

print('Number of variants', annovar_input.shape)

annovar_input_path = savepath + '/annovar_input_' + studyname + '.csv'
annovar_input.to_csv(annovar_input_path, sep="\t", index=False, header=False)

print('\n')
print('Annovar input files ready \n')
print("Install annovar: https://doc-openbio.readthedocs.io/projects/annovar/en/latest/user-guide/download/")
print("Navigate to annovar, e.g cd /home/charlesdarwin/annovar/")
print("Update annovar:\n perl annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/")
print("Run:\n perl annotate_variation.pl -geneanno -dbtype refGene -buildver hg19 " + str(
savepath) + "/annovar_input_" + str(studyname) + ".csv humandb --outfile " + str(savepath) + "/" + str(
studyname) + "_RefGene")
print('\n')
print(
'After obtaining the Annovar annotations, run topology create_gene_network to get the topology file for the SNPs-gene-output network:')


def Create_gene_network_topology(args):
datapath = args.path + '/'
studyname = args.study_name
savepath = args.out + '/'

print(args.study_name)

gene_annotation = pd.read_csv(datapath + str(studyname) + "_RefGene.variant_function", sep='\t', header=None)
gene_annotation.columns = ['into/exonic', 'gene', 'chr', 'bps', 'bpe', "mutation1", "mutation2", 'index_col']
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\,.*", "")
# gene_annotation['dist'] = gene_annotation['gene'].str.extract(r"(?<=dist\=)(.*)(?=\))")
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\(.*\)", "")
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\(.*", "")
gene_annotation['gene'] = gene_annotation['gene'].str.replace(r"\;.*", "")
gene_annotation = gene_annotation[(gene_annotation['gene'] != "NONE")]
gene_annotation = gene_annotation.dropna()

gene_list = gene_annotation.drop_duplicates("gene")
gene_list = gene_list.sort_values(by=["chr", "bps"], ascending=[True, True])
gene_list["gene_id"] = np.arange(len(gene_list))
gene_list = gene_list[["gene", "gene_id"]]

gene_annotation = gene_annotation.merge(gene_list, on="gene")
gene_annotation = gene_annotation.sort_values(by="index_col", ascending=True)

gene_annotation = gene_annotation.assign(
chrbp='chr' + gene_annotation.chr.astype(str) + ':' + gene_annotation.bps.astype(str))
gene_annotation.to_csv(savepath + "/gene_network_description.csv")

topology = gene_annotation[["chr", "index_col", "chrbp", "gene_id", "gene"]]
print(topology['index_col'].max())
topology.columns = ['chr', 'layer0_node', 'layer0_name', 'layer1_node', 'layer1_name']


topology.to_csv(savepath + "/topology.csv")

print('Topology file saved:', savepath + "/topology.csv")


def topology(args):
if args.type == 'create_annovar_input':
Create_Annovar_input(args)
elif args.type == 'create_gene_network':
Create_gene_network_topology(args)
else:
print("invalid type:", args.type)
exit()
4 changes: 4 additions & 0 deletions GenNet_utils/Utility_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ def create_importance_csv(datapath, model, masks):
coordinate_list = []
for i, mask in zip(np.arange(len(masks)), masks):
coordinates = pd.DataFrame([])

if (i == 0):
if 'chr' in network_csv.columns:
coordinates["chr"] = network_csv["chr"]
coordinates["node_layer_" + str(i)] = mask.row
coordinates["node_layer_" + str(i + 1)] = mask.col
coordinates = coordinates.sort_values("node_layer_" + str(i), ascending=True)
Expand Down

0 comments on commit 94e134e

Please sign in to comment.