forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_cmap.py
114 lines (109 loc) · 3.88 KB
/
parse_cmap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy
def main():
base_dir = "/home/emre/arastirma/data/drug/cmap"
desc_file = base_dir + "/cmap_instances_02.csv"
matrix_file = base_dir + "/rankMatrix.txt"
probe_mapping_file = base_dir + "/probe_mapping.txt"
drug_to_top_geneids = get_cmap_info(desc_file, probe_mapping_file, matrix_file)
return
def get_cmap_info(desc_file, probe_mapping_file, matrix_file, n_top=250):
# Get instance info
f = open(desc_file)
line = f.readline()
words = line.strip().split("\t")
header_to_idx = dict((word.lower(), i) for i, word in enumerate(words))
drugs = set()
instance_to_values = {} # drug, concentration, duration, cell_type
drug_to_instances = {}
for line in f:
words = line.strip().split("\t")
drug = words[header_to_idx["cmap_name"]].lower()
instance = words[header_to_idx["instance_id"]]
concentration = float(words[header_to_idx["concentration (m)"]])
duration = float(words[header_to_idx["duration (h)"]])
cell_type = words[header_to_idx["cell2"]]
instance_to_values[instance] = (drug, concentration, duration, cell_type)
drug_to_instances.setdefault(drug, []).append(instance)
f.close()
if False:
# Get instance with highest concentration & shortest duration
instance_to_drug = {}
for drug, instances in drug_to_instances.iteritems():
max_concentration = None
min_duration = None
selected_instance = None
for instance in instances:
drug_, concentration, duration, cell_type = instance_to_values[instance]
if max_concentration is None:
max_concentration = concentration
if min_duration is None:
min_duration = duration
if duration < min_duration:
min_duration = duration
selected_instance = instance
elif duration == min_duration:
if concentration >= max_concentration:
max_concentration = concentration
selected_instance = instance
if selected_instance is None:
raise ValueError("None instance %s %s" % (drug, instances))
instance_to_drug[selected_instance] = drug
# Get probe geneid mapping
f = open(probe_mapping_file)
line = f.readline()
probe_to_geneid = {}
for line in f:
probe, geneid = line.strip().split("\t")
probe_to_geneid[probe] = geneid
f.close()
# Get expression ranks
f = open(matrix_file)
line = f.readline()
words = line.strip().split("\t")
#header_to_idx = dict((word.lower(), i) for i, word in enumerate(words))
header_values = words[1:]
instance_to_ranks = {}
geneids = []
for line in f:
words = line.strip().split("\t")
probe = words[0]
if probe not in probe_to_geneid:
geneid = probe #"*%s" % probe
else:
geneid = probe_to_geneid[probe]
geneids.append(geneid)
for rank, instance in zip(words[1:], header_values):
instance_to_ranks.setdefault(instance, []).append(int(rank))
f.close()
geneids = numpy.array(geneids)
# Get top geneids for each instance
#drug_to_top_geneids = {}
drug_to_parameters_to_top_geneids = {}
for instance, ranks in instance_to_ranks.iteritems():
#if instance not in instance_to_drug:
# continue
indices = numpy.argsort(ranks)
up_geneids = []
down_geneids = []
#i = 0 # to get 50 geneids
for geneid in geneids[indices][:n_top]:
#if i >= 50:
# break
#if geneid[0] == "*": # before storing *probeid as geneid
# continue
up_geneids.append(geneid)
#i += 1
#i = 0
for geneid in reversed(geneids[indices][-n_top:]):
#if i >= 50:
# break
#if geneid[0] == "*":
# continue
down_geneids.append(geneid)
#i += 1
#drug_to_top_geneids[instance_to_drug[instance]] = (up_geneids, down_geneids)
(drug, concentration, duration, cell_type) = instance_to_values[instance]
drug_to_parameters_to_top_geneids.setdefault(drug, {})[(concentration, duration, cell_type)] = (up_geneids, down_geneids)
return drug_to_parameters_to_top_geneids #drug_to_top_geneids
if __name__ == "__main__":
main()