-
Notifications
You must be signed in to change notification settings - Fork 8
/
extract-and-annotate-changed.py
95 lines (73 loc) · 2.66 KB
/
extract-and-annotate-changed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#! /usr/bin/env python
import screed
import argparse
import csv
P=0.05
def main():
parser = argparse.ArgumentParser()
parser.add_argument('changed_file')
parser.add_argument('transcript_file')
parser.add_argument('output')
args = parser.parse_args()
print 'Loading changed list from:', args.changed_file
tr_dict = {}
fp = open(args.changed_file, 'rb')
r = csv.DictReader(fp, delimiter='\t',
fieldnames=['tr', 'p_equal', 'p_diff', 'postfc', 'realfc'])
r.next()
for row in r:
tr = row['tr']
tr_dict[tr] = row
print 'Loading transcript matrix from:', args.transcript_file
annots = {}
for record in screed.open(args.transcript_file):
tr = record.name.split('.')[2]
assert tr.startswith('tr')
# find the location of the x_of_y string; before that is the
# annotation.
annot_loc = record.description.find('_of_')
assert annot_loc > 0
while annot_loc > 0 and record.description[annot_loc] != ' ':
annot_loc -= 1
assert annot_loc >= 0, annot_loc
# if we got anything, record it.
if annot_loc > 0:
annot = record.description[:annot_loc].strip()
# eliminate 'transcript family' annotations since the
# original annotations will be gathered.
if annot.startswith('transcript family'):
continue
x = annots.get(tr, [])
x.append(annot)
annots[tr] = x
def sort_by_ortho(a, b):
if a.startswith('ortho'):
return -1
return 0
for n, k in enumerate(annots):
annot = sorted(set(annots[k]), cmp=sort_by_ortho)
annots[k] = "; ".join(annot)
###
diff_tr = []
for tr in tr_dict:
row = tr_dict[tr]
p_equal = float(row['p_equal'])
if p_equal > P: # ignore!
continue
postfc = float(row['postfc'])
realfc = float(row['realfc'])
diff_tr.append((p_equal, postfc, realfc, tr))
diff_tr.sort()
###
print '%d of %d total genes diff expressed at P > %f' % (len(diff_tr),
len(tr_dict),
1-P)
print 'Writing differentially expressed genes list to', args.output
outfp = open(args.output, 'w')
w = csv.writer(outfp)
w.writerow(['p diff expr', 'PostFC', 'RealFC', 'transcript family', 'annotation'])
for p_equal, postfc, realfc, tr in diff_tr:
w.writerow([1.0-p_equal, postfc, realfc, tr, annots.get(tr, '')])
outfp.close()
if __name__ == '__main__':
main()