forked from Paramecium-aurelia/20170214-Gene-Deletion-WGD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipepjfpid_ss5040_rr5040.py
221 lines (217 loc) · 8.39 KB
/
pipepjfpid_ss5040_rr5040.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
from __future__ import division
import re, sys, math, itertools, string, subprocess, operator, copy,collections
from itertools import chain
import numpy as np
from itertools import groupby
def ProcessCLI(args):
pslout=createpsl(args[1])
pbidict=readgff(args[2])[1]
enout=includeScaffTopsl(pslout,pbidict,args[3])
sssd=getSSS(enout,pbidict)[1]
anotssd=sssCheck(args[4],pbidict)
printSSS(anotssd,'ohnologs5040')
printSSS(sssd,'biscaffolds5040')
candidates=findCandidates(get_rbsssh(enout,pbidict,sssd))
bestpairs,bpL=getbesthits(candidates)
rbbh,oneways=getrbbsh(bestpairs)
hits=rbbh+oneways
printHits(hits,'bihits5040')
def printSSS(d,out):
o_file=open(out,'w')
for k in d.keys():
if type(d[k])is dict:
for k2 in d[k].keys():
o_file.write(k +'\t'+k2+'\t'+str(d[k][k2])+'\n')
else:
o_file.write(k +'\t'+str(d[k])+'\n')
o_file.close()
def printHits(hits,out):
o_file=open(out,'w')
for h in hits:
o_file.write(h +'\n')
o_file.close()
def getrbbsh(bestpairs):
rbbsh=[]
oneways=[]
for t in bestpairs.keys():
if (t[1],t[0]) in bestpairs.keys():
rbbsh.append(' '.join([str(i) for i in list(list(t)+list(bestpairs[t]))]))
else:
oneways.append(' '.join([str(i) for i in list(list(t)+list(bestpairs[t]))]))
return rbbsh,oneways
def getbesthits(candidates):
bestpairs=[]
bpairs={}
for k in candidates:
bestsss=200.0
bestpair=None
k2=None
bestbit=0.0
for v in candidates[k].keys():
if float(candidates[k][v][2])<=bestsss and float(candidates[k][v][0])>bestbit:
bestbit=float(candidates[k][v][0])
bestsss=float(candidates[k][v][2])
k2=v
bestpairs.append((k,k2,candidates[k][k2]))
bpairs[(k,k2)]=candidates[k][k2]
return bpairs, bestpairs
## Get candidates for each protein
def findCandidates(rbsssh):
candidates={}
for p1 in rbsssh.keys():
minval=min([float(rbsssh[p1][p2][1]) for p2 in rbsssh[p1].keys()])
uplimit=float(minval*10**15)
check=0
for p2 in rbsssh[p1].keys():
if float(rbsssh[p1][p2][1])<=uplimit:
if p1 not in candidates.keys():
candidates[p1]={}
candidates[p1][p2]=rbsssh[p1][p2]
else:
candidates[p1][p2]=rbsssh[p1][p2]
return candidates
## Get Protein(key1):key2:(bscore,eval,sss)
def get_rbsssh(en_pslout,pbidict,sssd2):
ppevsssd=collections.OrderedDict()
for line in en_pslout:
line=line.rstrip().split()
jfident=(float(line[-4])/(min(float(line[5]),float(line[6]))))
jfidentl=(float(line[-5])/(min(float(line[5]),float(line[6]))))
ov=max([ int(i) for i in re.split(r'(\d+)',line[-1]) if len(i)>0 and i.isdigit()])
if jfidentl>0.5 and float(line[4])>40.0:
if line[0]!=line[2] and line[1]!=line[3]:
sc=findSSS(pbidict[line[0]],pbidict[line[2]],sssd2)
if line[0] not in ppevsssd.keys():
ppevsssd[line[0]]={}
ppevsssd[line[0]][line[2]]=(float(line[-3]),float(line[-2]),sc)
elif line[0] in ppevsssd.keys() and line[2] not in ppevsssd[line[0]].keys():
ppevsssd[line[0]][line[2]]=(float(line[-3]),float(line[-2]),sc)
elif float(ppevsssd[line[0]][line[2]][0])> float(line[-3]):
continue
else:
ppevsssd[line[0]][line[2]]=(float(line[-3]),float(line[-2]),sc)
return ppevsssd
## Get ScaScaSc (SSS)::(sca_sca_list,Sca1(key):list of (sca1,sca2,sss),sssList)
def getSSS(en_pslout,pbidict):
sssd=collections.OrderedDict()
sca_List_sca=collections.OrderedDict()
sssList=[]
for line in en_pslout:
line=line.rstrip().split()
jfident=(float(line[-4])/(min(float(line[5]),float(line[6]))))
jfidentl=(float(line[-5])/(min(float(line[5]),float(line[6]))))
ov=max([ int(i) for i in re.split(r'(\d+)',line[-1]) if len(i)>0 and i.isdigit()])
if jfidentl>0.5 and float(line[4])>40.0:
if line[0]!=line[2] and line[1]!=line[3]:
if line[1] not in sca_List_sca.keys():
sca_List_sca[line[1]]=[line[3]]
else:
sca_List_sca[line[1]].append(line[3])
#Find the scaffold_scaffold score pairs
for key in sca_List_sca:
kval=collections.Counter(sca_List_sca[key])
sca_List_sca[key]=dict([a,-math.log(float(x)/sum(kval.values()))] for a, x in kval.iteritems())
for k1 in sca_List_sca.keys():
sssd[k1]=[]
for k2 in sca_List_sca[k1].keys():
score1=0
score2=0
try:
score1=sca_List_sca[k1][k2]
except:
score1=100
try:
score2=sca_List_sca[k2][k1]
except:
score2=100
score=score1+score2
sssd[k1].append((k1,k2,score))
sssList.append((k1,k2,score))
return sca_List_sca, makeDict(sortDictTu(sssd,2)),sortDictTu(sssList,2)
## Run command line
def subprocess_cmd(command):
process = subprocess.Popen(command,stdout=subprocess.PIPE, shell=True)
proc_stdout = process.communicate()[0].strip()
return proc_stdout
def sortDictTu(indic,n):
if type(indic)==dict:
for k in indic:
indic[k]=sorted(indic[k],key=lambda tup:(tup[n]))
elif type(indic)==list:
indic=sorted(indic,key=lambda tup:(tup[n]))
return indic
def makeDict(sssd):
newd={}
for k in sssd.keys():
newd[k]={}
for k2 in sssd[k]:
newd[k][k2[1]]=k2[2]
return newd
def findSSS(s1,s2,sssd2):
try:
x=float(sssd2[s1][s2])
y=float(sssd2[s2][s1])
return y
except KeyError:
return 200.0
def getInt(pb):
return [int(i) for i in re.split(r'(\d+)',pb) if len(i) >0 and i.isdigit()][0]
## Get enriched output : scaffolds included
def includeScaffTopsl(pslout,pbidict,en_output):
outpsl=open(en_output, 'w')
output=[]
pslout=pslout.split('\n')
for line in pslout:
line=line.rstrip().split()
outpsl.write(' '.join([line[0],pbidict[line[0]],line[1],pbidict[line[1]],' \t'.join(line[2\
:])])+'\n')
output.append(' '.join([line[0],pbidict[line[0]],line[1],pbidict[line[1]],' \t'.join(line[2:])]))
outpsl.close()
return output
### run blast and get blast output
def createpsl(infile):
out=' -outfmt \"6 qseqid sseqid pident qlen slen qstart qend sstart send length nident bitscore evalue btop\"'
pslout=subprocess_cmd('blastp -query '+ infile + ' -subject '+ infile + out)
return pslout
### Read the gff3 files and create a dictionary -->scaffolds:list of proteins and pbi:scaffold, the scaff details info
def readgff(gff):
filename = open(gff, "r")
scaffolds=collections.OrderedDict()
pbi=collections.OrderedDict()
scaff=collections.OrderedDict()
for line in filename:
line=line.strip().split()
if line[0][:8]=='scaffold' and line[2]=='mRNA':
pbi[re.split(r'[=;\s]',line[8])[1]]=line[0]
if line[0] not in scaffolds:
scaffolds[line[0]]=[re.split(r'[=;\s]',line[8])[1]]
else:
scaffolds[line[0]].append(re.split(r'[=;\s]',line[8])[1])
if line[0] not in scaff.keys():
scaff[line[0]]=[]
else:
scaff[line[0]]=[str(0),line[4]]
return scaffolds, pbi, scaff
def sssCheck(infile,pbi):
sca_List_sca=collections.OrderedDict()
sssd={}
sssList=[]
filename = open(infile,"r")
for line in filename:
if len(line)>0 and 'hno' not in line and "." not in line:
line=line.strip().split()
if pbi[GtoT(line[0])] not in sca_List_sca.keys():
sca_List_sca[pbi[GtoT(line[0])]]=[pbi[GtoT(line[1])]]
else:
sca_List_sca[pbi[GtoT(line[0])]].append(pbi[GtoT(line[1])])
for key in sca_List_sca:
kval=collections.Counter(sca_List_sca[key])
sca_List_sca[key]=dict([a,-math.log(float(x)/sum(kval.values()))] for a, x in kval.iteritems())
return sca_List_sca
def GtoT(pbid):
pbid=list(pbid)
if len(pbid)>5 and pbid[5]=='G':
pbid[5]='T'
return ''.join(pbid)
if __name__ == '__main__':
ProcessCLI(sys.argv)