forked from Paramecium-aurelia/20170214-Gene-Deletion-WGD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
withoutlogHitsOriginal.py
executable file
·69 lines (59 loc) · 2.34 KB
/
withoutlogHitsOriginal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from __future__ import division
import re, sys, math, operator, copy,collections
import numpy as np
def ProcessCLI(args):
#scaf_pb=readgff(args[1])
bestHits= readpsl(args[1])
for k in bestHits:
print ' \t'.join([str(i) for i in list(k)])
#Read in and update the psl output and find hits
def readpsl(pslout):
bhits=[]
hits=collections.OrderedDict()
phits=collections.OrderDict()
with open(pslout,'r') as file:
for line in file:
line=line.rstrip().split()
## Here are the filtering conditions: pident has to be than 50% and the minimum common subsequence overlap of the querry and subject >=40%
## These criteria could be tuned and changed as desired
if float(line[4])>50.0 and max([ int(i) for i in re.split(r'(\d+)',line[-1]) if len(i)>0 and i.isdigit()])/min(float(line[5]),float(line[6]))>=0.40:
if line[0]!=line[2]:
if line[1] not in hits.keys():
hits[line[1]]=[line[3]]
else:
hits[line[1]].append(line[3])
# print hits['scaffold_0001']
# return
for key in hits:
#hits[key]=dict([a,x] for a, x in collections.Counter(hits[key]).iteritems())
hits[key]=dict([a,-math.log(float(x)/sum(collections.Counter(hits[key]).values()))] for a, x in collections.Counter(hits[key]).iteritems\
())
# print hits['scaffold_0269']
for k1 in hits:
for k2 in hits[k1]:
score1=0
score2=0
try:
score1=hits[k1][k2]
except:
score1=10
#print k1,k2,score1
try:
score2=hits[k2][k1]
except:
score2=10
score=score1+score2
bhits.append((k1,k2,score))
return sorted(bhits, key=lambda tup: (tup[2]) )#bhits
#Read he gff3 files and create a dictionary: scaffolds: list of proteins
def readgff(f):
filename = open(f, "r")
scaffolds={}
pbi={}
for line in filename:
line=line.strip().split()
if line[0][:8]=='scaffold' and line[2]=='mRNA':
pbi[re.split(r'[=;\s]',line[8])[1]]=line[0]
return pbi
if __name__ == '__main__':
ProcessCLI(sys.argv)