-
Notifications
You must be signed in to change notification settings - Fork 0
/
rearrange_unique_qseq.py
executable file
·90 lines (75 loc) · 2.14 KB
/
rearrange_unique_qseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#coding: utf-8
# **author: zhongjie wang**
#对unique后的marker基因blast结果,按照每行一个基因组,每列一个marker的格式进行重排
#用法:python rearrange_unique_qseq.py unique_relation.tsv rearrange_qseq.tsv rearrange_num.tsv final_good_qseq.txt
import sys
from collections import defaultdict
f1 = open(sys.argv[1], 'r')
f2 = open(sys.argv[2], 'w')
f3 = open(sys.argv[3], 'w')
f4 = open(sys.argv[4], 'w')
hitlist=f1.readlines()
binlist,markerlist=[],[]
mydict = defaultdict(dict)
for hit in hitlist:
qseq,marker,piden=hit.split('\t')[:3]
if 'gi|' in qseq:
genome=qseq.split('_')[0].split('|')[-2][:4]
if mydict[marker].get(genome):
mydict[marker][genome].append(';;;'.join([qseq,piden]))
else:
mydict[marker][genome]=[]
mydict[marker][genome].append(';;;'.join([qseq,piden]))
if genome not in binlist:
binlist.append(genome)
else:
genome=qseq.split('_')[0]
if mydict[marker].get(genome):
mydict[marker][genome].append(';;;'.join([qseq,piden]))
else:
mydict[marker][genome]=[]
mydict[marker][genome].append(';;;'.join([qseq,piden]))
if genome not in binlist:
binlist.append(genome)
if marker not in markerlist:
markerlist.append(marker)
markerlist.sort()
for marker in markerlist:
f2.write('\t'+marker)
f3.write('\t'+marker)
f2.write('\n')
f3.write('\n')
for genome in binlist:
print genome
f2.write(genome+'\t')
f3.write(genome+'\t')
for marker in markerlist:
if genome in mydict[marker]:
num=len(mydict[marker][genome])
seqs=mydict[marker][genome]
for seq in seqs:
qseq=seq.split(';;;')[0]
f2.write(str(seq)+',')
f2.write('\t')
f3.write(str(num)+'\t')
else:
f2.write('\t')
f3.write('0\t')
f2.write('\n')
f3.write('\n')
for marker in mydict:
for genome in mydict[marker]:
num=len(mydict[marker][genome])
seqs=mydict[marker][genome]
max_qseq,max_piden=mydict[marker][genome][0].split(';;;')[:]
if num>1:
for seq in seqs:
qseq,piden=seq.split(';;;')[:]
if float(piden)>float(max_piden):
max_piden,max_qseq=piden,qseq
f4.write(max_qseq+'\t'+marker+'\n')
else:
f4.write(max_qseq+'\t'+marker+'\n')
f1.close()
f2.close()
f3.close()