forked from fjruizruano/ngs-protocols
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rm_getseq_annot.py
executable file
·62 lines (53 loc) · 1.42 KB
/
rm_getseq_annot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/python
import sys
from Bio import SeqIO
from Bio.Seq import Seq
print "Usage: rm_getseq.py FastaFile RepeatMaskerOut [LenMinimum]"
try:
fafile = sys.argv[1]
except:
fafile = raw_input("Introduce FASTA file: ")
try:
rmfile = sys.argv[2]
except:
rmfile = raw_input("Introduce RepeatMasker out file: ")
try:
lenlimit = int(sys.argv[3])
except:
lenlimit = 0
dict_seq = {}
seqs = SeqIO.parse(fafile,"fasta")
for s in seqs:
dict_seq[str(s.id)] = str(s.seq)
rmout = open(rmfile).readlines()
out = open(rmfile+".fas", "w")
for line in rmout[3:]:
line = line.replace("(","")
line = line.replace(")","")
info = line.split()
name = info[4]
begin_q = int(info[5])
end_q = int(info[6])
sense = info[8]
id = info[9]
begin_r = int(info[11])
end_r = int(info[12])
left_r = int(info[13])
try:
double = info[15]
except:
double = ""
secu = ""
if sense == "+" and double == "":
len_rep = end_r-begin_r+1
if len_rep >= lenlimit:
secu = dict_seq[name][begin_q-1:end_q]
out.write(">%s_%s\n%s\n" % (id,name,secu))
elif sense == "C" and double == "":
len_rep = end_r-left_r+1
if len_rep >= lenlimit:
secu = dict_seq[name][begin_q-1:end_q]
secu = Seq(secu)
secu_inv = secu.reverse_complement()
out.write(">%s_%s\n%s\n" % (id,name,secu_inv))
out.close()