-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathgenerate_fragments.py
executable file
·117 lines (95 loc) · 3.94 KB
/
generate_fragments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python2
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import sys, csv, StringIO, random, decimal, argparse
def printf(format, *args):
sys.stdout.write(format % args)
parser = argparse.ArgumentParser(description='Generate variable-length single metagenomic fragments.')
parser.add_argument('-r', metavar='<reference_fasta>', dest="ref", help="Multi-FASTA file containing genomic sequences from which reads will be sampled.")
parser.add_argument('-a', metavar='<abundance_file>', dest="abund", help="Tab-delimited abundance file with an abundance value for each corre- sponding genome sequence in <reference fasta>")
parser.add_argument('-o', metavar='<output_file>', dest="output", help="Name for output file containing simulated uniform-length reads")
parser.add_argument('-t', metavar='<total_reads>', type=int, dest="total", help="The total number of fragments to sample from all genomes")
parser.add_argument('-i', metavar='<insert_mean_length>', type=int, dest="insert", default="0", help="Average length of inserts.")
parser.add_argument('-s', metavar='<insert_stddev>', type=int, dest="stddev", default="0", help="Standard deviation of insert length" )
parser.add_argument('-d', '--direction', action='store_true', dest="direction", help="Use this switch to generate reads in both forward and reverse orientations" )
args = parser.parse_args()
#Reference metagenome database file (FASTA)
f1 = open(args.ref);
#abundance file (tab-delimited .txt)
f2 = open(args.abund);
total_reads = args.total
insert_avg = args.insert
insert_stddev = args.stddev
if(insert_avg):
f4 = open(args.output + '.1.fasta', 'w')
# f5 = open(args.output + '.2.fasta', 'w')
else:
f4 = open(args.output, 'w')
frags=[]
div_file = csv.reader(f2, delimiter='\t')
species=[]
diversity=[]
lengths=[]
freqs=[]
comp = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N', 'W':'W', 'R':'Y', 'Y':'R', 'S':'S', 'K':'M', 'M':'K', 'B':'V', 'V':'B', 'D':'H', 'H':'D'}
for row in div_file:
species.append(row[0][1:])
diversity.append(decimal.Decimal(row[1]))
for i in SeqIO.parse(f1, 'fasta') :
genome_num=0
while(not(species[genome_num] in i.description)) :
genome_num+=1
if(species[genome_num] in i.description) :
coverage=max(1, int((decimal.Decimal(diversity[genome_num])*total_reads)))
limit=len(i.seq)
for j in range(0, coverage) :
rand = random.random()
rand_length = 0
numLen = len(lengths)-1
if( (insert_avg != 0) & (insert_stddev != 0)):
cur_insert = int(random.gauss(insert_avg, insert_stddev))
if(limit > cur_insert):
start1 = random.randint(0, limit-cur_insert)
end1 = start1 + cur_insert
else:
start1 = 0
end1 = limit
read1 = i.seq[start1:end1]
if(args.direction):
check = random.random()
if(check < 0.5): #forward orientation
f4.write(">%s\n" % i.description)
f4.write("%s\n" % read1)
else: #reverse orientation
f4.write(">%s\n" % i.description)
f4.write("%s\n" % read1[::-1])
if(args.direction and random.random() < 0.5):
#reverse orientation
f4.write(">%s\n" % i.description)
f4.write("%s\n" % read1[::-1])
else:
#forward orientation
f4.write(">%s\n" % i.description)
f4.write("%s\n" % read1)
else:
if(limit > cur_insert) :
start=random.randint(0, limit-max_read_length)
end=start+max_read_length
else:
start=0
end=limit
read = i.seq[start:end]
if(args.direction and random.random() < 0.5):
#reverse orientation
read = ''.join([comp[b] for b in i.seq[start:end][::-1]])
f4.write(">%s\n" % i.description)
f4.write("%s\n" % read)
else:
#forward orientation
f4.write(">%s\n" % i.description)
f4.write("%s\n" % read)
if (genome_num >= len(species) ) :
break;
f1.close()
f2.close()
f4.close()