forked from csmiller/EMIRGE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
emirge_amplicon.py
executable file
·1556 lines (1317 loc) · 78.4 KB
/
emirge_amplicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python2
"""
EMIRGE: Expectation-Maximization Iterative Reconstruction of Genes from the Environment
Copyright (C) 2010-2012 Christopher S. Miller ([email protected])
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
https://github.com/csmiller/EMIRGE
for help, type:
python emirge_amplicon.py --help
"""
USAGE = \
"""usage: %prog DIR <required_parameters> [options]
This version of EMIRGE (%prog) attempts to reconstruct rRNA SSU genes
from Illumina amplicon data. It can handle up to a few million rRNA
reads at a time.
DIR is the working directory to process data in.
Use --help to see a list of required and optional arguments
Additional information:
https://groups.google.com/group/emirge-users
https://github.com/csmiller/EMIRGE/wiki
A manuscript describing the use of EMIRGE on amplicon data has been
submitted. For now, if you use this version of EMIRGE in your work,
please cite:
Miller, C.S., B. J. Baker, B. C. Thomas, S. W. Singer and
J. F. Banfield (2011)."EMIRGE: reconstruction of full-length ribosomal
genes from microbial community short read sequencing data." Genome
Biology 12(5): R44.
"""
import sys
import os
import re
import csv
from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
import pysam
import numpy
from scipy import sparse
from subprocess import Popen, PIPE, check_call, CalledProcessError
from time import ctime, time
from datetime import timedelta
import gzip
import cPickle
import _emirge_amplicon as _emirge
# from ctrie import Trie
# from pykseq import Kseq
BOWTIE_l = 20
BOWTIE_e = 300
BOWTIE_ASCII_OFFSET = 33 # currently, bowtie writes quals with an ascii offset of 33
class Record:
"""
stripped down FASTA record class with same members as
Biopython FASTA Record Class
title -- with > removed
sequence -- as string
"""
_colwidth = 60 # colwidth specifies the number of residues to put on each line when generating FASTA format.
def __init__(self, title = "", sequence = ""):
"""
Create a new Record.
"""
self.title = title
self.sequence = sequence
def __str__(self):
return ">%s\n%s\n"%(self.title, "\n".join(re.findall(".{1,%s}"%self._colwidth, self.sequence)))
def FastIterator(filehandle, dummyParser = None, record = None):
"""
a generator to return records one at a time. Maybe 160% faster on test case.
MAY RAISE MemoryError ON LARGE FASTA FILES
IN: file object
dummyParser is a placeholder for RecordParser from Biopython. Not used.
a record to use for yielding. Otherwise create an empty one with standard init
NOTE: this fasta iterator is fast, but it breaks easily with nonstandard input, for example
if there are "\r" in endlines.
"""
if record is None:
record = Record()
for recordstring in re.split('\n>', filehandle.read()[1:]):
record.title, record.sequence = recordstring.split('\n',1)
record.sequence = record.sequence.replace('\n','').replace(' ','')
yield record
class EM(object):
"""
driver class for EM algorithm
"""
_VERBOSE = True
base2i = {"A":0,"T":1,"C":2,"G":3}
i2base = dict([(v,k) for k,v in base2i.iteritems()])
# asciibase2i = {65:0,84:1,67:2,71:3}
clustermark_pat = re.compile(r'(\d+\|.?\|)?(.*)') # cludgey code associated with this should go into a method: get_seq_i()
DEFAULT_ERROR = 0.05
def __init__(self, reads1_filepath, reads2_filepath,
insert_mean,
insert_sd,
n_cpus = 1,
cwd = os.getcwd(), max_read_length = 76,
iterdir_prefix = "iter.", cluster_thresh = 0.97,
mapping_nice = None,
reads_ascii_offset = 64,
expected_coverage_thresh = 10,
rewrite_reads = True):
"""
n_cpus is how many processors to use for multithreaded steps (currently only the bowtie mapping)
mapping_nice is nice value to add to mapping program
"""
if reads2_filepath is not None:
assert not reads2_filepath.endswith('.gz'), "Only read1 file is allowed to be gzipped"
self.reads1_filepath = reads1_filepath
self.reads2_filepath = reads2_filepath
self.insert_mean = insert_mean
self.insert_sd = insert_sd
self.n_cpus = n_cpus
self.mapping_nice = mapping_nice
self.reads_ascii_offset = reads_ascii_offset
self.iteration_i = None # keeps track of which iteration we are on.
self.cwd = cwd
self.max_read_length = max_read_length
self.iterdir_prefix = iterdir_prefix
self.cluster_thresh = cluster_thresh # if two sequences evolve to be >= cluster_thresh identical (via usearch), then merge them. [0, 1.0]
assert self.cluster_thresh >= 0 and self.cluster_thresh <= 1.0
self.expected_coverage_thresh = expected_coverage_thresh
# Single numpy array. Has the shape: (numsequences x numreads) [numreads can be numpairs]
self.likelihoods = None # = Pr(R_i|S_i), the likelihood of generating read R_i given sequence S_i
# list of numpy arrays. list index is iteration number. Each numpy array has the shape: (numsequences,)
self.priors = [] # = Pr(S_i), the prior probability that sequence S generated any read
# list of numpy arrays. list index is iteration number. Each numpy array has the shape: (numsequences x numreads)
self.posteriors = [] # = Pr(S_i|R_i), the posterior probability that sequence S_i generated read R_i
# dict's and list keeping id mappings between sequence names and internal indices (seq_i)
# index is stable between iterations. If sequence_i2sequence value is None, means this sequence was abandoned in a previous round
self.sequence_name2sequence_i = {}
self.sequence_i2sequence_name = [] # list index is iteration number.
self.split_seq_first_appeared = {} # seq_i --> iteration first seen. Useful for keeping track of when a sequence first appeared, and not allowing merging of recently split out sequences
# similar to above except for reads -- depreciated
# self.read_name2read_i = {} # Trie()
# self.read_i2read_name = numpy.array([], dtype=numpy.uint) -- DEPRECIATED
self.n_reads = 0 # in fastq input (number of reads **or number of read pairs**)
self.n_reads_mapped = 0
self.n_sequences = 0
# other constants, potentially changeable, tunable later, or could incorporate into probabilistic model.
self.min_depth = 5.0 # minimum depth to keep sequence around for next round
self.min_prior = None # minimum prior probability for a sequence to keep it around for next round (alternative to depth, which is
# a little weird when you allow mappings to more than one place. NOT YET IMPLEMENTED
self.base_coverages = [] # list of numpy arrays -- per base coverage values.
self.min_length_coverage_def = 1 # EXPERIMENTAL: Minimum coverage in order to be counted in min_length_coverage
self.min_length_coverage = None # EXPERIMENTAL. Fraction of length that has to be covered by >= min_length_cov_depth
self.snp_minor_prob_thresh = 0.10 # if prob(N) for minor allele base N is >= this threshold, call site a minor allele
self.snp_percentage_thresh = 0.10 # if >= this percentage of bases are minor alleles (according to self.snp_minor_prob_thresh),
# then split this sequence into two sequences.
# rewrite reads for index mapping, set self.n_reads
self.temporary_files = [] # to remove at end of run
if rewrite_reads:
self.rewrite_reads() # also sets self.n_reads
else: # hidden option in main to avoid rewriting reads from big files more than necessary
# if already has correct integer read neames, then simply count reads in file
if self._VERBOSE:
sys.stderr.write("Counting reads in input files at %s...\n"%(ctime()))
start_time = time()
cmd = "cat %s | wc -l"%(self.reads1_filepath)
if self.reads1_filepath.endswith('.gz'):
cmd = "z" + cmd
p = Popen(cmd, shell=True, stdout=PIPE)
stdoutdata, stderrdata = p.communicate()
self.n_reads = int(stdoutdata.strip())
if self._VERBOSE:
sys.stderr.write("DONE Counting reads in input files at %s [%s]...\n"%(ctime(), timedelta(seconds = time()-start_time)))
if self._VERBOSE:
sys.stderr.write("Number of reads (or read pairs) in input file(s): %d\n"%(self.n_reads))
self.reads_seen = numpy.zeros(self.n_reads, dtype=numpy.uint8) # bool matrix of reads seen mapped at any iteration
# where 1st dimension is read index (from rewritten file headers)
# and second dimension is read number (0 or 1 ==> read /1 or read /2)
# 3rd dimension for reads and quals is max_readlen
self.reads = numpy.empty((self.n_reads, 2, self.max_read_length), dtype=numpy.uint8)
self.quals = numpy.empty_like(self.reads)
self.readlengths = numpy.empty((self.n_reads, 2), dtype = numpy.uint16)
# read through reads file again, fill these.
if self._VERBOSE:
sys.stderr.write("Preallocating reads and quals in memory at %s...\n"%(ctime()))
start_time = time()
_emirge.populate_reads_arrays(self)
if self._VERBOSE:
sys.stderr.write("DONE Preallocating reads and quals in memory at %s [%s]...\n"%(ctime(), timedelta(seconds = time()-start_time)))
return
def rewrite_reads(self):
"""
rewrite reads files with indices as only info in header.
Though this requires an inefficient rewrite of the fastq file,
it means that reading of bam files do not require a costly separate
id lookup step on the read name.
also: set self.reads_n
initialize self.reads_seen # bool matrix of reads seen mapped at any iteration
"""
if self._VERBOSE:
sys.stderr.write("Rewriting reads with indices in headers at %s...\n"%(ctime()))
start_time = time()
tmp_n_reads_file_path = os.path.join(self.cwd, "emirge_tmp_n_reads.txt")
for i in (1, 2):
reads_filepath = getattr(self, "reads%s_filepath"%i)
if reads_filepath is None: # if not paired end, then self.reads2_filepath should be None
continue
new_reads_filepath = os.path.join(self.cwd, "emirge_tmp_reads_%s.fastq"%i)
self.temporary_files.append(new_reads_filepath)
setattr(self, "reads%s_filepath"%i, new_reads_filepath)
# first try awk, which is fast:
try:
cmd = """cat %s | awk 'BEGIN {i=0} {if ((NR-1)%%4==0) {print "@"i; i++} else print $0} END {print i > "%s"} ' > %s"""%(reads_filepath, tmp_n_reads_file_path, new_reads_filepath)
if reads_filepath.endswith('.gz'):
cmd = 'z' + cmd
check_call(cmd, shell=True, stdout = sys.stdout, stderr = sys.stderr)
self.n_reads = int(file(tmp_n_reads_file_path).readline().strip())
os.remove(tmp_n_reads_file_path)
continue # awk code worked
except CalledProcessError:
if self._VERBOSE:
sys.stderr.write("\tawk rewrite of reads failed! Is awk installed?\n")
raise
# sys.stderr.write("\tawk rewrite failed, falling back to pykseq...\n")
# COMMENTED OUT FOR THE TIME BEING. REASONABLE TO EXPECT AWK
# if code reaches here, means awk failed, so use pykseq instead (about 2X slower)
# outf = file(new_reads_filepath, 'w')
# outf_write = outf.write
# ks = Kseq(reads_filepath)
# i = 0
# while 1:
# t = ks.read_sequence_and_quals()
# if t is None:
# break
# else:
# outf_write("@%s\n%s\n+\n%s\n" % (i, t[1], t[2]))
# i += 1
# outf.close()
# del ks
# self.n_reads = i
if self._VERBOSE:
sys.stderr.write("DONE Rewriting reads with indexes in headers at %s [%s]...\n"%(ctime(), timedelta(seconds = time()-start_time)))
return
def read_bam(self, bam_filename, reference_fasta_filename):
"""
reads a bam file and...
updates:
self.sequence_i2sequence_name # a numpy array
self.sequence_name2sequence_i # a dict
self.read_name2read_i # a dict
self.probN
doesn't do anything with these anymore, they should be populated and stable with _emirge.populate_reads_arrays
self.reads
self.quals
self.readlengths
creates a new EMPTY entry for (appends to list, removes t-2
self.priors
self.posteriors
creates new each iteration (overwrites):
self.likelihoods
self.unmapped_bases
self.coverage
self.bamfile_data
This MUST maintain seq_i to name and read_i to name mappings between iterations, so that a single
name always maintains the same index from one iteration to the next. One result of this requirement
is that the various matrices can always get larger in a later t, but never smaller (as reads or seqs are added)
"""
if self._VERBOSE:
sys.stderr.write("Reading bam file %s at %s...\n"%(bam_filename, ctime()))
start_time = time()
initial_iteration = self.iteration_i < 0 # this is initial iteration
self.current_bam_filename = bam_filename
self.current_reference_fasta_filename = reference_fasta_filename
self.fastafile = pysam.Fastafile(self.current_reference_fasta_filename)
# set here:
# self.sequence_name2sequence_i
# self.sequence_i2sequence_name
# self.bamfile_data numpy array with (seq_i, read_i, pair_i, rlen, pos, is_reverse)
_emirge.process_bamfile(self, BOWTIE_ASCII_OFFSET)
self.n_sequences = len(self.sequence_name2sequence_i)
t_check = time()
self.priors.append(numpy.zeros(self.n_sequences, dtype = numpy.float))
self.likelihoods = sparse.coo_matrix((self.n_sequences, self.n_reads), dtype = numpy.float) # init all to zero.
self.posteriors.append(sparse.lil_matrix((self.n_sequences+1, self.n_reads+1), dtype=numpy.float))
self.probN = [None for x in range(self.n_sequences)] # TODO: is this necessary any more? or is bookkeeping with probN good enough now.
self.unmapped_bases = [None for x in self.probN]
self.mean_read_length = numpy.mean(self.readlengths)
# reset probN for valid sequences (from current_reference_fasta_filename).
# is this still necessary? Or do I keep probN bookkeeping in order already?
t_check = time()
_emirge.reset_probN(self) # also updates coverage values and culls via fraction of length covered
# print >> sys.stderr, "DEBUG: reset_probN loop time: %s"%(timedelta(seconds = time()-t_check))
for d in [self.priors, self.posteriors]:
if len(d) > 2:
trash = d.pop(0) # no longer care about t-2
del trash
if self._VERBOSE:
sys.stderr.write("DONE Reading bam file %s at %s [%s]...\n"%(bam_filename, ctime(), timedelta(seconds = time()-start_time)))
return
def initialize_EM(self, bam_filename, reference_fasta_filename, randomize_priors = False):
"""
Set up EM with two things so that first iteration can proceed:
- Initial guesses of Pr(S) are made purely based on read counts, where each read is only allowed to
map only once to a single best reference (**if more than one alignment reported per read, raise exception!**).
- Initial guess of Pr(N=n) (necessary for likelihood in Pr(S|R) is also calculated simply, with the assumption
of 1 read (the best again) mapped to exactly 1 sequence. Thus Pr(N=n) only takes the base call errors
into account. This is actually not done here, but rather the first time self.calc_probN is called.
- bamfile for iteration 0 is assumed to have just one ("best") mapping per read.
- there is no t-1 for t = 0, hence the need to set up Pr(S)
if randomize_priors == True, then after calculating priors,
shuffle them randomly. This is useful for debugging
purposes, to test effect of initialization, robustness of
results, and how often the algorithm gets stuck in local
maxima.
"""
if self._VERBOSE:
sys.stderr.write("Beginning initialization at %s...\n"%(ctime()))
self.iteration_i = -1
self.read_bam(bam_filename, reference_fasta_filename)
# initialize priors. Here just adding a count for each read mapped to each reference sequence
# since bowtie run with --best and reporting just 1 alignment at random, there is some stochasticity here.
for (seq_i, read_i, pair_i, rlen, pos, is_reverse) in self.bamfile_data:
# if self.probN[seq_i] is not None:
self.priors[-1][seq_i] += 1
# this shouldn't be necessary with way I do initial mapping right now (all seq_i in priors should be nonzero initially)
nonzero_indices = numpy.nonzero(self.priors[-1]) # only divide cells with at least one count. Set all others to Pr(S) = 0
self.priors[-1] = self.priors[-1][nonzero_indices] / self.priors[-1][nonzero_indices].sum() # turn these into probabilities
if randomize_priors:
numpy.random.shuffle(self.priors[-1])
self.priors.append(self.priors[-1].copy()) # push this back to t-1 (index == -2)
# write priors as special case:
self.print_priors(os.path.join(self.cwd, "priors.initialized.txt"))
if self._VERBOSE:
sys.stderr.write("DONE with initialization at %s...\n"%(ctime()))
return
def do_iteration(self, bam_filename, reference_fasta_filename):
"""
This starts with the M-step, so it requires that Pr(S) and Pr(N=n) from previous round are set.
Pr(S) is used from the previous round's E-step.
Pr(N=n) partially depends on the previous round's M-step.
Once M-step is done, then E-step calculates Pr(S) based upon the just-calculated M-step.
"""
self.iteration_i += 1
if self._VERBOSE:
sys.stderr.write("Starting iteration %d at %s...\n"%(self.iteration_i, ctime()))
start_time = time()
self.iterdir = os.path.join(self.cwd, "%s%02d"%(self.iterdir_prefix, self.iteration_i))
check_call("mkdir -p %s"%(self.iterdir), shell=True)
self.read_bam(bam_filename, reference_fasta_filename) # initializes all data structures.
# m-step
self.calc_likelihoods()
self.calc_posteriors()
# now e-step
self.calc_priors()
# now write a new fasta file. Cull sequences below self.min_depth
consensus_filename = os.path.join(self.iterdir, "iter.%02d.cons.fasta"%(self.iteration_i))
self.write_consensus(consensus_filename) # culls and splits
self.cluster_sequences(consensus_filename) # merges sequences that have evolved to be the same (USEARCH)
# leave a few things around for later. Note that print_priors also leaves sequence_name2sequence_i mapping, basically.
if self._VERBOSE:
sys.stderr.write("Writing priors and probN to disk for iteration %d at %s...\n"%(self.iteration_i, ctime()))
self.print_priors()
# python gzip.GzipFile is slow. Use system call to gzip instead
pickled_filename = os.path.join(self.iterdir, 'probN.pkl')
cPickle.dump(self.probN, file(pickled_filename, 'w'), cPickle.HIGHEST_PROTOCOL)
check_call("gzip -f %s"%(pickled_filename), shell=True, stdout = sys.stdout, stderr = sys.stderr)
if self._VERBOSE:
sys.stderr.write("DONE Writing priors and probN to disk for iteration %d at %s...\n"%(self.iteration_i, ctime()))
# delete bamfile from previous round (keep -- and convert to
# compressed bam -- initial iteration mapping in the
# background)
if self.iteration_i == 0 and self.current_bam_filename.endswith(".u.bam"): # initial iteration
renamed = self.current_bam_filename.rstrip(".u.bam") + ".bam"
# self.initial_compress_process = Popen(["samtools", "view", "-h", "-b", self.current_bam_filename, "-o", renamed], stdout = sys.stdout, stderr = sys.stderr) # child process runs in background
self.initial_compress_process = Popen("samtools view -h -b %s > %s"%(self.current_bam_filename, renamed), shell=True, stderr = sys.stderr) # child process runs in background
self.initial_bam_filename_to_remove = self.current_bam_filename
if self.iteration_i >= 1:
os.remove(self.current_bam_filename)
# check up on initial mapping compression background process once per iteration here
if self.initial_compress_process is not None:
poll = self.initial_compress_process.poll()
if poll == 0: # completed successfully
os.remove(self.initial_bam_filename_to_remove)
self.initial_compress_process = None # don't bother in future
elif poll is None:
if self.iteration_i == self.max_iterations - 1: # shouldn't happen... but to be correct
print >> sys.stderr, "Waiting for initial bamfile to compress before finishing...",
self.initial_compress_process.wait()
print >> sys.stderr, "DONE"
else:
pass
else: # poll() returned something bad.
print >> sys.stderr, "WARNING: Failed to compress initial mapping bamfile %s.\nWARNING: Failure with exit code: %s.\nWARNING: File remains uncompressed: %s"%(poll, self.initial_bam_filename_to_remove)
self.initial_compress_process = None # don't bother in future
# now do a new mapping run for next iteration
self.do_mapping(consensus_filename, nice = self.mapping_nice)
if self._VERBOSE:
sys.stderr.write("Finished iteration %d at %s...\n"%(self.iteration_i, ctime()))
sys.stderr.write("Total time for iteration %d: %s\n"%(self.iteration_i, timedelta(seconds = time()-start_time)))
return
def print_priors(self, ofname = None):
"""
leave a file in directory with nonzero priors printed out.
"""
if ofname is not None:
of = file(ofname, 'w')
else:
of = file(os.path.join(self.iterdir, "priors.iter.%02d.txt"%(self.iteration_i)), 'w')
sequence_i2sequence_name_array = numpy.array(self.sequence_i2sequence_name) # faster slicing?
for seq_i, prior in enumerate(self.priors[-1]):
seqname = sequence_i2sequence_name_array[seq_i]
of.write("%d\t%s\t%.10f\n"%(seq_i, seqname, prior))
of.close()
def calc_priors(self):
"""
calculates priors [ Pr(S) ] based on
Pr(S|R) (current posteriors from previous M step, this iteration)
"""
# here we do have column summing with the posteriors
# therefore, should be csc sparse type for efficient summing
self.posteriors[-1] = self.posteriors[-1].tocsc()
self.priors[-1] = numpy.asarray(self.posteriors[-1].sum(axis = 1)).flatten() / self.posteriors[-1].sum()
return
def write_consensus(self, outputfilename):
"""
writes a consensus, taking the most probable base at each position, according to current
values in Pr(N=n) (self.probN)
only write sequences with coverage above self.min_depth (culling)
split sequences with many minor alleles:
self.snp_minor_prob_thresh # if prob(N) for minor allele base N is >= this threshold, call site a minor allele
self.snp_percentage_thresh # if >= this percentage of bases are minor alleles (according to self.snp_minor_prob_thresh),
# then split this sequence into two sequences.
"""
if self._VERBOSE:
sys.stderr.write("Writing consensus for iteration %d at %s...\n"%(self.iteration_i, ctime()))
sys.stderr.write("\tsnp_minor_prob_thresh = %.3f\n"%(self.snp_minor_prob_thresh))
sys.stderr.write("\tsnp_percentage_thresh = %.3f\n"%(self.snp_percentage_thresh))
t0 = time()
splitcount = 0
cullcount = 0
of = file(outputfilename, 'w')
times_split = [] # DEBUG
times_posteriors = [] # DEBUG
seqs_to_process = len(self.probN) # DEBUG
i2base = self.i2base
rows_to_add = [] # these are for updating posteriors at end with new minor strains
cols_to_add = []
data_to_add = []
probNtoadd = [] # for newly split out sequences
self.posteriors[-1] = self.posteriors[-1].tolil() # just to make sure this is in row-access-friendly format
loop_t0 = time()
for seq_i in range(len(self.probN)):
seq_i_t0 = time()
if self.probN[seq_i] is None: # means this sequence is no longer present in this iteration or was culled in reset_probN
continue
# FOLLOWING CULLING RULES REMOVED in favor of length-coverage culling in reset_probN()
# check if coverage passes self.min_depth, if not don't write it (culling happens here)
# if self.min_depth is not None and self.coverage[seq_i] < self.min_depth: # and self.iteration_i > 5:
# # could adjust priors and posteriors here, but because
# # prior will already be low (b/c of low coverage) and
# # because next round will have 0 mappings (no sequence
# # in reference file to map to), this seems
# # unneccesary.
# # probNarray = None # NOT PASSED BY REF, assignment is only local?
# self.probN[seq_i] = None
# cullcount += 1
# continue # continue == don't write it to consensus.
# else passes culling thresholds
title = self.sequence_i2sequence_name[seq_i]
consensus = numpy.array([i2base.get(x, "N") for x in numpy.argsort(self.probN[seq_i])[:,-1]])
# check for minor allele consensus, SPLIT sequence into two candidate sequences if passes thresholds.
minor_indices = numpy.argwhere((self.probN[seq_i] >= self.snp_minor_prob_thresh).sum(axis=1) >= 2)[:,0]
if minor_indices.shape[0] > 0:
minor_fraction_avg = numpy.mean(self.probN[seq_i][(minor_indices, numpy.argsort(self.probN[seq_i][minor_indices])[:, -2])])
else:
minor_fraction_avg = 0.0
# NEW rule: only split sequence if *expected* coverage
# of newly split minor sequence (assuming uniform read
# coverage over reconstructed sequence) is > some
# threshold. Here, expected coverage is calculated
# based on:
# Prior(seq_i) * number of MAPPED reads * avg read length * 2 seq per pair
expected_coverage_minor = ( self.priors[-1][seq_i] * minor_fraction_avg * self.n_reads_mapped * self.mean_read_length ) / self.probN[seq_i].shape[0]
expected_coverage_major = ( self.priors[-1][seq_i] * (1-minor_fraction_avg) * self.n_reads_mapped * self.mean_read_length ) / self.probN[seq_i].shape[0]
if self.reads2_filepath is not None: # multipy by 2 because n_reads_mapped is actually number of mapped pairs
expected_coverage_minor = expected_coverage_minor * 2.0
expected_coverage_major = expected_coverage_major * 2.0
if minor_indices.shape[0] / float(self.probN[seq_i].shape[0]) >= self.snp_percentage_thresh and \
expected_coverage_minor >= self.expected_coverage_thresh:
# We split!
splitcount += 1
if self._VERBOSE:
t0_split = time()
major_fraction_avg = 1.-minor_fraction_avg # if there's >=3 alleles, major allele keeps prob of other minors)
minor_bases = numpy.array([i2base.get(x, "N") for x in numpy.argsort(self.probN[seq_i][minor_indices])[:,-2]]) # -2 gets second most probably base
minor_consensus = consensus.copy() # get a copy of the consensus
minor_consensus[minor_indices] = minor_bases # replace the bases that pass minor threshold
# now deal with naming.
title_root = re.search(r'(.+)(_m(\d+))$', title)
if title_root is None: # no _m00 on this name
title_root = title[:]
else:
title_root = title_root.groups()[0]
# now check for any known name with same root and a _m on it.
previous_m_max = max([0] + [int(x) for x in re.findall(r'%s_m(\d+)'%re.escape(title_root), " ".join(self.sequence_i2sequence_name))])
m_title = "%s_m%02d"%(title_root, previous_m_max+1)
# also split out Priors and Posteriors (which will be used in next round), split with average ratio of major to minor alleles.
# updating priors first:
old_prior = self.priors[-1][seq_i]
self.priors[-1][seq_i] = old_prior * major_fraction_avg
seq_i_minor = self.n_sequences
self.n_sequences += 1
self.sequence_i2sequence_name.append(m_title)
assert len(self.sequence_i2sequence_name) == self.n_sequences
assert len(self.sequence_i2sequence_name) == seq_i_minor + 1
self.sequence_name2sequence_i[m_title] = seq_i_minor
self.split_seq_first_appeared[seq_i] = self.iteration_i
# how I adjust probN here for newly split seq doesn't really matter,
# as it is re-calculated next iter.
# this only matters for probN.pkl.gz file left behind for this iteration.
# for now just set prob(major base) = 0 and redistribute prob to other bases for minor,
# and set prob(minor base) = 0 and redistribute prob to other bases for major
# MINOR
major_base_i = numpy.argsort(self.probN[seq_i][minor_indices])[:, -1]
newprobNarray = self.probN[seq_i].copy()
newprobNarray[(minor_indices, major_base_i)] = 0
newprobNarray = newprobNarray / numpy.sum(newprobNarray, axis=1).reshape(newprobNarray.shape[0], 1)
probNtoadd.append(newprobNarray)
self.base_coverages.append(numpy.zeros_like(self.base_coverages[seq_i]))
# MAJOR
minor_base_i = numpy.argsort(self.probN[seq_i][minor_indices])[:, -2]
self.probN[seq_i][(minor_indices, minor_base_i)] = 0
self.probN[seq_i] = self.probN[seq_i] / numpy.sum(self.probN[seq_i], axis=1).reshape(self.probN[seq_i].shape[0], 1)
new_priors = numpy.zeros(seq_i_minor+1, dtype=self.priors[-1].dtype)
new_priors[:-1] = self.priors[-1].copy()
new_priors[seq_i_minor] = old_prior * minor_fraction_avg
trash = self.priors.pop()
del trash
self.priors.append(new_priors)
# keep track of all new minor data to add and add it
# once at end for ALL split sequences with one coo
# matrix construction, instead of each iteration.
t_posterior = time()
# new_read_probs, new_rows, new_cols = adjust_posteriors_for_split(AAAA, BBBB, CCCC) # TODO: could move to Cython
# updating posteriors. for each seq-read pair with prob > 0, split prob out to major and minor seq.
new_cols = self.posteriors[-1].rows[seq_i] # col in coo format
new_read_probs = [x * minor_fraction_avg for x in self.posteriors[-1].data[seq_i]] # data in coo format
new_rows = [seq_i_minor for x in new_cols] # row in coo format
# add new read probs to cache of new read probs to add at end of loop
rows_to_add.extend(new_rows)
cols_to_add.extend(new_cols)
data_to_add.extend(new_read_probs)
# adjust old read probs to reflect major strain
self.posteriors[-1].data[seq_i] = [x * major_fraction_avg for x in self.posteriors[-1].data[seq_i]]
times_posteriors.append(time() - t_posterior)
# adjust self.unmapped_bases (used in clustering). For now give same pattern as parent
self.unmapped_bases.append(self.unmapped_bases[seq_i].copy())
# write out minor strain consensus
of.write(">%s\n"%(m_title))
of.write("%s\n"%("".join(minor_consensus)))
if self._VERBOSE:
sys.stderr.write("splitting sequence %d (%s) to %d (%s)...\n"%(seq_i, title,
seq_i_minor, m_title))
times_split.append(time()-seq_i_t0)
# now write major strain consensus, regardless of whether there was a minor strain consensus
of.write(">%s\n"%(title))
of.write("%s\n"%("".join(consensus)))
# END LOOP
loop_t_total = time() - loop_t0
# update posteriors matrix with newly added minor sequences new_posteriors via coo, then convert to csr.
new_posteriors = self.posteriors[-1].tocoo() # first make a copy in coo format
# then create new coo matrix with new shape, appending new row, col, data to old row, col, data
new_posteriors = sparse.coo_matrix((numpy.concatenate((new_posteriors.data, data_to_add)),
(numpy.concatenate((new_posteriors.row, rows_to_add)),
numpy.concatenate((new_posteriors.col, cols_to_add)))),
shape=(self.n_sequences, self.posteriors[-1].shape[1]),
dtype=new_posteriors.dtype).tocsr()
# finally, exchange in this new matrix
trash = self.posteriors.pop()
del trash
self.posteriors.append(new_posteriors)
# update probN array:
self.probN.extend(probNtoadd)
if self._VERBOSE:
total_time = time()-t0
sys.stderr.write("\tSplit out %d new minor strain sequences.\n"%(splitcount))
if splitcount > 0:
sys.stderr.write("\tAverage time for split sequence: [%.6f seconds]\n"%numpy.mean(times_split))
sys.stderr.write("\tAverage time for posterior update: [%.6f seconds]\n"%numpy.mean(times_posteriors))
sys.stderr.write("\tAverage time for non-split sequences: [%.6f seconds]\n"%((loop_t_total - sum(times_split)) / (seqs_to_process - len(times_split))))
# sys.stderr.write("\tCulled %d sequences\n"%(cullcount))
sys.stderr.write("DONE Writing consensus for iteration %d at %s [%s]...\n"%(self.iteration_i, ctime(), timedelta(seconds = total_time)))
return
def write_consensus_with_mask(self, output_fastafilename, mask):
"""
write a consensus sequence to output_fastafilename for each
sequence in probN where unmapped bases are replaced with:
mask == "hard" --> N
mask == "soft" --> lowercase letters
this is useful prior to usearch clustering.
OUT: number of sequences processed
"""
n_seqs = 0
i2base_get = self.i2base.get # for speed
of = file(output_fastafilename, 'w')
for seq_i in range(len(self.probN)):
if self.probN[seq_i] is None:
continue
title = self.sequence_i2sequence_name[seq_i]
consensus = numpy.array([i2base_get(x, "N") for x in numpy.argsort(self.probN[seq_i])[:,-1]])
# now replace consensus bases with no read support with N
unmapped_indices = numpy.where(self.unmapped_bases[seq_i] == 1)
if mask == "hard":
consensus[unmapped_indices] = 'N'
elif mask == "soft":
consensus[unmapped_indices] = [letter.lower() for letter in consensus[unmapped_indices]]
else:
raise ValueError, "Invalid valud for mask: %s (choose one of {soft, hard}"%mask
of.write(">%s\n"%(title))
of.write("%s\n"%("".join(consensus)))
n_seqs += 1
of.close()
return n_seqs
def cluster_sequences(self, fastafilename):
"""
Right now, this simply calls cluster_sequences_usearch, which
uses USEARCH. Could swap in other functions here if there
were faster or just alternative clustering methods to try out
called function should also adjust Pr(S) [prior] and Pr(S_t-1)
[posteriors] as needed after merging.
"""
return self.cluster_sequences_usearch(fastafilename)
def cluster_sequences_usearch(self, fastafilename):
"""
uses Edgar's USEARCH to merge sequences above self.cluster_thresh %ID over the
length of the shorter sequence
"Search and clustering orders of magnitude faster than BLAST"
Robert C. Edgar
Bioinformatics 2010
Merge two sequences if the *NON-GAPPED* positions have %
identity >= self.cluster_thresh
also adjusts Pr(S) [prior] and Pr(S_t-1) [posteriors] as needed after merging.
"""
if self._VERBOSE:
sys.stderr.write("Clustering sequences for iteration %d at %s...\n"%(self.iteration_i, ctime()))
sys.stderr.write("\tcluster threshold = %.3f\n"%(self.cluster_thresh))
start_time = time()
tocleanup = [] # list of temporary files to remove after done
# get posteriors ready for slicing (just prior to this call, is csr matrix?):
self.posteriors[-1] = self.posteriors[-1].tolil()
# NOTE that this fasta file now contains N's where there are
# no mapped bases, so that usearch with iddef 0 will not count
# positions aligned to these bases in the identity calculation
tmp_fastafilename = fastafilename + ".tmp.fasta"
num_seqs = self.write_consensus_with_mask(tmp_fastafilename, mask="soft")
tocleanup.append(tmp_fastafilename)
tmp_fastafile = pysam.Fastafile(tmp_fastafilename)
tocleanup.append("%s.fai"%(tmp_fastafilename))
# do global alignments with USEARCH/UCLUST.
# I don't use --cluster because it doesn't report alignments
# usearch is fast but will sometimes miss things -- I've tried to tune params as best as I can.
# and I use different parameters depending on how many input sequences there are
# Also, I use a lower %ID thresh than specified for joining because I really calculate %ID over *mapped* sequence positions.
sens_string = "--maxaccepts 8 --maxrejects 256"
uclust_id = 0.80
algorithm="-usearch_global"
# uclust_id = self.cluster_thresh - 0.05
# if em.iteration_i > 10:
# num_seqs = len([x for x in self.probN if x is not None])
assert num_seqs == len([x for x in self.probN if x is not None])
if num_seqs < 1000:
sens_string = "--maxaccepts 16 --maxrejects 256"
if num_seqs < 500:
sens_string = "--maxaccepts 32 --maxrejects 256"
if num_seqs < 150:
algorithm="-search_global"
sens_string = "--maxaccepts 0 --maxrejects 0" # slower, but more sensitive.
# if really few seqs, then no use not doing smith-waterman or needleman wunsch alignments
if num_seqs < 50:
algorithm="-search_global"
sens_string = "-fulldp"
# there is a bug in usearch threads that I can't figure out (fails with many threads). Currently limiting to max 6 threads
usearch_threads = min(6, self.n_cpus)
cmd = "usearch %s %s --db %s --id %.3f -quicksort -query_cov 0.5 -target_cov 0.5 -strand plus --userout %s.us.txt --userfields query+target+id+caln+qlo+qhi+tlo+thi -threads %d %s"%\
(algorithm,
tmp_fastafilename, tmp_fastafilename,
uclust_id,
tmp_fastafilename,
usearch_threads,
sens_string)
if self._VERBOSE:
sys.stderr.write("usearch command was:\n%s\n"%(cmd))
check_call(cmd, shell=True, stdout = sys.stdout, stderr = sys.stderr)
# read clustering file to adjust Priors and Posteriors, summing merged reference sequences
tocleanup.append("%s.us.txt"%tmp_fastafilename)
nummerged = 0
alnstring_pat = re.compile(r'(\d*)([MDI])')
already_removed = set() # seq_ids
# this is a bit slow and almost certainly could be sped up with algorithmic improvements.
times = [] # DEBUG
for row in csv.reader(file("%s.us.txt"%tmp_fastafilename), delimiter='\t'):
# each row an alignment in userout file
t0 = time()
# member == query
member_name = row[0]
seed_name = row[1]
if member_name == seed_name:
continue # usearch allows self-hits, which we don't care about
member_seq_id = self.sequence_name2sequence_i.get(member_name)
seed_seq_id = self.sequence_name2sequence_i.get(seed_name)
if member_seq_id in already_removed or seed_seq_id in already_removed:
continue
# decide if these pass the cluster_thresh *over non-gapped, mapped columns*
member_fasta_seq = tmp_fastafile.fetch(member_name)
seed_fasta_seq = tmp_fastafile.fetch(seed_name)
member_unmapped = self.unmapped_bases[member_seq_id] # unmapped positions (default prob)
seed_unmapped = self.unmapped_bases[seed_seq_id]
# query+target+id+caln+qlo+qhi+tlo+thi %s"%\
# 0 1 2 3 4 5 6 7
member_start = int(row[4]) - 1 # printed as 1-based by usearch now
seed_start = int(row[6]) - 1
t0 = time()
# print >> sys.stderr, "DEBUG", alnstring_pat.findall(row[3])
aln_columns, matches = _emirge.count_cigar_aln(tmp_fastafile.fetch(seed_name),
tmp_fastafile.fetch(member_name),
self.unmapped_bases[seed_seq_id],
self.unmapped_bases[member_seq_id],
seed_start,
member_start,
alnstring_pat.findall(row[3]))
## print >> sys.stderr, "DEBUG: %.6e seconds"%(time()-t0)# timedelta(seconds = time()-t0)
# if alignment is less than 1000 bases, or identity over those 500+ bases is not above thresh, then continue
seed_n_mapped_bases = self.unmapped_bases[seed_seq_id].shape[0] - self.unmapped_bases[seed_seq_id].sum()
member_n_mapped_bases = self.unmapped_bases[member_seq_id].shape[0] - self.unmapped_bases[member_seq_id].sum()
if (aln_columns < 500) \
or ((float(matches) / aln_columns) < self.cluster_thresh):
# or (float(aln_columns) / min(seed_n_mapped_bases, member_n_mapped_bases) < 0.9)
continue
minimum_residence_time = -1 # how many iters does a newly split out seq have to be around before it's allowed to merge again. -1 to turn this off.
member_first_appeared = self.split_seq_first_appeared.get(member_seq_id)
if member_first_appeared is not None and self.iteration_i - member_first_appeared <= minimum_residence_time:
continue
seed_first_appeared = self.split_seq_first_appeared.get(seed_seq_id)
if seed_first_appeared is not None and self.iteration_i - seed_first_appeared <= minimum_residence_time:
continue
if self._VERBOSE and num_seqs < 50:
print >> sys.stderr, "\t\t%s|%s vs %s|%s %.3f over %s aligned columns (usearch %%ID: %s)"%(member_seq_id, member_name, seed_seq_id, seed_name, float(matches) / aln_columns, aln_columns, row[2])
# if above thresh, then first decide which sequence to keep, (one with higher prior probability).
percent_id = (float(matches) / aln_columns) * 100.
t0 = time()
if self.priors[-1][seed_seq_id] > self.priors[-1][member_seq_id]:
keep_seq_id = seed_seq_id
remove_seq_id = member_seq_id
keep_name = seed_name
remove_name = member_name
else:
keep_seq_id = member_seq_id
remove_seq_id = seed_seq_id
keep_name = member_name
remove_name = seed_name
# merge priors (add remove_seq_id probs to keep_seq_id probs).
self.priors[-1][keep_seq_id] += self.priors[-1][remove_seq_id]
self.priors[-1][remove_seq_id] = 0.0
# now merge posteriors (all removed probs from remove_seq_id go to keep_seq_id).
# self.posteriors[-1] at this point is lil_matrix
# some manipulations of underlying sparse matrix data structures for efficiency here.
# 1st, do addition in csr format (fast), convert to lil format, and store result in temporary array.
new_row = (self.posteriors[-1].getrow(keep_seq_id).tocsr() + self.posteriors[-1].getrow(remove_seq_id).tocsr()).tolil()
# then change linked lists directly in the posteriors data structure -- this is very fast
self.posteriors[-1].data[keep_seq_id] = new_row.data[0]
self.posteriors[-1].rows[keep_seq_id] = new_row.rows[0]
# these two lines remove the row from the linked list (or rather, make them empty rows), essentially setting all elements to 0
self.posteriors[-1].rows[remove_seq_id] = []
self.posteriors[-1].data[remove_seq_id] = []
# set self.probN[removed] to be None -- note that this doesn't really matter, except for
# writing out probN.pkl.gz every iteration, as probN is recalculated from bam file
# with each iteration
self.probN[remove_seq_id] = None
already_removed.add(remove_seq_id)
nummerged += 1
if self._VERBOSE:
times.append(time()-t0)
sys.stderr.write("\t...merging %d|%s into %d|%s (%.2f%% ID over %d columns) in %.3f seconds\n"%\
(remove_seq_id, remove_name,
keep_seq_id, keep_name,
percent_id, aln_columns,
times[-1]))
# if len(times) and self._VERBOSE: # DEBUG
# sys.stderr.write("merges: %d\n"%(len(times)))
# sys.stderr.write("total time for all merges: %.3f seconds\n"%(numpy.sum(times)))
# sys.stderr.write("average time per merge: %.3f seconds\n"%(numpy.mean(times)))
# sys.stderr.write("min time per merge: %.3f seconds\n"%(numpy.min(times)))
# sys.stderr.write("max time per merge: %.3f seconds\n"%(numpy.max(times)))
# write new fasta file with only new sequences
if self._VERBOSE:
sys.stderr.write("Writing new fasta file for iteration %d at %s...\n"%(self.iteration_i, ctime()))
tmp_fastafile.close()
recordstrings=""
num_seqs = 0
for record in FastIterator(file(fastafilename)): # read through file again, overwriting orig file if we keep the seq
seqname = record.title.split()[0]
seq_id = self.sequence_name2sequence_i.get(seqname)
if seq_id not in already_removed:
recordstrings += str(record)
num_seqs += 1
outfile = file(fastafilename, 'w')
outfile.write(recordstrings)
outfile.close()
# clean up.
for fn in tocleanup:
os.remove(fn)
if self._VERBOSE:
sys.stderr.write("\tremoved %d sequences after merging\n"%(nummerged))
sys.stderr.write("\tsequences remaining for iteration %02d: %d\n"%(self.iteration_i, num_seqs))
sys.stderr.write("DONE Clustering sequences for iteration %d at %s [%s]...\n"%(self.iteration_i, ctime(), timedelta(seconds = time()-start_time)))
return
def do_mapping(self, full_fasta_path, nice = None):
"""
IN: path of fasta file to map reads to
run external mapping program to produce bam file
right now this is bowtie
should also set self.n_alignments and self.current_bam_filename
"""
if self._VERBOSE:
sys.stderr.write("Starting read mapping for iteration %d at %s...\n"%(self.iteration_i, ctime()))
start_time = time()
self.do_mapping_bowtie(full_fasta_path, nice = nice)
if self._VERBOSE:
sys.stderr.write("DONE with read mapping for iteration %d at %s [%s]...\n"%(self.iteration_i, ctime(), timedelta(seconds = time()-start_time)))
return
def do_mapping_bowtie(self, full_fasta_path, nice = None):
"""
run bowtie to produce bam file for next iteration
sets self.n_alignments
sets self.current_bam_filename
"""
bowtie_index = os.path.join(self.iterdir, "bowtie.index.iter.%02d"%(self.iteration_i))
bowtie_logfile = os.path.join(self.iterdir, "bowtie.iter.%02d.log"%(self.iteration_i))
# 1. build index
cmd = "bowtie-build -o 3 %s %s > %s"%(full_fasta_path , bowtie_index, bowtie_logfile) # -o 3 for speed? magnitude of speedup untested!
# note: just send stdout to log file, as stderr captured in emirge stderr
if self._VERBOSE:
sys.stderr.write("\tbowtie-build command:\n")
sys.stderr.write("\t%s\n"%cmd)
check_call(cmd, shell=True, stdout = sys.stdout, stderr = sys.stderr)
sys.stdout.flush()