diff --git a/taxon_filter.py b/taxon_filter.py index f5e106a72..8a959042c 100755 --- a/taxon_filter.py +++ b/taxon_filter.py @@ -47,8 +47,7 @@ def parser_deplete_human(parser=argparse.ArgumentParser()): parser.add_argument('bmtaggerBam', help='Output BAM: depleted of human reads with BMTagger.') parser.add_argument('rmdupBam', help='Output BAM: bmtaggerBam run through M-Vicuna duplicate removal.') parser.add_argument( - 'blastnBam', - help='Output BAM: rmdupBam run through another depletion of human reads with BLASTN.' + 'blastnBam', help='Output BAM: rmdupBam run through another depletion of human reads with BLASTN.' ) parser.add_argument( '--taxfiltBam', @@ -89,9 +88,7 @@ def main_deplete_human(args): ''' Run the entire depletion pipeline: bmtagger, mvicuna, blastn. Optionally, use lastal to select a specific taxon of interest.''' tools.picard.RevertSamTool().execute( - args.inBam, - args.revertBam, - picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true'] + args.inBam, args.revertBam, picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true'] ) multi_db_deplete_bam( args.revertBam, @@ -102,7 +99,14 @@ def main_deplete_human(args): JVMmemory=args.JVMmemory ) read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory) - multi_db_deplete_bam(args.rmdupBam, args.blastDbs, deplete_blastn_bam, args.blastnBam, threads=args.threads, JVMmemory=args.JVMmemory) + multi_db_deplete_bam( + args.rmdupBam, + args.blastDbs, + deplete_blastn_bam, + args.blastnBam, + threads=args.threads, + JVMmemory=args.JVMmemory + ) if args.taxfiltBam and args.lastDb: filter_lastal_bam(args.blastnBam, args.lastDb, args.taxfiltBam, JVMmemory=args.JVMmemory) return 0 @@ -115,7 +119,18 @@ def main_deplete_human(args): # ========================== -def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta): +def trimmomatic( + inFastq1, + inFastq2, + pairedOutFastq1, + pairedOutFastq2, + clipFasta, + leading_q_cutoff=15, + trailing_q_cutoff=15, + minlength_to_keep=30, + sliding_window_size=4, + sliding_window_q_cutoff=25 +): '''Trim read sequences with Trimmomatic.''' trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path() tmpUnpaired1 = mkstempfname() @@ -137,8 +152,15 @@ def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta) javaCmd.extend( [ - inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2, tmpUnpaired2, 'LEADING:20', - 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30', 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta) + inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2, tmpUnpaired2, + 'LEADING:{leading_q_cutoff}'.format(leading_q_cutoff=leading_q_cutoff), + 'TRAILING:{trailing_q_cutoff}'.format(trailing_q_cutoff=trailing_q_cutoff), + 'SLIDINGWINDOW:{sliding_window_size}:{sliding_window_q_cutoff}'.format( + sliding_window_size=sliding_window_size, + sliding_window_q_cutoff=sliding_window_q_cutoff, + ), + 'MINLEN:{minlength_to_keep}'.format(minlength_to_keep=minlength_to_keep), + 'ILLUMINACLIP:{clipFasta}:2:30:12'.format(clipFasta=clipFasta) ] ) @@ -153,6 +175,41 @@ def parser_trim_trimmomatic(parser=argparse.ArgumentParser()): parser.add_argument("inFastq2", help="Input reads 2") parser.add_argument("pairedOutFastq1", help="Paired output 1") parser.add_argument("pairedOutFastq2", help="Paired output 2") + parser.add_argument( + '--leadingQCutoff', + dest="leading_q_cutoff", + help='minimum quality required to keep a base on the leading end (default: %(default)s)', + type=int, + default=15 + ) + parser.add_argument( + '--trailingQCutoff', + dest="trailing_q_cutoff", + help='minimum quality required to keep a base on the trailing end (default: %(default)s)', + type=int, + default=15 + ) + parser.add_argument( + '--minlengthToKeep', + dest="minlength_to_keep", + help='minimum length of reads to be kept (default: %(default)s)', + type=int, + default=30 + ) + parser.add_argument( + '--slidingWindowSize', + dest="sliding_window_size", + help='the number of bases to average across (default: %(default)s)', + type=int, + default=4 + ) + parser.add_argument( + '--slidingWindowQCutoff', + dest="sliding_window_q_cutoff", + help='specifies the average quality required in the sliding window (default: %(default)s)', + type=int, + default=25 + ) parser.add_argument("clipFasta", help="Fasta file with adapters, PCR sequences, etc. to clip off") util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, trimmomatic, split_args=True) @@ -165,16 +222,17 @@ def parser_trim_trimmomatic(parser=argparse.ArgumentParser()): # *** filter_lastal *** # ======================= + def lastal_chunked_fastq( - inFastq, - db, - outFastq, + inFastq, + db, + outFastq, max_gapless_alignments_per_position=1, min_length_for_initial_matches=5, max_length_for_initial_matches=50, max_initial_matches_per_position=100, chunk_size=100000 - ): +): lastal_path = tools.last.Lastal().install_and_get_path() mafsort_path = tools.last.MafSort().install_and_get_path() @@ -249,12 +307,12 @@ def lastal_get_hits( ): filteredFastq = mkstempfname('.filtered.fastq') lastal_chunked_fastq( - inFastq, - db, + inFastq, + db, filteredFastq, - max_gapless_alignments_per_position=max_gapless_alignments_per_position, - min_length_for_initial_matches=min_length_for_initial_matches, - max_length_for_initial_matches=max_length_for_initial_matches, + max_gapless_alignments_per_position=max_gapless_alignments_per_position, + min_length_for_initial_matches=min_length_for_initial_matches, + max_length_for_initial_matches=max_length_for_initial_matches, max_initial_matches_per_position=max_initial_matches_per_position ) @@ -383,12 +441,12 @@ def filter_lastal( filtered_fastq = mkstempfname('.filtered.fastq') lastal_chunked_fastq( - inFastq, - refDb, + inFastq, + refDb, filtered_fastq, - max_gapless_alignments_per_position=max_gapless_alignments_per_position, - min_length_for_initial_matches=min_length_for_initial_matches, - max_length_for_initial_matches=max_length_for_initial_matches, + max_gapless_alignments_per_position=max_gapless_alignments_per_position, + min_length_for_initial_matches=min_length_for_initial_matches, + max_length_for_initial_matches=max_length_for_initial_matches, max_initial_matches_per_position=max_initial_matches_per_position ) @@ -456,15 +514,12 @@ def deplete_bmtagger_bam(inBam, db, outBam, threads=None, JVMmemory=None): bmtaggerConf = mkstempfname('.bmtagger.conf') with open(bmtaggerConf, 'w') as f: # Default srprismopts: "-b 100000000 -n 5 -R 0 -r 1 -M 7168" - print( - 'srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M 7168 --paired false"', - file=f) + print('srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M 7168 --paired false"', file=f) tempDir = tempfile.mkdtemp() matchesFile = mkstempfname('.txt') cmdline = [ - bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf, - '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1, - '-2', inReads2, '-o', matchesFile + bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf, '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', + inReads1, '-2', inReads2, '-o', matchesFile ] log.debug(' '.join(cmdline)) util.misc.run_and_print(cmdline, check=True) @@ -679,7 +734,14 @@ def parser_deplete_bam_bmtagger(parser=argparse.ArgumentParser()): def main_deplete_bam_bmtagger(args): '''Use bmtagger to deplete input reads against several databases.''' - multi_db_deplete_bam(args.inBam, args.refDbs, deplete_bmtagger_bam, args.outBam, threads=args.threads, JVMmemory=args.JVMmemory) + multi_db_deplete_bam( + args.inBam, + args.refDbs, + deplete_bmtagger_bam, + args.outBam, + threads=args.threads, + JVMmemory=args.JVMmemory + ) __commands__.append(('deplete_bam_bmtagger', parser_deplete_bam_bmtagger)) @@ -701,18 +763,20 @@ def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, threads=1, JVMme # *** deplete_blastn *** # ======================== + def run_blastn(blastn_path, db, input_fasta, blast_threads=1): - """ run blastn on the input fasta file. this is intended to be run in parallel """ - chunk_hits = mkstempfname('.hits.txt') - blastnCmd = [ - blastn_path, '-db', db, '-word_size', '16', '-num_threads', str(blast_threads), '-evalue', '1e-6', '-outfmt', '6', '-max_target_seqs', '2', - '-query', input_fasta, '-out', chunk_hits - ] - log.debug(' '.join(blastnCmd)) - util.misc.run_and_print(blastnCmd, check=True) + """ run blastn on the input fasta file. this is intended to be run in parallel """ + chunk_hits = mkstempfname('.hits.txt') + blastnCmd = [ + blastn_path, '-db', db, '-word_size', '16', '-num_threads', str(blast_threads), '-evalue', '1e-6', '-outfmt', + '6', '-max_target_seqs', '2', '-query', input_fasta, '-out', chunk_hits + ] + log.debug(' '.join(blastnCmd)) + util.misc.run_and_print(blastnCmd, check=True) + + os.unlink(input_fasta) + return chunk_hits - os.unlink(input_fasta) - return chunk_hits def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1): """ @@ -730,18 +794,18 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1): blastnPath = tools.blast.BlastnTool().install_and_get_path() # clamp threadcount to number of CPUs minus one - threads = min(util.misc.available_cpu_count()-1, threads) + threads = min(util.misc.available_cpu_count() - 1, threads) # determine size of input data; records in fasta file number_of_reads = util.file.fasta_length(fasta) log.debug("number of reads in fasta file %s" % number_of_reads) - if number_of_reads==0: + if number_of_reads == 0: return [mkstempfname('.hits.txt')] # divide (max, single-thread) chunksize by thread count # to find the absolute max chunk size per thread chunk_max_size_per_thread = chunkSize // threads - + # find the chunk size if evenly divided among blast threads reads_per_thread = number_of_reads // threads @@ -759,7 +823,7 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1): # decrease the chunk size until the last chunk is 80% # this is bounded by the MIN_CHUNK_SIZE while (number_of_reads / chunkSize) % 1 < 0.8 and chunkSize > MIN_CHUNK_SIZE: - chunkSize = chunkSize-1 + chunkSize = chunkSize - 1 log.debug("blastn chunk size %s" % chunkSize) log.debug("number of chunks to create %s" % (number_of_reads / chunkSize)) @@ -771,7 +835,7 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1): record_iter = SeqIO.parse(fastaFile, "fasta") for batch in util.misc.batch_iterator(record_iter, chunkSize): chunk_fasta = mkstempfname('.fasta') - + with open(chunk_fasta, "wt") as handle: SeqIO.write(batch, handle, "fasta") batch = None @@ -786,9 +850,12 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1): # divide extra cpus evenly among chunks where possible # rounding to 1 if there are more chunks than extra threads cpus_leftover = (threads - len(input_fastas)) - blast_threads = max(1, int(cpus_leftover/len(input_fastas)) ) + blast_threads = max(1, int(cpus_leftover / len(input_fastas))) - futs = [executor.submit(functools.partial(run_blastn, blastnPath, db, input_fasta, blast_threads)) for input_fasta in input_fastas] + futs = [ + executor.submit(functools.partial(run_blastn, blastnPath, db, input_fasta, blast_threads)) + for input_fasta in input_fastas + ] hits_files = [fut.result() for fut in concurrent.futures.as_completed(futs)] return hits_files @@ -1014,11 +1081,11 @@ def parser_lastal_build_db(parser=argparse.ArgumentParser()): __commands__.append(('lastal_build_db', parser_lastal_build_db)) - # ======================== # *** blastn_build_db *** # ======================== + def blastn_build_db(inputFasta, outputDirectory, outputFilePrefix): """ Create a database for use with blastn from an input reference FASTA file """ @@ -1047,11 +1114,11 @@ def parser_blastn_build_db(parser=argparse.ArgumentParser()): __commands__.append(('blastn_build_db', parser_blastn_build_db)) - # ======================== # *** bmtagger_build_db *** # ======================== + def bmtagger_build_db(inputFasta, outputDirectory, outputFilePrefix): """ Create a database for use with Bmtagger from an input FASTA file. """ @@ -1062,12 +1129,20 @@ def bmtagger_build_db(inputFasta, outputDirectory, outputFilePrefix): fileNameSansExtension = os.path.splitext(baseName)[0] outPrefix = fileNameSansExtension - bmtooldb_path = tools.bmtagger.BmtoolTool().build_database(inputFasta, os.path.join(outputDirectory, outPrefix+".bitmask")) - srprismdb_path = tools.bmtagger.SrprismTool().build_database(inputFasta, os.path.join(outputDirectory, outPrefix+".srprism")) + bmtooldb_path = tools.bmtagger.BmtoolTool().build_database( + inputFasta, os.path.join(outputDirectory, outPrefix + ".bitmask") + ) + srprismdb_path = tools.bmtagger.SrprismTool().build_database( + inputFasta, os.path.join(outputDirectory, outPrefix + ".srprism") + ) + def parser_bmtagger_build_db(parser=argparse.ArgumentParser()): parser.add_argument('inputFasta', help='Location of the input FASTA file') - parser.add_argument('outputDirectory', help='Location for the output files (Where *.bitmask and *.srprism files will be stored)') + parser.add_argument( + 'outputDirectory', + help='Location for the output files (Where *.bitmask and *.srprism files will be stored)' + ) parser.add_argument( '--outputFilePrefix', help='Prefix for the output file name (default: inputFasta name, sans ".fasta" extension)' @@ -1079,8 +1154,6 @@ def parser_bmtagger_build_db(parser=argparse.ArgumentParser()): __commands__.append(('bmtagger_build_db', parser_bmtagger_build_db)) - - # ========================