Skip to content

Commit

Permalink
Merge pull request #358 from broadinstitute/ct-trimmomatic-cutoff-values
Browse files Browse the repository at this point in the history
relax and expose trimmomatic cutoff values
  • Loading branch information
tomkinsc authored Jun 20, 2016
2 parents 9f7bebd + 426757d commit 426ced8
Showing 1 changed file with 127 additions and 54 deletions.
181 changes: 127 additions & 54 deletions taxon_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ def parser_deplete_human(parser=argparse.ArgumentParser()):
parser.add_argument('bmtaggerBam', help='Output BAM: depleted of human reads with BMTagger.')
parser.add_argument('rmdupBam', help='Output BAM: bmtaggerBam run through M-Vicuna duplicate removal.')
parser.add_argument(
'blastnBam',
help='Output BAM: rmdupBam run through another depletion of human reads with BLASTN.'
'blastnBam', help='Output BAM: rmdupBam run through another depletion of human reads with BLASTN.'
)
parser.add_argument(
'--taxfiltBam',
Expand Down Expand Up @@ -89,9 +88,7 @@ def main_deplete_human(args):
''' Run the entire depletion pipeline: bmtagger, mvicuna, blastn.
Optionally, use lastal to select a specific taxon of interest.'''
tools.picard.RevertSamTool().execute(
args.inBam,
args.revertBam,
picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true']
args.inBam, args.revertBam, picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true']
)
multi_db_deplete_bam(
args.revertBam,
Expand All @@ -102,7 +99,14 @@ def main_deplete_human(args):
JVMmemory=args.JVMmemory
)
read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory)
multi_db_deplete_bam(args.rmdupBam, args.blastDbs, deplete_blastn_bam, args.blastnBam, threads=args.threads, JVMmemory=args.JVMmemory)
multi_db_deplete_bam(
args.rmdupBam,
args.blastDbs,
deplete_blastn_bam,
args.blastnBam,
threads=args.threads,
JVMmemory=args.JVMmemory
)
if args.taxfiltBam and args.lastDb:
filter_lastal_bam(args.blastnBam, args.lastDb, args.taxfiltBam, JVMmemory=args.JVMmemory)
return 0
Expand All @@ -115,7 +119,18 @@ def main_deplete_human(args):
# ==========================


def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta):
def trimmomatic(
inFastq1,
inFastq2,
pairedOutFastq1,
pairedOutFastq2,
clipFasta,
leading_q_cutoff=15,
trailing_q_cutoff=15,
minlength_to_keep=30,
sliding_window_size=4,
sliding_window_q_cutoff=25
):
'''Trim read sequences with Trimmomatic.'''
trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path()
tmpUnpaired1 = mkstempfname()
Expand All @@ -137,8 +152,15 @@ def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta)

javaCmd.extend(
[
inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2, tmpUnpaired2, 'LEADING:20',
'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30', 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta)
inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2, tmpUnpaired2,
'LEADING:{leading_q_cutoff}'.format(leading_q_cutoff=leading_q_cutoff),
'TRAILING:{trailing_q_cutoff}'.format(trailing_q_cutoff=trailing_q_cutoff),
'SLIDINGWINDOW:{sliding_window_size}:{sliding_window_q_cutoff}'.format(
sliding_window_size=sliding_window_size,
sliding_window_q_cutoff=sliding_window_q_cutoff,
),
'MINLEN:{minlength_to_keep}'.format(minlength_to_keep=minlength_to_keep),
'ILLUMINACLIP:{clipFasta}:2:30:12'.format(clipFasta=clipFasta)
]
)

Expand All @@ -153,6 +175,41 @@ def parser_trim_trimmomatic(parser=argparse.ArgumentParser()):
parser.add_argument("inFastq2", help="Input reads 2")
parser.add_argument("pairedOutFastq1", help="Paired output 1")
parser.add_argument("pairedOutFastq2", help="Paired output 2")
parser.add_argument(
'--leadingQCutoff',
dest="leading_q_cutoff",
help='minimum quality required to keep a base on the leading end (default: %(default)s)',
type=int,
default=15
)
parser.add_argument(
'--trailingQCutoff',
dest="trailing_q_cutoff",
help='minimum quality required to keep a base on the trailing end (default: %(default)s)',
type=int,
default=15
)
parser.add_argument(
'--minlengthToKeep',
dest="minlength_to_keep",
help='minimum length of reads to be kept (default: %(default)s)',
type=int,
default=30
)
parser.add_argument(
'--slidingWindowSize',
dest="sliding_window_size",
help='the number of bases to average across (default: %(default)s)',
type=int,
default=4
)
parser.add_argument(
'--slidingWindowQCutoff',
dest="sliding_window_q_cutoff",
help='specifies the average quality required in the sliding window (default: %(default)s)',
type=int,
default=25
)
parser.add_argument("clipFasta", help="Fasta file with adapters, PCR sequences, etc. to clip off")
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, trimmomatic, split_args=True)
Expand All @@ -165,16 +222,17 @@ def parser_trim_trimmomatic(parser=argparse.ArgumentParser()):
# *** filter_lastal ***
# =======================


def lastal_chunked_fastq(
inFastq,
db,
outFastq,
inFastq,
db,
outFastq,
max_gapless_alignments_per_position=1,
min_length_for_initial_matches=5,
max_length_for_initial_matches=50,
max_initial_matches_per_position=100,
chunk_size=100000
):
):

lastal_path = tools.last.Lastal().install_and_get_path()
mafsort_path = tools.last.MafSort().install_and_get_path()
Expand Down Expand Up @@ -249,12 +307,12 @@ def lastal_get_hits(
):
filteredFastq = mkstempfname('.filtered.fastq')
lastal_chunked_fastq(
inFastq,
db,
inFastq,
db,
filteredFastq,
max_gapless_alignments_per_position=max_gapless_alignments_per_position,
min_length_for_initial_matches=min_length_for_initial_matches,
max_length_for_initial_matches=max_length_for_initial_matches,
max_gapless_alignments_per_position=max_gapless_alignments_per_position,
min_length_for_initial_matches=min_length_for_initial_matches,
max_length_for_initial_matches=max_length_for_initial_matches,
max_initial_matches_per_position=max_initial_matches_per_position
)

Expand Down Expand Up @@ -383,12 +441,12 @@ def filter_lastal(

filtered_fastq = mkstempfname('.filtered.fastq')
lastal_chunked_fastq(
inFastq,
refDb,
inFastq,
refDb,
filtered_fastq,
max_gapless_alignments_per_position=max_gapless_alignments_per_position,
min_length_for_initial_matches=min_length_for_initial_matches,
max_length_for_initial_matches=max_length_for_initial_matches,
max_gapless_alignments_per_position=max_gapless_alignments_per_position,
min_length_for_initial_matches=min_length_for_initial_matches,
max_length_for_initial_matches=max_length_for_initial_matches,
max_initial_matches_per_position=max_initial_matches_per_position
)

Expand Down Expand Up @@ -456,15 +514,12 @@ def deplete_bmtagger_bam(inBam, db, outBam, threads=None, JVMmemory=None):
bmtaggerConf = mkstempfname('.bmtagger.conf')
with open(bmtaggerConf, 'w') as f:
# Default srprismopts: "-b 100000000 -n 5 -R 0 -r 1 -M 7168"
print(
'srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M 7168 --paired false"',
file=f)
print('srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M 7168 --paired false"', file=f)
tempDir = tempfile.mkdtemp()
matchesFile = mkstempfname('.txt')
cmdline = [
bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf,
'-x', db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1,
'-2', inReads2, '-o', matchesFile
bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf, '-x', db + '.srprism', '-T', tempDir, '-q1', '-1',
inReads1, '-2', inReads2, '-o', matchesFile
]
log.debug(' '.join(cmdline))
util.misc.run_and_print(cmdline, check=True)
Expand Down Expand Up @@ -679,7 +734,14 @@ def parser_deplete_bam_bmtagger(parser=argparse.ArgumentParser()):

def main_deplete_bam_bmtagger(args):
'''Use bmtagger to deplete input reads against several databases.'''
multi_db_deplete_bam(args.inBam, args.refDbs, deplete_bmtagger_bam, args.outBam, threads=args.threads, JVMmemory=args.JVMmemory)
multi_db_deplete_bam(
args.inBam,
args.refDbs,
deplete_bmtagger_bam,
args.outBam,
threads=args.threads,
JVMmemory=args.JVMmemory
)


__commands__.append(('deplete_bam_bmtagger', parser_deplete_bam_bmtagger))
Expand All @@ -701,18 +763,20 @@ def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, threads=1, JVMme
# *** deplete_blastn ***
# ========================


def run_blastn(blastn_path, db, input_fasta, blast_threads=1):
""" run blastn on the input fasta file. this is intended to be run in parallel """
chunk_hits = mkstempfname('.hits.txt')
blastnCmd = [
blastn_path, '-db', db, '-word_size', '16', '-num_threads', str(blast_threads), '-evalue', '1e-6', '-outfmt', '6', '-max_target_seqs', '2',
'-query', input_fasta, '-out', chunk_hits
]
log.debug(' '.join(blastnCmd))
util.misc.run_and_print(blastnCmd, check=True)
""" run blastn on the input fasta file. this is intended to be run in parallel """
chunk_hits = mkstempfname('.hits.txt')
blastnCmd = [
blastn_path, '-db', db, '-word_size', '16', '-num_threads', str(blast_threads), '-evalue', '1e-6', '-outfmt',
'6', '-max_target_seqs', '2', '-query', input_fasta, '-out', chunk_hits
]
log.debug(' '.join(blastnCmd))
util.misc.run_and_print(blastnCmd, check=True)

os.unlink(input_fasta)
return chunk_hits

os.unlink(input_fasta)
return chunk_hits

def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1):
"""
Expand All @@ -730,18 +794,18 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1):
blastnPath = tools.blast.BlastnTool().install_and_get_path()

# clamp threadcount to number of CPUs minus one
threads = min(util.misc.available_cpu_count()-1, threads)
threads = min(util.misc.available_cpu_count() - 1, threads)

# determine size of input data; records in fasta file
number_of_reads = util.file.fasta_length(fasta)
log.debug("number of reads in fasta file %s" % number_of_reads)
if number_of_reads==0:
if number_of_reads == 0:
return [mkstempfname('.hits.txt')]

# divide (max, single-thread) chunksize by thread count
# to find the absolute max chunk size per thread
chunk_max_size_per_thread = chunkSize // threads

# find the chunk size if evenly divided among blast threads
reads_per_thread = number_of_reads // threads

Expand All @@ -759,7 +823,7 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1):
# decrease the chunk size until the last chunk is 80%
# this is bounded by the MIN_CHUNK_SIZE
while (number_of_reads / chunkSize) % 1 < 0.8 and chunkSize > MIN_CHUNK_SIZE:
chunkSize = chunkSize-1
chunkSize = chunkSize - 1

log.debug("blastn chunk size %s" % chunkSize)
log.debug("number of chunks to create %s" % (number_of_reads / chunkSize))
Expand All @@ -771,7 +835,7 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1):
record_iter = SeqIO.parse(fastaFile, "fasta")
for batch in util.misc.batch_iterator(record_iter, chunkSize):
chunk_fasta = mkstempfname('.fasta')

with open(chunk_fasta, "wt") as handle:
SeqIO.write(batch, handle, "fasta")
batch = None
Expand All @@ -786,9 +850,12 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000, threads=1):
# divide extra cpus evenly among chunks where possible
# rounding to 1 if there are more chunks than extra threads
cpus_leftover = (threads - len(input_fastas))
blast_threads = max(1, int(cpus_leftover/len(input_fastas)) )
blast_threads = max(1, int(cpus_leftover / len(input_fastas)))

futs = [executor.submit(functools.partial(run_blastn, blastnPath, db, input_fasta, blast_threads)) for input_fasta in input_fastas]
futs = [
executor.submit(functools.partial(run_blastn, blastnPath, db, input_fasta, blast_threads))
for input_fasta in input_fastas
]
hits_files = [fut.result() for fut in concurrent.futures.as_completed(futs)]

return hits_files
Expand Down Expand Up @@ -1014,11 +1081,11 @@ def parser_lastal_build_db(parser=argparse.ArgumentParser()):

__commands__.append(('lastal_build_db', parser_lastal_build_db))


# ========================
# *** blastn_build_db ***
# ========================


def blastn_build_db(inputFasta, outputDirectory, outputFilePrefix):
""" Create a database for use with blastn from an input reference FASTA file
"""
Expand Down Expand Up @@ -1047,11 +1114,11 @@ def parser_blastn_build_db(parser=argparse.ArgumentParser()):

__commands__.append(('blastn_build_db', parser_blastn_build_db))


# ========================
# *** bmtagger_build_db ***
# ========================


def bmtagger_build_db(inputFasta, outputDirectory, outputFilePrefix):
""" Create a database for use with Bmtagger from an input FASTA file.
"""
Expand All @@ -1062,12 +1129,20 @@ def bmtagger_build_db(inputFasta, outputDirectory, outputFilePrefix):
fileNameSansExtension = os.path.splitext(baseName)[0]
outPrefix = fileNameSansExtension

bmtooldb_path = tools.bmtagger.BmtoolTool().build_database(inputFasta, os.path.join(outputDirectory, outPrefix+".bitmask"))
srprismdb_path = tools.bmtagger.SrprismTool().build_database(inputFasta, os.path.join(outputDirectory, outPrefix+".srprism"))
bmtooldb_path = tools.bmtagger.BmtoolTool().build_database(
inputFasta, os.path.join(outputDirectory, outPrefix + ".bitmask")
)
srprismdb_path = tools.bmtagger.SrprismTool().build_database(
inputFasta, os.path.join(outputDirectory, outPrefix + ".srprism")
)


def parser_bmtagger_build_db(parser=argparse.ArgumentParser()):
parser.add_argument('inputFasta', help='Location of the input FASTA file')
parser.add_argument('outputDirectory', help='Location for the output files (Where *.bitmask and *.srprism files will be stored)')
parser.add_argument(
'outputDirectory',
help='Location for the output files (Where *.bitmask and *.srprism files will be stored)'
)
parser.add_argument(
'--outputFilePrefix',
help='Prefix for the output file name (default: inputFasta name, sans ".fasta" extension)'
Expand All @@ -1079,8 +1154,6 @@ def parser_bmtagger_build_db(parser=argparse.ArgumentParser()):

__commands__.append(('bmtagger_build_db', parser_bmtagger_build_db))



# ========================


Expand Down

0 comments on commit 426ced8

Please sign in to comment.