Skip to content

Commit

Permalink
Merge pull request #696 from broadinstitute/is-readd-gapfill-to-pipeline
Browse files Browse the repository at this point in the history
readd gapfill to pipeline
  • Loading branch information
dpark01 authored Oct 19, 2017
2 parents db53e0f + 4d1ef2d commit f2fcfe6
Show file tree
Hide file tree
Showing 13 changed files with 271 additions and 585 deletions.
148 changes: 92 additions & 56 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(self, reason):
super(DenovoAssemblyError, self).__init__(reason)


def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000, trim_opts=None):
''' Take reads through Trimmomatic, Prinseq, and subsampling.
This should probably move over to read_utils.
'''
Expand Down Expand Up @@ -87,7 +87,8 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
trimfq[1],
clipDb,
unpairedOutFastq1=trimfq_unpaired[0],
unpairedOutFastq2=trimfq_unpaired[1]
unpairedOutFastq2=trimfq_unpaired[1],
**(trim_opts or {})
)

n_trim = max(map(util.file.count_fastq_reads, trimfq)) # count is pairs
Expand Down Expand Up @@ -198,7 +199,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):

n_final_individual_reads = samtools.count(outBam)

log.info("Pre-Trinity read filters: ")
log.info("Pre-DeNovoAssembly read filters: ")
log.info(" {} read pairs at start ".format(n_input))
log.info(
" {} read pairs after Trimmomatic {}".format(
Expand All @@ -220,7 +221,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
else:
log.info(" Paired read count sufficient to reach threshold ({})".format(n_reads))
log.info(
" {} individual reads for trinity ({}{})".format(
" {} individual reads for de novo assembly ({}{})".format(
n_output, "paired subsampled {} -> {}".format(n_rmdup_paired, n_paired_subsamp)
if not did_include_subsampled_unpaired_reads else "{} read pairs".format(n_rmdup_paired),
" + unpaired subsampled {} -> {}".format(n_rmdup_unpaired, n_unpaired_subsamp)
Expand All @@ -232,7 +233,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
if did_include_subsampled_unpaired_reads:
if n_final_individual_reads < n_reads:
log.warning(
"NOTE: Even with unpaired reads included, there are fewer unique trimmed reads than requested for Trinity input."
"NOTE: Even with unpaired reads included, there are fewer unique trimmed reads than requested for de novo assembly input."
)

# clean up temp files
Expand Down Expand Up @@ -340,72 +341,70 @@ def parser_assemble_trinity(parser=argparse.ArgumentParser()):

__commands__.append(('assemble_trinity', parser_assemble_trinity))

def gapfill_gap2seq(in_scaffold, in_bam, out_scaffold, gap2seq_opts='', threads=1, mem_limit_gb=4):
''' This step runs the Gap2Seq tool to close gaps between contigs in a scaffold.
'''
tools.gap2seq.Gap2SeqTool().gapfill(in_scaffold, in_bam, out_scaffold, gap2seq_opts=gap2seq_opts, threads=threads,
mem_limit_gb=mem_limit_gb)

def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gaps between contigs in a scaffold')):
parser.add_argument('inScaffold', help='FASTA file containing the scaffold. Each FASTA record corresponds to one '
'segment (for multi-segment genomes). Contigs within each segment are separated by Ns.')
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('outScaffold', help='Output assembly.')
parser.add_argument('--threads', default=0, type=int, help='Number of threads (default: %(default)s); 0 means use all available cores')
parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4.0, help='Max memory to use, in gigabytes %(default)s')
parser.add_argument('--timeSoftLimitMinutes', dest='time_soft_limit_minutes', default=60.0,
help='Stop trying to close more gaps after this many minutes (default: %(default)s); this is a soft/advisory limit')
parser.add_argument('--gap2seqOpts', dest='gap2seq_opts', default='', help='(advanced) Extra command-line options to pass to Gap2Seq')

util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, gapfill_gap2seq, split_args=True)
return parser


__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq))


def assemble_spades(
inBam,
outFasta,
in_bam,
clip_db,
out_fasta,
spades_opts='',
previously_assembled_contigs=None,
mem_limit_gb=4,
contigs_trusted=None, contigs_untrusted=None,
filter_contigs=False,
kmer_sizes=(55,65),
n_reads=10000000,
mask_errors=False,
max_kmer_sizes=1,
mem_limit_gb=8,
threads=0,
):
''' De novo RNA-seq assembly with the SPAdes assembler.
'''De novo RNA-seq assembly with the SPAdes assembler.
Inputs:
inBam - reads to assemble. May include both paired and unpaired reads.
previously_assembled_contigs - (optional) already-assembled contigs from the same sample.
inBam: reads to assemble. May include both paired and unpaired reads.
clip_db: Trimmomatic clip DB
trusted_contigs, untrusted_contigs: (optional) already-assembled contigs from the same sample. trusted contigs must be
high-quality, untrusted_contigs may have some errors.
Outputs:
outFasta - the assembled contigs. Note that, since this is RNA-seq assembly, for each assembled genomic region there may be
outFasta: the assembled contigs. Note that, since this is RNA-seq assembly, for each assembled genomic region there may be
several contigs representing different variants of that region.
Params:
mem_limit_gb - max memory to use
threads - number of threads to use (0 means use all available CPUs)
spades_opts - (advanced) custom command-line options to pass to the SPAdes assembler
n_reads: before assembly, subsample the reads to at most this many
filter_contigs: drop lesser-quality contigs from output
mem_limit_gb: max memory to use
threads: number of threads to use (0 means use all available CPUs)
spades_opts: (advanced) custom command-line options to pass to the SPAdes assembler
'''

with tools.picard.SamToFastqTool().execute_tmp(inBam, includeUnpaired=True,
JVMmemory=str(mem_limit_gb)+'g') as (reads_fwd, reads_bwd, reads_unpaired):
try:
tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired,
contigs_untrusted=previously_assembled_contigs,
contigs_out=outFasta, spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
threads=threads)
except subprocess.CalledProcessError as e:
raise DenovoAssemblyError('SPAdes assembler failed: ' + str(e))
with util.file.tempfname(suffix='.bam', prefix='trim_rmdup_for_spades') as trim_rmdup_bam:
trim_rmdup_subsamp_reads(in_bam, clip_db, trim_rmdup_bam, n_reads=n_reads,
trim_opts=dict(maxinfo_target_length=35, maxinfo_strictness=.2))

with tools.picard.SamToFastqTool().execute_tmp(trim_rmdup_bam, includeUnpaired=True, illuminaClipping=True,
JVMmemory=str(mem_limit_gb)+'g') as (reads_fwd, reads_bwd, reads_unpaired):
try:
tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired,
contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted,
contigs_out=out_fasta, filter_contigs=filter_contigs,
kmer_sizes=kmer_sizes, mask_errors=mask_errors, max_kmer_sizes=max_kmer_sizes,
spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
threads=threads)
except subprocess.CalledProcessError as e:
raise DenovoAssemblyError('SPAdes assembler failed: ' + str(e))

def parser_assemble_spades(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('outFasta', help='Output assembly.')
parser.add_argument('--previously_assembled_contigs', help='Contigs previously assembled from the same sample')
parser.add_argument('--spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler')
parser.add_argument('--mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
parser.add_argument('in_bam', help='Input unaligned reads, BAM format.')
parser.add_argument('clip_db', help='Trimmomatic clip db')
parser.add_argument('out_fasta', help='Output assembly.')
parser.add_argument('--contigsTrusted', dest='contigs_trusted',
help='Contigs of high quality previously assembled from the same sample')
parser.add_argument('--contigsUntrusted', dest='contigs_untrusted',
help='Contigs of high medium quality previously assembled from the same sample')
parser.add_argument('--nReads', dest='n_reads', type=int, default=10000000,
help='Before assembly, subsample the reads to at most this many')
parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true',
help='only output contigs SPAdes is sure of')
parser.add_argument('--spadesOpts', dest='spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler')
parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
parser.add_argument('--threads', default=0, type=int, help='Number of threads, or 0 to use all CPUs (default: %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, assemble_spades, split_args=True)
Expand All @@ -415,6 +414,43 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()):
__commands__.append(('assemble_spades', parser_assemble_spades))


def gapfill_gap2seq(in_scaffold, in_bam, out_scaffold, threads=1, mem_limit_gb=4, time_soft_limit_minutes=60.0,
random_seed=0, gap2seq_opts='', mask_errors=False):
''' This step runs the Gap2Seq tool to close gaps between contigs in a scaffold.
'''
try:
tools.gap2seq.Gap2SeqTool().gapfill(in_scaffold, in_bam, out_scaffold, gap2seq_opts=gap2seq_opts, threads=threads,
mem_limit_gb=mem_limit_gb, time_soft_limit_minutes=time_soft_limit_minutes,
random_seed=random_seed)
except Exception as e:
if not mask_errors:
raise
log.warning('Gap-filling failed (%s); ignoring error, emulating successful run where simply no gaps were filled.')
shutil.copyfile(in_scaffold, out_scaffold)

def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gaps between contigs in a scaffold')):
parser.add_argument('in_scaffold', help='FASTA file containing the scaffold. Each FASTA record corresponds to one '
'segment (for multi-segment genomes). Contigs within each segment are separated by Ns.')
parser.add_argument('in_bam', help='Input unaligned reads, BAM format.')
parser.add_argument('out_scaffold', help='Output assembly.')
parser.add_argument('--threads', default=0, type=int, help='Number of threads (default: %(default)s); 0 means use all available cores')
parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4.0, help='Max memory to use, in gigabytes %(default)s')
parser.add_argument('--timeSoftLimitMinutes', dest='time_soft_limit_minutes', default=60.0,
help='Stop trying to close more gaps after this many minutes (default: %(default)s); this is a soft/advisory limit')
parser.add_argument('--maskErrors', dest='mask_errors', default=False, action='store_true',
help='In case of any error, just copy in_scaffold to out_scaffold, emulating a successful run that simply could not '
'fill any gaps.')
parser.add_argument('--gap2seqOpts', dest='gap2seq_opts', default='', help='(advanced) Extra command-line options to pass to Gap2Seq')
parser.add_argument('--randomSeed', dest='random_seed', default=0, help='Random seed; 0 means use current time')

util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, gapfill_gap2seq, split_args=True)
return parser


__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq))


def order_and_orient(inFasta, inReference, outFasta,
outAlternateContigs=None,
breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None,
Expand Down
11 changes: 11 additions & 0 deletions pipes/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ samples_per_run: "samples-runs.txt"
# to insufficient input data, consider expanding the list of accessions given to lastal.
accessions_file_for_lastal_db_build: ""

# De novo assembler to use. Current options are "trinity", "spades" and "trinity-spades"
# (the "trinity-spades" option gives contigs produced by Trinity as auxiliary input to SPAdes).
assembly_assembler: "trinity"

# The minimum length an assembled sequence must have
# to be considered acceptible quality, specified as
# a fraction of the length of the reference sequene.
Expand Down Expand Up @@ -192,6 +196,10 @@ mafft_maxiters: 1000
# See: https://github.com/trinityrnaseq/trinityrnaseq/wiki/Trinity-FAQ#ques_why_so_many_transcripts
trinity_n_reads: 250000

# The number of reads to be subsampled from input bam files
# and used as input to assembly via SPAdes
spades_n_reads: 10000000

# Minimum number of reads on each strand
vphaser_min_reads_each: 5

Expand All @@ -205,6 +213,9 @@ vphaser_max_bins: 10
# allele freq > 0.005. If false, no filtering is performed.
vcf_merge_naive_filter: false

# Random seed, for non-deterministic steps. 0 means use current time.
random_seed: 0

# |----------------------- Data storage locations ---------------------------

# The parent directory containing data sub-directories.
Expand Down
Loading

0 comments on commit f2fcfe6

Please sign in to comment.