Skip to content

Commit

Permalink
Merge pull request #80 from broadinstitute/dpark-dev
Browse files Browse the repository at this point in the history
new reports and slight pipeline tweaks
  • Loading branch information
dpark01 committed Jan 27, 2015
2 parents 5b59122 + 353d392 commit eaec753
Show file tree
Hide file tree
Showing 11 changed files with 252 additions and 61 deletions.
36 changes: 30 additions & 6 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
os.unlink(rmdupfq[0])
os.unlink(rmdupfq[1])

# Log count
with open(purgefq[0], 'rt') as inf:
n = int(sum(1 for line in inf)/4)
log.info("PRE-SUBSAMPLE COUNT: {} read pairs".format(n))

# Subsample
subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
Expand All @@ -58,19 +63,38 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
os.unlink(purgefq[1])

# Fastq -> BAM
# Note: this destroys RG IDs! We should instead just pull the list
# of IDs out of purge_unmated, random.sample them ourselves, and
# use FilterSamReadsTool to go straight from inBam -> outBam
# Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way
# breaks out the read groups and perform the above steps in a way that preserves
# the RG IDs.
tmp_bam = util.file.mkstempfname('.subsamp.bam')
tmp_header = util.file.mkstempfname('.header.sam')
tools.picard.FastqToSamTool().execute(
subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header)
tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
if n == 0:
# FastqToSam cannot deal with empty input
# but Picard SamFormatConverter can deal with empty files
opts = ['INPUT='+tmp_header, 'OUTPUT='+outBam, 'VERBOSITY=ERROR']
tools.picard.PicardTools().execute('SamFormatConverter', opts, JVMmemory='50m')
else:
tools.picard.FastqToSamTool().execute(
subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
os.unlink(tmp_bam)
os.unlink(tmp_header)
os.unlink(subsampfq[0])
os.unlink(subsampfq[1])
def parser_trim_rmdup_subsamp(parser=argparse.ArgumentParser()):
parser.add_argument('inBam',
help='Input reads, unaligned BAM format.')
parser.add_argument('clipDb',
help='Trimmomatic clip DB.')
parser.add_argument('outBam',
help='Output reads, unaligned BAM format (currently, read groups and other header information are destroyed in this process).')
parser.add_argument('--n_reads', default=100000, type=int,
help='Subsample reads to no more than this many pairs. (default %(default)s)')
util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
util.cmd.attach_main(parser, trim_rmdup_subsamp_reads, split_args=True)
return parser
__commands__.append(('trim_rmdup_subsamp', parser_trim_rmdup_subsamp))


def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None):
Expand Down
10 changes: 6 additions & 4 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,14 @@ def vphaser_to_vcf(inFile, refFasta, multiAlignment, outVcf):
'''

# read in multiple alignments of consensus sequences
aln = Bio.AlignIO.read(open(multiAlignment, 'rt'), 'fasta')
with open(multiAlignment, 'rt') as inf:
aln = Bio.AlignIO.read(inf, 'fasta')

# open reference genome and set ref as a BioPython SeqRecord
ref = list(Bio.SeqIO.parse(open(refFasta, 'rt'), 'fasta'))
assert len(ref)==1
ref = ref[0]
with open(refFasta, 'rt') as inf:
ref = list(Bio.SeqIO.parse(inf, 'fasta'))
assert len(ref)==1
ref = ref[0]

# prepare sample list
samples = list(util.misc.unique(row['patient'] for row in util.file.read_tabfile_dict(inFile)))
Expand Down
71 changes: 39 additions & 32 deletions pipes/rules/assembly.rules
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,6 @@
This is a basic framework for assembly of viral genomes, currently
tailored for EBOV. Some generalization work needed to expand this
to generic viral genomes with an arbitrary number of segments/chromosomes.

note: all this runs in ~3hrs, no step takes more than 4GB RAM, and the
DAG is linear per sample, so we could conslidate most of this into a
single Bellini-like script (once we actually Tool-ify all of the critical
pieces like Trinity, MOSIAK, VFAT, Novoalign, GATK)
"""

__author__ = 'Kristian Andersen <[email protected]>, Daniel Park <[email protected]>'
Expand All @@ -24,6 +19,10 @@ rule all_assemble:
# create BAMs of aligned reads to own consensus
expand("{dataDir}/{subdir}/{sample}.bam",
dataDir=config["dataDir"], subdir=config["subdirs"]["align_self"],
sample=read_samples_file(config["samples_assembly"])),
# assembly reports too
expand("{reportsDir}/assembly/{sample}.txt",
reportsDir=config["reportsDir"],
sample=read_samples_file(config["samples_assembly"]))
params: LSF="-N"

Expand All @@ -32,6 +31,9 @@ rule all_assemble_failures:
expand("{dataDir}/{subdir}/{sample}.fasta",
dataDir=config["dataDir"], subdir=config["subdirs"]["assembly"],
sample=read_samples_file(config.get("samples_assembly_failures"))),
expand("{reportsDir}/assembly/{sample}.txt",
reportsDir=config["reportsDir"],
sample=read_samples_file(config["samples_assembly"]))
params: LSF="-N"


Expand All @@ -42,38 +44,42 @@ rule assemble_trinity:
and random subsample to no more than 100k reads.
'''
input: config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.taxfilt.bam'
output: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta',
config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
output: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta'
resources: mem=3
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
n_reads="100000",
logid="{sample}",
clipDb=config["trim_clipDb"]
clipDb=config["trim_clipDb"],
subsamp_bam=config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
run:
makedirs(expand("{dir}/{subdir}",
dir=[config["dataDir"],config["tmpDir"]],
subdir=config["subdirs"]["assembly"]))
shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output[0]} --n_reads={params.n_reads} --outReads {output[1]}")
shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output} --n_reads={params.n_reads} --outReads {params.subsamp_bam}")

rule orient_and_impute:
''' This step cleans up the Trinity assembly with a known reference genome.
VFAT (maybe Bellini later): take the Trinity contigs, align them to
the known reference genome, switch it to the same strand as the
reference, and produce chromosome-level assemblies (with runs of
N's in between the Trinity contigs).
filter_short_seqs: We then toss out all assemblies that come out to
< 15kb or < 95% unambiguous and fail otherwise.
modify_contig: Finally, we trim off anything at the end that exceeds
order_and_orient (VFAT, maybe Bellini later): take the de novo assembly,
align them to the known reference genome, switch it to the same
strand as the reference, and produce chromosome-level assemblies
(with runs of N's in between the de novo contigs).
filter_short_seqs: Fail on assemblies that come out to
< 15kb or < 95% unambiguous.
impute_from_reference: trim off anything at the end that exceeds
the length of the known reference assembly. We also replace all
Ns and everything within 55bp of the chromosome ends with the
reference sequence. This is clearly incorrect consensus sequence,
but it allows downstream steps to map reads in parts of the genome
that would otherwise be Ns, and we will correct all of the inferred
positions with two steps of read-based refinement (below), and
revert positions back to Ns where read support is lacking.
revert positions back to Ns where read support is lacking. The
de novo step is still important because it allows for significant
structural / indel changes (or significant substitution distances)
which will be captured in this output.
This is the only step in the pipeline that needs generalization to
multi-chromosome genomes.
'''
input: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta',
config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
input: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta'
output: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly2-vfat.fasta',
config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly3-modify.fasta'
resources: mem=3
Expand All @@ -83,9 +89,10 @@ rule orient_and_impute:
min_unambig = str(config["assembly_min_unambig"]),
renamed_prefix="",
replace_length="55",
logid="{sample}"
logid="{sample}",
subsamp_bam=config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
run:
shell("{config[binDir]}/assembly.py order_and_orient {input[0]} {params.refGenome} {output[0]} --inReads {input[1]}")
shell("{config[binDir]}/assembly.py order_and_orient {input} {params.refGenome} {output[0]} --inReads {params.subsamp_bam}")
shell("{config[binDir]}/assembly.py impute_from_reference {output[0]} {params.refGenome} {output[1]} --newName {params.renamed_prefix}{wildcards.sample} --replaceLength {params.replace_length} --minLength {params.length} --minUnambig {params.min_unambig}")

rule refine_assembly_1:
Expand Down Expand Up @@ -115,15 +122,13 @@ rule refine_assembly_2:
''' This a second pass refinement step very similar to the first.
The only differences are that Novoalign mapping parameters are
more conservative and the input consensus sequence has already
been refined once and the input reads are the full raw read
set (instead of the cleaned read set). We also require higher
minimum read coverage (3 instead of 2) in order to call a
non-ambiguous base.
been refined once. We also require higher minimum read coverage
(3 instead of 2) in order to call a non-ambiguous base.
The output of this step is the final assembly for this sample.
Final FASTA file is indexed for Picard, Samtools, and Novoalign.
'''
input: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly4-refined.fasta',
config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.raw.bam'
config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam'
output: config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta',
config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly4.vcf.gz'
resources: mem=4
Expand All @@ -136,19 +141,21 @@ rule refine_assembly_2:
rule map_reads_to_self:
''' After the final assembly is produced, we also produce BAM files with all reads
mapped back to its own consensus. Outputs several BAM files, sorted and indexed:
{sample}.bam - all raw reads
{sample}.mapped.bam - only mapped reads
{sample}.rmdup.bam - only mapped reads, with duplicates removed by Picard
{sample}.realigned.bam - mapped/rmdup reads, realigned with GATK IndelRealigner
{sample}.bam - all raw reads aligned to self
{sample}.mapped.bam - only mapped reads, duplicates removed by Picard,
realigned with GATK IndelRealigner
'''
input: config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam',
config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta'
output: config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.bam',
config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam'
config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam',
config["reportsDir"]+'/assembly/{sample}.txt'
resources: mem=4
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
logid="{sample}",
novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k -c 3"
run:
makedirs(os.path.join(config["dataDir"], config["subdirs"]["align_self"]))
makedirs([os.path.join(config["dataDir"], config["subdirs"]["align_self"]),
os.path.join(config["reportsDir"], "assembly")])
shell("{config[binDir]}/read_utils.py align_and_fix {input} --outBamAll {output[0]} --outBamFiltered {output[1]} --novoalign_options '{params.novoalign_options}'")
shell("{config[binDir]}/reports.py assembly_stats {wildcards.sample} {output[2]} --assembly_dir {config[dataDir]}/{config[subdirs][assembly]} --assembly_tmp {config[tmpDir]}/{config[subdirs][assembly]} --align_dir {config[dataDir]}/{config[subdirs][align_self]}")
13 changes: 7 additions & 6 deletions pipes/rules/hs_deplete.rules
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ rule all_deplete:
expand("{dataDir}/{subdir}/{sample}.{adjective}.bam",
dataDir=config["dataDir"],
subdir=config["subdirs"]["per_sample"],
adjective=['raw','cleaned','taxfilt'],
adjective=['cleaned','taxfilt'],
sample=read_samples_file(config["samples_depletion"])),
params: LSF="-N"


"""
rule revert_bam:
input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
output: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
output: config["tmpDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
resources: mem=3
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
logid="{sample}"
Expand All @@ -38,7 +38,7 @@ rule deplete_bmtagger:
''' This step depletes human reads using BMTagger
This sometimes takes just over 4 hrs for highly dense lanes
'''
input: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
input: config["tmpDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
# expand("{dbdir}/{db}.{suffix}",
# dbdir=config["bmTaggerDbDir"],
# db=config["bmTaggerDbs_remove"],
Expand Down Expand Up @@ -82,20 +82,21 @@ rule depletion:
''' Runs a full human read depletion pipeline and removes PCR duplicates
'''
input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
output: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam',
config["tmpDir"]+ '/'+config["subdirs"]["depletion"]+'/{sample}.bmtagger_depleted.bam',
output: config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.bmtagger_depleted.bam',
config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.rmdup.bam',
config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.cleaned.bam'
resources: mem=15
params: LSF=config.get('LSF_queues', {}).get('long', '-q forest'),
bmtaggerDbs = expand("{dbdir}/{db}", dbdir=config["bmTaggerDbDir"], db=config["bmTaggerDbs_remove"]),
blastDbs = expand("{dbdir}/{db}", dbdir=config["blastDbDir"], db=config["blastDb_remove"]),
revert_bam = config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam',
logid="{sample}"
run:
makedirs(expand("{dir}/{subdir}",
dir=[config["dataDir"],config["tmpDir"]],
subdir=config["subdirs"]["depletion"]))
shell("{config[binDir]}/taxon_filter.py deplete_human {input} {output} --bmtaggerDbs {params.bmtaggerDbs} --blastDbs {params.blastDbs}")
shell("{config[binDir]}/taxon_filter.py deplete_human {input} {params.revert_bam} {output} --bmtaggerDbs {params.bmtaggerDbs} --blastDbs {params.blastDbs}")
os.unlink(params.revert_bam)

rule filter_to_taxon:
''' This step reduces the read set to a specific taxon (usually the genus
Expand Down
2 changes: 1 addition & 1 deletion pipes/rules/interhost.rules
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ rule all_ref_guided:
config["reportsDir"]+'/summary.coverage_ref.txt.gz'

rule ref_guided_consensus:
input: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz',
config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta'
Expand Down
56 changes: 56 additions & 0 deletions pipes/rules/reports_only
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Just the assembly reports and nothing else
"""

__author__ = 'Daniel Park <[email protected]>'

import os

configfile: "config.json"

def read_file(fname):
with open(fname, 'rt') as inf:
for line in inf:
yield line.strip()

def cat_files_with_header(inFiles, outFile):
with open(outFile, 'wt') as outf:
header = None
for f in inFiles:
with open(f, 'rt') as inf:
h = None
for line in inf:
if h == None:
# this is the header line
h = line
if header == None:
# this is the first file
outf.write(line)
header = h
else:
# this is not the first file
if h != header:
raise Exception("headers do not match")
else:
outf.write(line)


rule all_assembly_reports:
input:
expand("{reportsDir}/assembly/{sample}.txt",
reportsDir=config["reportsDir"],
sample=read_file(config["samples_depletion"]))
output:
config["reportsDir"]+"/summary.assembly.txt"
params: LSF="-N"
run:
cat_files_with_header(input, output[0])


rule assembly_report:
output: config["reportsDir"]+'/assembly/{sample}.txt'
resources: mem=2
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
logid="{sample}"
shell: "{config[binDir]}/reports.py assembly_stats {wildcards.sample} {output} --assembly_dir {config[dataDir]}/{config[subdirs][assembly]} --assembly_tmp {config[tmpDir]}/{config[subdirs][assembly]} --align_dir {config[dataDir]}/{config[subdirs][align_self]}"

Loading

0 comments on commit eaec753

Please sign in to comment.