Merge pull request #80 from broadinstitute/dpark-dev

new reports and slight pipeline tweaks
broadinstitute · Jan 27, 2015 · eaec753 · eaec753
2 parents 5b59122 + 353d392
commit eaec753
Show file tree

Hide file tree

Showing 11 changed files with 252 additions and 61 deletions.
diff --git a/assembly.py b/assembly.py
@@ -44,6 +44,11 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
     read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1])
     os.unlink(rmdupfq[0])
     os.unlink(rmdupfq[1])
+
+    # Log count
+    with open(purgefq[0], 'rt') as inf:
+        n = int(sum(1 for line in inf)/4)
+        log.info("PRE-SUBSAMPLE COUNT: {} read pairs".format(n))
 
     # Subsample
     subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
@@ -58,19 +63,38 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
     os.unlink(purgefq[1])
 
     # Fastq -> BAM
-    # Note: this destroys RG IDs! We should instead just pull the list
-    # of IDs out of purge_unmated, random.sample them ourselves, and
-    # use FilterSamReadsTool to go straight from inBam -> outBam
+    # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way
+    # breaks out the read groups and perform the above steps in a way that preserves
+    # the RG IDs.
     tmp_bam = util.file.mkstempfname('.subsamp.bam')
     tmp_header = util.file.mkstempfname('.header.sam')
-    tools.picard.FastqToSamTool().execute(
-        subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
     tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header)
-    tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
+    if n == 0:
+        # FastqToSam cannot deal with empty input
+        # but Picard SamFormatConverter can deal with empty files
+        opts = ['INPUT='+tmp_header, 'OUTPUT='+outBam, 'VERBOSITY=ERROR']
+        tools.picard.PicardTools().execute('SamFormatConverter', opts, JVMmemory='50m')
+    else:
+        tools.picard.FastqToSamTool().execute(
+            subsampfq[0], subsampfq[1], 'Dummy', tmp_bam)
+        tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam)
     os.unlink(tmp_bam)
     os.unlink(tmp_header)
     os.unlink(subsampfq[0])
     os.unlink(subsampfq[1])
+def parser_trim_rmdup_subsamp(parser=argparse.ArgumentParser()):
+    parser.add_argument('inBam',
+        help='Input reads, unaligned BAM format.')
+    parser.add_argument('clipDb',
+        help='Trimmomatic clip DB.')
+    parser.add_argument('outBam',
+        help='Output reads, unaligned BAM format (currently, read groups and other header information are destroyed in this process).')
+    parser.add_argument('--n_reads', default=100000, type=int,
+        help='Subsample reads to no more than this many pairs. (default %(default)s)')
+    util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None)))
+    util.cmd.attach_main(parser, trim_rmdup_subsamp_reads, split_args=True)
+    return parser
+__commands__.append(('trim_rmdup_subsamp', parser_trim_rmdup_subsamp))
 
 
 def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None):

diff --git a/intrahost.py b/intrahost.py
@@ -76,12 +76,14 @@ def vphaser_to_vcf(inFile, refFasta, multiAlignment, outVcf):
     '''
 
     # read in multiple alignments of consensus sequences
-    aln = Bio.AlignIO.read(open(multiAlignment, 'rt'), 'fasta')
+    with open(multiAlignment, 'rt') as inf:
+        aln = Bio.AlignIO.read(inf, 'fasta')
 
     # open reference genome and set ref as a BioPython SeqRecord
-    ref = list(Bio.SeqIO.parse(open(refFasta, 'rt'), 'fasta'))
-    assert len(ref)==1
-    ref = ref[0]
+    with open(refFasta, 'rt') as inf:
+        ref = list(Bio.SeqIO.parse(inf, 'fasta'))
+        assert len(ref)==1
+        ref = ref[0]
 
     # prepare sample list
     samples = list(util.misc.unique(row['patient'] for row in util.file.read_tabfile_dict(inFile)))

diff --git a/pipes/rules/assembly.rules b/pipes/rules/assembly.rules
@@ -2,11 +2,6 @@
     This is a basic framework for assembly of viral genomes, currently
     tailored for EBOV. Some generalization work needed to expand this
     to generic viral genomes with an arbitrary number of segments/chromosomes.
-
-    note: all this runs in ~3hrs, no step takes more than 4GB RAM, and the
-    DAG is linear per sample, so we could conslidate most of this into a
-    single Bellini-like script (once we actually Tool-ify all of the critical
-    pieces like Trinity, MOSIAK, VFAT, Novoalign, GATK)
 """
 
 __author__ = 'Kristian Andersen <[email protected]>, Daniel Park <[email protected]>'
@@ -24,6 +19,10 @@ rule all_assemble:
         # create BAMs of aligned reads to own consensus
         expand("{dataDir}/{subdir}/{sample}.bam",
             dataDir=config["dataDir"], subdir=config["subdirs"]["align_self"],
+            sample=read_samples_file(config["samples_assembly"])),
+        # assembly reports too
+        expand("{reportsDir}/assembly/{sample}.txt",
+            reportsDir=config["reportsDir"],
             sample=read_samples_file(config["samples_assembly"]))
     params: LSF="-N"
 
@@ -32,6 +31,9 @@ rule all_assemble_failures:
         expand("{dataDir}/{subdir}/{sample}.fasta",
             dataDir=config["dataDir"], subdir=config["subdirs"]["assembly"],
             sample=read_samples_file(config.get("samples_assembly_failures"))),
+        expand("{reportsDir}/assembly/{sample}.txt",
+            reportsDir=config["reportsDir"],
+            sample=read_samples_file(config["samples_assembly"]))
     params: LSF="-N"
 
 
@@ -42,38 +44,42 @@ rule assemble_trinity:
         and random subsample to no more than 100k reads.
     '''
     input:  config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.taxfilt.bam'
-    output: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta',
-            config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
+    output: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta'
     resources: mem=3
     params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
             n_reads="100000",
             logid="{sample}",
-            clipDb=config["trim_clipDb"]
+            clipDb=config["trim_clipDb"],
+            subsamp_bam=config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
     run:
             makedirs(expand("{dir}/{subdir}",
                 dir=[config["dataDir"],config["tmpDir"]],
                 subdir=config["subdirs"]["assembly"]))
-            shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output[0]} --n_reads={params.n_reads} --outReads {output[1]}")
+            shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output} --n_reads={params.n_reads} --outReads {params.subsamp_bam}")
 
 rule orient_and_impute:
     ''' This step cleans up the Trinity assembly with a known reference genome.
-        VFAT (maybe Bellini later): take the Trinity contigs, align them to
-            the known reference genome, switch it to the same strand as the
-            reference, and produce chromosome-level assemblies (with runs of
-            N's in between the Trinity contigs).
-        filter_short_seqs: We then toss out all assemblies that come out to
-            < 15kb or < 95% unambiguous and fail otherwise.
-        modify_contig: Finally, we trim off anything at the end that exceeds
+        order_and_orient (VFAT, maybe Bellini later): take the de novo assembly,
+            align them to the known reference genome, switch it to the same
+            strand as the reference, and produce chromosome-level assemblies
+            (with runs of N's in between the de novo contigs).
+        filter_short_seqs: Fail on assemblies that come out to
+            < 15kb or < 95% unambiguous.
+        impute_from_reference: trim off anything at the end that exceeds
             the length of the known reference assembly.  We also replace all
             Ns and everything within 55bp of the chromosome ends with the
             reference sequence.  This is clearly incorrect consensus sequence,
             but it allows downstream steps to map reads in parts of the genome
             that would otherwise be Ns, and we will correct all of the inferred
             positions with two steps of read-based refinement (below), and
-            revert positions back to Ns where read support is lacking.
+            revert positions back to Ns where read support is lacking.  The
+            de novo step is still important because it allows for significant
+            structural / indel changes (or significant substitution distances)
+            which will be captured in this output.
+            This is the only step in the pipeline that needs generalization to
+            multi-chromosome genomes.
     '''
-    input:  config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta',
-            config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
+    input:  config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta'
     output: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly2-vfat.fasta',
             config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly3-modify.fasta'
     resources: mem=3
@@ -83,9 +89,10 @@ rule orient_and_impute:
             min_unambig = str(config["assembly_min_unambig"]),
             renamed_prefix="",
             replace_length="55",
-            logid="{sample}"
+            logid="{sample}",
+            subsamp_bam=config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam'
     run:
-            shell("{config[binDir]}/assembly.py order_and_orient {input[0]} {params.refGenome} {output[0]} --inReads {input[1]}")
+            shell("{config[binDir]}/assembly.py order_and_orient {input} {params.refGenome} {output[0]} --inReads {params.subsamp_bam}")
             shell("{config[binDir]}/assembly.py impute_from_reference {output[0]} {params.refGenome} {output[1]} --newName {params.renamed_prefix}{wildcards.sample} --replaceLength {params.replace_length} --minLength {params.length} --minUnambig {params.min_unambig}")
 
 rule refine_assembly_1:
@@ -115,15 +122,13 @@ rule refine_assembly_2:
     ''' This a second pass refinement step very similar to the first.
         The only differences are that Novoalign mapping parameters are
         more conservative and the input consensus sequence has already
-        been refined once and the input reads are the full raw read
-        set (instead of the cleaned read set). We also require higher
-        minimum read coverage (3 instead of 2) in order to call a
-        non-ambiguous base.
+        been refined once. We also require higher minimum read coverage
+        (3 instead of 2) in order to call a non-ambiguous base.
         The output of this step is the final assembly for this sample.
         Final FASTA file is indexed for Picard, Samtools, and Novoalign.
     '''
     input:  config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly4-refined.fasta',
-            config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.raw.bam'
+            config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam'
     output: config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta',
             config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly4.vcf.gz'
     resources: mem=4
@@ -136,19 +141,21 @@ rule refine_assembly_2:
 rule map_reads_to_self:
     ''' After the final assembly is produced, we also produce BAM files with all reads
         mapped back to its own consensus.  Outputs several BAM files, sorted and indexed:
-            {sample}.bam           - all raw reads
-            {sample}.mapped.bam    - only mapped reads
-            {sample}.rmdup.bam     - only mapped reads, with duplicates removed by Picard
-            {sample}.realigned.bam - mapped/rmdup reads, realigned with GATK IndelRealigner
+            {sample}.bam        - all raw reads aligned to self
+            {sample}.mapped.bam - only mapped reads, duplicates removed by Picard,
+                                  realigned with GATK IndelRealigner
     '''
     input:  config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam',
             config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta'
     output: config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.bam',
-            config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam'
+            config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam',
+            config["reportsDir"]+'/assembly/{sample}.txt'
     resources: mem=4
     params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
             logid="{sample}",
             novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k -c 3"
     run:
-            makedirs(os.path.join(config["dataDir"], config["subdirs"]["align_self"]))
+            makedirs([os.path.join(config["dataDir"], config["subdirs"]["align_self"]),
+                os.path.join(config["reportsDir"], "assembly")])
             shell("{config[binDir]}/read_utils.py align_and_fix {input} --outBamAll {output[0]} --outBamFiltered {output[1]} --novoalign_options '{params.novoalign_options}'")
+            shell("{config[binDir]}/reports.py assembly_stats {wildcards.sample} {output[2]} --assembly_dir {config[dataDir]}/{config[subdirs][assembly]} --assembly_tmp {config[tmpDir]}/{config[subdirs][assembly]} --align_dir {config[dataDir]}/{config[subdirs][align_self]}")
diff --git a/pipes/rules/hs_deplete.rules b/pipes/rules/hs_deplete.rules
@@ -16,15 +16,15 @@ rule all_deplete:
         expand("{dataDir}/{subdir}/{sample}.{adjective}.bam",
             dataDir=config["dataDir"],
             subdir=config["subdirs"]["per_sample"],
-            adjective=['raw','cleaned','taxfilt'],
+            adjective=['cleaned','taxfilt'],
             sample=read_samples_file(config["samples_depletion"])),
     params: LSF="-N"
 
 
 """
 rule revert_bam:
     input:  config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
-    output: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
+    output: config["tmpDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
     resources: mem=3
     params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
             logid="{sample}"
@@ -38,7 +38,7 @@ rule deplete_bmtagger:
     ''' This step depletes human reads using BMTagger
         This sometimes takes just over 4 hrs for highly dense lanes
     '''
-    input:  config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
+    input:  config["tmpDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
     #       expand("{dbdir}/{db}.{suffix}",
     #           dbdir=config["bmTaggerDbDir"],
     #           db=config["bmTaggerDbs_remove"],
@@ -82,20 +82,21 @@ rule depletion:
     ''' Runs a full human read depletion pipeline and removes PCR duplicates
     '''
     input:  config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
-    output: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam',
-            config["tmpDir"]+ '/'+config["subdirs"]["depletion"]+'/{sample}.bmtagger_depleted.bam',
+    output: config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.bmtagger_depleted.bam',
             config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.rmdup.bam',
             config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.cleaned.bam'
     resources: mem=15
     params: LSF=config.get('LSF_queues', {}).get('long', '-q forest'),
             bmtaggerDbs = expand("{dbdir}/{db}", dbdir=config["bmTaggerDbDir"], db=config["bmTaggerDbs_remove"]),
             blastDbs = expand("{dbdir}/{db}", dbdir=config["blastDbDir"], db=config["blastDb_remove"]),
+            revert_bam = config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam',
             logid="{sample}"
     run:
             makedirs(expand("{dir}/{subdir}",
                 dir=[config["dataDir"],config["tmpDir"]],
                 subdir=config["subdirs"]["depletion"]))
-            shell("{config[binDir]}/taxon_filter.py deplete_human {input} {output} --bmtaggerDbs {params.bmtaggerDbs} --blastDbs {params.blastDbs}")
+            shell("{config[binDir]}/taxon_filter.py deplete_human {input} {params.revert_bam} {output} --bmtaggerDbs {params.bmtaggerDbs} --blastDbs {params.blastDbs}")
+            os.unlink(params.revert_bam)
 
 rule filter_to_taxon:
     ''' This step reduces the read set to a specific taxon (usually the genus

diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules
@@ -37,7 +37,7 @@ rule all_ref_guided:
             config["reportsDir"]+'/summary.coverage_ref.txt.gz'
 
 rule ref_guided_consensus:
-    input:  config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam'
+    input:  config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam'
     output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam',
             config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz',
             config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta'

diff --git a/pipes/rules/reports_only b/pipes/rules/reports_only
@@ -0,0 +1,56 @@
+"""
+    Just the assembly reports and nothing else
+"""
+
+__author__ = 'Daniel Park <[email protected]>'
+
+import os
+
+configfile: "config.json"
+
+def read_file(fname):
+    with open(fname, 'rt') as inf:
+        for line in inf:
+            yield line.strip()
+
+def cat_files_with_header(inFiles, outFile):
+    with open(outFile, 'wt') as outf:
+        header = None
+        for f in inFiles:
+            with open(f, 'rt') as inf:
+                h = None
+                for line in inf:
+                    if h == None:
+                        # this is the header line
+                        h = line
+                        if header == None:
+                            # this is the first file
+                            outf.write(line)
+                            header = h
+                        else:
+                            # this is not the first file
+                            if h != header:
+                                raise Exception("headers do not match")
+                    else:
+                        outf.write(line)
+
+
+rule all_assembly_reports:
+    input:
+            expand("{reportsDir}/assembly/{sample}.txt",
+                reportsDir=config["reportsDir"],
+                sample=read_file(config["samples_depletion"]))
+    output:
+            config["reportsDir"]+"/summary.assembly.txt"
+    params: LSF="-N"
+    run:
+            cat_files_with_header(input, output[0])
+
+
+rule assembly_report:
+    output: config["reportsDir"]+'/assembly/{sample}.txt'
+    resources: mem=2
+    params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
+            logid="{sample}"
+    shell:  "{config[binDir]}/reports.py assembly_stats {wildcards.sample} {output} --assembly_dir {config[dataDir]}/{config[subdirs][assembly]} --assembly_tmp {config[tmpDir]}/{config[subdirs][assembly]} --align_dir {config[dataDir]}/{config[subdirs][align_self]}"
+