add workflows for isnvs_merge_to_vcf (#864)

* add workflows for isnvs_merge_to_vcf and isnvs_merge_to_vcf_filtered * consolidate isnv workflows * include multiple alignment as part of isnv merge_to_vcf * isnvs_per_sample: also remove ".mapped" from bam file used to infer sample name * make emaill address optional for isnvs_vcf WDL (and for snpEff) * infer snpEff accessions from ref fasta if not provided * wdl corrections * WIP * chain commands with &&; accession parse correction * set -ex -o pipefail * comment change * update snpEff 4.1l -> 4.3.1t * add snpEff integration test * comment out custom tmpdir test fixtures * try older tmpdir fixtures * replace tmpdir_function fixture with tempfile.gettempdir() * revert; incorporate changes from 0191d68 (@notestaff) * only check final snp_eff tabular output * disable stdout capture by pytest for snpeff test * ignore BOM in open_or_gzopen https://en.wikipedia.org/wiki/Byte_order_mark * typo correction * debug * io open * kwarg reorder * debug * revert utf-8-sig * remove debug * correct git merge butchery * skip test_snpeff for now errors occur on Travis that so far have not been reproducible locally * dev notes * allow passage of stderr handle in run_and_print allow passage of stderr handle in run_and_print, defaulting to stdout redirection as before if not specified * stderr=subprocess.STDOUT default in run_and_print * py27 doesn't have subprocess.DEVNULL (thanks, @notestaff) * correcting late night mistakes * assertAlmostEqual for floats in snpEff test * maintain consistent order of inferred sample name w/r/t isnv files * WIP * add --emailAddress syntax to snpEff snakemake rule * WIP * WIP * remove test skipIf
broadinstitute · Jul 25, 2018 · 03b8368 · 03b8368
1 parent e2515d3
commit 03b8368
Show file tree

Hide file tree

Showing 25 changed files with 981 additions and 103 deletions.
diff --git a/DEVELOPMENT_NOTES.md b/DEVELOPMENT_NOTES.md
@@ -20,6 +20,7 @@ When upgrading the GATK to a new version:
 - in tools/gatk.py change TOOL_VERSION_TUPLE at the top
 - in travis/install-gatk.sh change GATK_VERSION at the top
 - in easy-deploy-script/easy-deploy-viral-ngs.sh 
+- in docker/rundocker.sh 
 
 ### (Automated) testing 
 [Travis CI](https://travis-ci.org/broadinstitute/viral-ngs) performs automated unit and integration tests for viral-ngs on each branch and pull request. Unit tests are run on each new branch commit, and longer integration tests are performed on pull requests to help ensure the stability of the `master` branch. Pull requests are gated to ensure merging to `master` is allowed only if all tests pass. The Travis configuration is specified in `.travis.yml`, and relies on files stored within `viral-ngs/travis/`.

diff --git a/conftest.py b/conftest.py
@@ -120,4 +120,4 @@ def pytest_terminal_summary(self, terminalreporter, exitstatus):
         widths = [max(map(len, col)) for col in zip(*rows)]
         for row in rows:
             writer.write(" ".join((val.ljust(width) for val, width in zip(row, widths))))
-            writer.line()
+            writer.line()
diff --git a/docker/rundocker.sh b/docker/rundocker.sh
@@ -3,7 +3,7 @@
 # A wrapper script to run viral-ngs docker images
 # The following paths have to be modified according to end-user environment
 NOVOALIGN_PATH="/opt/novocraft" # Directory where novoalign.lic license file can befound
-GATK_PATH="/opt/GenomeAnalysisTK-3.6" # Directory where the correct GATK jar file can be found
+GATK_PATH="/opt/GenomeAnalysisTK-3.8" # Directory where the correct GATK jar file can be found
 IMAGE_HASH_OR_TAG="local/viral-ngs:1.16.0" # This can be found by running this command 'docker images'
 DATA_DIR="$1"; shift
 GID=$(id -g $USER)

diff --git a/illumina.py b/illumina.py
@@ -553,7 +553,6 @@ def get_flowcell(self):
             log.warn("The provided flowcell ID is longer than 15 characters. Is that correct?")
         return fc
 
-    @util.misc.memoize
     def _get_rundate_obj(self):
         """
             Access the text of the <Date> node in the RunInfo.xml file

diff --git a/interhost.py b/interhost.py
@@ -328,7 +328,7 @@ def parser_snpEff(parser=argparse.ArgumentParser()):
     parser.add_argument("inVcf", help="Input VCF file")
     parser.add_argument("genomes", nargs='+', help="genome name (snpEff db name, or NCBI accessions)")
     parser.add_argument("outVcf", help="Output VCF file")
-    parser.add_argument("emailAddress",
+    parser.add_argument("--emailAddress",
                         help="""Your email address. To access the Genbank CoreNucleotide database,
         NCBI requires you to specify your email address with each request.
         In case of excessive usage of the E-utilities, NCBI will attempt to contact

diff --git a/intrahost.py b/intrahost.py
@@ -485,7 +485,31 @@ def merge_to_vcf(
         guessed_samples = samplenames_from_isnvs + list(samplenames_from_alignments-(refnames|set(samplenames_from_isnvs)))
         log.info("guessed sample names %s" % guessed_samples)
 
-    samples = samples or guessed_samples
+    samples = samples if samples is not None and len(samples)>0 else guessed_samples
+
+    samp_to_isnv = {}
+    # if we had to guess sample names, match them up to isnv files
+    if len(guessed_samples)>0:
+        matched_samples = []
+        matched_isnv_files = []
+        for sample in samples:
+            sample_found=False
+            for isnvs_file in isnvs:
+                for row in util.file.read_tabfile(isnvs_file):
+                    if sample==sampleIDMatch(row[0]):
+                        samp_to_isnv[sample] = isnvs_file
+                        sample_found=True
+                        matched_samples.append(sample)
+                        matched_isnv_files.append(isnvs_file)
+                        break
+                if sample_found:
+                    break
+        samples = matched_samples
+        isnvs = matched_isnv_files
+    else:
+        samp_to_isnv = dict(zip(samples, isnvs))
+
+    log.info(samp_to_isnv)
 
     # get IDs and sequence lengths for reference sequence
     with util.file.open_or_gzopen(refFasta, 'r') as inf:
@@ -567,22 +591,14 @@ def merge_to_vcf(
                 # to the assemblies
 
                 # if we had to guess samples only check that the number of isnv files == number of alignments
-                if len(guessed_samples)>0:
-                    if not (number_of_aligned_sequences - 1) == num_isnv_files:
-                        raise LookupError(
-                            """The number of isnv files provided (%s) and must equal the number of sequences
-                            seen in the alignment (%s) (plus an extra reference record in the alignment).
-                            %s does not have the right number of sequences""" % (num_isnv_files,number_of_aligned_sequences - 1,fileName))
-                else:
+                if len(guessed_samples)==0:
                     if not (number_of_aligned_sequences - 1) == num_isnv_files == len(samples):
                         raise LookupError(
                             """The number of isnv files provided (%s) and must equal the number of sequences
                             seen in the alignment (%s) (plus an extra reference record in the alignment), 
                             as well as the number of sample names provided (%s)
                             %s does not have the right number of sequences""" % (num_isnv_files,number_of_aligned_sequences - 1,len(samples),fileName))
 
-        samp_to_isnv = dict(zip(samples, isnvs))
-
         # one reference chrom at a time
         with open(refFasta, 'r') as inf:
             for ref_sequence in Bio.SeqIO.parse(inf, 'fasta'):
@@ -611,6 +627,7 @@ def merge_to_vcf(
                         for sampleName in samplesToUse:
                             if seq.id == sampleName:
                                 samp_to_seqIndex[sampleName] = seq.seq.ungap('-')
+                                break
 
                 if not len(samp_to_seqIndex) == len(samplesToUse):
                     raise LookupError(

diff --git a/pipes/WDL/workflows/isnvs_merge_to_vcf.wdl b/pipes/WDL/workflows/isnvs_merge_to_vcf.wdl
@@ -0,0 +1,18 @@
+import "tasks_interhost.wdl" as interhost
+import "tasks_intrahost.wdl" as tasks_intrahost
+
+workflow isnvs_merge_to_vcf {
+    File          reference_fasta
+    Array[File]+  assemblies_fasta     # one per genome
+
+    call interhost.multi_align_mafft_ref as mafft {
+        input:
+            reference_fasta = reference_fasta,
+            assemblies_fasta = assemblies_fasta
+    }
+    call tasks_intrahost.isnvs_vcf {
+        input:
+            perSegmentMultiAlignments = mafft.alignments_by_chr,
+            reference_fasta = reference_fasta
+    }
+}
diff --git a/pipes/WDL/workflows/tasks/tasks_intrahost.wdl b/pipes/WDL/workflows/tasks/tasks_intrahost.wdl
@@ -7,7 +7,7 @@ task isnvs_per_sample {
   Int? minReadsPerStrand
   Int? maxBias
 
-  String sample_name = basename(basename(mapped_bam, ".bam"), ".all")
+  String sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")
 
   command {
     intrahost.py vphaser_one_sample \
@@ -29,61 +29,31 @@ task isnvs_per_sample {
   }
 }
 
+
 task isnvs_vcf {
   Array[File] vphaser2Calls # vphaser output; ex. vphaser2.${sample}.txt.gz
   Array[File] perSegmentMultiAlignments # aligned_##.fasta, where ## is segment number
   File reference_fasta
 
-  Array[String] snpEffRef # list of accessions to build/find snpEff database
+  Array[String]? snpEffRef # list of accessions to build/find snpEff database
   Array[String]? sampleNames # list of sample names
-  String emailAddress # email address passed to NCBI if we need to download reference sequences
+  String? emailAddress # email address passed to NCBI if we need to download reference sequences
+  Boolean naiveFilter=false
 
   command {
+    set -ex -o pipefail
+
     SAMPLES="${sep=' ' sampleNames}"
     if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi
 
-    intrahost.py merge_to_vcf  \
-        ${reference_fasta} \
-        isnvs.vcf.gz \
-        $SAMPLES \
-        --isnvs ${sep=' ' vphaser2Calls} \
-        --alignments ${sep=' ' perSegmentMultiAlignments} \
-        --strip_chr_version \
-        --parse_accession
-
-    interhost.py snpEff \
-        isnvs.vcf.gz \
-        ${sep=' ' snpEffRef} \
-        isnvs.annot.vcf.gz \
-        ${emailAddress}
-
-    intrahost.py iSNV_table \
-        isnvs.annot.vcf.gz \
-        isnvs.annot.txt.gz        
-  }
-
-  output {
-    Array[File] isnvFiles = ["isnvs.vcf.gz", "isnvs.vcf.gz.tbi", "isnvs.annot.vcf.gz", "isnvs.annot.txt.gz", "isnvs.annot.vcf.gz.tbi"]
-  }
-  runtime {
-    memory: "4 GB"
-    docker: "quay.io/broadinstitute/viral-ngs"
-  }
-}
-
-task isnvs_vcf_filtered {
-  Array[File] vphaser2Calls # vphaser output; ex. vphaser2.${sample}.txt.gz
-  Array[File] perSegmentMultiAlignments # aligned_##.fasta, where ## is segment number
-  File reference_fasta
+    providedSnpRefAccessions="${sep=' ' snpEffRef}"
+    if [ -n "$providedSnpRefAccessions" ]; then 
+      snpRefAccessions="$providedSnpRefAccessions";
+    else
+      snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${reference_fasta}', 'fasta'))))")"
+    fi
 
-  Array[String] snpEffRef # list of accessions to build/find snpEff database
-  Array[String]? sampleNames # list of sample names
-  String emailAddress # email address passed to NCBI if we need to download reference sequences
-  Boolean naiveFilter
-
-  command {
-    SAMPLES="${sep=' ' sampleNames}"
-    if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi
+    echo "snpRefAccessions: $snpRefAccessions"
 
     intrahost.py merge_to_vcf \
         ${reference_fasta} \
@@ -92,18 +62,18 @@ task isnvs_vcf_filtered {
         --isnvs ${sep=' ' vphaser2Calls} \
         --alignments ${sep=' ' perSegmentMultiAlignments} \
         --strip_chr_version \
-        ${'--naive_filter' + naiveFilter} \
+        ${true="--naive_filter" false="" naiveFilter} \
         --parse_accession
-
+        
     interhost.py snpEff \
         isnvs.vcf.gz \
-        ${sep=' ' snpEffRef} \
+        $snpRefAccessions \
         isnvs.annot.vcf.gz \
-        ${emailAddress}
+        ${'--emailAddress=' + emailAddress}
 
     intrahost.py iSNV_table \
         isnvs.annot.vcf.gz \
-        isnvs.annot.txt.gz \
+        isnvs.annot.txt.gz
   }
 
   output {

diff --git a/pipes/rules/intrahost.rules b/pipes/rules/intrahost.rules
@@ -56,7 +56,7 @@ rule isnvs_vcf:
         UGER          = config.get('UGER_queues', {}).get('short', '-l h_rt=04:00:00'),
         logid         = "all",
         snpEff_ref    = " ".join(config["accessions_for_ref_genome_build"]),
-        email_address = config["email_point_of_contact_for_ncbi"]
+        email_address = "--emailAddress "+config["email_point_of_contact_for_ncbi"] if config["email_point_of_contact_for_ncbi"] else ""
     run:
         shell("{config[bin_dir]}/intrahost.py merge_to_vcf {input.ref_genome} {output.raw_vcf}"
             + " --samples " + " ".join(read_samples_file(config["samples_assembly"]))
@@ -92,7 +92,7 @@ rule isnvs_vcf_filtered:
         UGER         = config.get('UGER_queues', {}).get('short', '-l h_rt=04:00:00'),
         logid        = "all",
         snpEff_ref   = " ".join(config["accessions_for_ref_genome_build"]),
-        emailAddress = config["email_point_of_contact_for_ncbi"],
+        email_address = "--emailAddress "+config["email_point_of_contact_for_ncbi"] if config["email_point_of_contact_for_ncbi"] else "",
         naiveFilter  = "--naive_filter" if config["vcf_merge_naive_filter"] else ""
     run:
         shell("{config[bin_dir]}/intrahost.py merge_to_vcf {input.ref_genome} {output.raw_vcf}"

diff --git a/read_utils.py b/read_utils.py
@@ -754,11 +754,11 @@ def main_reheader_bam(args):
     ''' Copy a BAM file (inBam to outBam) while renaming elements of the BAM header.
         The mapping file specifies which (key, old value, new value) mappings. For
         example:
-            LB  lib1  lib_one
-            SM  sample1 Sample_1
-            SM  sample2 Sample_2
-            SM  sample3 Sample_3
-            CN  broad   BI
+            LB	lib1	lib_one
+            SM	sample1	Sample_1
+            SM	sample2	Sample_2
+            SM	sample3	Sample_3
+            CN	broad	BI
     '''
     # read mapping file
     mapper = dict((a + ':' + b, a + ':' + c) for a, b, c in util.file.read_tabfile(args.rgMap))

diff --git a/requirements-conda.txt b/requirements-conda.txt
@@ -22,7 +22,7 @@ picard=2.18.9
 pigz=2.4
 prinseq=0.20.4
 samtools=1.7
-snpeff=4.1l
+snpeff=4.3.1t
 spades=3.11.1
 tbl2asn=25.6
 trimmomatic=0.38

diff --git a/test/__init__.py b/test/__init__.py
@@ -118,4 +118,4 @@ def inputs(self, *fnames):
 def assert_none_executable():
     testDir = os.path.dirname(__file__)
     assert all(not os.access(os.path.join(testDir, filename), os.X_OK) for filename in os.listdir(testDir)
-               if filename.endswith('.py'))
+               if filename.endswith('.py'))