Merge pull request #800 from broadinstitute/dp-genbank

genbank and other new WDL workflows
broadinstitute · Mar 29, 2018 · f61c71c · f61c71c
2 parents 933667f + b660097
commit f61c71c
Show file tree

Hide file tree

Showing 12 changed files with 223 additions and 145 deletions.
diff --git a/ncbi.py b/ncbi.py
@@ -381,6 +381,7 @@ def fasta2fsa(infname, outdir, biosample=None):
 def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None):
     with open(cmt_fname, 'wt') as outf:
         outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n")
+        # note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
         outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(util.version.get_version()))
         if name:
             outf.write("Assembly Name\t{}\n".format(name))
@@ -448,7 +449,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
                             outf.write(inf.readline())
                             for line in inf:
                                 row = line.rstrip('\n').split('\t')
-                                if row[0] == sample_base:
+                                if row[0] == sample_base or row[0] == sample:
                                     row[0] = sample
                                     outf.write('\t'.join(row) + '\n')
 

diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
diff --git a/pipes/WDL/workflows/coverage_table.wdl b/pipes/WDL/workflows/coverage_table.wdl
@@ -0,0 +1,5 @@
+import "reports.wdl" as reports
+
+workflow coverage_table {
+  call reports.coverage_report
+}
diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
@@ -0,0 +1,29 @@
+import "interhost.wdl" as interhost
+import "ncbi.wdl" as ncbi
+
+workflow genbank {
+
+  File          reference_fasta
+  Array[File]+  assemblies_fasta     # one per genome
+  Array[File]+  ref_annotations_tbl  # one per chromosome
+
+  call interhost.multi_align_mafft_ref as mafft {
+    input:
+      reference_fasta = reference_fasta,
+      assemblies_fasta = assemblies_fasta
+  }
+
+  call ncbi.annot_transfer as annot {
+    input:
+      multi_aln_fasta = mafft.alignments_by_chr,
+      reference_fasta = reference_fasta,
+      reference_feature_table = ref_annotations_tbl
+  }
+
+  call ncbi.prepare_genbank as prep_genbank {
+    input:
+      assemblies_fasta = assemblies_fasta,
+      annotations_tbl = annot.transferred_feature_tables
+  }
+
+}
diff --git a/pipes/WDL/workflows/mafft.wdl b/pipes/WDL/workflows/mafft.wdl
@@ -0,0 +1,5 @@
+import "interhost.wdl" as interhost
+
+workflow mafft {
+  call interhost.multi_align_mafft
+}
diff --git a/pipes/WDL/workflows/merge_bams.wdl b/pipes/WDL/workflows/merge_bams.wdl
@@ -0,0 +1,5 @@
+import "demux.wdl" as demux
+
+workflow merge_bams {
+  call demux.merge_and_reheader_bams
+}
diff --git a/pipes/WDL/workflows/tasks/demux.wdl b/pipes/WDL/workflows/tasks/demux.wdl
@@ -107,3 +107,38 @@ task illumina_demux {
     preemptible: 0  # this is the very first operation before scatter, so let's get it done quickly & reliably
   }
 }
+
+task merge_and_reheader_bams {
+  Array[File]+  in_bams
+  File?         reheader_table # tsv with 3 cols: field, old value, new value
+  String        out_basename
+
+  command {
+    set -ex -o pipefail
+
+    if [ ${length(in_bams)} -gt 1 ]; then
+      read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --loglevel DEBUG
+    else
+      echo "Skipping merge, only one input file"
+      ln -s ${select_first(in_bams)} merged.bam
+    fi    
+
+    if [ -f ${reheader_table} ]; then
+      read_utils.py merged.bam ${reheader_table} ${out_basename}.bam --loglevel DEBUG
+    else
+      echo "Skipping reheader, no mapping table specified"
+      ln -s merged.bam ${out_basename}.bam
+    fi
+  }
+
+  output {
+    File  out_bam = "${out_basename}.bam"
+  }
+
+  runtime {
+    docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "2000 MB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd2_x4"
+  }
+}
diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl
@@ -1,92 +1,18 @@
-task ref_guided_consensus {
-  String sample
-
-  File referenceGenome #fasta
-  File inBam
-  # TODO: input settings must compute chr_names
-  Array[String] chrNames # sampleName-1, sampleName-2, ...
-
-  Int? threads
-  Int? minCoverage
-
-  String? novoalignOptions
-
-  command {
-    assembly.py refine_assembly \
-      "${referenceGenome}" \
-      "${inBam}" \
-      "${sample}.fasta" \
-      --outBam "${sample}.realigned.bam" \
-      --outVcf "${sample}.vcf.gz" \
-      "${'--min_coverage' + minCoverage || '3'}" \
-      "${'--novo_params' + novoalignOptions || '-r Random -l 40 -g 40 -x 20 -t 100'}" \
-      --keep_all_reads \
-      --chr_names "${sep=' ' chrNames+}" \
-      "${'--threads' + threads}"
-  }
-
-  output {
-    File assembly = "${sample}.fasta"
-    File realignedBam = "${sample}.realigned.bam"
-    File variantCalls = "${sample}.vcf.gz"
-  }
-  runtime {
-    memory: "4 GB"
-    docker: "quay.io/broadinstitute/viral-ngs"
-  }
-}
-
-task ref_guided_consensus_aligned_with_dups {
-  String sample
-
-  File realignedBam
-
-  command {
-    samtools view -b -F 4 -o "${sample}.realigned.only_aligned.bam" "${realignedBam}"
-  }
-
-  output {
-    File onlyAlignedReadsBam = "${sample}.realigned.only_aligned.bam"
-  }
-  runtime {
-    memory: "8 GB"
-    docker: "quay.io/broadinstitute/viral-ngs"
-  }
-}
-
-#task ref_guided_diversity {
-#  String sample
-#
-#  File assembly # .fasta
-#  File variantCalls # .vcf.gz
-#  File referenceGenome # .fasta, but also .fasta.fai and .dict
-#
-#  command {
-#    GenomeAnalysisTK.jar
-#      -T CombineVariants -R "${referenceGenome}" {inFilesString} -o {outFile}
-#      --genotypemergeoption UNIQUIFY
-#  }
-#
-#  output {
-#
-#  }
-#  runtime {
-#    docker: "quay.io/broadinstitute/viral-ngs"
-#  }
-#}
 
 task multi_align_mafft_ref {
   File           reference_fasta
   Array[File]+   assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
-  String?        out_prefix = basename(reference_fasta, '.fasta')
+  String         out_prefix = basename(reference_fasta, '.fasta')
   Int?           mafft_maxIters
-  Int?           mafft_ep
+  Float?         mafft_ep
+  Float?         mafft_gapOpeningPenalty
 
   command {
     interhost.py multichr_mafft \
       ${reference_fasta} ${sep=' ' assemblies_fasta} \
       . \
       ${'--ep' + mafft_ep} \
+      ${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \
       ${'--maxiters' + mafft_maxIters} \
       --outFilePrefix ${out_prefix} \
       --preservecase \
@@ -96,29 +22,31 @@ task multi_align_mafft_ref {
   }
 
   output {
-    File        sampleNamesFile   = "${out_prefix}-sample_names.txt"
-    Array[File] alignments_by_chr = glob("${out_prefix}*.fasta")
+    File         sampleNamesFile   = "${out_prefix}-sample_names.txt"
+    Array[File]+ alignments_by_chr = glob("${out_prefix}*.fasta")
   }
 
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
     memory: "7 GB"
-    cpu: 4
-    dx_instance_type: "mem1_ssd1_x4"
+    cpu: 8
+    dx_instance_type: "mem1_ssd1_x8"
   }
 }
 
 task multi_align_mafft {
   Array[File]+   assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
   String         out_prefix
   Int?           mafft_maxIters
-  Int?           mafft_ep
+  Float?         mafft_ep
+  Float?         mafft_gapOpeningPenalty
 
   command {
     interhost.py multichr_mafft \
       ${sep=' ' assemblies_fasta} \
       . \
       ${'--ep' + mafft_ep} \
+      ${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \
       ${'--maxiters' + mafft_maxIters} \
       --outFilePrefix ${out_prefix} \
       --preservecase \
@@ -135,8 +63,8 @@ task multi_align_mafft {
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
     memory: "7 GB"
-    cpu: 4
-    dx_instance_type: "mem1_ssd1_x4"
+    cpu: 8
+    dx_instance_type: "mem1_ssd1_x8"
   }
 }
 

diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -58,22 +58,31 @@ task download_annotations {
 }
 
 task annot_transfer {
-  File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences for a single chr
-  File reference_fasta # fasta (may contain multiple chrs, only one with the same name as reference_feature_table will be used)
-  File reference_feature_table # feature table corresponding to the chr in the alignment
+  Array[File]+ multi_aln_fasta         # fasta; multiple alignments of sample sequences for each chromosome
+  File         reference_fasta         # fasta; all chromosomes in one file
+  Array[File]+ reference_feature_table # tbl; feature table corresponding to each chromosome in the alignment
+
+  Array[Int]   chr_nums=range(length(multi_aln_fasta))
 
   command {
-    ncbi.py tbl_transfer_prealigned \
-        ${chr_mutli_aln_fasta} \
-        ${reference_fasta} \
-        ${reference_feature_table} \
-        . \
-        --oob_clip \
-        --loglevel DEBUG
+    set -ex -o pipefail
+    echo ${sep=' ' multi_aln_fasta} > alignments.txt
+    echo ${sep=' ' reference_feature_table} > tbls.txt
+    for i in ${sep=' ' chr_nums}; do
+      _alignment_fasta=`cat alignments.txt | cut -f $(($i+1)) -d ' '`
+      _feature_tbl=`cat tbls.txt | cut -f $(($i+1)) -d ' '`
+      ncbi.py tbl_transfer_prealigned \
+          $_alignment_fasta \
+          ${reference_fasta} \
+          $_feature_tbl \
+          . \
+          --oob_clip \
+          --loglevel DEBUG
+    done
   }
 
   output {
-    Array[File] transferred_feature_tables = glob("*.tbl")
+    Array[File]+ transferred_feature_tables = glob("*.tbl")
   }
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
@@ -87,12 +96,13 @@ task prepare_genbank {
   Array[File]+ assemblies_fasta
   Array[File]+ annotations_tbl
   File         authors_sbt
-  File?        coverage_table # summary.assembly.txt (from Snakemake)
-  File?        genbankSourceTable
-  File?        biosampleMap
-  String?      sequencingTech
-  String?      comment
-  String       out_prefix = "ncbi_package"
+  File         biosampleMap
+  File         genbankSourceTable
+  File?        coverage_table # summary.assembly.txt (from Snakemake) -- change this to accept a list of mapped bam files and we can create this table ourselves
+  String       sequencingTech
+  String       comment # TO DO: make this optional
+  String       organism
+  String       molType = "cRNA"
 
   command {
     set -ex -o pipefail
@@ -101,19 +111,25 @@ task prepare_genbank {
         ${authors_sbt} \
         ${sep=' ' assemblies_fasta} \
         . \
-        ${'--master_source_table=' + genbankSourceTable} \
-        ${'--sequencing_tech=' + sequencingTech} \
-        ${'--biosample_map=' + biosampleMap} \
-        ${'--coverage_table=' + coverage_table} \
-        ${'--comment=' + comment} \
+        --mol_type ${molType} \
+        --organism "${organism}" \
+        --biosample_map ${biosampleMap} \
+        --master_source_table ${genbankSourceTable} \
+        ${'--coverage_table ' + coverage_table} \
+        --comment "${comment}" \
+        --sequencing_tech "${sequencingTech}" \
         --loglevel DEBUG
-    tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl
+    mv errorsummary.val errorsummary.val.txt # to keep it separate from the glob
   }
 
   output {
     Array[File] sequin_files = glob("*.sqn")
-    File        ncbi_package = "${out_prefix}.tar.gz"
-    File        errorSummary = "errorsummary.val"
+    Array[File] structured_comment_files = glob("*.cmt")
+    Array[File] genbank_preview_files = glob("*.gbf")
+    Array[File] source_table_files = glob("*.src")
+    Array[File] fasta_per_chr_files = glob("*.fsa")
+    Array[File] validation_files = glob("*.val")
+    File        errorSummary = "errorsummary.val.txt"
   }
 
   runtime {
@@ -122,4 +138,4 @@ task prepare_genbank {
     cpu: 2
     dx_instance_type: "mem1_ssd1_x2"
   }
-}
+}