Skip to content

Commit

Permalink
Merge pull request #800 from broadinstitute/dp-genbank
Browse files Browse the repository at this point in the history
genbank and other new WDL workflows
  • Loading branch information
dpark01 authored Mar 29, 2018
2 parents 933667f + b660097 commit f61c71c
Show file tree
Hide file tree
Showing 12 changed files with 223 additions and 145 deletions.
3 changes: 2 additions & 1 deletion ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ def fasta2fsa(infname, outdir, biosample=None):
def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None):
with open(cmt_fname, 'wt') as outf:
outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n")
# note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(util.version.get_version()))
if name:
outf.write("Assembly Name\t{}\n".format(name))
Expand Down Expand Up @@ -448,7 +449,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
outf.write(inf.readline())
for line in inf:
row = line.rstrip('\n').split('\t')
if row[0] == sample_base:
if row[0] == sample_base or row[0] == sample:
row[0] = sample
outf.write('\t'.join(row) + '\n')

Expand Down
32 changes: 0 additions & 32 deletions pipes/WDL/workflows/align_and_annot.wdl

This file was deleted.

5 changes: 5 additions & 0 deletions pipes/WDL/workflows/coverage_table.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "reports.wdl" as reports

workflow coverage_table {
call reports.coverage_report
}
29 changes: 29 additions & 0 deletions pipes/WDL/workflows/genbank.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import "interhost.wdl" as interhost
import "ncbi.wdl" as ncbi

workflow genbank {

File reference_fasta
Array[File]+ assemblies_fasta # one per genome
Array[File]+ ref_annotations_tbl # one per chromosome
call interhost.multi_align_mafft_ref as mafft {
input:
reference_fasta = reference_fasta,
assemblies_fasta = assemblies_fasta
}

call ncbi.annot_transfer as annot {
input:
multi_aln_fasta = mafft.alignments_by_chr,
reference_fasta = reference_fasta,
reference_feature_table = ref_annotations_tbl
}

call ncbi.prepare_genbank as prep_genbank {
input:
assemblies_fasta = assemblies_fasta,
annotations_tbl = annot.transferred_feature_tables
}

}
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/mafft.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "interhost.wdl" as interhost

workflow mafft {
call interhost.multi_align_mafft
}
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/merge_bams.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "demux.wdl" as demux

workflow merge_bams {
call demux.merge_and_reheader_bams
}
35 changes: 35 additions & 0 deletions pipes/WDL/workflows/tasks/demux.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,38 @@ task illumina_demux {
preemptible: 0 # this is the very first operation before scatter, so let's get it done quickly & reliably
}
}

task merge_and_reheader_bams {
Array[File]+ in_bams
File? reheader_table # tsv with 3 cols: field, old value, new value
String out_basename

command {
set -ex -o pipefail

if [ ${length(in_bams)} -gt 1 ]; then
read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --loglevel DEBUG
else
echo "Skipping merge, only one input file"
ln -s ${select_first(in_bams)} merged.bam
fi

if [ -f ${reheader_table} ]; then
read_utils.py merged.bam ${reheader_table} ${out_basename}.bam --loglevel DEBUG
else
echo "Skipping reheader, no mapping table specified"
ln -s merged.bam ${out_basename}.bam
fi
}

output {
File out_bam = "${out_basename}.bam"
}

runtime {
docker: "quay.io/broadinstitute/viral-ngs"
memory: "2000 MB"
cpu: 2
dx_instance_type: "mem1_ssd2_x4"
}
}
98 changes: 13 additions & 85 deletions pipes/WDL/workflows/tasks/interhost.wdl
Original file line number Diff line number Diff line change
@@ -1,92 +1,18 @@
task ref_guided_consensus {
String sample

File referenceGenome #fasta
File inBam
# TODO: input settings must compute chr_names
Array[String] chrNames # sampleName-1, sampleName-2, ...
Int? threads
Int? minCoverage

String? novoalignOptions

command {
assembly.py refine_assembly \
"${referenceGenome}" \
"${inBam}" \
"${sample}.fasta" \
--outBam "${sample}.realigned.bam" \
--outVcf "${sample}.vcf.gz" \
"${'--min_coverage' + minCoverage || '3'}" \
"${'--novo_params' + novoalignOptions || '-r Random -l 40 -g 40 -x 20 -t 100'}" \
--keep_all_reads \
--chr_names "${sep=' ' chrNames+}" \
"${'--threads' + threads}"
}

output {
File assembly = "${sample}.fasta"
File realignedBam = "${sample}.realigned.bam"
File variantCalls = "${sample}.vcf.gz"
}
runtime {
memory: "4 GB"
docker: "quay.io/broadinstitute/viral-ngs"
}
}

task ref_guided_consensus_aligned_with_dups {
String sample

File realignedBam

command {
samtools view -b -F 4 -o "${sample}.realigned.only_aligned.bam" "${realignedBam}"
}

output {
File onlyAlignedReadsBam = "${sample}.realigned.only_aligned.bam"
}
runtime {
memory: "8 GB"
docker: "quay.io/broadinstitute/viral-ngs"
}
}

#task ref_guided_diversity {
# String sample
#
# File assembly # .fasta
# File variantCalls # .vcf.gz
# File referenceGenome # .fasta, but also .fasta.fai and .dict
#
# command {
# GenomeAnalysisTK.jar
# -T CombineVariants -R "${referenceGenome}" {inFilesString} -o {outFile}
# --genotypemergeoption UNIQUIFY
# }
#
# output {
#
# }
# runtime {
# docker: "quay.io/broadinstitute/viral-ngs"
# }
#}

task multi_align_mafft_ref {
File reference_fasta
Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
String? out_prefix = basename(reference_fasta, '.fasta')
String out_prefix = basename(reference_fasta, '.fasta')
Int? mafft_maxIters
Int? mafft_ep
Float? mafft_ep
Float? mafft_gapOpeningPenalty

command {
interhost.py multichr_mafft \
${reference_fasta} ${sep=' ' assemblies_fasta} \
. \
${'--ep' + mafft_ep} \
${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \
${'--maxiters' + mafft_maxIters} \
--outFilePrefix ${out_prefix} \
--preservecase \
Expand All @@ -96,29 +22,31 @@ task multi_align_mafft_ref {
}

output {
File sampleNamesFile = "${out_prefix}-sample_names.txt"
Array[File] alignments_by_chr = glob("${out_prefix}*.fasta")
File sampleNamesFile = "${out_prefix}-sample_names.txt"
Array[File]+ alignments_by_chr = glob("${out_prefix}*.fasta")
}

runtime {
docker: "quay.io/broadinstitute/viral-ngs"
memory: "7 GB"
cpu: 4
dx_instance_type: "mem1_ssd1_x4"
cpu: 8
dx_instance_type: "mem1_ssd1_x8"
}
}

task multi_align_mafft {
Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
String out_prefix
Int? mafft_maxIters
Int? mafft_ep
Float? mafft_ep
Float? mafft_gapOpeningPenalty

command {
interhost.py multichr_mafft \
${sep=' ' assemblies_fasta} \
. \
${'--ep' + mafft_ep} \
${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \
${'--maxiters' + mafft_maxIters} \
--outFilePrefix ${out_prefix} \
--preservecase \
Expand All @@ -135,8 +63,8 @@ task multi_align_mafft {
runtime {
docker: "quay.io/broadinstitute/viral-ngs"
memory: "7 GB"
cpu: 4
dx_instance_type: "mem1_ssd1_x4"
cpu: 8
dx_instance_type: "mem1_ssd1_x8"
}
}

Expand Down
68 changes: 42 additions & 26 deletions pipes/WDL/workflows/tasks/ncbi.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -58,22 +58,31 @@ task download_annotations {
}

task annot_transfer {
File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences for a single chr
File reference_fasta # fasta (may contain multiple chrs, only one with the same name as reference_feature_table will be used)
File reference_feature_table # feature table corresponding to the chr in the alignment
Array[File]+ multi_aln_fasta # fasta; multiple alignments of sample sequences for each chromosome
File reference_fasta # fasta; all chromosomes in one file
Array[File]+ reference_feature_table # tbl; feature table corresponding to each chromosome in the alignment
Array[Int] chr_nums=range(length(multi_aln_fasta))

command {
ncbi.py tbl_transfer_prealigned \
${chr_mutli_aln_fasta} \
${reference_fasta} \
${reference_feature_table} \
. \
--oob_clip \
--loglevel DEBUG
set -ex -o pipefail
echo ${sep=' ' multi_aln_fasta} > alignments.txt
echo ${sep=' ' reference_feature_table} > tbls.txt
for i in ${sep=' ' chr_nums}; do
_alignment_fasta=`cat alignments.txt | cut -f $(($i+1)) -d ' '`
_feature_tbl=`cat tbls.txt | cut -f $(($i+1)) -d ' '`
ncbi.py tbl_transfer_prealigned \
$_alignment_fasta \
${reference_fasta} \
$_feature_tbl \
. \
--oob_clip \
--loglevel DEBUG
done
}

output {
Array[File] transferred_feature_tables = glob("*.tbl")
Array[File]+ transferred_feature_tables = glob("*.tbl")
}
runtime {
docker: "quay.io/broadinstitute/viral-ngs"
Expand All @@ -87,12 +96,13 @@ task prepare_genbank {
Array[File]+ assemblies_fasta
Array[File]+ annotations_tbl
File authors_sbt
File? coverage_table # summary.assembly.txt (from Snakemake)
File? genbankSourceTable
File? biosampleMap
String? sequencingTech
String? comment
String out_prefix = "ncbi_package"
File biosampleMap
File genbankSourceTable
File? coverage_table # summary.assembly.txt (from Snakemake) -- change this to accept a list of mapped bam files and we can create this table ourselves
String sequencingTech
String comment # TO DO: make this optional
String organism
String molType = "cRNA"

command {
set -ex -o pipefail
Expand All @@ -101,19 +111,25 @@ task prepare_genbank {
${authors_sbt} \
${sep=' ' assemblies_fasta} \
. \
${'--master_source_table=' + genbankSourceTable} \
${'--sequencing_tech=' + sequencingTech} \
${'--biosample_map=' + biosampleMap} \
${'--coverage_table=' + coverage_table} \
${'--comment=' + comment} \
--mol_type ${molType} \
--organism "${organism}" \
--biosample_map ${biosampleMap} \
--master_source_table ${genbankSourceTable} \
${'--coverage_table ' + coverage_table} \
--comment "${comment}" \
--sequencing_tech "${sequencingTech}" \
--loglevel DEBUG
tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl
mv errorsummary.val errorsummary.val.txt # to keep it separate from the glob
}

output {
Array[File] sequin_files = glob("*.sqn")
File ncbi_package = "${out_prefix}.tar.gz"
File errorSummary = "errorsummary.val"
Array[File] structured_comment_files = glob("*.cmt")
Array[File] genbank_preview_files = glob("*.gbf")
Array[File] source_table_files = glob("*.src")
Array[File] fasta_per_chr_files = glob("*.fsa")
Array[File] validation_files = glob("*.val")
File errorSummary = "errorsummary.val.txt"
}

runtime {
Expand All @@ -122,4 +138,4 @@ task prepare_genbank {
cpu: 2
dx_instance_type: "mem1_ssd1_x2"
}
}
}
Loading

0 comments on commit f61c71c

Please sign in to comment.