diff --git a/ncbi.py b/ncbi.py index 8db5bc229..e8a29e652 100755 --- a/ncbi.py +++ b/ncbi.py @@ -381,6 +381,7 @@ def fasta2fsa(infname, outdir, biosample=None): def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None): with open(cmt_fname, 'wt') as outf: outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n") + # note: the v. format is required by NCBI, don't remove the " v. " outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(util.version.get_version())) if name: outf.write("Assembly Name\t{}\n".format(name)) @@ -448,7 +449,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir, outf.write(inf.readline()) for line in inf: row = line.rstrip('\n').split('\t') - if row[0] == sample_base: + if row[0] == sample_base or row[0] == sample: row[0] = sample outf.write('\t'.join(row) + '\n') diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl deleted file mode 100644 index 14939bf48..000000000 --- a/pipes/WDL/workflows/align_and_annot.wdl +++ /dev/null @@ -1,32 +0,0 @@ -import "interhost.wdl" as interhost -import "ncbi.wdl" as ncbi - -# DX_SKIP_WORKFLOW - -workflow align_and_annot { - - File reference_fasta - Array[File]+ assemblies_fasta - Array[File]+ annotations_tbl - - call interhost.multi_align_mafft_ref as mafft { - input: - reference_fasta = reference_fasta, - assemblies_fasta = assemblies_fasta - } - - scatter(by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) { - call ncbi.annot_transfer as annot { - input: - chr_mutli_aln_fasta = by_chr.left, - reference_fasta = reference_fasta, - reference_feature_table = by_chr.right - } - call ncbi.prepare_genbank as genbank { - input: - assemblies_fasta = assemblies_fasta, - annotations_tbl = annot.transferred_feature_tables, - out_prefix = basename(by_chr.right, '.tbl') - } - } -} diff --git a/pipes/WDL/workflows/coverage_table.wdl b/pipes/WDL/workflows/coverage_table.wdl new file mode 100644 index 000000000..fe956bc93 --- /dev/null +++ b/pipes/WDL/workflows/coverage_table.wdl @@ -0,0 +1,5 @@ +import "reports.wdl" as reports + +workflow coverage_table { + call reports.coverage_report +} diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl new file mode 100644 index 000000000..1b7ceae8d --- /dev/null +++ b/pipes/WDL/workflows/genbank.wdl @@ -0,0 +1,29 @@ +import "interhost.wdl" as interhost +import "ncbi.wdl" as ncbi + +workflow genbank { + + File reference_fasta + Array[File]+ assemblies_fasta # one per genome + Array[File]+ ref_annotations_tbl # one per chromosome + + call interhost.multi_align_mafft_ref as mafft { + input: + reference_fasta = reference_fasta, + assemblies_fasta = assemblies_fasta + } + + call ncbi.annot_transfer as annot { + input: + multi_aln_fasta = mafft.alignments_by_chr, + reference_fasta = reference_fasta, + reference_feature_table = ref_annotations_tbl + } + + call ncbi.prepare_genbank as prep_genbank { + input: + assemblies_fasta = assemblies_fasta, + annotations_tbl = annot.transferred_feature_tables + } + +} diff --git a/pipes/WDL/workflows/mafft.wdl b/pipes/WDL/workflows/mafft.wdl new file mode 100644 index 000000000..1111f5c69 --- /dev/null +++ b/pipes/WDL/workflows/mafft.wdl @@ -0,0 +1,5 @@ +import "interhost.wdl" as interhost + +workflow mafft { + call interhost.multi_align_mafft +} diff --git a/pipes/WDL/workflows/merge_bams.wdl b/pipes/WDL/workflows/merge_bams.wdl new file mode 100644 index 000000000..1e1a7105c --- /dev/null +++ b/pipes/WDL/workflows/merge_bams.wdl @@ -0,0 +1,5 @@ +import "demux.wdl" as demux + +workflow merge_bams { + call demux.merge_and_reheader_bams +} diff --git a/pipes/WDL/workflows/tasks/demux.wdl b/pipes/WDL/workflows/tasks/demux.wdl index 340879625..89c636a75 100644 --- a/pipes/WDL/workflows/tasks/demux.wdl +++ b/pipes/WDL/workflows/tasks/demux.wdl @@ -107,3 +107,38 @@ task illumina_demux { preemptible: 0 # this is the very first operation before scatter, so let's get it done quickly & reliably } } + +task merge_and_reheader_bams { + Array[File]+ in_bams + File? reheader_table # tsv with 3 cols: field, old value, new value + String out_basename + + command { + set -ex -o pipefail + + if [ ${length(in_bams)} -gt 1 ]; then + read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --loglevel DEBUG + else + echo "Skipping merge, only one input file" + ln -s ${select_first(in_bams)} merged.bam + fi + + if [ -f ${reheader_table} ]; then + read_utils.py merged.bam ${reheader_table} ${out_basename}.bam --loglevel DEBUG + else + echo "Skipping reheader, no mapping table specified" + ln -s merged.bam ${out_basename}.bam + fi + } + + output { + File out_bam = "${out_basename}.bam" + } + + runtime { + docker: "quay.io/broadinstitute/viral-ngs" + memory: "2000 MB" + cpu: 2 + dx_instance_type: "mem1_ssd2_x4" + } +} diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl index f4e1ce423..c7b2ccf1b 100644 --- a/pipes/WDL/workflows/tasks/interhost.wdl +++ b/pipes/WDL/workflows/tasks/interhost.wdl @@ -1,92 +1,18 @@ -task ref_guided_consensus { - String sample - - File referenceGenome #fasta - File inBam - # TODO: input settings must compute chr_names - Array[String] chrNames # sampleName-1, sampleName-2, ... - - Int? threads - Int? minCoverage - - String? novoalignOptions - - command { - assembly.py refine_assembly \ - "${referenceGenome}" \ - "${inBam}" \ - "${sample}.fasta" \ - --outBam "${sample}.realigned.bam" \ - --outVcf "${sample}.vcf.gz" \ - "${'--min_coverage' + minCoverage || '3'}" \ - "${'--novo_params' + novoalignOptions || '-r Random -l 40 -g 40 -x 20 -t 100'}" \ - --keep_all_reads \ - --chr_names "${sep=' ' chrNames+}" \ - "${'--threads' + threads}" - } - - output { - File assembly = "${sample}.fasta" - File realignedBam = "${sample}.realigned.bam" - File variantCalls = "${sample}.vcf.gz" - } - runtime { - memory: "4 GB" - docker: "quay.io/broadinstitute/viral-ngs" - } -} - -task ref_guided_consensus_aligned_with_dups { - String sample - - File realignedBam - - command { - samtools view -b -F 4 -o "${sample}.realigned.only_aligned.bam" "${realignedBam}" - } - - output { - File onlyAlignedReadsBam = "${sample}.realigned.only_aligned.bam" - } - runtime { - memory: "8 GB" - docker: "quay.io/broadinstitute/viral-ngs" - } -} - -#task ref_guided_diversity { -# String sample -# -# File assembly # .fasta -# File variantCalls # .vcf.gz -# File referenceGenome # .fasta, but also .fasta.fai and .dict -# -# command { -# GenomeAnalysisTK.jar -# -T CombineVariants -R "${referenceGenome}" {inFilesString} -o {outFile} -# --genotypemergeoption UNIQUIFY -# } -# -# output { -# -# } -# runtime { -# docker: "quay.io/broadinstitute/viral-ngs" -# } -#} task multi_align_mafft_ref { File reference_fasta Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay - String? out_prefix = basename(reference_fasta, '.fasta') + String out_prefix = basename(reference_fasta, '.fasta') Int? mafft_maxIters - Int? mafft_ep + Float? mafft_ep + Float? mafft_gapOpeningPenalty command { interhost.py multichr_mafft \ ${reference_fasta} ${sep=' ' assemblies_fasta} \ . \ ${'--ep' + mafft_ep} \ + ${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \ ${'--maxiters' + mafft_maxIters} \ --outFilePrefix ${out_prefix} \ --preservecase \ @@ -96,15 +22,15 @@ task multi_align_mafft_ref { } output { - File sampleNamesFile = "${out_prefix}-sample_names.txt" - Array[File] alignments_by_chr = glob("${out_prefix}*.fasta") + File sampleNamesFile = "${out_prefix}-sample_names.txt" + Array[File]+ alignments_by_chr = glob("${out_prefix}*.fasta") } runtime { docker: "quay.io/broadinstitute/viral-ngs" memory: "7 GB" - cpu: 4 - dx_instance_type: "mem1_ssd1_x4" + cpu: 8 + dx_instance_type: "mem1_ssd1_x8" } } @@ -112,13 +38,15 @@ task multi_align_mafft { Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay String out_prefix Int? mafft_maxIters - Int? mafft_ep + Float? mafft_ep + Float? mafft_gapOpeningPenalty command { interhost.py multichr_mafft \ ${sep=' ' assemblies_fasta} \ . \ ${'--ep' + mafft_ep} \ + ${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \ ${'--maxiters' + mafft_maxIters} \ --outFilePrefix ${out_prefix} \ --preservecase \ @@ -135,8 +63,8 @@ task multi_align_mafft { runtime { docker: "quay.io/broadinstitute/viral-ngs" memory: "7 GB" - cpu: 4 - dx_instance_type: "mem1_ssd1_x4" + cpu: 8 + dx_instance_type: "mem1_ssd1_x8" } } diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index 2005e05e2..0c46b29d3 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -58,22 +58,31 @@ task download_annotations { } task annot_transfer { - File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences for a single chr - File reference_fasta # fasta (may contain multiple chrs, only one with the same name as reference_feature_table will be used) - File reference_feature_table # feature table corresponding to the chr in the alignment + Array[File]+ multi_aln_fasta # fasta; multiple alignments of sample sequences for each chromosome + File reference_fasta # fasta; all chromosomes in one file + Array[File]+ reference_feature_table # tbl; feature table corresponding to each chromosome in the alignment + + Array[Int] chr_nums=range(length(multi_aln_fasta)) command { - ncbi.py tbl_transfer_prealigned \ - ${chr_mutli_aln_fasta} \ - ${reference_fasta} \ - ${reference_feature_table} \ - . \ - --oob_clip \ - --loglevel DEBUG + set -ex -o pipefail + echo ${sep=' ' multi_aln_fasta} > alignments.txt + echo ${sep=' ' reference_feature_table} > tbls.txt + for i in ${sep=' ' chr_nums}; do + _alignment_fasta=`cat alignments.txt | cut -f $(($i+1)) -d ' '` + _feature_tbl=`cat tbls.txt | cut -f $(($i+1)) -d ' '` + ncbi.py tbl_transfer_prealigned \ + $_alignment_fasta \ + ${reference_fasta} \ + $_feature_tbl \ + . \ + --oob_clip \ + --loglevel DEBUG + done } output { - Array[File] transferred_feature_tables = glob("*.tbl") + Array[File]+ transferred_feature_tables = glob("*.tbl") } runtime { docker: "quay.io/broadinstitute/viral-ngs" @@ -87,12 +96,13 @@ task prepare_genbank { Array[File]+ assemblies_fasta Array[File]+ annotations_tbl File authors_sbt - File? coverage_table # summary.assembly.txt (from Snakemake) - File? genbankSourceTable - File? biosampleMap - String? sequencingTech - String? comment - String out_prefix = "ncbi_package" + File biosampleMap + File genbankSourceTable + File? coverage_table # summary.assembly.txt (from Snakemake) -- change this to accept a list of mapped bam files and we can create this table ourselves + String sequencingTech + String comment # TO DO: make this optional + String organism + String molType = "cRNA" command { set -ex -o pipefail @@ -101,19 +111,25 @@ task prepare_genbank { ${authors_sbt} \ ${sep=' ' assemblies_fasta} \ . \ - ${'--master_source_table=' + genbankSourceTable} \ - ${'--sequencing_tech=' + sequencingTech} \ - ${'--biosample_map=' + biosampleMap} \ - ${'--coverage_table=' + coverage_table} \ - ${'--comment=' + comment} \ + --mol_type ${molType} \ + --organism "${organism}" \ + --biosample_map ${biosampleMap} \ + --master_source_table ${genbankSourceTable} \ + ${'--coverage_table ' + coverage_table} \ + --comment "${comment}" \ + --sequencing_tech "${sequencingTech}" \ --loglevel DEBUG - tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl + mv errorsummary.val errorsummary.val.txt # to keep it separate from the glob } output { Array[File] sequin_files = glob("*.sqn") - File ncbi_package = "${out_prefix}.tar.gz" - File errorSummary = "errorsummary.val" + Array[File] structured_comment_files = glob("*.cmt") + Array[File] genbank_preview_files = glob("*.gbf") + Array[File] source_table_files = glob("*.src") + Array[File] fasta_per_chr_files = glob("*.fsa") + Array[File] validation_files = glob("*.val") + File errorSummary = "errorsummary.val.txt" } runtime { @@ -122,4 +138,4 @@ task prepare_genbank { cpu: 2 dx_instance_type: "mem1_ssd1_x2" } -} \ No newline at end of file +} diff --git a/pipes/WDL/workflows/tasks/reports.wdl b/pipes/WDL/workflows/tasks/reports.wdl index 1f6c7f3cc..c431c1728 100644 --- a/pipes/WDL/workflows/tasks/reports.wdl +++ b/pipes/WDL/workflows/tasks/reports.wdl @@ -97,6 +97,31 @@ task plot_coverage { } +task coverage_report { + Array[File]+ mapped_bams + Array[File] mapped_bam_idx # optional.. speeds it up if you provide it, otherwise we auto-index + String out_report_name="coverage_report.txt" + + command { + reports.py coverage_only \ + ${sep=' ' mapped_bams} \ + ${out_report_name} \ + --loglevel DEBUG + } + + output { + File coverage_report = "${out_report_name}" + } + + runtime { + docker: "quay.io/broadinstitute/viral-ngs" + memory: "2000 MB" + cpu: 2 + dx_instance_type: "mem1_ssd2_x4" + } +} + + task fastqc { File reads_bam diff --git a/reports.py b/reports.py index 6a112f693..3594b561f 100755 --- a/reports.py +++ b/reports.py @@ -133,6 +133,19 @@ def get_assembly_stats(sample, return (header, out) +def genome_coverage_stats_only(mapped_bam, chr_name=None, cov_thresholds=(1, 5, 20, 100)): + out = {} + with pysam.AlignmentFile(mapped_bam, 'rb') as bam: + coverages = list([pcol.nsegments for pcol in bam.pileup(chr_name)]) + if coverages: + out['aln2self_cov_median'] = median(coverages) + out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) + out['aln2self_cov_mean_non0'] = "%0.3f" % mean([n for n in coverages if n > 0]) + for thresh in cov_thresholds: + out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) + return out + + def assembly_stats(samples, outFile, cov_thresholds, assembly_dir, assembly_tmp, align_dir, reads_dir, raw_reads_dir): ''' Fetch assembly-level statistics for a given sample ''' header_written = False @@ -183,6 +196,54 @@ def parser_assembly_stats(parser=argparse.ArgumentParser()): __commands__.append(('assembly_stats', parser_assembly_stats)) +def _get_samples_from_bam(bam): + with pysam.AlignmentFile(bam) as af: + return set(rg['SM'] for rg in af.header['RG']) +def _get_chrs_from_bam(bam): + with pysam.AlignmentFile(bam) as af: + return list(sq['SN'] for sq in af.header['SQ']) + +def parser_coverage_only(parser=argparse.ArgumentParser()): + parser.add_argument('mapped_bams', nargs='+', help='Aligned-to-self mapped bam files.') + parser.add_argument('out_report', help='Output report file.') + parser.add_argument('--cov_thresholds', + nargs='+', + type=int, + default=(1, 5, 20, 100), + help='Genome coverage thresholds to report on. (default: %(default)s)') + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) + util.cmd.attach_main(parser, coverage_only, split_args=True) + return parser +def coverage_only(mapped_bams, out_report, cov_thresholds=(1, 5, 20, 100)): + header = ['sample','aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0'] + header += ['aln2self_cov_%dX' % thresh for thresh in cov_thresholds] + with open(out_report, 'wt') as outf: + outf.write('\t'.join(header)+'\n') + for bam in mapped_bams: + # check for index and auto-create if needed + with pysam.AlignmentFile(bam) as af: + is_indexed = af.has_index() + if not is_indexed: + pysam.index(bam) + # get unique sample name + samples = _get_samples_from_bam(bam) + if len(samples) != 1: + raise Exception("input bam file {} has {} unique samples: {} (require one unique sample)".format(bam, len(samples), str(samples))) + sample_name = samples.pop() + # get and write coverage stats + row = genome_coverage_stats_only(bam, cov_thresholds=cov_thresholds) + row['sample'] = sample_name + outf.write('\t'.join([str(row.get(h,'')) for h in header])+'\n') + # for multi-seg genomes, also do per-chr stats + chrs = _get_chrs_from_bam(bam) + if len(chrs) > 1: + for i in range(len(chrs)): + row = genome_coverage_stats_only(bam, chr_name=chrs[i], cov_thresholds=cov_thresholds) + row['sample'] = "{}-{}".format(sample_name, i+1) + outf.write('\t'.join([str(row.get(h,'')) for h in header])+'\n') +__commands__.append(('coverage_only', parser_coverage_only)) + + def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False): """ Write or print pairwise alignment summary information for sequences in two FASTA files, including SNPs, ambiguous bases, and indels. diff --git a/tools/mafft.py b/tools/mafft.py index 830a79e2d..97191e82b 100644 --- a/tools/mafft.py +++ b/tools/mafft.py @@ -106,7 +106,7 @@ def execute( if not (retree or localpair or globalpair): tool_cmd.append("--auto") - tool_cmd.extend(["--thread", "{}".format(util.misc.sanitize_thread_count(threads, tool_max_cores_value=-1))]) + tool_cmd.extend(["--thread", "{}".format(util.misc.sanitize_thread_count(threads))]) if localpair and globalpair: raise Exception("Alignment type must be either local or global, not both.")