diff --git a/Dockerfile b/Dockerfile index 78a486f..bb268c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \ RUN pip3 install pandas==0.24.2 RUN pip3 install pysam==0.15.3 - # Add mountpoint directory RUN mkdir /data # Stick to Jin's way of organizing the directory structure @@ -56,9 +55,9 @@ ENV PATH="/software/RSEM-1.2.31:${PATH}" RUN git clone https://github.com/ENCODE-DCC/kentutils_v385_bin_bulkrna.git ENV PATH=${PATH}:/software/kentutils_v385_bin_bulkrna/ -# Install qc-utils 19.8.1 +# Install qc-utils 19.8.1 and ptools -RUN pip3 install qc-utils==19.8.1 +RUN pip3 install qc-utils==19.8.1 ptools_bin==0.0.7 RUN mkdir -p rna-seq-pipeline/src COPY /src rna-seq-pipeline/src diff --git a/docs/reference.md b/docs/reference.md index d214db8..37c6400 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -9,6 +9,7 @@ This document contains more detailed information on the inputs, outputs and the [Inputs](reference.md#inputs) [Resource Considerations](reference.md#note-about-resources) [Outputs](reference.md#outputs) +[pBAM](reference.md#pBAM) ## Software @@ -157,6 +158,20 @@ Kallisto quantifier makes use of average fragment lenghts and standard deviation If you do not have this data available, or if you for some other reason want to omit running kallisto you can use the following parameter: * `rna.run_kallisto` Boolean defaulting to `true`. If set to `false` kallisto will not be run, and you do not need to provide values for any kallisto related parameters. + +## pBAM + +This pipeline offers an option to also output the alignments in privacy-preserving BAM, also known as pBAM format. For more details about the format, see [Cell article by Gursoy et al.](https://pubmed.ncbi.nlm.nih.gov/33186529/). Note, that also the usual BAM is output, and making sure only the privacy-preserving BAM is published is up to you. All the downstream operations (RSEM quantification, signal track generation, MAD QC and RNA QC) are based on pBAMs. Note that kallisto step, that is directly based on fastqs will not be changed when using this option, and kallisto quantifications might leak some information. Safest way to handle possible leakage from kallisto is to just skip it by setting `rna.run_kallisto` to `false`. + +#### Enabling the pBAM option +To enable pBAM producing the following additional inputs are required: + +* `rna.produce_pbams` needs to be set to `true`. +* `rna.reference_genome` The reference genome fasta compressed with gzip. The human reference genome used by ENCODE can be found [here](https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/). +* `rna.reference_transcriptome` The reference transcriptome fasta compressed with gzip. The version should match the genome annotation version. The human version 29 transcriptome corresponding to ENCODE4 data can be found [here](https://www.encodeproject.org/files/ENCFF088TTQ/). +* `rna.reference_annotations` An array of annotation gtf files compressed with gzip. Typically this would be [the genome annotation](https://www.encodeproject.org/files/gencode.v29.primary_assembly.annotation_UCSC_names/) and [tRNA annotation](https://www.encodeproject.org/files/gencode.v29.tRNAs/). + + ## Outputs `Cromwell`: `Cromwell` will store outputs for each task under directory `cromwell-executions/[WORKFLOW_ID]/call-[TASK_NAME]/shard-[IDX]`. For all tasks `[IDX]` means a zero-based index for each replicate. In addition to the actual pipeline outputs, these directories contain a plethora of operational Cromwell-specific files. Use [croo](https://github.com/ENCODE-DCC/croo) to find and organize the pipeline outputs. diff --git a/make_index_wdl/build_genome_index.wdl b/make_index_wdl/build_genome_index.wdl index 7971727..9b86881 100644 --- a/make_index_wdl/build_genome_index.wdl +++ b/make_index_wdl/build_genome_index.wdl @@ -2,13 +2,12 @@ version 1.0 # ENCODE DCC RNA-Seq pipeline build_genome_index -#CAPER docker encodedcc/rna-seq-pipeline:v1.2.1 -#CAPER singularity docker://encodedcc/rna-seq-pipeline:v1.2.1 - workflow build_index { meta { author: "Otto Jolanki" - version: "v1.2.1" + version: "1.2.4" + caper_docker: "encodedcc/rna-seq-pipeline:1.2.4" + caper_singularity: "encodedcc/rna-seq-pipeline:1.2.4" } input { diff --git a/make_index_wdl/merge_anno.wdl b/make_index_wdl/merge_anno.wdl index 9e0a2f5..002d061 100644 --- a/make_index_wdl/merge_anno.wdl +++ b/make_index_wdl/merge_anno.wdl @@ -2,13 +2,13 @@ version 1.0 #ENCODE DCC RNA-Seq pipeline merge-annotation -#CAPER docker encodedcc/rna-seq-pipeline:v1.2.1 -#CAPER singularity docker://encodedcc/rna-seq-pipeline:v1.2.1 workflow merge_anno { meta { author: "Otto Jolanki" - version: "v1.2.1" + version: "1.2.4" + caper_docker: "encodedcc/rna-seq-pipeline:1.2.4" + caper_singularity:"encodedcc/rna-seq-pipeline:1.2.4" } input { diff --git a/rna-seq-pipeline.wdl b/rna-seq-pipeline.wdl index 46d2af0..697b634 100644 --- a/rna-seq-pipeline.wdl +++ b/rna-seq-pipeline.wdl @@ -1,13 +1,16 @@ version 1.0 # ENCODE DCC RNA-seq pipeline +import "wdl/tasks/cat.wdl" +import "wdl/tasks/gzip.wdl" +import "wdl/tasks/pbam.wdl" workflow rna { meta { author: "Otto Jolanki" - version: "1.2.3" - caper_docker: "encodedcc/rna-seq-pipeline:1.2.3" - caper_singularity: "encodedcc/rna-seq-pipeline:v1.2.3" + version: "1.2.4" + caper_docker: "encodedcc/rna-seq-pipeline:1.2.4" + caper_singularity: "encodedcc/rna-seq-pipeline:1.2.4" croo_out_def: "https://storage.googleapis.com/encode-pipeline-output-definition/bulkrna.output_definition.json" } @@ -55,6 +58,17 @@ workflow rna { String? mad_qc_disk String? rna_qc_disk + # Following inputs should only be defined if you want to produce + # privacy preserving pBAM files, and use those for signal_track + # generation and quantification. + Boolean produce_pbams = false + File? reference_genome + File? reference_transcriptome + # Usually for ENCODE experiments this would be the usual annotation file + the tRNAs in gtf.gz format + Array[File] reference_annotations = [] + Int pbam_ncpus = 2 + Int pbam_ramGB = 6 + String pbam_disks = "local-disk 200 SSD" # These are for internal use, leave undefined Int? kallisto_fragment_length_undefined Float? kallisto_sd_undefined @@ -63,6 +77,23 @@ workflow rna { # dummy variable value for the single-ended case Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2 + if (produce_pbams) { + call cat.cat as combined_gtf_gz { input: + files=reference_annotations, + output_filename="combined_annotation.gtf.gz", + ncpus=2, + ramGB=4, + disks="local-disk 100 SSD", + } + call gzip.decompress as combined_gtf { input: + input_file=combined_gtf_gz.concatenated, + output_filename="combined_annotation.gtf", + ncpus=2, + ramGB=4, + disks="local-disk 100 SSD", + } + } + scatter (i in range(length(fastqs_R1))) { call align { input: endedness=endedness, @@ -89,8 +120,46 @@ workflow rna { disks=bam_to_signals_disk, } + if (produce_pbams) { + + call gzip.decompress as reference_genome_decompressed { input: + input_file = select_first([reference_genome]), + ncpus = 2, + ramGB = 4, + disks = "local-disk 40 SSD", + } + + call gzip.decompress as reference_transcriptome_decompressed { input: + input_file = select_first([reference_transcriptome]), + ncpus = 2, + ramGB = 4, + disks = "local-disk 40 SSD", + } + + call pbam.make_genome_pbam as genome_pbam { input: + bam=align.genomebam, + reference_genome=reference_genome_decompressed.out, + ncpus=pbam_ncpus, + ramGB=pbam_ramGB, + disks=pbam_disks, + } + + call pbam.make_transcriptome_pbam as anno_pbam { input: + bam = align.annobam, + reference_genome=reference_genome_decompressed.out, + reference_transcriptome=reference_transcriptome_decompressed.out, + reference_annotation=select_first([combined_gtf.out]), + ncpus=pbam_ncpus, + ramGB=pbam_ramGB, + disks=pbam_disks, + } + } + + File genome_alignment = select_first([genome_pbam.out, align.genomebam]) + File transcriptome_alignment = select_first([anno_pbam.out, align.annobam]) + call bam_to_signals { input: - input_bam=align.genomebam, + input_bam=genome_alignment, chrom_sizes=chrom_sizes, strandedness=strandedness, bamroot="rep"+(i+1)+bamroot+"_genome", @@ -102,7 +171,7 @@ workflow rna { call rsem_quant { input: rsem_index=rsem_index, rnd_seed=rnd_seed, - anno_bam=align.annobam, + anno_bam=transcriptome_alignment, endedness=endedness, read_strand=strandedness_direction, ncpus=rsem_ncpus, @@ -143,8 +212,11 @@ workflow rna { } scatter (i in range(length(align.annobam))) { + # if pbams are present they will be in the beginning and thus used, and if not the bams directly from alignment are used + Array[File] annobams = select_all(flatten([anno_pbam.out, align.annobam])) + call rna_qc { input: - input_bam=align.annobam[i], + input_bam=select_first([annobams[i]]), tr_id_to_gene_type_tsv=rna_qc_tr_id_to_gene_type_tsv, output_filename="rep"+(i+1)+bamroot+"_qc.json", disks=rna_qc_disk, diff --git a/wdl/tasks/cat.wdl b/wdl/tasks/cat.wdl new file mode 100644 index 0000000..5ecbff1 --- /dev/null +++ b/wdl/tasks/cat.wdl @@ -0,0 +1,28 @@ +version 1.0 + + +task cat { + input { + Array[File] files + String output_filename = "concatenated" + Int ncpus + Int ramGB + String disks + } + + command { + cat \ + ~{sep=" " files} \ + > ~{output_filename} + } + + output { + File concatenated = output_filename + } + + runtime { + cpu: ncpus + memory: "~{ramGB} GB" + disks: disks + } +} diff --git a/wdl/tasks/gzip.wdl b/wdl/tasks/gzip.wdl new file mode 100644 index 0000000..017aba4 --- /dev/null +++ b/wdl/tasks/gzip.wdl @@ -0,0 +1,30 @@ +version 1.0 + + +task decompress { + input { + File input_file + String output_filename = "out" + Int ncpus + Int ramGB + String disks + } + + command { + gzip \ + -d \ + -c \ + ~{input_file} \ + > ~{output_filename} + } + + output { + File out = output_filename + } + + runtime { + cpu: ncpus + memory: "~{ramGB} GB" + disks: disks + } +} diff --git a/wdl/tasks/pbam.wdl b/wdl/tasks/pbam.wdl new file mode 100644 index 0000000..e0d45da --- /dev/null +++ b/wdl/tasks/pbam.wdl @@ -0,0 +1,62 @@ +version 1.0 + + +task make_genome_pbam { + input { + File bam + File reference_genome + Int ncpus + Int ramGB + String disks + } + + String bam_base = basename(bam, ".bam") + + command { + makepBAM_genome.sh \ + ~{bam} \ + ~{reference_genome} + } + + output { + File out = "~{bam_base}.sorted.p.bam" + } + + runtime { + cpu: ncpus + memory: "~{ramGB} GB" + disks: disks + } +} + +task make_transcriptome_pbam { + input { + File bam + File reference_genome + File reference_transcriptome + File reference_annotation + Int ncpus + Int ramGB + String disks + } + + String bam_base = basename(bam, ".bam") + + command { + makepBAM_transcriptome.sh \ + ~{bam} \ + ~{reference_genome} \ + ~{reference_transcriptome} \ + ~{reference_annotation} + } + + output { + File out = "~{bam_base}.p.bam" + } + + runtime { + cpu: ncpus + memory: "~{ramGB} GB" + disks: disks + } +}