PIP-1583-pbam

ENCODE-DCC · Jul 26, 2021 · 375bc7e · 375bc7e
1 parent f60fa59
commit 375bc7e
Show file tree

Hide file tree

Showing 8 changed files with 221 additions and 16 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
 
 RUN pip3 install pandas==0.24.2
 RUN pip3 install pysam==0.15.3
-
 # Add mountpoint directory
 RUN mkdir /data
 # Stick to Jin's way of organizing the directory structure
@@ -56,9 +55,9 @@ ENV PATH="/software/RSEM-1.2.31:${PATH}"
 RUN git clone https://github.com/ENCODE-DCC/kentutils_v385_bin_bulkrna.git
 ENV PATH=${PATH}:/software/kentutils_v385_bin_bulkrna/
 
-# Install qc-utils 19.8.1
+# Install qc-utils 19.8.1 and ptools
 
-RUN pip3 install qc-utils==19.8.1
+RUN pip3 install qc-utils==19.8.1 ptools_bin==0.0.7
 
 RUN mkdir -p rna-seq-pipeline/src
 COPY /src rna-seq-pipeline/src

diff --git a/docs/reference.md b/docs/reference.md
@@ -9,6 +9,7 @@ This document contains more detailed information on the inputs, outputs and the
 [Inputs](reference.md#inputs)  
 [Resource Considerations](reference.md#note-about-resources)  
 [Outputs](reference.md#outputs)  
+[pBAM](reference.md#pBAM)
 
 ## Software
 
@@ -157,6 +158,20 @@ Kallisto quantifier makes use of average fragment lenghts and standard deviation
 If you do not have this data available, or if you for some other reason want to omit running kallisto you can use the following parameter:
 
 * `rna.run_kallisto` Boolean defaulting to `true`. If set to `false` kallisto will not be run, and you do not need to provide values for any kallisto related parameters.
+
+## pBAM
+
+This pipeline offers an option to also output the alignments in privacy-preserving BAM, also known as pBAM format. For more details about the format, see [Cell article by Gursoy et al.](https://pubmed.ncbi.nlm.nih.gov/33186529/). Note, that also the usual BAM is output, and making sure only the privacy-preserving BAM is published is up to you. All the downstream operations (RSEM quantification, signal track generation, MAD QC and RNA QC) are based on pBAMs. Note that kallisto step, that is directly based on fastqs will not be changed when using this option, and kallisto quantifications might leak some information. Safest way to handle possible leakage from kallisto is to just skip it by setting `rna.run_kallisto` to `false`.
+
+#### Enabling the pBAM option
+To enable pBAM producing the following additional inputs are required:
+
+* `rna.produce_pbams` needs to be set to `true`.
+* `rna.reference_genome` The reference genome fasta compressed with gzip. The human reference genome used by ENCODE can be found [here](https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/).
+* `rna.reference_transcriptome` The reference transcriptome fasta compressed with gzip. The version should match the genome annotation version. The human version 29 transcriptome corresponding to ENCODE4 data can be found [here](https://www.encodeproject.org/files/ENCFF088TTQ/).
+* `rna.reference_annotations` An array of annotation gtf files compressed with gzip. Typically this would be [the genome annotation](https://www.encodeproject.org/files/gencode.v29.primary_assembly.annotation_UCSC_names/) and [tRNA annotation](https://www.encodeproject.org/files/gencode.v29.tRNAs/).
+
+
 ## Outputs
 
 `Cromwell`: `Cromwell` will store outputs for each task under directory `cromwell-executions/[WORKFLOW_ID]/call-[TASK_NAME]/shard-[IDX]`. For all tasks `[IDX]` means a zero-based index for each replicate. In addition to the actual pipeline outputs, these directories contain a plethora of operational Cromwell-specific files. Use [croo](https://github.com/ENCODE-DCC/croo) to find and organize the pipeline outputs.

diff --git a/make_index_wdl/build_genome_index.wdl b/make_index_wdl/build_genome_index.wdl
@@ -2,13 +2,12 @@ version 1.0
 
 # ENCODE DCC RNA-Seq pipeline build_genome_index
 
-#CAPER docker encodedcc/rna-seq-pipeline:v1.2.1
-#CAPER singularity docker://encodedcc/rna-seq-pipeline:v1.2.1
-
 workflow build_index {
     meta {
         author: "Otto Jolanki"
-        version: "v1.2.1"
+        version: "1.2.4"
+        caper_docker: "encodedcc/rna-seq-pipeline:1.2.4"
+        caper_singularity: "encodedcc/rna-seq-pipeline:1.2.4"
     }
 
     input {

diff --git a/make_index_wdl/merge_anno.wdl b/make_index_wdl/merge_anno.wdl
@@ -2,13 +2,13 @@ version 1.0
 
 #ENCODE DCC RNA-Seq pipeline merge-annotation
 
-#CAPER docker encodedcc/rna-seq-pipeline:v1.2.1
-#CAPER singularity docker://encodedcc/rna-seq-pipeline:v1.2.1
 
 workflow merge_anno {
     meta {
         author: "Otto Jolanki"
-        version: "v1.2.1"
+        version: "1.2.4"
+        caper_docker: "encodedcc/rna-seq-pipeline:1.2.4"
+        caper_singularity:"encodedcc/rna-seq-pipeline:1.2.4"
     }
 
     input {

diff --git a/rna-seq-pipeline.wdl b/rna-seq-pipeline.wdl
@@ -1,13 +1,16 @@
 version 1.0
 
 # ENCODE DCC RNA-seq pipeline
+import "wdl/tasks/cat.wdl"
+import "wdl/tasks/gzip.wdl"
+import "wdl/tasks/pbam.wdl"
 
 workflow rna {
     meta {
         author: "Otto Jolanki"
-        version: "1.2.3"
-        caper_docker: "encodedcc/rna-seq-pipeline:1.2.3"
-        caper_singularity: "encodedcc/rna-seq-pipeline:v1.2.3"
+        version: "1.2.4"
+        caper_docker: "encodedcc/rna-seq-pipeline:1.2.4"
+        caper_singularity: "encodedcc/rna-seq-pipeline:1.2.4"
         croo_out_def: "https://storage.googleapis.com/encode-pipeline-output-definition/bulkrna.output_definition.json"
     }
 
@@ -55,6 +58,17 @@ workflow rna {
         String? mad_qc_disk
         String? rna_qc_disk
 
+        # Following inputs should only be defined if you want to produce
+        # privacy preserving pBAM files, and use those for signal_track
+        # generation and quantification.
+        Boolean produce_pbams = false
+        File? reference_genome
+        File? reference_transcriptome
+        # Usually for ENCODE experiments this would be the usual annotation file + the tRNAs in gtf.gz format
+        Array[File] reference_annotations = []
+        Int pbam_ncpus = 2
+        Int pbam_ramGB = 6
+        String pbam_disks = "local-disk 200 SSD"
         # These are for internal use, leave undefined
         Int? kallisto_fragment_length_undefined
         Float? kallisto_sd_undefined
@@ -63,6 +77,23 @@ workflow rna {
     # dummy variable value for the single-ended case
     Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2
 
+    if (produce_pbams) {
+        call cat.cat as combined_gtf_gz { input:
+            files=reference_annotations,
+            output_filename="combined_annotation.gtf.gz",
+            ncpus=2,
+            ramGB=4,
+            disks="local-disk 100 SSD",
+        }
+        call gzip.decompress as combined_gtf { input:
+            input_file=combined_gtf_gz.concatenated,
+            output_filename="combined_annotation.gtf",
+            ncpus=2,
+            ramGB=4,
+            disks="local-disk 100 SSD",
+        }
+    }
+
     scatter (i in range(length(fastqs_R1))) {
         call align { input:
             endedness=endedness,
@@ -89,8 +120,46 @@ workflow rna {
             disks=bam_to_signals_disk,
         }
 
+        if (produce_pbams) {
+
+            call gzip.decompress as reference_genome_decompressed { input:
+                input_file = select_first([reference_genome]),
+                ncpus = 2,
+                ramGB = 4,
+                disks = "local-disk 40 SSD",
+            }
+
+            call gzip.decompress as reference_transcriptome_decompressed { input:
+                input_file = select_first([reference_transcriptome]),
+                ncpus = 2,
+                ramGB = 4,
+                disks = "local-disk 40 SSD",
+            }
+
+            call pbam.make_genome_pbam as genome_pbam { input:
+                bam=align.genomebam,
+                reference_genome=reference_genome_decompressed.out,
+                ncpus=pbam_ncpus,
+                ramGB=pbam_ramGB,
+                disks=pbam_disks,
+            }
+
+            call pbam.make_transcriptome_pbam as anno_pbam { input:
+                bam = align.annobam,
+                reference_genome=reference_genome_decompressed.out,
+                reference_transcriptome=reference_transcriptome_decompressed.out,
+                reference_annotation=select_first([combined_gtf.out]),
+                ncpus=pbam_ncpus,
+                ramGB=pbam_ramGB,
+                disks=pbam_disks,
+            }
+        }
+
+        File genome_alignment = select_first([genome_pbam.out, align.genomebam])
+        File transcriptome_alignment = select_first([anno_pbam.out, align.annobam])
+
         call bam_to_signals { input:
-            input_bam=align.genomebam,
+            input_bam=genome_alignment,
             chrom_sizes=chrom_sizes,
             strandedness=strandedness,
             bamroot="rep"+(i+1)+bamroot+"_genome",
@@ -102,7 +171,7 @@ workflow rna {
         call rsem_quant { input:
             rsem_index=rsem_index,
             rnd_seed=rnd_seed,
-            anno_bam=align.annobam,
+            anno_bam=transcriptome_alignment,
             endedness=endedness,
             read_strand=strandedness_direction,
             ncpus=rsem_ncpus,
@@ -143,8 +212,11 @@ workflow rna {
     }
 
     scatter (i in range(length(align.annobam))) {
+        # if pbams are present they will be in the beginning and thus used, and if not the bams directly from alignment are used
+        Array[File] annobams = select_all(flatten([anno_pbam.out, align.annobam]))
+
         call rna_qc { input:
-            input_bam=align.annobam[i],
+            input_bam=select_first([annobams[i]]),
             tr_id_to_gene_type_tsv=rna_qc_tr_id_to_gene_type_tsv,
             output_filename="rep"+(i+1)+bamroot+"_qc.json",
             disks=rna_qc_disk,

diff --git a/wdl/tasks/cat.wdl b/wdl/tasks/cat.wdl
@@ -0,0 +1,28 @@
+version 1.0
+
+
+task cat {
+    input {
+        Array[File] files
+        String output_filename = "concatenated"
+        Int ncpus
+        Int ramGB
+        String disks
+    }
+
+    command {
+        cat \
+            ~{sep=" " files} \
+            > ~{output_filename}
+    }
+
+    output {
+        File concatenated = output_filename
+    }
+
+    runtime {
+        cpu: ncpus
+        memory: "~{ramGB} GB"
+        disks: disks
+    }
+}
diff --git a/wdl/tasks/gzip.wdl b/wdl/tasks/gzip.wdl
@@ -0,0 +1,30 @@
+version 1.0
+
+
+task decompress {
+    input {
+        File input_file
+        String output_filename = "out"
+        Int ncpus
+        Int ramGB
+        String disks
+    }
+
+    command {
+        gzip \
+            -d \
+            -c \
+            ~{input_file} \
+            > ~{output_filename}
+    }
+
+    output {
+        File out = output_filename
+    }
+
+    runtime {
+        cpu: ncpus
+        memory: "~{ramGB} GB"
+        disks: disks
+    }
+}
diff --git a/wdl/tasks/pbam.wdl b/wdl/tasks/pbam.wdl
@@ -0,0 +1,62 @@
+version 1.0
+
+
+task make_genome_pbam {
+    input {
+        File bam
+        File reference_genome
+        Int ncpus
+        Int ramGB
+        String disks
+    }
+
+    String bam_base = basename(bam, ".bam")
+
+    command {
+        makepBAM_genome.sh \
+            ~{bam} \
+            ~{reference_genome}
+    }
+
+    output {
+        File out = "~{bam_base}.sorted.p.bam"
+    }
+
+    runtime {
+        cpu: ncpus
+        memory: "~{ramGB} GB"
+        disks: disks
+    }
+}
+
+task make_transcriptome_pbam {
+    input {
+        File bam
+        File reference_genome
+        File reference_transcriptome
+        File reference_annotation
+        Int ncpus
+        Int ramGB
+        String disks
+    }
+
+    String bam_base = basename(bam, ".bam")
+
+    command {
+        makepBAM_transcriptome.sh \
+            ~{bam} \
+            ~{reference_genome} \
+            ~{reference_transcriptome} \
+            ~{reference_annotation}
+    }
+
+    output {
+        File out = "~{bam_base}.p.bam"
+    }
+
+    runtime {
+        cpu: ncpus
+        memory: "~{ramGB} GB"
+        disks: disks
+    }
+}