v1.2.0 (#64)

ENCODE-DCC · Jul 8, 2020 · 8a29a99 · 8a29a99
1 parent fb345be
commit 8a29a99
Show file tree

Hide file tree

Showing 13 changed files with 346 additions and 484 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -28,12 +28,12 @@ make_tag: &make_tag
 get_star_index: &get_star_index
   name: get star index for test
   command: |
-    curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+    curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
 
 get_kallisto_index: &get_kallisto_index
   name: get kallisto index for test
   command: |
-    curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+    curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 
 install_singularity: &install_singularity
   name: install singularity
@@ -280,76 +280,6 @@ jobs:
             python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_align_SE_input.result.json
           no_output_timeout: 30m
 
-  test_PE_kallisto_docker:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG docker
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json
-            cat test_kallisto_PE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json
-          no_output_timeout: 30m
-
-  test_PE_kallisto_singularity:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: sudo apt-get update
-      - singularity/install-go
-      - singularity/debian-install-3
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG singularity
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json
-            cat test_kallisto_PE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json
-          no_output_timeout: 30m
-
-  test_SE_kallisto_docker:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG docker
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json
-            cat test_kallisto_SE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json
-          no_output_timeout: 30m
-
-  test_SE_kallisto_singularity:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: sudo apt-get update
-      - singularity/install-go
-      - singularity/debian-install-3
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG singularity
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json
-            cat test_kallisto_SE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json
-          no_output_timeout: 30m
-
 # Workflow
 workflows:
   build_workflow:
@@ -388,18 +318,6 @@ workflows:
       - test_SE_align_singularity:
           requires:
             - build
-      - test_PE_kallisto_docker:
-          requires:
-            - build
-      - test_PE_kallisto_singularity:
-          requires:
-            - build
-      - test_SE_kallisto_docker:
-          requires:
-            - build
-      - test_SE_kallisto_singularity:
-          requires:
-            - build
       - push_template:
           requires:
             - unittests
@@ -413,7 +331,3 @@ workflows:
             - test_PE_align_singularity
             - test_SE_align_docker
             - test_SE_align_singularity
-            - test_PE_kallisto_docker
-            - test_PE_kallisto_singularity
-            - test_SE_kallisto_docker
-            - test_SE_kallisto_singularity
diff --git a/docs/howto.md b/docs/howto.md
@@ -31,8 +31,8 @@ Make sure you have completed the steps for installation and Google Cloud setup d
 2. Get STAR and kallisto index files:
 
 ```bash
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 ```
 
 3. Copy indexes and input data into the cloud:
@@ -110,8 +110,8 @@ The goal is to run a single-end, non-strand-specific experiment on a local compu
 2. Get STAR and kallisto index files:
 
 ```bash
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 ```
 
 The other data that is required to complete this recipe is included in the repository within test_data directory.
@@ -137,8 +137,8 @@ The other data that is required to complete this recipe is included in the repos
     "rna.rsem_ramGB" : 4,
     "rna.kallisto_number_of_threads" : 2,
     "rna.kallisto_ramGB" : 4,
-    "rna.kallisto_fragment_length" : 250,
-    "rna.kallisto_sd_of_fragment_length" : 10,
+    "rna.kallisto_fragment_length" : [250],
+    "rna.kallisto_sd_of_fragment_length" : [10],
     "rna.rna_qc_tr_id_to_gene_type_tsv" : "[PATH_TO_REPO]/rna-seq-pipeline/transcript_id_to_gene_type_mappings/gencodeV24pri-tRNAs-ERCC-phiX.transcript_id_to_genes.tsv",
     "rna.bam_to_signals_ncpus" : 1,
     "rna.bam_to_signals_ramGB" : 2,
@@ -185,8 +185,8 @@ The goal is to run a single-end non-strand-specific experiment locally using sin
 3. Get STAR and kallisto index files:
 
 ```bash
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 ```
 
 4. Run the pipeline using caper:

diff --git a/docs/reference.md b/docs/reference.md
@@ -149,9 +149,9 @@ Assume you want to allocate 100 gigabytes of spinning hard drive. In this case y
 
 Kallisto quantifier makes use of average fragment lenghts and standard deviations of those lengths. In the case of paired end experiments, those values can be calculated from the data, but in case of single-ended experiment those values must be provided.
 
-* `rna.kallisto_fragment_length` Is the average fragment length. Required only if `rna.run_kallisto` is `true` (Default is `true`).
+* `rna.kallisto_fragment_length` Is an array of average fragment lengths as integers, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`).
 .
-* `rna.kallisto_sd_of_fragment_length` Is the standard deviation of the fragment lengths. Required only if `rna.run_kallisto` is `true` (Default is `true`).
+* `rna.kallisto_sd_of_fragment_length` Is and array standard deviations of the fragment lengths as floats, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`).
 
 
 If you do not have this data available, or if you for some other reason want to omit running kallisto you can use the following parameter:

diff --git a/make_index_wdl/build_genome_index.wdl b/make_index_wdl/build_genome_index.wdl
@@ -1,61 +1,70 @@
+version 1.0
+
 # ENCODE DCC RNA-Seq pipeline build_genome_index
-# Maintainer: Otto Jolanki
 
-#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1
-#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1
+#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
+#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
 
 workflow build_index {
-    # Inputs
-    # reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta
-    File reference_sequence
-    # spikeins in gzipped fasta
-    File? spikeins
-    # annotation in gzipped gtf
-    File? annotation
-    # annotation version (e.g 'v24')
-    String? anno_version
-    # genome (e.g 'GRCh38')
-    String? genome
-    # Flavor of the index that gets built
-    # available options:
-    # prep_rsem, prep_srna, prep_star, prep_kallisto
-    String index_type
-    Int ncpu = 8
-    Int? memGB
-    String? disks
+    meta {
+        author: "Otto Jolanki"
+        version: "v1.2.0"
+    }
+
+    input {
+        # reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta
+        File reference_sequence
+        # spikeins in gzipped fasta
+        File? spikeins
+        # annotation in gzipped gtf
+        File? annotation
+        # annotation version (e.g 'v24')
+        String? anno_version
+        # genome (e.g 'GRCh38')
+        String? genome
+        # Flavor of the index that gets built
+        # available options:
+        # prep_rsem, prep_srna, prep_star, prep_kallisto
+        String index_type
+        Int ncpu = 8
+        Int? memGB
+        String? disks
+    }
 
     call make_index { input:
-        reference_sequence = reference_sequence,
-        spikeins = spikeins,
-        annotation = annotation,
-        anno_version = anno_version,
-        genome = genome,
-        index_type = index_type,
-        ncpu = ncpu,
-        memGB = memGB,
-        disks = disks,
+        reference_sequence=reference_sequence,
+        spikeins=spikeins,
+        annotation=annotation,
+        anno_version=anno_version,
+        genome=genome,
+        index_type=index_type,
+        ncpu=ncpu,
+        memGB=memGB,
+        disks=disks,
     }
 }
 
 task make_index {
-    File reference_sequence
-    File? spikeins
-    File? annotation
-    String? anno_version
-    String? genome
-    String index_type
-    Int ncpu
-    Int? memGB
-    String? disks
+    input {
+        File reference_sequence
+        File? spikeins
+        File? annotation
+        String? anno_version
+        String? genome
+        String index_type
+        Int ncpu
+        Int? memGB
+        String? disks
+    }
 
     command {
-        $(which ${index_type + ".sh"}) \
-            ${reference_sequence} \
-            ${spikeins} \
-            ${annotation} \
-            ${anno_version} \
-            ${genome} \
-            ${ncpu}
+        $(which ~{index_type + ".sh"}) \
+            ~{reference_sequence} \
+            ~{spikeins} \
+            ~{annotation} \
+            ~{anno_version} \
+            ~{genome} \
+            ~{ncpu}
     }
 
     output {
@@ -64,7 +73,7 @@ task make_index {
 
     runtime {
         cpu : ncpu
-        memory : "${select_first([memGB,'8'])} GB"
+        memory : "~{select_first([memGB,'8'])} GB"
         disks : select_first([disks,"local-disk 100 SSD"])
     }
 }
diff --git a/make_index_wdl/merge_anno.wdl b/make_index_wdl/merge_anno.wdl
@@ -1,50 +1,58 @@
+version 1.0
+
 #ENCODE DCC RNA-Seq pipeline merge-annotation
-#Maintainer: Otto Jolanki
 
-#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1
-#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1
+#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
+#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
 
 workflow merge_anno {
-    # input filenames
-    File annotation
-    File tRNA
-    File spikeins
-    # output filename
-    String output_filename
-    Int? cpu
-    Int? memGB
-    String? disks
+    meta {
+        author: "Otto Jolanki"
+        version: "v1.2.0"
+    }
+
+    input {
+        File annotation
+        File tRNA
+        File spikeins
+        String output_filename
+        Int? cpu
+        Int? memGB
+        String? disks
+    }
 
     call merge_annotation { input :
-        annotation = annotation,
-        tRNA = tRNA,
-        spikeins = spikeins,
-        output_filename = output_filename,
+        annotation=annotation,
+        tRNA=tRNA,
+        spikeins=spikeins,
+        output_filename=output_filename,
     }
 }
 
 task merge_annotation {
-    File annotation
-    File tRNA
-    File spikeins
-    String output_filename
-    Int? cpu
-    Int? memGB
-    String? disks
+    input {
+        File annotation
+        File tRNA
+        File spikeins
+        String output_filename
+        Int? cpu
+        Int? memGB
+        String? disks
+    }
 
     command {
         python3 $(which merge_annotation.py) \
-            ${"--annotation " + annotation} \
-            ${"--tRNA " + tRNA} \
-            ${"--spikeins " + spikeins} \
-            ${"--output_filename " + output_filename}
+            ~{"--annotation " + annotation} \
+            ~{"--tRNA " + tRNA} \
+            ~{"--spikeins " + spikeins} \
+            ~{"--output_filename " + output_filename}
     }
     output {
-        File merged_annotation = glob("${output_filename}")[0]
+        File merged_annotation = output_filename
     }
     runtime {
         cpu : select_first([cpu,2])
-        memory : "${select_first([memGB,'8'])} GB"
+        memory : "~{select_first([memGB,'8'])} GB"
         disks : select_first([disks,"local-disk 100 SSD"])
     }
 }