From 8a29a9970226dc687597c4e7e8bac7fe314dac0f Mon Sep 17 00:00:00 2001
From: ottojolanki <ojolanki@stanford.edu>
Date: Wed, 8 Jul 2020 04:54:31 -0700
Subject: [PATCH] v1.2.0 (#64)

---
 .circleci/config.yml                          |  90 +---
 docs/howto.md                                 |  16 +-
 docs/reference.md                             |   4 +-
 make_index_wdl/build_genome_index.wdl         | 103 +++--
 make_index_wdl/merge_anno.wdl                 |  66 +--
 rna-seq-pipeline.wdl                          | 421 +++++++++---------
 test/test_task/test_align.wdl                 |  37 +-
 test/test_task/test_kallisto.wdl              |  33 --
 test/test_task/test_kallisto_PE_input.json    |  21 -
 .../test_kallisto_PE_reference_md5.json       |   3 -
 test/test_task/test_kallisto_SE_input.json    |  22 -
 .../test_kallisto_SE_reference_md5.json       |   4 -
 test/test_workflow/SE_unstranded_input.json   |  10 +-
 13 files changed, 346 insertions(+), 484 deletions(-)
 delete mode 100644 test/test_task/test_kallisto.wdl
 delete mode 100644 test/test_task/test_kallisto_PE_input.json
 delete mode 100644 test/test_task/test_kallisto_PE_reference_md5.json
 delete mode 100644 test/test_task/test_kallisto_SE_input.json
 delete mode 100644 test/test_task/test_kallisto_SE_reference_md5.json

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 29e1726..fa4dde3 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -28,12 +28,12 @@ make_tag: &make_tag
 get_star_index: &get_star_index
   name: get star index for test
   command: |
-    curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+    curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
 
 get_kallisto_index: &get_kallisto_index
   name: get kallisto index for test
   command: |
-    curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+    curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 
 install_singularity: &install_singularity
   name: install singularity
@@ -280,76 +280,6 @@ jobs:
             python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_align_SE_input.result.json
           no_output_timeout: 30m
 
-  test_PE_kallisto_docker:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG docker
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json
-            cat test_kallisto_PE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json
-          no_output_timeout: 30m
-
-  test_PE_kallisto_singularity:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: sudo apt-get update
-      - singularity/install-go
-      - singularity/debian-install-3
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG singularity
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json
-            cat test_kallisto_PE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json
-          no_output_timeout: 30m
-
-  test_SE_kallisto_docker:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG docker
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json
-            cat test_kallisto_SE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json
-          no_output_timeout: 30m
-
-  test_SE_kallisto_singularity:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - run: *make_tag
-      - run: sudo apt-get update
-      - singularity/install-go
-      - singularity/debian-install-3
-      - run: *get_kallisto_index
-      - run:
-          command: |
-            pyenv global 3.5.2
-            source ${BASH_ENV}
-            test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG singularity
-            python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json
-            cat test_kallisto_SE_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json
-          no_output_timeout: 30m
-
 # Workflow
 workflows:
   build_workflow:
@@ -388,18 +318,6 @@ workflows:
       - test_SE_align_singularity:
           requires:
             - build
-      - test_PE_kallisto_docker:
-          requires:
-            - build
-      - test_PE_kallisto_singularity:
-          requires:
-            - build
-      - test_SE_kallisto_docker:
-          requires:
-            - build
-      - test_SE_kallisto_singularity:
-          requires:
-            - build
       - push_template:
           requires:
             - unittests
@@ -413,7 +331,3 @@ workflows:
             - test_PE_align_singularity
             - test_SE_align_docker
             - test_SE_align_singularity
-            - test_PE_kallisto_docker
-            - test_PE_kallisto_singularity
-            - test_SE_kallisto_docker
-            - test_SE_kallisto_singularity
diff --git a/docs/howto.md b/docs/howto.md
index ebce468..43a5418 100644
--- a/docs/howto.md
+++ b/docs/howto.md
@@ -31,8 +31,8 @@ Make sure you have completed the steps for installation and Google Cloud setup d
 2. Get STAR and kallisto index files:
 
 ```bash
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 ```
 
 3. Copy indexes and input data into the cloud:
@@ -110,8 +110,8 @@ The goal is to run a single-end, non-strand-specific experiment on a local compu
 2. Get STAR and kallisto index files:
 
 ```bash
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 ```
 
 The other data that is required to complete this recipe is included in the repository within test_data directory.
@@ -137,8 +137,8 @@ The other data that is required to complete this recipe is included in the repos
     "rna.rsem_ramGB" : 4,
     "rna.kallisto_number_of_threads" : 2,
     "rna.kallisto_ramGB" : 4,
-    "rna.kallisto_fragment_length" : 250,
-    "rna.kallisto_sd_of_fragment_length" : 10,
+    "rna.kallisto_fragment_length" : [250],
+    "rna.kallisto_sd_of_fragment_length" : [10],
     "rna.rna_qc_tr_id_to_gene_type_tsv" : "[PATH_TO_REPO]/rna-seq-pipeline/transcript_id_to_gene_type_mappings/gencodeV24pri-tRNAs-ERCC-phiX.transcript_id_to_genes.tsv",
     "rna.bam_to_signals_ncpus" : 1,
     "rna.bam_to_signals_ramGB" : 2,
@@ -185,8 +185,8 @@ The goal is to run a single-end non-strand-specific experiment locally using sin
 3. Get STAR and kallisto index files:
 
 ```bash
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
-  $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
+  $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
 ```
 
 4. Run the pipeline using caper:
diff --git a/docs/reference.md b/docs/reference.md
index 6303419..e519337 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -149,9 +149,9 @@ Assume you want to allocate 100 gigabytes of spinning hard drive. In this case y
 
 Kallisto quantifier makes use of average fragment lenghts and standard deviations of those lengths. In the case of paired end experiments, those values can be calculated from the data, but in case of single-ended experiment those values must be provided.
 
-* `rna.kallisto_fragment_length` Is the average fragment length. Required only if `rna.run_kallisto` is `true` (Default is `true`).
+* `rna.kallisto_fragment_length` Is an array of average fragment lengths as integers, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`).
 .
-* `rna.kallisto_sd_of_fragment_length` Is the standard deviation of the fragment lengths. Required only if `rna.run_kallisto` is `true` (Default is `true`).
+* `rna.kallisto_sd_of_fragment_length` Is and array standard deviations of the fragment lengths as floats, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`).
 
 
 If you do not have this data available, or if you for some other reason want to omit running kallisto you can use the following parameter:
diff --git a/make_index_wdl/build_genome_index.wdl b/make_index_wdl/build_genome_index.wdl
index 98226cf..0432d72 100644
--- a/make_index_wdl/build_genome_index.wdl
+++ b/make_index_wdl/build_genome_index.wdl
@@ -1,61 +1,70 @@
+version 1.0
+
 # ENCODE DCC RNA-Seq pipeline build_genome_index
-# Maintainer: Otto Jolanki
 
-#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1
-#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1
+#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
+#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
 
 workflow build_index {
-    # Inputs
-    # reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta
-    File reference_sequence
-    # spikeins in gzipped fasta
-    File? spikeins
-    # annotation in gzipped gtf
-    File? annotation
-    # annotation version (e.g 'v24')
-    String? anno_version
-    # genome (e.g 'GRCh38')
-    String? genome
-    # Flavor of the index that gets built
-    # available options:
-    # prep_rsem, prep_srna, prep_star, prep_kallisto
-    String index_type
-    Int ncpu = 8
-    Int? memGB
-    String? disks
+    meta {
+        author: "Otto Jolanki"
+        version: "v1.2.0"
+    }
+
+    input {
+        # reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta
+        File reference_sequence
+        # spikeins in gzipped fasta
+        File? spikeins
+        # annotation in gzipped gtf
+        File? annotation
+        # annotation version (e.g 'v24')
+        String? anno_version
+        # genome (e.g 'GRCh38')
+        String? genome
+        # Flavor of the index that gets built
+        # available options:
+        # prep_rsem, prep_srna, prep_star, prep_kallisto
+        String index_type
+        Int ncpu = 8
+        Int? memGB
+        String? disks
+    }
 
     call make_index { input:
-        reference_sequence = reference_sequence,
-        spikeins = spikeins,
-        annotation = annotation,
-        anno_version = anno_version,
-        genome = genome,
-        index_type = index_type,
-        ncpu = ncpu,
-        memGB = memGB,
-        disks = disks,
+        reference_sequence=reference_sequence,
+        spikeins=spikeins,
+        annotation=annotation,
+        anno_version=anno_version,
+        genome=genome,
+        index_type=index_type,
+        ncpu=ncpu,
+        memGB=memGB,
+        disks=disks,
     }
 }
 
 task make_index {
-    File reference_sequence
-    File? spikeins
-    File? annotation
-    String? anno_version
-    String? genome
-    String index_type
-    Int ncpu
-    Int? memGB
-    String? disks
+    input {
+        File reference_sequence
+        File? spikeins
+        File? annotation
+        String? anno_version
+        String? genome
+        String index_type
+        Int ncpu
+        Int? memGB
+        String? disks
+    }
 
     command {
-        $(which ${index_type + ".sh"}) \
-            ${reference_sequence} \
-            ${spikeins} \
-            ${annotation} \
-            ${anno_version} \
-            ${genome} \
-            ${ncpu}
+        $(which ~{index_type + ".sh"}) \
+            ~{reference_sequence} \
+            ~{spikeins} \
+            ~{annotation} \
+            ~{anno_version} \
+            ~{genome} \
+            ~{ncpu}
     }
 
     output {
@@ -64,7 +73,7 @@ task make_index {
 
     runtime {
         cpu : ncpu
-        memory : "${select_first([memGB,'8'])} GB"
+        memory : "~{select_first([memGB,'8'])} GB"
         disks : select_first([disks,"local-disk 100 SSD"])
     }
 }
diff --git a/make_index_wdl/merge_anno.wdl b/make_index_wdl/merge_anno.wdl
index d74b0e8..c593fdd 100644
--- a/make_index_wdl/merge_anno.wdl
+++ b/make_index_wdl/merge_anno.wdl
@@ -1,50 +1,58 @@
+version 1.0
+
 #ENCODE DCC RNA-Seq pipeline merge-annotation
-#Maintainer: Otto Jolanki
 
-#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1
-#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1
+#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
+#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
 
 workflow merge_anno {
-    # input filenames
-    File annotation
-    File tRNA
-    File spikeins
-    # output filename
-    String output_filename
-    Int? cpu
-    Int? memGB
-    String? disks
+    meta {
+        author: "Otto Jolanki"
+        version: "v1.2.0"
+    }
+
+    input {
+        File annotation
+        File tRNA
+        File spikeins
+        String output_filename
+        Int? cpu
+        Int? memGB
+        String? disks
+    }
 
     call merge_annotation { input :
-        annotation = annotation,
-        tRNA = tRNA,
-        spikeins = spikeins,
-        output_filename = output_filename,
+        annotation=annotation,
+        tRNA=tRNA,
+        spikeins=spikeins,
+        output_filename=output_filename,
     }
 }
 
 task merge_annotation {
-    File annotation
-    File tRNA
-    File spikeins
-    String output_filename
-    Int? cpu
-    Int? memGB
-    String? disks
+    input {
+        File annotation
+        File tRNA
+        File spikeins
+        String output_filename
+        Int? cpu
+        Int? memGB
+        String? disks
+    }
 
     command {
         python3 $(which merge_annotation.py) \
-            ${"--annotation " + annotation} \
-            ${"--tRNA " + tRNA} \
-            ${"--spikeins " + spikeins} \
-            ${"--output_filename " + output_filename}
+            ~{"--annotation " + annotation} \
+            ~{"--tRNA " + tRNA} \
+            ~{"--spikeins " + spikeins} \
+            ~{"--output_filename " + output_filename}
     }
     output {
-        File merged_annotation = glob("${output_filename}")[0]
+        File merged_annotation = output_filename
     }
     runtime {
         cpu : select_first([cpu,2])
-        memory : "${select_first([memGB,'8'])} GB"
+        memory : "~{select_first([memGB,'8'])} GB"
         disks : select_first([disks,"local-disk 100 SSD"])
     }
 }
diff --git a/rna-seq-pipeline.wdl b/rna-seq-pipeline.wdl
index ef384b0..c80d94d 100644
--- a/rna-seq-pipeline.wdl
+++ b/rna-seq-pipeline.wdl
@@ -1,125 +1,119 @@
+version 1.0
+
 # ENCODE DCC RNA-seq pipeline
-# Maintainer: Otto Jolanki
 
-#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1
-#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1
+#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
+#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/bulkrna.output_definition.json
 
 workflow rna {
-    # endedness: paired or single
-    String endedness
-    # fastqs_R1: fastq.gz files for Read1 (only these if single-ended)
-    Array[Array[File]] fastqs_R1
-    # fastqs_R2: fastq.gz files for Read2 (omit if single-ended) in order
-    # corresponding to fastqs_R1
-    Array[Array[File]] fastqs_R2 = []
-    # bamroot: root name for output bams. For example foo_bar will
-    # create foo_bar_genome.bam and foo_bar_anno.bam
-    String bamroot
-    # strandedness: is the library strand specific (stranded or unstranded)
-    String strandedness
-    # strandedness_direction (forward, reverse, unstranded)
-    String strandedness_direction
-    # chrom_sizes: chromosome sizes file
-    File chrom_sizes
-    # Switch to false to not run kallisto
-    Boolean run_kallisto = true
-    ## task level variables that are defined globally to make them visible to DNANexus UI
-
-    # ALIGN
-    # index: aligner index archive (tar.gz)
-    File align_index
-    Int align_ncpus
-    Int align_ramGB
-    String? align_disk
-
-    # KALLISTO
-
-    Int? kallisto_number_of_threads
-    Int? kallisto_ramGB
-    File? kallisto_index
-    Int? kallisto_fragment_length
-    Float? kallisto_sd_of_fragment_length
-    String? kallisto_disk
-
-    # BAM_TO_SIGNALS
-
-    Int bam_to_signals_ncpus
-    Int bam_to_signals_ramGB
-    String? bam_to_signals_disk
-
-    # RSEM_QUANT
-
-    # rsem_index: RSEM index archive (tar.gz)
-    File rsem_index
-    # rnd_seed: random seed used for rsem
-    Int rnd_seed = 12345
-    Int rsem_ncpus
-    Int rsem_ramGB
-    String? rsem_disk
-
-    # RNA_QC
-
-    File rna_qc_tr_id_to_gene_type_tsv
-    String? rna_qc_disk
-
-    # MAD_QC
-
-    String? mad_qc_disk
-
-    ## WORKFLOW BEGINS
+    meta {
+        author: "Otto Jolanki"
+        version: "v1.2.0"
+    }
+
+    input {
+        # endedness: paired or single
+        String endedness
+        # fastqs_R1: fastq.gz files for Read1 (only these if single-ended)
+        Array[Array[File]] fastqs_R1
+        # fastqs_R2: fastq.gz files for Read2 (omit if single-ended) in order
+        # corresponding to fastqs_R1
+        Array[Array[File]] fastqs_R2 = []
+        # bamroot: root name for output bams. For example foo_bar will
+        # create foo_bar_genome.bam and foo_bar_anno.bam
+        String bamroot
+        # strandedness: is the library strand specific (stranded or unstranded)
+        String strandedness
+        # strandedness_direction (forward, reverse, unstranded)
+        String strandedness_direction
+        # chrom_sizes: chromosome sizes file
+        File chrom_sizes
+        # Switch to false to not run kallisto
+        Boolean run_kallisto = true
+        # index: aligner index archive (tar.gz)
+        File align_index
+        Int align_ncpus
+        Int align_ramGB
+        String? align_disk
+        Int? kallisto_number_of_threads
+        Int? kallisto_ramGB
+        File? kallisto_index
+        Array[Int] kallisto_fragment_length = []
+        Array[Float] kallisto_sd_of_fragment_length = []
+        String? kallisto_disk
+        Int bam_to_signals_ncpus
+        Int bam_to_signals_ramGB
+        String? bam_to_signals_disk
+        # rsem_index: RSEM index archive (tar.gz)
+        File rsem_index
+        # rnd_seed: random seed used for rsem
+        Int rnd_seed = 12345
+        Int rsem_ncpus
+        Int rsem_ramGB
+        String? rsem_disk
+        File rna_qc_tr_id_to_gene_type_tsv
+        String? mad_qc_disk
+        String? rna_qc_disk
+
+        # These are for internal use, leave undefined
+        Int? kallisto_fragment_length_undefined
+        Float? kallisto_sd_undefined
+    }
 
     # dummy variable value for the single-ended case
     Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2
 
     scatter (i in range(length(fastqs_R1))) {
         call align { input:
-            endedness = endedness,
-            fastqs_R1 = fastqs_R1[i],
-            fastqs_R2 = fastqs_R2_[i],
-            index = align_index,
-            bamroot = "rep"+(i+1)+bamroot,
-            ncpus = align_ncpus,
-            ramGB = align_ramGB,
-            disks = align_disk,
+            endedness=endedness,
+            fastqs_R1=fastqs_R1[i],
+            fastqs_R2=fastqs_R2_[i],
+            index=align_index,
+            bamroot="rep"+(i+1)+bamroot,
+            ncpus=align_ncpus,
+            ramGB=align_ramGB,
+            disks=align_disk,
         }
 
         call bam_to_signals { input:
-            input_bam = align.genomebam,
-            chrom_sizes = chrom_sizes,
-            strandedness = strandedness,
-            bamroot = "rep"+(i+1)+bamroot+"_genome",
-            ncpus = bam_to_signals_ncpus,
-            ramGB = bam_to_signals_ramGB,
-            disks = bam_to_signals_disk,
+            input_bam=align.genomebam,
+            chrom_sizes=chrom_sizes,
+            strandedness=strandedness,
+            bamroot="rep"+(i+1)+bamroot+"_genome",
+            ncpus=bam_to_signals_ncpus,
+            ramGB=bam_to_signals_ramGB,
+            disks=bam_to_signals_disk,
         }
 
         call rsem_quant { input:
-            rsem_index = rsem_index,
-            rnd_seed = rnd_seed,
-            anno_bam = align.annobam,
-            endedness = endedness,
-            read_strand = strandedness_direction,
-            ncpus = rsem_ncpus,
-            ramGB = rsem_ramGB,
-            disks = rsem_disk,
+            rsem_index=rsem_index,
+            rnd_seed=rnd_seed,
+            anno_bam=align.annobam,
+            endedness=endedness,
+            read_strand=strandedness_direction,
+            ncpus=rsem_ncpus,
+            ramGB=rsem_ramGB,
+            disks=rsem_disk,
         }
     }
 
     if (run_kallisto) {
       scatter (i in range(length(fastqs_R1))) {
+          Float? kallisto_sd = if (length(kallisto_sd_of_fragment_length) > 0) then kallisto_sd_of_fragment_length[i] else kallisto_sd_undefined
+          Int? kallisto_fl = if (length(kallisto_fragment_length) > 0) then kallisto_fragment_length[i] else kallisto_fragment_length_undefined
           call kallisto { input:
-              fastqs_R1 = fastqs_R1[i],
-              fastqs_R2 = fastqs_R2_[i],
-              endedness = endedness,
-              strandedness_direction = strandedness_direction,
-              kallisto_index = kallisto_index,
-              number_of_threads = kallisto_number_of_threads,
-              ramGB = kallisto_ramGB,
-              fragment_length = kallisto_fragment_length,
-              sd_of_fragment_length = kallisto_sd_of_fragment_length,
-              disks = kallisto_disk,
-              out_prefix = "rep"+(i+1)+bamroot,
+              fastqs_R1=fastqs_R1[i],
+              fastqs_R2=fastqs_R2_[i],
+              endedness=endedness,
+              strandedness_direction=strandedness_direction,
+              kallisto_index=select_first([kallisto_index]),
+              number_of_threads=select_first([kallisto_number_of_threads]),
+              ramGB=select_first([kallisto_ramGB]),
+              fragment_length=kallisto_fl,
+              sd_of_fragment_length=kallisto_sd,
+              disks=kallisto_disk,
+              out_prefix="rep"+(i+1)+bamroot,
           }
       }
     }
@@ -129,188 +123,197 @@ workflow rna {
 # if there are exactly two replicates, calculate the madQC metrics and draw a plot
     if (length(fastqs_R1) == 2) {
         call mad_qc { input:
-            quants1 = rsem_quant.genes_results[0],
-            quants2 = rsem_quant.genes_results[1],
-            disks = mad_qc_disk,
+            quants1=rsem_quant.genes_results[0],
+            quants2=rsem_quant.genes_results[1],
+            disks=mad_qc_disk,
         }
     }
 
     scatter (i in range(length(align.annobam))) {
         call rna_qc { input:
-            input_bam = align.annobam[i],
-            tr_id_to_gene_type_tsv = rna_qc_tr_id_to_gene_type_tsv,
-            output_filename = "rep"+(i+1)+bamroot+"_qc.json",
-            disks = rna_qc_disk,
+            input_bam=align.annobam[i],
+            tr_id_to_gene_type_tsv=rna_qc_tr_id_to_gene_type_tsv,
+            output_filename="rep"+(i+1)+bamroot+"_qc.json",
+            disks=rna_qc_disk,
         }
     }
 }
 
-## tasks
+
 task align {
-    Array[File] fastqs_R1
-    Array[File] fastqs_R2
-    String endedness
-    File index
-    String bamroot
-    Int ncpus
-    Int ramGB
-    String? disks
+    input {
+        Array[File] fastqs_R1
+        Array[File] fastqs_R2
+        String endedness
+        File index
+        String bamroot
+        Int ncpus
+        Int ramGB
+        String? disks
+    }
 
     command {
         python3 $(which align.py) \
-            --fastqs_R1 ${sep=' ' fastqs_R1} \
-            --fastqs_R2 ${sep=' ' fastqs_R2} \
-            --endedness ${endedness} \
-            --index ${index} \
-            ${"--bamroot " + bamroot} \
-            ${"--ncpus " + ncpus} \
-            ${"--ramGB " + ramGB}
+            --fastqs_R1 ~{sep=' ' fastqs_R1} \
+            --fastqs_R2 ~{sep=' ' fastqs_R2} \
+            --endedness ~{endedness} \
+            --index ~{index} \
+            ~{"--bamroot " + bamroot} \
+            ~{"--ncpus " + ncpus} \
+            ~{"--ramGB " + ramGB}
     }
 
     output {
-        File genomebam = glob("*_genome.bam")[0]
-        File annobam = glob("*_anno.bam")[0]
-        File genome_flagstat = glob("*_genome_flagstat.txt")[0]
-        File anno_flagstat = glob("*_anno_flagstat.txt")[0]
-        File log = glob("*_Log.final.out")[0]
-        File genome_flagstat_json = glob("*_genome_flagstat.json")[0]
-        File anno_flagstat_json = glob("*_anno_flagstat.json")[0]
-        File log_json = glob("*_Log.final.json")[0]
-        File python_log = glob("align.log")[0]
+        File genomebam = "~{bamroot}_genome.bam"
+        File annobam = "~{bamroot}_anno.bam"
+        File genome_flagstat = "~{bamroot}_genome_flagstat.txt"
+        File anno_flagstat = "~{bamroot}_anno_flagstat.txt"
+        File log = "~{bamroot}_Log.final.out"
+        File genome_flagstat_json = "~{bamroot}_genome_flagstat.json"
+        File anno_flagstat_json = "~{bamroot}_anno_flagstat.json"
+        File log_json = "~{bamroot}_Log.final.json"
+        File python_log = "align.log"
     }
 
     runtime {
       cpu: ncpus
-      memory: "${ramGB} GB"
+      memory: "~{ramGB} GB"
       disks : select_first([disks,"local-disk 100 SSD"])
     }
 }
 
 task  bam_to_signals {
-    File? null
-    File input_bam
-    File chrom_sizes
-    String strandedness
-    String bamroot
-    Int ncpus
-    Int ramGB
-    String? disks
-
+    input {
+        File? null
+        File input_bam
+        File chrom_sizes
+        String strandedness
+        String bamroot
+        Int ncpus
+        Int ramGB
+        String? disks
+    }
 
     command {
         python3 $(which bam_to_signals.py) \
-            --bamfile ${input_bam} \
-            --chrom_sizes ${chrom_sizes} \
-            --strandedness ${strandedness} \
-            --bamroot ${bamroot}
+            --bamfile ~{input_bam} \
+            --chrom_sizes ~{chrom_sizes} \
+            --strandedness ~{strandedness} \
+            --bamroot ~{bamroot}
     }
 
     output {
-        File? unique_unstranded = if (strandedness == "unstranded") then glob("*genome_uniq.bw")[0] else null
-        File? all_unstranded = if (strandedness == "unstranded") then glob("*genome_all.bw")[0] else null
-        File? unique_plus = if (strandedness == "stranded") then glob("*genome_plusUniq.bw")[0] else null
-        File? unique_minus = if (strandedness == "stranded") then glob("*genome_minusUniq.bw")[0] else null
-        File? all_plus = if (strandedness == "stranded") then glob("*genome_plusAll.bw")[0] else null
-        File? all_minus = if (strandedness == "stranded") then glob("*genome_minusAll.bw")[0] else null
-        File python_log = glob("bam_to_signals.log")[0]
+        File? unique_unstranded = if (strandedness == "unstranded") then glob("*_genome_uniq.bw")[0] else null
+        File? all_unstranded = if (strandedness == "unstranded") then glob("*_genome_all.bw")[0] else null
+        File? unique_plus = if (strandedness == "stranded") then glob("*_genome_plusUniq.bw")[0] else null
+        File? unique_minus = if (strandedness == "stranded") then glob("*_genome_minusUniq.bw")[0] else null
+        File? all_plus = if (strandedness == "stranded") then glob("*_genome_plusAll.bw")[0] else null
+        File? all_minus = if (strandedness == "stranded") then glob("*_genome_minusAll.bw")[0] else null
+        File python_log = "bam_to_signals.log"
     }
 
     runtime {
         cpu: ncpus
-        memory: "${ramGB} GB"
+        memory: "~{ramGB} GB"
         disks : select_first([disks,"local-disk 100 SSD"])
     }
 }
 
 task rsem_quant {
-    File rsem_index
-    File anno_bam
-    String endedness
-    String read_strand
-    Int rnd_seed
-    Int ncpus
-    Int ramGB
-    String? disks
+    input {
+        File rsem_index
+        File anno_bam
+        String endedness
+        String read_strand
+        Int rnd_seed
+        Int ncpus
+        Int ramGB
+        String? disks
+    }
 
     command {
         python3 $(which rsem_quant.py) \
-            --rsem_index ${rsem_index} \
-            --anno_bam ${anno_bam} \
-            --endedness ${endedness} \
-            --read_strand ${read_strand} \
-            --rnd_seed ${rnd_seed} \
-            --ncpus ${ncpus} \
-            --ramGB ${ramGB}
+            --rsem_index ~{rsem_index} \
+            --anno_bam ~{anno_bam} \
+            --endedness ~{endedness} \
+            --read_strand ~{read_strand} \
+            --rnd_seed ~{rnd_seed} \
+            --ncpus ~{ncpus} \
+            --ramGB ~{ramGB}
     }
 
     output {
         File genes_results = glob("*.genes.results")[0]
         File isoforms_results = glob("*.isoforms.results")[0]
-        File python_log = glob("rsem_quant.log")[0]
+        File python_log = "rsem_quant.log"
         File number_of_genes = glob("*_number_of_genes_detected.json")[0]
     }
 
     runtime {
         cpu: ncpus
-        memory: "${ramGB} GB"
+        memory: "~{ramGB} GB"
         disks : select_first([disks,"local-disk 100 SSD"])
     }
 }
 
 task kallisto {
-    Array[File] fastqs_R1
-    Array[File] fastqs_R2
-    File kallisto_index
-    String endedness
-    String strandedness_direction
-    Int number_of_threads
-    Int ramGB
-    String out_prefix
-    Int? fragment_length
-    Float? sd_of_fragment_length
-    String? disks
+    input {
+        Array[File] fastqs_R1
+        Array[File] fastqs_R2
+        File kallisto_index
+        String endedness
+        String strandedness_direction
+        Int number_of_threads
+        Int ramGB
+        String out_prefix
+        Int? fragment_length
+        Float? sd_of_fragment_length
+        String? disks
+    }
 
     command {
         python3 $(which kallisto_quant.py) \
-            --fastqs_R1 ${sep=' ' fastqs_R1} \
-            --fastqs_R2 ${sep=' ' fastqs_R2} \
-            --number_of_threads ${number_of_threads} \
-            --strandedness ${strandedness_direction} \
-            --path_to_index ${kallisto_index} \
-            --endedness ${endedness} \
-            ${"--fragment_length " + fragment_length} \
-            ${"--sd_of_fragment_length " + sd_of_fragment_length} \
-            ${"--out_prefix " + out_prefix}
+            --fastqs_R1 ~{sep=' ' fastqs_R1} \
+            --fastqs_R2 ~{sep=' ' fastqs_R2} \
+            --number_of_threads ~{number_of_threads} \
+            --strandedness ~{strandedness_direction} \
+            --path_to_index ~{kallisto_index} \
+            --endedness ~{endedness} \
+            ~{"--fragment_length " + fragment_length} \
+            ~{"--sd_of_fragment_length " + sd_of_fragment_length} \
+            ~{"--out_prefix " + out_prefix}
     }
 
     output {
-        File quants = glob("kallisto_out/*_abundance.tsv")[0]
-        File python_log = glob("kallisto_quant.log")[0]
+        File quants = "kallisto_out/~{out_prefix}_abundance.tsv"
+        File python_log = "kallisto_quant.log"
     }
 
     runtime {
         cpu: number_of_threads
-        memory: "${ramGB} GB"
+        memory: "~{ramGB} GB"
         disks: select_first([disks, "local-disk 100 SSD"])
     }
 }
 
 task mad_qc {
-    File quants1
-    File quants2
-    String? disks
+    input {
+        File quants1
+        File quants2
+        String? disks
+    }
 
     command {
         python3 $(which mad_qc.py) \
-            --quants1 ${quants1} \
-            --quants2 ${quants2} \
+            --quants1 ~{quants1} \
+            --quants2 ~{quants2} \
             --MAD_R_path $(which MAD.R)
     }
 
     output {
         File madQCplot = glob("*_mad_plot.png")[0]
         File madQCmetrics = glob("*_mad_qc_metrics.json")[0]
-        File python_log = glob("mad_qc.log")[0]
+        File python_log = "mad_qc.log"
     }
 
     runtime {
@@ -321,21 +324,23 @@ task mad_qc {
 }
 
 task rna_qc {
-    File input_bam
-    File tr_id_to_gene_type_tsv
-    String output_filename
-    String? disks
+    input {
+        File input_bam
+        File tr_id_to_gene_type_tsv
+        String output_filename
+        String? disks
+    }
 
     command {
         python3 $(which rna_qc.py) \
-            --input_bam ${input_bam} \
-            --tr_id_to_gene_type_tsv ${tr_id_to_gene_type_tsv} \
-            --output_filename ${output_filename}
+            --input_bam ~{input_bam} \
+            --tr_id_to_gene_type_tsv ~{tr_id_to_gene_type_tsv} \
+            --output_filename ~{output_filename}
     }
 
     output {
-        File rnaQC = glob("*_qc.json")[0]
-        File python_log = glob("rna_qc.log")[0]
+        File rnaQC = output_filename
+        File python_log = "rna_qc.log"
     }
 
     runtime {
diff --git a/test/test_task/test_align.wdl b/test/test_task/test_align.wdl
index 14dbd1e..528392f 100644
--- a/test/test_task/test_align.wdl
+++ b/test/test_task/test_align.wdl
@@ -1,28 +1,31 @@
+version 1.0
+
 import "../../rna-seq-pipeline.wdl" as rna
 
 workflow test_align {
-    String endedness
-    Array[Array[File]] fastqs_R1
-    Array[Array[File]] fastqs_R2 = []
-    String bamroot
-    File align_index
-    Int align_ncpus
-    Int align_ramGB
-    String? align_disk
+    input {
+        String endedness
+        Array[Array[File]] fastqs_R1
+        Array[Array[File]] fastqs_R2 = []
+        String bamroot
+        File align_index
+        Int align_ncpus
+        Int align_ramGB
+        String? align_disk
+    }
 
     Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2
 
     scatter (i in range(length(fastqs_R1))) {
         call rna.align { input:
-            endedness = endedness,
-            fastqs_R1 = fastqs_R1[i],
-            fastqs_R2 = fastqs_R2_[i],
-            index = align_index,
-            bamroot = "rep"+(i+1)+bamroot,
-            ncpus = align_ncpus,
-            ramGB = align_ramGB,
-            disks = align_disk,
-
+            endedness=endedness,
+            fastqs_R1=fastqs_R1[i],
+            fastqs_R2=fastqs_R2_[i],
+            index=align_index,
+            bamroot="rep"+(i+1)+bamroot,
+            ncpus=align_ncpus,
+            ramGB=align_ramGB,
+            disks=align_disk,
         }
     }
 }
diff --git a/test/test_task/test_kallisto.wdl b/test/test_task/test_kallisto.wdl
deleted file mode 100644
index 4760d3f..0000000
--- a/test/test_task/test_kallisto.wdl
+++ /dev/null
@@ -1,33 +0,0 @@
-import "../../rna-seq-pipeline.wdl" as rna
-
-workflow test_kallisto {
-    Array[Array[File]] fastqs_R1
-    Array[Array[File]] fastqs_R2 = []
-    File kallisto_index
-    String endedness
-    String strandedness_direction
-    Int kallisto_number_of_threads
-    Int kallisto_ramGB
-    String out_prefix
-    String kallisto_disk
-    Int? kallisto_fragment_length
-    Float? kallisto_sd_of_fragment_length
-
-    Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2
-
-    scatter (i in range(length(fastqs_R1))) {
-        call rna.kallisto { input:
-            fastqs_R1 = fastqs_R1[i],
-            fastqs_R2 = fastqs_R2_[i],
-            endedness = endedness,
-            strandedness_direction = strandedness_direction,
-            kallisto_index = kallisto_index,
-            number_of_threads = kallisto_number_of_threads,
-            ramGB = kallisto_ramGB,
-            fragment_length = kallisto_fragment_length,
-            sd_of_fragment_length = kallisto_sd_of_fragment_length,
-            disks = kallisto_disk,
-            out_prefix = "rep"+(i+1)+out_prefix,
-        }
-    }
-}
diff --git a/test/test_task/test_kallisto_PE_input.json b/test/test_task/test_kallisto_PE_input.json
deleted file mode 100644
index 75b2069..0000000
--- a/test/test_task/test_kallisto_PE_input.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "test_kallisto.endedness": "paired",
-  "test_kallisto.fastqs_R1": [
-    [
-      "test_data/ENCSR142YZV_chr19only_10000_reads_R1_part1.fastq.gz",
-      "test_data/ENCSR142YZV_chr19only_10000_reads_R1_part2.fastq.gz"
-    ]
-  ],
-  "test_kallisto.fastqs_R2": [
-    [
-      "test_data/ENCSR142YZV_chr19only_10000_reads_R2_part1.fastq.gz",
-      "test_data/ENCSR142YZV_chr19only_10000_reads_R2_part2.fastq.gz"
-    ]
-  ],
-  "test_kallisto.kallisto_disk": "local-disk 20 HDD",
-  "test_kallisto.kallisto_index": "test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx",
-  "test_kallisto.kallisto_number_of_threads": 2,
-  "test_kallisto.kallisto_ramGB": 4,
-  "test_kallisto.out_prefix": "PE_unstranded",
-  "test_kallisto.strandedness_direction": "unstranded"
-}
diff --git a/test/test_task/test_kallisto_PE_reference_md5.json b/test/test_task/test_kallisto_PE_reference_md5.json
deleted file mode 100644
index cc2e572..0000000
--- a/test/test_task/test_kallisto_PE_reference_md5.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "rep1PE_unstranded_abundance.tsv": "2e319b57db723e97a1df81547854fed9"
-}
diff --git a/test/test_task/test_kallisto_SE_input.json b/test/test_task/test_kallisto_SE_input.json
deleted file mode 100644
index 33a9a59..0000000
--- a/test/test_task/test_kallisto_SE_input.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "test_kallisto.endedness": "single",
-  "test_kallisto.fastqs_R1": [
-    [
-      "test_data/rep1_ENCSR510QZW_chr19only_10000_reads_part1.fastq.gz",
-      "test_data/rep1_ENCSR510QZW_chr19only_10000_reads_part2.fastq.gz"
-    ],
-    [
-      "test_data/rep2_ENCSR510QZW_chr19only_10000_reads_part1.fastq.gz",
-      "test_data/rep2_ENCSR510QZW_chr19only_10000_reads_part2.fastq.gz"
-    ]
-  ],
-  "test_kallisto.fastqs_R2": [],
-  "test_kallisto.kallisto_disk": "local-disk 20 HDD",
-  "test_kallisto.kallisto_fragment_length": 250,
-  "test_kallisto.kallisto_index": "test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx",
-  "test_kallisto.kallisto_number_of_threads": 2,
-  "test_kallisto.kallisto_ramGB": 4,
-  "test_kallisto.kallisto_sd_of_fragment_length": 10,
-  "test_kallisto.out_prefix": "kallisto_SE",
-  "test_kallisto.strandedness_direction": "unstranded"
-}
diff --git a/test/test_task/test_kallisto_SE_reference_md5.json b/test/test_task/test_kallisto_SE_reference_md5.json
deleted file mode 100644
index b5d13c3..0000000
--- a/test/test_task/test_kallisto_SE_reference_md5.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "rep1kallisto_SE_abundance.tsv": "07de1950ee4bf5fabd4daa92c0ccd423",
-  "rep2kallisto_SE_abundance.tsv": "1efd28ac45dc29b609a01342554d2a9f"
-}
diff --git a/test/test_workflow/SE_unstranded_input.json b/test/test_workflow/SE_unstranded_input.json
index b380338..e0f9e1a 100644
--- a/test/test_workflow/SE_unstranded_input.json
+++ b/test/test_workflow/SE_unstranded_input.json
@@ -18,11 +18,17 @@
     ]
   ],
   "rna.kallisto_disk": "local-disk 20 HDD",
-  "rna.kallisto_fragment_length": 250,
+  "rna.kallisto_fragment_length": [
+    250,
+    250
+  ],
   "rna.kallisto_index": "test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx",
   "rna.kallisto_number_of_threads": 2,
   "rna.kallisto_ramGB": 4,
-  "rna.kallisto_sd_of_fragment_length": 10,
+  "rna.kallisto_sd_of_fragment_length": [
+    10,
+    10
+  ],
   "rna.mad_qc_disk": "local-disk 20 HDD",
   "rna.rna_qc_disk": "local-disk 20 HDD",
   "rna.rna_qc_tr_id_to_gene_type_tsv": "transcript_id_to_gene_type_mappings/gencodeV24pri-tRNAs-ERCC-phiX.transcript_id_to_genes.tsv",