From 8a29a9970226dc687597c4e7e8bac7fe314dac0f Mon Sep 17 00:00:00 2001 From: ottojolanki Date: Wed, 8 Jul 2020 04:54:31 -0700 Subject: [PATCH] v1.2.0 (#64) --- .circleci/config.yml | 90 +--- docs/howto.md | 16 +- docs/reference.md | 4 +- make_index_wdl/build_genome_index.wdl | 103 +++-- make_index_wdl/merge_anno.wdl | 66 +-- rna-seq-pipeline.wdl | 421 +++++++++--------- test/test_task/test_align.wdl | 37 +- test/test_task/test_kallisto.wdl | 33 -- test/test_task/test_kallisto_PE_input.json | 21 - .../test_kallisto_PE_reference_md5.json | 3 - test/test_task/test_kallisto_SE_input.json | 22 - .../test_kallisto_SE_reference_md5.json | 4 - test/test_workflow/SE_unstranded_input.json | 10 +- 13 files changed, 346 insertions(+), 484 deletions(-) delete mode 100644 test/test_task/test_kallisto.wdl delete mode 100644 test/test_task/test_kallisto_PE_input.json delete mode 100644 test/test_task/test_kallisto_PE_reference_md5.json delete mode 100644 test/test_task/test_kallisto_SE_input.json delete mode 100644 test/test_task/test_kallisto_SE_reference_md5.json diff --git a/.circleci/config.yml b/.circleci/config.yml index 29e1726..fa4dde3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,12 +28,12 @@ make_tag: &make_tag get_star_index: &get_star_index name: get star index for test command: | - curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz + curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz get_kallisto_index: &get_kallisto_index name: get kallisto index for test command: | - curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx + curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx install_singularity: &install_singularity name: install singularity @@ -280,76 +280,6 @@ jobs: python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_align_SE_input.result.json no_output_timeout: 30m - test_PE_kallisto_docker: - <<: *machine_defaults - steps: - - checkout - - run: *make_tag - - run: *get_kallisto_index - - run: - command: | - pyenv global 3.5.2 - source ${BASH_ENV} - test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG docker - python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json - cat test_kallisto_PE_input.result.json - python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json - no_output_timeout: 30m - - test_PE_kallisto_singularity: - <<: *machine_defaults - steps: - - checkout - - run: *make_tag - - run: sudo apt-get update - - singularity/install-go - - singularity/debian-install-3 - - run: *get_kallisto_index - - run: - command: | - pyenv global 3.5.2 - source ${BASH_ENV} - test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG singularity - python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json - cat test_kallisto_PE_input.result.json - python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json - no_output_timeout: 30m - - test_SE_kallisto_docker: - <<: *machine_defaults - steps: - - checkout - - run: *make_tag - - run: *get_kallisto_index - - run: - command: | - pyenv global 3.5.2 - source ${BASH_ENV} - test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG docker - python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json - cat test_kallisto_SE_input.result.json - python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json - no_output_timeout: 30m - - test_SE_kallisto_singularity: - <<: *machine_defaults - steps: - - checkout - - run: *make_tag - - run: sudo apt-get update - - singularity/install-go - - singularity/debian-install-3 - - run: *get_kallisto_index - - run: - command: | - pyenv global 3.5.2 - source ${BASH_ENV} - test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG singularity - python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json - cat test_kallisto_SE_input.result.json - python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json - no_output_timeout: 30m - # Workflow workflows: build_workflow: @@ -388,18 +318,6 @@ workflows: - test_SE_align_singularity: requires: - build - - test_PE_kallisto_docker: - requires: - - build - - test_PE_kallisto_singularity: - requires: - - build - - test_SE_kallisto_docker: - requires: - - build - - test_SE_kallisto_singularity: - requires: - - build - push_template: requires: - unittests @@ -413,7 +331,3 @@ workflows: - test_PE_align_singularity - test_SE_align_docker - test_SE_align_singularity - - test_PE_kallisto_docker - - test_PE_kallisto_singularity - - test_SE_kallisto_docker - - test_SE_kallisto_singularity diff --git a/docs/howto.md b/docs/howto.md index ebce468..43a5418 100644 --- a/docs/howto.md +++ b/docs/howto.md @@ -31,8 +31,8 @@ Make sure you have completed the steps for installation and Google Cloud setup d 2. Get STAR and kallisto index files: ```bash - $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz - $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx + $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz + $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx ``` 3. Copy indexes and input data into the cloud: @@ -110,8 +110,8 @@ The goal is to run a single-end, non-strand-specific experiment on a local compu 2. Get STAR and kallisto index files: ```bash - $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz - $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx + $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz + $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx ``` The other data that is required to complete this recipe is included in the repository within test_data directory. @@ -137,8 +137,8 @@ The other data that is required to complete this recipe is included in the repos "rna.rsem_ramGB" : 4, "rna.kallisto_number_of_threads" : 2, "rna.kallisto_ramGB" : 4, - "rna.kallisto_fragment_length" : 250, - "rna.kallisto_sd_of_fragment_length" : 10, + "rna.kallisto_fragment_length" : [250], + "rna.kallisto_sd_of_fragment_length" : [10], "rna.rna_qc_tr_id_to_gene_type_tsv" : "[PATH_TO_REPO]/rna-seq-pipeline/transcript_id_to_gene_type_mappings/gencodeV24pri-tRNAs-ERCC-phiX.transcript_id_to_genes.tsv", "rna.bam_to_signals_ncpus" : 1, "rna.bam_to_signals_ramGB" : 2, @@ -185,8 +185,8 @@ The goal is to run a single-end non-strand-specific experiment locally using sin 3. Get STAR and kallisto index files: ```bash - $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz - $ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx + $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz + $ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx ``` 4. Run the pipeline using caper: diff --git a/docs/reference.md b/docs/reference.md index 6303419..e519337 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -149,9 +149,9 @@ Assume you want to allocate 100 gigabytes of spinning hard drive. In this case y Kallisto quantifier makes use of average fragment lenghts and standard deviations of those lengths. In the case of paired end experiments, those values can be calculated from the data, but in case of single-ended experiment those values must be provided. -* `rna.kallisto_fragment_length` Is the average fragment length. Required only if `rna.run_kallisto` is `true` (Default is `true`). +* `rna.kallisto_fragment_length` Is an array of average fragment lengths as integers, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`). . -* `rna.kallisto_sd_of_fragment_length` Is the standard deviation of the fragment lengths. Required only if `rna.run_kallisto` is `true` (Default is `true`). +* `rna.kallisto_sd_of_fragment_length` Is and array standard deviations of the fragment lengths as floats, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`). If you do not have this data available, or if you for some other reason want to omit running kallisto you can use the following parameter: diff --git a/make_index_wdl/build_genome_index.wdl b/make_index_wdl/build_genome_index.wdl index 98226cf..0432d72 100644 --- a/make_index_wdl/build_genome_index.wdl +++ b/make_index_wdl/build_genome_index.wdl @@ -1,61 +1,70 @@ +version 1.0 + # ENCODE DCC RNA-Seq pipeline build_genome_index -# Maintainer: Otto Jolanki -#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1 -#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1 +#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0 +#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0 workflow build_index { - # Inputs - # reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta - File reference_sequence - # spikeins in gzipped fasta - File? spikeins - # annotation in gzipped gtf - File? annotation - # annotation version (e.g 'v24') - String? anno_version - # genome (e.g 'GRCh38') - String? genome - # Flavor of the index that gets built - # available options: - # prep_rsem, prep_srna, prep_star, prep_kallisto - String index_type - Int ncpu = 8 - Int? memGB - String? disks + meta { + author: "Otto Jolanki" + version: "v1.2.0" + } + + input { + # reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta + File reference_sequence + # spikeins in gzipped fasta + File? spikeins + # annotation in gzipped gtf + File? annotation + # annotation version (e.g 'v24') + String? anno_version + # genome (e.g 'GRCh38') + String? genome + # Flavor of the index that gets built + # available options: + # prep_rsem, prep_srna, prep_star, prep_kallisto + String index_type + Int ncpu = 8 + Int? memGB + String? disks + } call make_index { input: - reference_sequence = reference_sequence, - spikeins = spikeins, - annotation = annotation, - anno_version = anno_version, - genome = genome, - index_type = index_type, - ncpu = ncpu, - memGB = memGB, - disks = disks, + reference_sequence=reference_sequence, + spikeins=spikeins, + annotation=annotation, + anno_version=anno_version, + genome=genome, + index_type=index_type, + ncpu=ncpu, + memGB=memGB, + disks=disks, } } task make_index { - File reference_sequence - File? spikeins - File? annotation - String? anno_version - String? genome - String index_type - Int ncpu - Int? memGB - String? disks + input { + File reference_sequence + File? spikeins + File? annotation + String? anno_version + String? genome + String index_type + Int ncpu + Int? memGB + String? disks + } command { - $(which ${index_type + ".sh"}) \ - ${reference_sequence} \ - ${spikeins} \ - ${annotation} \ - ${anno_version} \ - ${genome} \ - ${ncpu} + $(which ~{index_type + ".sh"}) \ + ~{reference_sequence} \ + ~{spikeins} \ + ~{annotation} \ + ~{anno_version} \ + ~{genome} \ + ~{ncpu} } output { @@ -64,7 +73,7 @@ task make_index { runtime { cpu : ncpu - memory : "${select_first([memGB,'8'])} GB" + memory : "~{select_first([memGB,'8'])} GB" disks : select_first([disks,"local-disk 100 SSD"]) } } diff --git a/make_index_wdl/merge_anno.wdl b/make_index_wdl/merge_anno.wdl index d74b0e8..c593fdd 100644 --- a/make_index_wdl/merge_anno.wdl +++ b/make_index_wdl/merge_anno.wdl @@ -1,50 +1,58 @@ +version 1.0 + #ENCODE DCC RNA-Seq pipeline merge-annotation -#Maintainer: Otto Jolanki -#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1 -#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1 +#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0 +#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0 workflow merge_anno { - # input filenames - File annotation - File tRNA - File spikeins - # output filename - String output_filename - Int? cpu - Int? memGB - String? disks + meta { + author: "Otto Jolanki" + version: "v1.2.0" + } + + input { + File annotation + File tRNA + File spikeins + String output_filename + Int? cpu + Int? memGB + String? disks + } call merge_annotation { input : - annotation = annotation, - tRNA = tRNA, - spikeins = spikeins, - output_filename = output_filename, + annotation=annotation, + tRNA=tRNA, + spikeins=spikeins, + output_filename=output_filename, } } task merge_annotation { - File annotation - File tRNA - File spikeins - String output_filename - Int? cpu - Int? memGB - String? disks + input { + File annotation + File tRNA + File spikeins + String output_filename + Int? cpu + Int? memGB + String? disks + } command { python3 $(which merge_annotation.py) \ - ${"--annotation " + annotation} \ - ${"--tRNA " + tRNA} \ - ${"--spikeins " + spikeins} \ - ${"--output_filename " + output_filename} + ~{"--annotation " + annotation} \ + ~{"--tRNA " + tRNA} \ + ~{"--spikeins " + spikeins} \ + ~{"--output_filename " + output_filename} } output { - File merged_annotation = glob("${output_filename}")[0] + File merged_annotation = output_filename } runtime { cpu : select_first([cpu,2]) - memory : "${select_first([memGB,'8'])} GB" + memory : "~{select_first([memGB,'8'])} GB" disks : select_first([disks,"local-disk 100 SSD"]) } } diff --git a/rna-seq-pipeline.wdl b/rna-seq-pipeline.wdl index ef384b0..c80d94d 100644 --- a/rna-seq-pipeline.wdl +++ b/rna-seq-pipeline.wdl @@ -1,125 +1,119 @@ +version 1.0 + # ENCODE DCC RNA-seq pipeline -# Maintainer: Otto Jolanki -#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1 -#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1 +#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0 +#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/bulkrna.output_definition.json workflow rna { - # endedness: paired or single - String endedness - # fastqs_R1: fastq.gz files for Read1 (only these if single-ended) - Array[Array[File]] fastqs_R1 - # fastqs_R2: fastq.gz files for Read2 (omit if single-ended) in order - # corresponding to fastqs_R1 - Array[Array[File]] fastqs_R2 = [] - # bamroot: root name for output bams. For example foo_bar will - # create foo_bar_genome.bam and foo_bar_anno.bam - String bamroot - # strandedness: is the library strand specific (stranded or unstranded) - String strandedness - # strandedness_direction (forward, reverse, unstranded) - String strandedness_direction - # chrom_sizes: chromosome sizes file - File chrom_sizes - # Switch to false to not run kallisto - Boolean run_kallisto = true - ## task level variables that are defined globally to make them visible to DNANexus UI - - # ALIGN - # index: aligner index archive (tar.gz) - File align_index - Int align_ncpus - Int align_ramGB - String? align_disk - - # KALLISTO - - Int? kallisto_number_of_threads - Int? kallisto_ramGB - File? kallisto_index - Int? kallisto_fragment_length - Float? kallisto_sd_of_fragment_length - String? kallisto_disk - - # BAM_TO_SIGNALS - - Int bam_to_signals_ncpus - Int bam_to_signals_ramGB - String? bam_to_signals_disk - - # RSEM_QUANT - - # rsem_index: RSEM index archive (tar.gz) - File rsem_index - # rnd_seed: random seed used for rsem - Int rnd_seed = 12345 - Int rsem_ncpus - Int rsem_ramGB - String? rsem_disk - - # RNA_QC - - File rna_qc_tr_id_to_gene_type_tsv - String? rna_qc_disk - - # MAD_QC - - String? mad_qc_disk - - ## WORKFLOW BEGINS + meta { + author: "Otto Jolanki" + version: "v1.2.0" + } + + input { + # endedness: paired or single + String endedness + # fastqs_R1: fastq.gz files for Read1 (only these if single-ended) + Array[Array[File]] fastqs_R1 + # fastqs_R2: fastq.gz files for Read2 (omit if single-ended) in order + # corresponding to fastqs_R1 + Array[Array[File]] fastqs_R2 = [] + # bamroot: root name for output bams. For example foo_bar will + # create foo_bar_genome.bam and foo_bar_anno.bam + String bamroot + # strandedness: is the library strand specific (stranded or unstranded) + String strandedness + # strandedness_direction (forward, reverse, unstranded) + String strandedness_direction + # chrom_sizes: chromosome sizes file + File chrom_sizes + # Switch to false to not run kallisto + Boolean run_kallisto = true + # index: aligner index archive (tar.gz) + File align_index + Int align_ncpus + Int align_ramGB + String? align_disk + Int? kallisto_number_of_threads + Int? kallisto_ramGB + File? kallisto_index + Array[Int] kallisto_fragment_length = [] + Array[Float] kallisto_sd_of_fragment_length = [] + String? kallisto_disk + Int bam_to_signals_ncpus + Int bam_to_signals_ramGB + String? bam_to_signals_disk + # rsem_index: RSEM index archive (tar.gz) + File rsem_index + # rnd_seed: random seed used for rsem + Int rnd_seed = 12345 + Int rsem_ncpus + Int rsem_ramGB + String? rsem_disk + File rna_qc_tr_id_to_gene_type_tsv + String? mad_qc_disk + String? rna_qc_disk + + # These are for internal use, leave undefined + Int? kallisto_fragment_length_undefined + Float? kallisto_sd_undefined + } # dummy variable value for the single-ended case Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2 scatter (i in range(length(fastqs_R1))) { call align { input: - endedness = endedness, - fastqs_R1 = fastqs_R1[i], - fastqs_R2 = fastqs_R2_[i], - index = align_index, - bamroot = "rep"+(i+1)+bamroot, - ncpus = align_ncpus, - ramGB = align_ramGB, - disks = align_disk, + endedness=endedness, + fastqs_R1=fastqs_R1[i], + fastqs_R2=fastqs_R2_[i], + index=align_index, + bamroot="rep"+(i+1)+bamroot, + ncpus=align_ncpus, + ramGB=align_ramGB, + disks=align_disk, } call bam_to_signals { input: - input_bam = align.genomebam, - chrom_sizes = chrom_sizes, - strandedness = strandedness, - bamroot = "rep"+(i+1)+bamroot+"_genome", - ncpus = bam_to_signals_ncpus, - ramGB = bam_to_signals_ramGB, - disks = bam_to_signals_disk, + input_bam=align.genomebam, + chrom_sizes=chrom_sizes, + strandedness=strandedness, + bamroot="rep"+(i+1)+bamroot+"_genome", + ncpus=bam_to_signals_ncpus, + ramGB=bam_to_signals_ramGB, + disks=bam_to_signals_disk, } call rsem_quant { input: - rsem_index = rsem_index, - rnd_seed = rnd_seed, - anno_bam = align.annobam, - endedness = endedness, - read_strand = strandedness_direction, - ncpus = rsem_ncpus, - ramGB = rsem_ramGB, - disks = rsem_disk, + rsem_index=rsem_index, + rnd_seed=rnd_seed, + anno_bam=align.annobam, + endedness=endedness, + read_strand=strandedness_direction, + ncpus=rsem_ncpus, + ramGB=rsem_ramGB, + disks=rsem_disk, } } if (run_kallisto) { scatter (i in range(length(fastqs_R1))) { + Float? kallisto_sd = if (length(kallisto_sd_of_fragment_length) > 0) then kallisto_sd_of_fragment_length[i] else kallisto_sd_undefined + Int? kallisto_fl = if (length(kallisto_fragment_length) > 0) then kallisto_fragment_length[i] else kallisto_fragment_length_undefined call kallisto { input: - fastqs_R1 = fastqs_R1[i], - fastqs_R2 = fastqs_R2_[i], - endedness = endedness, - strandedness_direction = strandedness_direction, - kallisto_index = kallisto_index, - number_of_threads = kallisto_number_of_threads, - ramGB = kallisto_ramGB, - fragment_length = kallisto_fragment_length, - sd_of_fragment_length = kallisto_sd_of_fragment_length, - disks = kallisto_disk, - out_prefix = "rep"+(i+1)+bamroot, + fastqs_R1=fastqs_R1[i], + fastqs_R2=fastqs_R2_[i], + endedness=endedness, + strandedness_direction=strandedness_direction, + kallisto_index=select_first([kallisto_index]), + number_of_threads=select_first([kallisto_number_of_threads]), + ramGB=select_first([kallisto_ramGB]), + fragment_length=kallisto_fl, + sd_of_fragment_length=kallisto_sd, + disks=kallisto_disk, + out_prefix="rep"+(i+1)+bamroot, } } } @@ -129,188 +123,197 @@ workflow rna { # if there are exactly two replicates, calculate the madQC metrics and draw a plot if (length(fastqs_R1) == 2) { call mad_qc { input: - quants1 = rsem_quant.genes_results[0], - quants2 = rsem_quant.genes_results[1], - disks = mad_qc_disk, + quants1=rsem_quant.genes_results[0], + quants2=rsem_quant.genes_results[1], + disks=mad_qc_disk, } } scatter (i in range(length(align.annobam))) { call rna_qc { input: - input_bam = align.annobam[i], - tr_id_to_gene_type_tsv = rna_qc_tr_id_to_gene_type_tsv, - output_filename = "rep"+(i+1)+bamroot+"_qc.json", - disks = rna_qc_disk, + input_bam=align.annobam[i], + tr_id_to_gene_type_tsv=rna_qc_tr_id_to_gene_type_tsv, + output_filename="rep"+(i+1)+bamroot+"_qc.json", + disks=rna_qc_disk, } } } -## tasks + task align { - Array[File] fastqs_R1 - Array[File] fastqs_R2 - String endedness - File index - String bamroot - Int ncpus - Int ramGB - String? disks + input { + Array[File] fastqs_R1 + Array[File] fastqs_R2 + String endedness + File index + String bamroot + Int ncpus + Int ramGB + String? disks + } command { python3 $(which align.py) \ - --fastqs_R1 ${sep=' ' fastqs_R1} \ - --fastqs_R2 ${sep=' ' fastqs_R2} \ - --endedness ${endedness} \ - --index ${index} \ - ${"--bamroot " + bamroot} \ - ${"--ncpus " + ncpus} \ - ${"--ramGB " + ramGB} + --fastqs_R1 ~{sep=' ' fastqs_R1} \ + --fastqs_R2 ~{sep=' ' fastqs_R2} \ + --endedness ~{endedness} \ + --index ~{index} \ + ~{"--bamroot " + bamroot} \ + ~{"--ncpus " + ncpus} \ + ~{"--ramGB " + ramGB} } output { - File genomebam = glob("*_genome.bam")[0] - File annobam = glob("*_anno.bam")[0] - File genome_flagstat = glob("*_genome_flagstat.txt")[0] - File anno_flagstat = glob("*_anno_flagstat.txt")[0] - File log = glob("*_Log.final.out")[0] - File genome_flagstat_json = glob("*_genome_flagstat.json")[0] - File anno_flagstat_json = glob("*_anno_flagstat.json")[0] - File log_json = glob("*_Log.final.json")[0] - File python_log = glob("align.log")[0] + File genomebam = "~{bamroot}_genome.bam" + File annobam = "~{bamroot}_anno.bam" + File genome_flagstat = "~{bamroot}_genome_flagstat.txt" + File anno_flagstat = "~{bamroot}_anno_flagstat.txt" + File log = "~{bamroot}_Log.final.out" + File genome_flagstat_json = "~{bamroot}_genome_flagstat.json" + File anno_flagstat_json = "~{bamroot}_anno_flagstat.json" + File log_json = "~{bamroot}_Log.final.json" + File python_log = "align.log" } runtime { cpu: ncpus - memory: "${ramGB} GB" + memory: "~{ramGB} GB" disks : select_first([disks,"local-disk 100 SSD"]) } } task bam_to_signals { - File? null - File input_bam - File chrom_sizes - String strandedness - String bamroot - Int ncpus - Int ramGB - String? disks - + input { + File? null + File input_bam + File chrom_sizes + String strandedness + String bamroot + Int ncpus + Int ramGB + String? disks + } command { python3 $(which bam_to_signals.py) \ - --bamfile ${input_bam} \ - --chrom_sizes ${chrom_sizes} \ - --strandedness ${strandedness} \ - --bamroot ${bamroot} + --bamfile ~{input_bam} \ + --chrom_sizes ~{chrom_sizes} \ + --strandedness ~{strandedness} \ + --bamroot ~{bamroot} } output { - File? unique_unstranded = if (strandedness == "unstranded") then glob("*genome_uniq.bw")[0] else null - File? all_unstranded = if (strandedness == "unstranded") then glob("*genome_all.bw")[0] else null - File? unique_plus = if (strandedness == "stranded") then glob("*genome_plusUniq.bw")[0] else null - File? unique_minus = if (strandedness == "stranded") then glob("*genome_minusUniq.bw")[0] else null - File? all_plus = if (strandedness == "stranded") then glob("*genome_plusAll.bw")[0] else null - File? all_minus = if (strandedness == "stranded") then glob("*genome_minusAll.bw")[0] else null - File python_log = glob("bam_to_signals.log")[0] + File? unique_unstranded = if (strandedness == "unstranded") then glob("*_genome_uniq.bw")[0] else null + File? all_unstranded = if (strandedness == "unstranded") then glob("*_genome_all.bw")[0] else null + File? unique_plus = if (strandedness == "stranded") then glob("*_genome_plusUniq.bw")[0] else null + File? unique_minus = if (strandedness == "stranded") then glob("*_genome_minusUniq.bw")[0] else null + File? all_plus = if (strandedness == "stranded") then glob("*_genome_plusAll.bw")[0] else null + File? all_minus = if (strandedness == "stranded") then glob("*_genome_minusAll.bw")[0] else null + File python_log = "bam_to_signals.log" } runtime { cpu: ncpus - memory: "${ramGB} GB" + memory: "~{ramGB} GB" disks : select_first([disks,"local-disk 100 SSD"]) } } task rsem_quant { - File rsem_index - File anno_bam - String endedness - String read_strand - Int rnd_seed - Int ncpus - Int ramGB - String? disks + input { + File rsem_index + File anno_bam + String endedness + String read_strand + Int rnd_seed + Int ncpus + Int ramGB + String? disks + } command { python3 $(which rsem_quant.py) \ - --rsem_index ${rsem_index} \ - --anno_bam ${anno_bam} \ - --endedness ${endedness} \ - --read_strand ${read_strand} \ - --rnd_seed ${rnd_seed} \ - --ncpus ${ncpus} \ - --ramGB ${ramGB} + --rsem_index ~{rsem_index} \ + --anno_bam ~{anno_bam} \ + --endedness ~{endedness} \ + --read_strand ~{read_strand} \ + --rnd_seed ~{rnd_seed} \ + --ncpus ~{ncpus} \ + --ramGB ~{ramGB} } output { File genes_results = glob("*.genes.results")[0] File isoforms_results = glob("*.isoforms.results")[0] - File python_log = glob("rsem_quant.log")[0] + File python_log = "rsem_quant.log" File number_of_genes = glob("*_number_of_genes_detected.json")[0] } runtime { cpu: ncpus - memory: "${ramGB} GB" + memory: "~{ramGB} GB" disks : select_first([disks,"local-disk 100 SSD"]) } } task kallisto { - Array[File] fastqs_R1 - Array[File] fastqs_R2 - File kallisto_index - String endedness - String strandedness_direction - Int number_of_threads - Int ramGB - String out_prefix - Int? fragment_length - Float? sd_of_fragment_length - String? disks + input { + Array[File] fastqs_R1 + Array[File] fastqs_R2 + File kallisto_index + String endedness + String strandedness_direction + Int number_of_threads + Int ramGB + String out_prefix + Int? fragment_length + Float? sd_of_fragment_length + String? disks + } command { python3 $(which kallisto_quant.py) \ - --fastqs_R1 ${sep=' ' fastqs_R1} \ - --fastqs_R2 ${sep=' ' fastqs_R2} \ - --number_of_threads ${number_of_threads} \ - --strandedness ${strandedness_direction} \ - --path_to_index ${kallisto_index} \ - --endedness ${endedness} \ - ${"--fragment_length " + fragment_length} \ - ${"--sd_of_fragment_length " + sd_of_fragment_length} \ - ${"--out_prefix " + out_prefix} + --fastqs_R1 ~{sep=' ' fastqs_R1} \ + --fastqs_R2 ~{sep=' ' fastqs_R2} \ + --number_of_threads ~{number_of_threads} \ + --strandedness ~{strandedness_direction} \ + --path_to_index ~{kallisto_index} \ + --endedness ~{endedness} \ + ~{"--fragment_length " + fragment_length} \ + ~{"--sd_of_fragment_length " + sd_of_fragment_length} \ + ~{"--out_prefix " + out_prefix} } output { - File quants = glob("kallisto_out/*_abundance.tsv")[0] - File python_log = glob("kallisto_quant.log")[0] + File quants = "kallisto_out/~{out_prefix}_abundance.tsv" + File python_log = "kallisto_quant.log" } runtime { cpu: number_of_threads - memory: "${ramGB} GB" + memory: "~{ramGB} GB" disks: select_first([disks, "local-disk 100 SSD"]) } } task mad_qc { - File quants1 - File quants2 - String? disks + input { + File quants1 + File quants2 + String? disks + } command { python3 $(which mad_qc.py) \ - --quants1 ${quants1} \ - --quants2 ${quants2} \ + --quants1 ~{quants1} \ + --quants2 ~{quants2} \ --MAD_R_path $(which MAD.R) } output { File madQCplot = glob("*_mad_plot.png")[0] File madQCmetrics = glob("*_mad_qc_metrics.json")[0] - File python_log = glob("mad_qc.log")[0] + File python_log = "mad_qc.log" } runtime { @@ -321,21 +324,23 @@ task mad_qc { } task rna_qc { - File input_bam - File tr_id_to_gene_type_tsv - String output_filename - String? disks + input { + File input_bam + File tr_id_to_gene_type_tsv + String output_filename + String? disks + } command { python3 $(which rna_qc.py) \ - --input_bam ${input_bam} \ - --tr_id_to_gene_type_tsv ${tr_id_to_gene_type_tsv} \ - --output_filename ${output_filename} + --input_bam ~{input_bam} \ + --tr_id_to_gene_type_tsv ~{tr_id_to_gene_type_tsv} \ + --output_filename ~{output_filename} } output { - File rnaQC = glob("*_qc.json")[0] - File python_log = glob("rna_qc.log")[0] + File rnaQC = output_filename + File python_log = "rna_qc.log" } runtime { diff --git a/test/test_task/test_align.wdl b/test/test_task/test_align.wdl index 14dbd1e..528392f 100644 --- a/test/test_task/test_align.wdl +++ b/test/test_task/test_align.wdl @@ -1,28 +1,31 @@ +version 1.0 + import "../../rna-seq-pipeline.wdl" as rna workflow test_align { - String endedness - Array[Array[File]] fastqs_R1 - Array[Array[File]] fastqs_R2 = [] - String bamroot - File align_index - Int align_ncpus - Int align_ramGB - String? align_disk + input { + String endedness + Array[Array[File]] fastqs_R1 + Array[Array[File]] fastqs_R2 = [] + String bamroot + File align_index + Int align_ncpus + Int align_ramGB + String? align_disk + } Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2 scatter (i in range(length(fastqs_R1))) { call rna.align { input: - endedness = endedness, - fastqs_R1 = fastqs_R1[i], - fastqs_R2 = fastqs_R2_[i], - index = align_index, - bamroot = "rep"+(i+1)+bamroot, - ncpus = align_ncpus, - ramGB = align_ramGB, - disks = align_disk, - + endedness=endedness, + fastqs_R1=fastqs_R1[i], + fastqs_R2=fastqs_R2_[i], + index=align_index, + bamroot="rep"+(i+1)+bamroot, + ncpus=align_ncpus, + ramGB=align_ramGB, + disks=align_disk, } } } diff --git a/test/test_task/test_kallisto.wdl b/test/test_task/test_kallisto.wdl deleted file mode 100644 index 4760d3f..0000000 --- a/test/test_task/test_kallisto.wdl +++ /dev/null @@ -1,33 +0,0 @@ -import "../../rna-seq-pipeline.wdl" as rna - -workflow test_kallisto { - Array[Array[File]] fastqs_R1 - Array[Array[File]] fastqs_R2 = [] - File kallisto_index - String endedness - String strandedness_direction - Int kallisto_number_of_threads - Int kallisto_ramGB - String out_prefix - String kallisto_disk - Int? kallisto_fragment_length - Float? kallisto_sd_of_fragment_length - - Array[Array[File]] fastqs_R2_ = if (endedness == "single") then fastqs_R1 else fastqs_R2 - - scatter (i in range(length(fastqs_R1))) { - call rna.kallisto { input: - fastqs_R1 = fastqs_R1[i], - fastqs_R2 = fastqs_R2_[i], - endedness = endedness, - strandedness_direction = strandedness_direction, - kallisto_index = kallisto_index, - number_of_threads = kallisto_number_of_threads, - ramGB = kallisto_ramGB, - fragment_length = kallisto_fragment_length, - sd_of_fragment_length = kallisto_sd_of_fragment_length, - disks = kallisto_disk, - out_prefix = "rep"+(i+1)+out_prefix, - } - } -} diff --git a/test/test_task/test_kallisto_PE_input.json b/test/test_task/test_kallisto_PE_input.json deleted file mode 100644 index 75b2069..0000000 --- a/test/test_task/test_kallisto_PE_input.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "test_kallisto.endedness": "paired", - "test_kallisto.fastqs_R1": [ - [ - "test_data/ENCSR142YZV_chr19only_10000_reads_R1_part1.fastq.gz", - "test_data/ENCSR142YZV_chr19only_10000_reads_R1_part2.fastq.gz" - ] - ], - "test_kallisto.fastqs_R2": [ - [ - "test_data/ENCSR142YZV_chr19only_10000_reads_R2_part1.fastq.gz", - "test_data/ENCSR142YZV_chr19only_10000_reads_R2_part2.fastq.gz" - ] - ], - "test_kallisto.kallisto_disk": "local-disk 20 HDD", - "test_kallisto.kallisto_index": "test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx", - "test_kallisto.kallisto_number_of_threads": 2, - "test_kallisto.kallisto_ramGB": 4, - "test_kallisto.out_prefix": "PE_unstranded", - "test_kallisto.strandedness_direction": "unstranded" -} diff --git a/test/test_task/test_kallisto_PE_reference_md5.json b/test/test_task/test_kallisto_PE_reference_md5.json deleted file mode 100644 index cc2e572..0000000 --- a/test/test_task/test_kallisto_PE_reference_md5.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "rep1PE_unstranded_abundance.tsv": "2e319b57db723e97a1df81547854fed9" -} diff --git a/test/test_task/test_kallisto_SE_input.json b/test/test_task/test_kallisto_SE_input.json deleted file mode 100644 index 33a9a59..0000000 --- a/test/test_task/test_kallisto_SE_input.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test_kallisto.endedness": "single", - "test_kallisto.fastqs_R1": [ - [ - "test_data/rep1_ENCSR510QZW_chr19only_10000_reads_part1.fastq.gz", - "test_data/rep1_ENCSR510QZW_chr19only_10000_reads_part2.fastq.gz" - ], - [ - "test_data/rep2_ENCSR510QZW_chr19only_10000_reads_part1.fastq.gz", - "test_data/rep2_ENCSR510QZW_chr19only_10000_reads_part2.fastq.gz" - ] - ], - "test_kallisto.fastqs_R2": [], - "test_kallisto.kallisto_disk": "local-disk 20 HDD", - "test_kallisto.kallisto_fragment_length": 250, - "test_kallisto.kallisto_index": "test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx", - "test_kallisto.kallisto_number_of_threads": 2, - "test_kallisto.kallisto_ramGB": 4, - "test_kallisto.kallisto_sd_of_fragment_length": 10, - "test_kallisto.out_prefix": "kallisto_SE", - "test_kallisto.strandedness_direction": "unstranded" -} diff --git a/test/test_task/test_kallisto_SE_reference_md5.json b/test/test_task/test_kallisto_SE_reference_md5.json deleted file mode 100644 index b5d13c3..0000000 --- a/test/test_task/test_kallisto_SE_reference_md5.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "rep1kallisto_SE_abundance.tsv": "07de1950ee4bf5fabd4daa92c0ccd423", - "rep2kallisto_SE_abundance.tsv": "1efd28ac45dc29b609a01342554d2a9f" -} diff --git a/test/test_workflow/SE_unstranded_input.json b/test/test_workflow/SE_unstranded_input.json index b380338..e0f9e1a 100644 --- a/test/test_workflow/SE_unstranded_input.json +++ b/test/test_workflow/SE_unstranded_input.json @@ -18,11 +18,17 @@ ] ], "rna.kallisto_disk": "local-disk 20 HDD", - "rna.kallisto_fragment_length": 250, + "rna.kallisto_fragment_length": [ + 250, + 250 + ], "rna.kallisto_index": "test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx", "rna.kallisto_number_of_threads": 2, "rna.kallisto_ramGB": 4, - "rna.kallisto_sd_of_fragment_length": 10, + "rna.kallisto_sd_of_fragment_length": [ + 10, + 10 + ], "rna.mad_qc_disk": "local-disk 20 HDD", "rna.rna_qc_disk": "local-disk 20 HDD", "rna.rna_qc_tr_id_to_gene_type_tsv": "transcript_id_to_gene_type_mappings/gencodeV24pri-tRNAs-ERCC-phiX.transcript_id_to_genes.tsv",