Skip to content

Commit

Permalink
v1.2.0 (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
ottojolanki authored Jul 8, 2020
1 parent fb345be commit 8a29a99
Show file tree
Hide file tree
Showing 13 changed files with 346 additions and 484 deletions.
90 changes: 2 additions & 88 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ make_tag: &make_tag
get_star_index: &get_star_index
name: get star index for test
command: |
curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
get_kallisto_index: &get_kallisto_index
name: get kallisto index for test
command: |
curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
install_singularity: &install_singularity
name: install singularity
Expand Down Expand Up @@ -280,76 +280,6 @@ jobs:
python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_align_SE_input.result.json
no_output_timeout: 30m

test_PE_kallisto_docker:
<<: *machine_defaults
steps:
- checkout
- run: *make_tag
- run: *get_kallisto_index
- run:
command: |
pyenv global 3.5.2
source ${BASH_ENV}
test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG docker
python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json
cat test_kallisto_PE_input.result.json
python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json
no_output_timeout: 30m

test_PE_kallisto_singularity:
<<: *machine_defaults
steps:
- checkout
- run: *make_tag
- run: sudo apt-get update
- singularity/install-go
- singularity/debian-install-3
- run: *get_kallisto_index
- run:
command: |
pyenv global 3.5.2
source ${BASH_ENV}
test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_PE_input.json $TAG singularity
python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_PE_input.metadata.json --reference_json test/test_task/test_kallisto_PE_reference_md5.json --outfile test_kallisto_PE_input.result.json
cat test_kallisto_PE_input.result.json
python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_PE_input.result.json
no_output_timeout: 30m

test_SE_kallisto_docker:
<<: *machine_defaults
steps:
- checkout
- run: *make_tag
- run: *get_kallisto_index
- run:
command: |
pyenv global 3.5.2
source ${BASH_ENV}
test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG docker
python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json
cat test_kallisto_SE_input.result.json
python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json
no_output_timeout: 30m

test_SE_kallisto_singularity:
<<: *machine_defaults
steps:
- checkout
- run: *make_tag
- run: sudo apt-get update
- singularity/install-go
- singularity/debian-install-3
- run: *get_kallisto_index
- run:
command: |
pyenv global 3.5.2
source ${BASH_ENV}
test/test_workflow/test.sh test/test_task/test_kallisto.wdl test/test_task/test_kallisto_SE_input.json $TAG singularity
python3 src/compare_md5.py --keys_to_inspect test_kallisto.kallisto.quants --metadata_json test_kallisto_SE_input.metadata.json --reference_json test/test_task/test_kallisto_SE_reference_md5.json --outfile test_kallisto_SE_input.result.json
cat test_kallisto_SE_input.result.json
python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_kallisto_SE_input.result.json
no_output_timeout: 30m

# Workflow
workflows:
build_workflow:
Expand Down Expand Up @@ -388,18 +318,6 @@ workflows:
- test_SE_align_singularity:
requires:
- build
- test_PE_kallisto_docker:
requires:
- build
- test_PE_kallisto_singularity:
requires:
- build
- test_SE_kallisto_docker:
requires:
- build
- test_SE_kallisto_singularity:
requires:
- build
- push_template:
requires:
- unittests
Expand All @@ -413,7 +331,3 @@ workflows:
- test_PE_align_singularity
- test_SE_align_docker
- test_SE_align_singularity
- test_PE_kallisto_docker
- test_PE_kallisto_singularity
- test_SE_kallisto_docker
- test_SE_kallisto_singularity
16 changes: 8 additions & 8 deletions docs/howto.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ Make sure you have completed the steps for installation and Google Cloud setup d
2. Get STAR and kallisto index files:

```bash
$ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
$ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
$ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
$ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
```

3. Copy indexes and input data into the cloud:
Expand Down Expand Up @@ -110,8 +110,8 @@ The goal is to run a single-end, non-strand-specific experiment on a local compu
2. Get STAR and kallisto index files:

```bash
$ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
$ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
$ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
$ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
```

The other data that is required to complete this recipe is included in the repository within test_data directory.
Expand All @@ -137,8 +137,8 @@ The other data that is required to complete this recipe is included in the repos
"rna.rsem_ramGB" : 4,
"rna.kallisto_number_of_threads" : 2,
"rna.kallisto_ramGB" : 4,
"rna.kallisto_fragment_length" : 250,
"rna.kallisto_sd_of_fragment_length" : 10,
"rna.kallisto_fragment_length" : [250],
"rna.kallisto_sd_of_fragment_length" : [10],
"rna.rna_qc_tr_id_to_gene_type_tsv" : "[PATH_TO_REPO]/rna-seq-pipeline/transcript_id_to_gene_type_mappings/gencodeV24pri-tRNAs-ERCC-phiX.transcript_id_to_genes.tsv",
"rna.bam_to_signals_ncpus" : 1,
"rna.bam_to_signals_ramGB" : 2,
Expand Down Expand Up @@ -185,8 +185,8 @@ The goal is to run a single-end non-strand-specific experiment locally using sin
3. Get STAR and kallisto index files:

```bash
$ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
$ curl https://storage.googleapis.com/star-rsem-runs/reference-genomes/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
$ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz -o test_data/GRCh38_v24_ERCC_phiX_starIndex_chr19only.tgz
$ curl https://storage.googleapis.com/circle_ci_test_data/rna-seq-pipeline/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx -o test_data/Homo_sapiens.GRCh38.cdna.all.chr19_ERCC_phix_k31_kallisto.idx
```

4. Run the pipeline using caper:
Expand Down
4 changes: 2 additions & 2 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,9 @@ Assume you want to allocate 100 gigabytes of spinning hard drive. In this case y

Kallisto quantifier makes use of average fragment lenghts and standard deviations of those lengths. In the case of paired end experiments, those values can be calculated from the data, but in case of single-ended experiment those values must be provided.

* `rna.kallisto_fragment_length` Is the average fragment length. Required only if `rna.run_kallisto` is `true` (Default is `true`).
* `rna.kallisto_fragment_length` Is an array of average fragment lengths as integers, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`).
.
* `rna.kallisto_sd_of_fragment_length` Is the standard deviation of the fragment lengths. Required only if `rna.run_kallisto` is `true` (Default is `true`).
* `rna.kallisto_sd_of_fragment_length` Is and array standard deviations of the fragment lengths as floats, one per replicate. Required only if `rna.run_kallisto` is `true` (Default is `true`).


If you do not have this data available, or if you for some other reason want to omit running kallisto you can use the following parameter:
Expand Down
103 changes: 56 additions & 47 deletions make_index_wdl/build_genome_index.wdl
Original file line number Diff line number Diff line change
@@ -1,61 +1,70 @@
version 1.0

# ENCODE DCC RNA-Seq pipeline build_genome_index
# Maintainer: Otto Jolanki
#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1
#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1
#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
workflow build_index {
# Inputs
# reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta
File reference_sequence
# spikeins in gzipped fasta
File? spikeins
# annotation in gzipped gtf
File? annotation
# annotation version (e.g 'v24')
String? anno_version
# genome (e.g 'GRCh38')
String? genome
# Flavor of the index that gets built
# available options:
# prep_rsem, prep_srna, prep_star, prep_kallisto
String index_type
Int ncpu = 8
Int? memGB
String? disks
meta {
author: "Otto Jolanki"
version: "v1.2.0"
}

input {
# reference genome or transcriptome (in prep_kallisto mode)in gzipped fasta
File reference_sequence
# spikeins in gzipped fasta
File? spikeins
# annotation in gzipped gtf
File? annotation
# annotation version (e.g 'v24')
String? anno_version
# genome (e.g 'GRCh38')
String? genome
# Flavor of the index that gets built
# available options:
# prep_rsem, prep_srna, prep_star, prep_kallisto
String index_type
Int ncpu = 8
Int? memGB
String? disks
}

call make_index { input:
reference_sequence = reference_sequence,
spikeins = spikeins,
annotation = annotation,
anno_version = anno_version,
genome = genome,
index_type = index_type,
ncpu = ncpu,
memGB = memGB,
disks = disks,
reference_sequence=reference_sequence,
spikeins=spikeins,
annotation=annotation,
anno_version=anno_version,
genome=genome,
index_type=index_type,
ncpu=ncpu,
memGB=memGB,
disks=disks,
}
}

task make_index {
File reference_sequence
File? spikeins
File? annotation
String? anno_version
String? genome
String index_type
Int ncpu
Int? memGB
String? disks
input {
File reference_sequence
File? spikeins
File? annotation
String? anno_version
String? genome
String index_type
Int ncpu
Int? memGB
String? disks
}

command {
$(which ${index_type + ".sh"}) \
${reference_sequence} \
${spikeins} \
${annotation} \
${anno_version} \
${genome} \
${ncpu}
$(which ~{index_type + ".sh"}) \
~{reference_sequence} \
~{spikeins} \
~{annotation} \
~{anno_version} \
~{genome} \
~{ncpu}
}

output {
Expand All @@ -64,7 +73,7 @@ task make_index {

runtime {
cpu : ncpu
memory : "${select_first([memGB,'8'])} GB"
memory : "~{select_first([memGB,'8'])} GB"
disks : select_first([disks,"local-disk 100 SSD"])
}
}
66 changes: 37 additions & 29 deletions make_index_wdl/merge_anno.wdl
Original file line number Diff line number Diff line change
@@ -1,50 +1,58 @@
version 1.0

#ENCODE DCC RNA-Seq pipeline merge-annotation
#Maintainer: Otto Jolanki
#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.1
#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.1
#CAPER docker quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
#CAPER singularity docker://quay.io/encode-dcc/rna-seq-pipeline:v1.2.0
workflow merge_anno {
# input filenames
File annotation
File tRNA
File spikeins
# output filename
String output_filename
Int? cpu
Int? memGB
String? disks
meta {
author: "Otto Jolanki"
version: "v1.2.0"
}

input {
File annotation
File tRNA
File spikeins
String output_filename
Int? cpu
Int? memGB
String? disks
}

call merge_annotation { input :
annotation = annotation,
tRNA = tRNA,
spikeins = spikeins,
output_filename = output_filename,
annotation=annotation,
tRNA=tRNA,
spikeins=spikeins,
output_filename=output_filename,
}
}
task merge_annotation {
File annotation
File tRNA
File spikeins
String output_filename
Int? cpu
Int? memGB
String? disks
input {
File annotation
File tRNA
File spikeins
String output_filename
Int? cpu
Int? memGB
String? disks
}

command {
python3 $(which merge_annotation.py) \
${"--annotation " + annotation} \
${"--tRNA " + tRNA} \
${"--spikeins " + spikeins} \
${"--output_filename " + output_filename}
~{"--annotation " + annotation} \
~{"--tRNA " + tRNA} \
~{"--spikeins " + spikeins} \
~{"--output_filename " + output_filename}
}
output {
File merged_annotation = glob("${output_filename}")[0]
File merged_annotation = output_filename
}
runtime {
cpu : select_first([cpu,2])
memory : "${select_first([memGB,'8'])} GB"
memory : "~{select_first([memGB,'8'])} GB"
disks : select_first([disks,"local-disk 100 SSD"])
}
}
Loading

0 comments on commit 8a29a99

Please sign in to comment.