From bc9cc0a6ce4e0b87a4ce47561b4812b449e101ca Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Thu, 19 Sep 2024 05:48:45 +0200 Subject: [PATCH] Kallisto quant (#152) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * complete component * Update changelog * add help.txt * apply suggested changes (changelog, config) --- CHANGELOG.md | 16 ++- src/kallisto/kallisto_quant/config.vsh.yaml | 105 ++++++++++++++++++ src/kallisto/kallisto_quant/help.txt | 33 ++++++ src/kallisto/kallisto_quant/script.sh | 46 ++++++++ src/kallisto/kallisto_quant/test.sh | 53 +++++++++ .../kallisto_quant/test_data/abundance_1.tsv | 2 + .../kallisto_quant/test_data/abundance_2.tsv | 2 + .../test_data/index/transcriptome.idx | Bin 0 -> 1583 bytes .../kallisto_quant/test_data/reads/A_R1.fastq | 4 + .../kallisto_quant/test_data/reads/A_R2.fastq | 4 + .../kallisto_quant/test_data/script.sh | 11 ++ 11 files changed, 270 insertions(+), 6 deletions(-) create mode 100644 src/kallisto/kallisto_quant/config.vsh.yaml create mode 100644 src/kallisto/kallisto_quant/help.txt create mode 100644 src/kallisto/kallisto_quant/script.sh create mode 100644 src/kallisto/kallisto_quant/test.sh create mode 100644 src/kallisto/kallisto_quant/test_data/abundance_1.tsv create mode 100644 src/kallisto/kallisto_quant/test_data/abundance_2.tsv create mode 100644 src/kallisto/kallisto_quant/test_data/index/transcriptome.idx create mode 100644 src/kallisto/kallisto_quant/test_data/reads/A_R1.fastq create mode 100644 src/kallisto/kallisto_quant/test_data/reads/A_R2.fastq create mode 100755 src/kallisto/kallisto_quant/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bfb5606..5d380e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,16 @@ * `fastqc`: High throughput sequence quality control analysis tool (PR #92). +* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from + metatranscriptomic data (PR #146). + +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + +* `kallisto`: + - `kallisto_index`: Create a kallisto index (PR #149). + - `kallisto_quant`: Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads (PR #152). + + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). @@ -161,13 +171,7 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). -* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic - data. (PR #146) - -* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). -* `kallisto`: - - `kallisto_index`: Create a kallisto index (PR #149). ## MINOR CHANGES diff --git a/src/kallisto/kallisto_quant/config.vsh.yaml b/src/kallisto/kallisto_quant/config.vsh.yaml new file mode 100644 index 00000000..e92ac6b3 --- /dev/null +++ b/src/kallisto/kallisto_quant/config.vsh.yaml @@ -0,0 +1,105 @@ +name: kallisto_quant +namespace: kallisto +description: | + Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. +keywords: [kallisto, quant, pseudoalignment] +links: + homepage: https://pachterlab.github.io/kallisto/about + documentation: https://pachterlab.github.io/kallisto/manual + repository: https://github.com/pachterlab/kallisto + issue_tracker: https://github.com/pachterlab/kallisto/issues +references: + doi: 10.1038/nbt.3519 +license: BSD 2-Clause License + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + direction: "input" + multiple: true + required: true + - name: "--index" + alternatives: ["-i"] + type: file + description: Kallisto genome index. + must_exist: true + required: true + +- name: "Output" + arguments: + - name: "--output_dir" + alternatives: ["-o"] + type: string + description: Directory to write output to. + required: true + +- name: "Options" + arguments: + - name: "--single" + type: boolean_true + description: Single end mode. + - name: "--single_overhang" + type: boolean_true + description: Include reads where unobserved rest of fragment is predicted to lie outside a transcript. + - name: "--fr_stranded" + type: boolean_true + description: Strand specific reads, first read forward. + - name: "--rf_stranded" + type: boolean_true + description: Strand specific reads, first read reverse. + - name: "--fragment_length" + alternatives: ["-l"] + type: double + description: The estimated average fragment length. + - name: "--sd" + alternatives: ["-s"] + type: double + description: | + The estimated standard deviation of the fragment length (default: -l, -s values are estimated + from paired end data, but are required when using --single). + - name: "--plaintext" + type: boolean_true + description: Output plaintext instead of HDF5. + - name: "--bootstrap_samples" + alternatives: ["-b"] + type: integer + description: | + Number of bootstrap samples to draw. Default: '0' + example: 0 + - name: "--seed" + type: integer + description: | + Random seed for bootstrap. Default: '42' + example: 42 + + +resources: +- type: bash_script + path: script.sh + +test_resources: +- type: bash_script + path: test.sh +- type: file + path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \ + tar -xzf kallisto_linux-v0.50.1.tar.gz && \ + mv kallisto/kallisto /usr/local/bin/ + - type: docker + run: | + echo "kallisto: $(kallisto version | sed 's/kallisto, version //')" > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/kallisto/kallisto_quant/help.txt b/src/kallisto/kallisto_quant/help.txt new file mode 100644 index 00000000..7022571b --- /dev/null +++ b/src/kallisto/kallisto_quant/help.txt @@ -0,0 +1,33 @@ +``` +kallisto quant +``` + +kallisto 0.50.1 +Computes equivalence classes for reads and quantifies abundances + +Usage: kallisto quant [arguments] FASTQ-files + +Required arguments: +-i, --index=STRING Filename for the kallisto index to be used for + quantification +-o, --output-dir=STRING Directory to write output to + +Optional arguments: +-b, --bootstrap-samples=INT Number of bootstrap samples (default: 0) + --seed=INT Seed for the bootstrap sampling (default: 42) + --plaintext Output plaintext instead of HDF5 + --single Quantify single-end reads + --single-overhang Include reads where unobserved rest of fragment is + predicted to lie outside a transcript + --fr-stranded Strand specific reads, first read forward + --rf-stranded Strand specific reads, first read reverse +-l, --fragment-length=DOUBLE Estimated average fragment length +-s, --sd=DOUBLE Estimated standard deviation of fragment length + (default: -l, -s values are estimated from paired + end data, but are required when using --single) +-p, --priors Priors for the EM algorithm, either as raw counts or as + probabilities. Pseudocounts are added to raw reads to + prevent zero valued priors. Supplied in the same order + as the transcripts in the transcriptome +-t, --threads=INT Number of threads to use (default: 1) + --verbose Print out progress information every 1M proccessed reads \ No newline at end of file diff --git a/src/kallisto/kallisto_quant/script.sh b/src/kallisto/kallisto_quant/script.sh new file mode 100644 index 00000000..a7105cd1 --- /dev/null +++ b/src/kallisto/kallisto_quant/script.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_single par_single_overhang par_rf_stranded par_fr_stranded par_plaintext ) + +for var in "${unset_if_false[@]}"; do + temp_var="${!var}" + [[ "$temp_var" == "false" ]] && unset $var +done + +IFS=";" read -ra input <<< $par_input + +# Check if par_single is not set and ensure even number of input files +if [ -z "$par_single" ]; then + if [ $((${#input[@]} % 2)) -ne 0 ]; then + echo "Error: When running in paired-end mode, the number of input files must be even." + echo "Number of input files provided: ${#input[@]}" + exit 1 + fi +fi + + +mkdir -p $par_output_dir + + +kallisto quant \ + ${meta_cpus:+--threads $meta_cpus} \ + -i $par_index \ + ${par_gtf:+--gtf "${par_gtf}"} \ + ${par_single:+--single} \ + ${par_single_overhang:+--single-overhang} \ + ${par_fr_stranded:+--fr-stranded} \ + ${par_rf_stranded:+--rf-stranded} \ + ${par_plaintext:+--plaintext} \ + ${par_bootstrap_samples:+--bootstrap-samples "${par_bootstrap_samples}"} \ + ${par_fragment_length:+--fragment-length "${par_fragment_length}"} \ + ${par_sd:+--sd "${par_sd}"} \ + ${par_seed:+--seed "${par_seed}"} \ + -o $par_output_dir \ + ${input[*]} + + diff --git a/src/kallisto/kallisto_quant/test.sh b/src/kallisto/kallisto_quant/test.sh new file mode 100644 index 00000000..28e2e3ad --- /dev/null +++ b/src/kallisto/kallisto_quant/test.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +echo ">>> Testing $meta_functionality_name" + +echo ">>> Test 1: Testing for paired-end reads" +"$meta_executable" \ + --index "$meta_resources_dir/test_data/index/transcriptome.idx" \ + --rf_stranded \ + --output_dir . \ + --input "$meta_resources_dir/test_data/reads/A_R1.fastq;$meta_resources_dir/test_data/reads/A_R2.fastq" + +echo ">>> Checking whether output exists" +[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1 +[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1 +[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1 +[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1 +[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1 +[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1 + +echo ">>> Checking if output is correct" +diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_1.tsv" || { echo "abundance.tsv is not correct"; exit 1; } + +rm -rf abundance.tsv abundance.h5 run_info.json + +################################################################################ + +echo ">>> Test 2: Testing for single-end reads" +"$meta_executable" \ + --index "$meta_resources_dir/test_data/index/transcriptome.idx" \ + --rf_stranded \ + --output_dir . \ + --single \ + --input "$meta_resources_dir/test_data/reads/A_R1.fastq" \ + --fragment_length 101 \ + --sd 50 + +echo ">>> Checking whether output exists" +[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1 +[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1 +[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1 +[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1 +[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1 +[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1 + +echo ">>> Checking if output is correct" +diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_2.tsv" || { echo "abundance.tsv is not correct"; exit 1; } + +rm -rf abundance.tsv abundance.h5 run_info.json + +################################################################################ + +echo "All tests succeeded!" +exit 0 diff --git a/src/kallisto/kallisto_quant/test_data/abundance_1.tsv b/src/kallisto/kallisto_quant/test_data/abundance_1.tsv new file mode 100644 index 00000000..1de99e54 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/abundance_1.tsv @@ -0,0 +1,2 @@ +target_id length eff_length est_counts tpm +Sheila 35 36 0 -nan diff --git a/src/kallisto/kallisto_quant/test_data/abundance_2.tsv b/src/kallisto/kallisto_quant/test_data/abundance_2.tsv new file mode 100644 index 00000000..6b3e9055 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/abundance_2.tsv @@ -0,0 +1,2 @@ +target_id length eff_length est_counts tpm +Sheila 35 15.0373 0 -nan diff --git a/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx b/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx new file mode 100644 index 0000000000000000000000000000000000000000..194fec14b9b858e324345e535e00764a36100db7 GIT binary patch literal 1583 zcmd;OfPjTinh{9b$1B#!18H#}2Jt~a8A36bm2ogIJfAt+63T~DAce8EHEN$uybqZC zfg=W{5v~BrV208#c|}Et0E`c#VftWv7=48WCIhA&B!LvnOc?C|Rl)?N*?=^%HkesZ zX$A)<1EwA(4x?e}ahVTO2ct*TLqcLSJR#vQnjS{e1FUQS(dg*`Sq@nqrq10t#1V*{ z9o-$_AjH`nh=7E