diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bfb5606..5d380e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,16 @@ * `fastqc`: High throughput sequence quality control analysis tool (PR #92). +* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from + metatranscriptomic data (PR #146). + +* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). + +* `kallisto`: + - `kallisto_index`: Create a kallisto index (PR #149). + - `kallisto_quant`: Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads (PR #152). + + ## MINOR CHANGES * `busco` components: update BUSCO to `5.7.1` (PR #72). @@ -161,13 +171,7 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). -* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic - data. (PR #146) - -* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147). -* `kallisto`: - - `kallisto_index`: Create a kallisto index (PR #149). ## MINOR CHANGES diff --git a/src/kallisto/kallisto_quant/config.vsh.yaml b/src/kallisto/kallisto_quant/config.vsh.yaml new file mode 100644 index 00000000..e92ac6b3 --- /dev/null +++ b/src/kallisto/kallisto_quant/config.vsh.yaml @@ -0,0 +1,105 @@ +name: kallisto_quant +namespace: kallisto +description: | + Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. +keywords: [kallisto, quant, pseudoalignment] +links: + homepage: https://pachterlab.github.io/kallisto/about + documentation: https://pachterlab.github.io/kallisto/manual + repository: https://github.com/pachterlab/kallisto + issue_tracker: https://github.com/pachterlab/kallisto/issues +references: + doi: 10.1038/nbt.3519 +license: BSD 2-Clause License + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + direction: "input" + multiple: true + required: true + - name: "--index" + alternatives: ["-i"] + type: file + description: Kallisto genome index. + must_exist: true + required: true + +- name: "Output" + arguments: + - name: "--output_dir" + alternatives: ["-o"] + type: string + description: Directory to write output to. + required: true + +- name: "Options" + arguments: + - name: "--single" + type: boolean_true + description: Single end mode. + - name: "--single_overhang" + type: boolean_true + description: Include reads where unobserved rest of fragment is predicted to lie outside a transcript. + - name: "--fr_stranded" + type: boolean_true + description: Strand specific reads, first read forward. + - name: "--rf_stranded" + type: boolean_true + description: Strand specific reads, first read reverse. + - name: "--fragment_length" + alternatives: ["-l"] + type: double + description: The estimated average fragment length. + - name: "--sd" + alternatives: ["-s"] + type: double + description: | + The estimated standard deviation of the fragment length (default: -l, -s values are estimated + from paired end data, but are required when using --single). + - name: "--plaintext" + type: boolean_true + description: Output plaintext instead of HDF5. + - name: "--bootstrap_samples" + alternatives: ["-b"] + type: integer + description: | + Number of bootstrap samples to draw. Default: '0' + example: 0 + - name: "--seed" + type: integer + description: | + Random seed for bootstrap. Default: '42' + example: 42 + + +resources: +- type: bash_script + path: script.sh + +test_resources: +- type: bash_script + path: test.sh +- type: file + path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \ + tar -xzf kallisto_linux-v0.50.1.tar.gz && \ + mv kallisto/kallisto /usr/local/bin/ + - type: docker + run: | + echo "kallisto: $(kallisto version | sed 's/kallisto, version //')" > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/kallisto/kallisto_quant/help.txt b/src/kallisto/kallisto_quant/help.txt new file mode 100644 index 00000000..7022571b --- /dev/null +++ b/src/kallisto/kallisto_quant/help.txt @@ -0,0 +1,33 @@ +``` +kallisto quant +``` + +kallisto 0.50.1 +Computes equivalence classes for reads and quantifies abundances + +Usage: kallisto quant [arguments] FASTQ-files + +Required arguments: +-i, --index=STRING Filename for the kallisto index to be used for + quantification +-o, --output-dir=STRING Directory to write output to + +Optional arguments: +-b, --bootstrap-samples=INT Number of bootstrap samples (default: 0) + --seed=INT Seed for the bootstrap sampling (default: 42) + --plaintext Output plaintext instead of HDF5 + --single Quantify single-end reads + --single-overhang Include reads where unobserved rest of fragment is + predicted to lie outside a transcript + --fr-stranded Strand specific reads, first read forward + --rf-stranded Strand specific reads, first read reverse +-l, --fragment-length=DOUBLE Estimated average fragment length +-s, --sd=DOUBLE Estimated standard deviation of fragment length + (default: -l, -s values are estimated from paired + end data, but are required when using --single) +-p, --priors Priors for the EM algorithm, either as raw counts or as + probabilities. Pseudocounts are added to raw reads to + prevent zero valued priors. Supplied in the same order + as the transcripts in the transcriptome +-t, --threads=INT Number of threads to use (default: 1) + --verbose Print out progress information every 1M proccessed reads \ No newline at end of file diff --git a/src/kallisto/kallisto_quant/script.sh b/src/kallisto/kallisto_quant/script.sh new file mode 100644 index 00000000..a7105cd1 --- /dev/null +++ b/src/kallisto/kallisto_quant/script.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_single par_single_overhang par_rf_stranded par_fr_stranded par_plaintext ) + +for var in "${unset_if_false[@]}"; do + temp_var="${!var}" + [[ "$temp_var" == "false" ]] && unset $var +done + +IFS=";" read -ra input <<< $par_input + +# Check if par_single is not set and ensure even number of input files +if [ -z "$par_single" ]; then + if [ $((${#input[@]} % 2)) -ne 0 ]; then + echo "Error: When running in paired-end mode, the number of input files must be even." + echo "Number of input files provided: ${#input[@]}" + exit 1 + fi +fi + + +mkdir -p $par_output_dir + + +kallisto quant \ + ${meta_cpus:+--threads $meta_cpus} \ + -i $par_index \ + ${par_gtf:+--gtf "${par_gtf}"} \ + ${par_single:+--single} \ + ${par_single_overhang:+--single-overhang} \ + ${par_fr_stranded:+--fr-stranded} \ + ${par_rf_stranded:+--rf-stranded} \ + ${par_plaintext:+--plaintext} \ + ${par_bootstrap_samples:+--bootstrap-samples "${par_bootstrap_samples}"} \ + ${par_fragment_length:+--fragment-length "${par_fragment_length}"} \ + ${par_sd:+--sd "${par_sd}"} \ + ${par_seed:+--seed "${par_seed}"} \ + -o $par_output_dir \ + ${input[*]} + + diff --git a/src/kallisto/kallisto_quant/test.sh b/src/kallisto/kallisto_quant/test.sh new file mode 100644 index 00000000..28e2e3ad --- /dev/null +++ b/src/kallisto/kallisto_quant/test.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +echo ">>> Testing $meta_functionality_name" + +echo ">>> Test 1: Testing for paired-end reads" +"$meta_executable" \ + --index "$meta_resources_dir/test_data/index/transcriptome.idx" \ + --rf_stranded \ + --output_dir . \ + --input "$meta_resources_dir/test_data/reads/A_R1.fastq;$meta_resources_dir/test_data/reads/A_R2.fastq" + +echo ">>> Checking whether output exists" +[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1 +[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1 +[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1 +[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1 +[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1 +[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1 + +echo ">>> Checking if output is correct" +diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_1.tsv" || { echo "abundance.tsv is not correct"; exit 1; } + +rm -rf abundance.tsv abundance.h5 run_info.json + +################################################################################ + +echo ">>> Test 2: Testing for single-end reads" +"$meta_executable" \ + --index "$meta_resources_dir/test_data/index/transcriptome.idx" \ + --rf_stranded \ + --output_dir . \ + --single \ + --input "$meta_resources_dir/test_data/reads/A_R1.fastq" \ + --fragment_length 101 \ + --sd 50 + +echo ">>> Checking whether output exists" +[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1 +[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1 +[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1 +[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1 +[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1 +[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1 + +echo ">>> Checking if output is correct" +diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_2.tsv" || { echo "abundance.tsv is not correct"; exit 1; } + +rm -rf abundance.tsv abundance.h5 run_info.json + +################################################################################ + +echo "All tests succeeded!" +exit 0 diff --git a/src/kallisto/kallisto_quant/test_data/abundance_1.tsv b/src/kallisto/kallisto_quant/test_data/abundance_1.tsv new file mode 100644 index 00000000..1de99e54 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/abundance_1.tsv @@ -0,0 +1,2 @@ +target_id length eff_length est_counts tpm +Sheila 35 36 0 -nan diff --git a/src/kallisto/kallisto_quant/test_data/abundance_2.tsv b/src/kallisto/kallisto_quant/test_data/abundance_2.tsv new file mode 100644 index 00000000..6b3e9055 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/abundance_2.tsv @@ -0,0 +1,2 @@ +target_id length eff_length est_counts tpm +Sheila 35 15.0373 0 -nan diff --git a/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx b/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx new file mode 100644 index 00000000..194fec14 Binary files /dev/null and b/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx differ diff --git a/src/kallisto/kallisto_quant/test_data/reads/A_R1.fastq b/src/kallisto/kallisto_quant/test_data/reads/A_R1.fastq new file mode 100644 index 00000000..999ed649 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/reads/A_R1.fastq @@ -0,0 +1,4 @@ +@1 +GCTAGCTCAGAAAAAAAAAATCGTCGCGTGCGCGT ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! diff --git a/src/kallisto/kallisto_quant/test_data/reads/A_R2.fastq b/src/kallisto/kallisto_quant/test_data/reads/A_R2.fastq new file mode 100644 index 00000000..999ed649 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/reads/A_R2.fastq @@ -0,0 +1,4 @@ +@1 +GCTAGCTCAGAAAAAAAAAATCGTCGCGTGCGCGT ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! diff --git a/src/kallisto/kallisto_quant/test_data/script.sh b/src/kallisto/kallisto_quant/test_data/script.sh new file mode 100755 index 00000000..6d684b29 --- /dev/null +++ b/src/kallisto/kallisto_quant/test_data/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +# copy test data +cp -r /tmp/snakemake-wrappers/bio/kallisto/quant/test/* src/kallisto/kallisto_quant/test_data + +rm src/kallisto/kallisto_quant/test_data/Snakefile \ No newline at end of file