Merge branch 'viash-hub:main' into dupradar

viash-hub · Sep 23, 2024 · 332e1a9 · 332e1a9
2 parents 5362b55 + bc9cc0a
commit 332e1a9
Show file tree

Hide file tree

Showing 11 changed files with 270 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -65,6 +65,16 @@
 
 * `fastqc`: High throughput sequence quality control analysis tool (PR #92).
 
+* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from
+  metatranscriptomic data (PR #146).
+
+* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147).
+
+* `kallisto`:
+    - `kallisto_index`: Create a kallisto index (PR #149).
+    - `kallisto_quant`: Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads (PR #152).
+
+
 ## MINOR CHANGES
 
 * `busco` components: update BUSCO to `5.7.1` (PR #72).
@@ -161,13 +171,7 @@
     - `bedtools_getfasta`: extract sequences from a FASTA file for each of the
                            intervals defined in a BED/GFF/VCF file (PR #59).
 
-* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic 
-               data. (PR #146)
-
-* `fq_subsample`: Sample a subset of records from single or paired FASTQ files (PR #147).
 
-* `kallisto`:
-    - `kallisto_index`: Create a kallisto index (PR #149).
 
 
 ## MINOR CHANGES

diff --git a/src/kallisto/kallisto_quant/config.vsh.yaml b/src/kallisto/kallisto_quant/config.vsh.yaml
@@ -0,0 +1,105 @@
+name: kallisto_quant
+namespace: kallisto
+description: |
+  Quantifying abundances of transcripts from RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads.
+keywords: [kallisto, quant, pseudoalignment]
+links:
+  homepage: https://pachterlab.github.io/kallisto/about
+  documentation: https://pachterlab.github.io/kallisto/manual
+  repository: https://github.com/pachterlab/kallisto
+  issue_tracker: https://github.com/pachterlab/kallisto/issues
+references: 
+  doi: 10.1038/nbt.3519
+license: BSD 2-Clause License
+
+argument_groups:
+- name: "Input"
+  arguments:
+  - name: "--input"
+    type: file
+    description: List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively.
+    direction: "input"
+    multiple: true
+    required: true
+  - name: "--index"
+    alternatives: ["-i"]
+    type: file
+    description: Kallisto genome index.
+    must_exist: true
+    required: true
+
+- name: "Output"
+  arguments:
+  - name: "--output_dir"
+    alternatives: ["-o"]
+    type: string
+    description: Directory to write output to.
+    required: true
+
+- name: "Options"
+  arguments:
+  - name: "--single"
+    type: boolean_true
+    description: Single end mode.
+  - name: "--single_overhang"
+    type: boolean_true
+    description: Include reads where unobserved rest of fragment is predicted to lie outside a transcript.
+  - name: "--fr_stranded"
+    type: boolean_true
+    description: Strand specific reads, first read forward.
+  - name: "--rf_stranded"
+    type: boolean_true
+    description: Strand specific reads, first read reverse.
+  - name: "--fragment_length"
+    alternatives: ["-l"]
+    type: double
+    description: The estimated average fragment length.
+  - name: "--sd"
+    alternatives: ["-s"]
+    type: double
+    description: |
+      The estimated standard deviation of the fragment length (default: -l, -s values are estimated 
+      from paired end data, but are required when using --single).
+  - name: "--plaintext"
+    type: boolean_true
+    description: Output plaintext instead of HDF5.
+  - name: "--bootstrap_samples"
+    alternatives: ["-b"]
+    type: integer
+    description: |
+      Number of bootstrap samples to draw. Default: '0'
+    example: 0
+  - name: "--seed"
+    type: integer
+    description: |
+      Random seed for bootstrap. Default: '42'
+    example: 42
+
+
+resources:
+- type: bash_script
+  path: script.sh
+
+test_resources:
+- type: bash_script
+  path: test.sh
+- type: file
+  path: test_data
+
+engines:
+  - type: docker
+    image: ubuntu:22.04
+    setup:
+      - type: docker
+        run: |
+          apt-get update && \
+          apt-get install -y --no-install-recommends wget && \
+          wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \
+          tar -xzf kallisto_linux-v0.50.1.tar.gz && \
+          mv kallisto/kallisto /usr/local/bin/
+      - type: docker
+        run: |
+          echo "kallisto: $(kallisto version | sed 's/kallisto, version //')" > /var/software_versions.txt
+runners:
+  - type: executable
+  - type: nextflow  
diff --git a/src/kallisto/kallisto_quant/help.txt b/src/kallisto/kallisto_quant/help.txt
@@ -0,0 +1,33 @@
+```
+kallisto quant
+```
+
+kallisto 0.50.1
+Computes equivalence classes for reads and quantifies abundances
+
+Usage: kallisto quant [arguments] FASTQ-files
+
+Required arguments:
+-i, --index=STRING            Filename for the kallisto index to be used for
+                              quantification
+-o, --output-dir=STRING       Directory to write output to
+
+Optional arguments:
+-b, --bootstrap-samples=INT   Number of bootstrap samples (default: 0)
+    --seed=INT                Seed for the bootstrap sampling (default: 42)
+    --plaintext               Output plaintext instead of HDF5
+    --single                  Quantify single-end reads
+    --single-overhang         Include reads where unobserved rest of fragment is
+                              predicted to lie outside a transcript
+    --fr-stranded             Strand specific reads, first read forward
+    --rf-stranded             Strand specific reads, first read reverse
+-l, --fragment-length=DOUBLE  Estimated average fragment length
+-s, --sd=DOUBLE               Estimated standard deviation of fragment length
+                              (default: -l, -s values are estimated from paired
+                               end data, but are required when using --single)
+-p, --priors                  Priors for the EM algorithm, either as raw counts or as
+                              probabilities. Pseudocounts are added to raw reads to
+                              prevent zero valued priors. Supplied in the same order
+                              as the transcripts in the transcriptome
+-t, --threads=INT             Number of threads to use (default: 1)
+    --verbose                 Print out progress information every 1M proccessed reads
diff --git a/src/kallisto/kallisto_quant/script.sh b/src/kallisto/kallisto_quant/script.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -eo pipefail
+
+unset_if_false=( par_single par_single_overhang par_rf_stranded par_fr_stranded par_plaintext )
+
+for var in "${unset_if_false[@]}"; do
+    temp_var="${!var}"
+    [[ "$temp_var" == "false" ]] && unset $var
+done
+
+IFS=";" read -ra input <<< $par_input
+
+# Check if par_single is not set and ensure even number of input files
+if [ -z "$par_single" ]; then
+    if [ $((${#input[@]} % 2)) -ne 0 ]; then
+        echo "Error: When running in paired-end mode, the number of input files must be even."
+        echo "Number of input files provided: ${#input[@]}"
+        exit 1
+    fi
+fi
+
+
+mkdir -p $par_output_dir
+
+
+kallisto quant \
+    ${meta_cpus:+--threads $meta_cpus} \
+    -i $par_index \
+    ${par_gtf:+--gtf "${par_gtf}"} \
+    ${par_single:+--single} \
+    ${par_single_overhang:+--single-overhang} \
+    ${par_fr_stranded:+--fr-stranded} \
+    ${par_rf_stranded:+--rf-stranded} \
+    ${par_plaintext:+--plaintext} \
+    ${par_bootstrap_samples:+--bootstrap-samples "${par_bootstrap_samples}"} \
+    ${par_fragment_length:+--fragment-length "${par_fragment_length}"} \
+    ${par_sd:+--sd "${par_sd}"} \
+    ${par_seed:+--seed "${par_seed}"} \
+    -o $par_output_dir \
+    ${input[*]}
+
+
diff --git a/src/kallisto/kallisto_quant/test.sh b/src/kallisto/kallisto_quant/test.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+echo ">>> Testing $meta_functionality_name"
+
+echo ">>> Test 1: Testing for paired-end reads"
+"$meta_executable" \
+  --index "$meta_resources_dir/test_data/index/transcriptome.idx" \
+  --rf_stranded \
+  --output_dir . \
+  --input "$meta_resources_dir/test_data/reads/A_R1.fastq;$meta_resources_dir/test_data/reads/A_R2.fastq"
+
+echo ">>> Checking whether output exists"
+[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1
+[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1
+[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1
+[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1
+[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1
+[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1
+
+echo ">>> Checking if output is correct"
+diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_1.tsv" || { echo "abundance.tsv is not correct"; exit 1; }
+
+rm -rf abundance.tsv abundance.h5 run_info.json
+
+################################################################################
+
+echo ">>> Test 2: Testing for single-end reads"
+"$meta_executable" \
+  --index "$meta_resources_dir/test_data/index/transcriptome.idx" \
+  --rf_stranded \
+  --output_dir . \
+  --single \
+  --input "$meta_resources_dir/test_data/reads/A_R1.fastq" \
+  --fragment_length 101 \
+  --sd 50
+
+echo ">>> Checking whether output exists"
+[ ! -f "run_info.json" ] && echo "run_info.json does not exist!" && exit 1
+[ ! -s "run_info.json" ] && echo "run_info.json is empty!" && exit 1
+[ ! -f "abundance.tsv" ] && echo "abundance.tsv does not exist!" && exit 1
+[ ! -s "abundance.tsv" ] && echo "abundance.tsv is empty!" && exit 1
+[ ! -f "abundance.h5" ] && echo "abundance.h5 does not exist!" && exit 1
+[ ! -s "abundance.h5" ] && echo "abundance.h5 is empty!" && exit 1
+
+echo ">>> Checking if output is correct"
+diff "abundance.tsv" "$meta_resources_dir/test_data/abundance_2.tsv" || { echo "abundance.tsv is not correct"; exit 1; }
+
+rm -rf abundance.tsv abundance.h5 run_info.json
+
+################################################################################
+
+echo "All tests succeeded!"
+exit 0
diff --git a/src/kallisto/kallisto_quant/test_data/abundance_1.tsv b/src/kallisto/kallisto_quant/test_data/abundance_1.tsv
@@ -0,0 +1,2 @@
+target_id	length	eff_length	est_counts	tpm
+Sheila	35	36	0	-nan
diff --git a/src/kallisto/kallisto_quant/test_data/abundance_2.tsv b/src/kallisto/kallisto_quant/test_data/abundance_2.tsv
@@ -0,0 +1,2 @@
+target_id	length	eff_length	est_counts	tpm
+Sheila	35	15.0373	0	-nan
diff --git a/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx b/src/kallisto/kallisto_quant/test_data/index/transcriptome.idx
diff --git a/src/kallisto/kallisto_quant/test_data/reads/A_R1.fastq b/src/kallisto/kallisto_quant/test_data/reads/A_R1.fastq
@@ -0,0 +1,4 @@
+@1
+GCTAGCTCAGAAAAAAAAAATCGTCGCGTGCGCGT
++
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
diff --git a/src/kallisto/kallisto_quant/test_data/reads/A_R2.fastq b/src/kallisto/kallisto_quant/test_data/reads/A_R2.fastq
@@ -0,0 +1,4 @@
+@1
+GCTAGCTCAGAAAAAAAAAATCGTCGCGTGCGCGT
++
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
diff --git a/src/kallisto/kallisto_quant/test_data/script.sh b/src/kallisto/kallisto_quant/test_data/script.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# clone repo
+if [ ! -d /tmp/snakemake-wrappers ]; then
+  git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers
+fi
+
+# copy test data
+cp -r /tmp/snakemake-wrappers/bio/kallisto/quant/test/* src/kallisto/kallisto_quant/test_data
+
+rm src/kallisto/kallisto_quant/test_data/Snakefile
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		target_id length eff_length est_counts tpm
		Sheila 35 36 0 -nan
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		target_id length eff_length est_counts tpm
		Sheila 35 15.0373 0 -nan