complete config file, adjusted script and tests, not functional

viash-hub · Aug 25, 2024 · 53e4dee · 53e4dee
1 parent 924eeee
commit 53e4dee
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 29 deletions.
diff --git a/src/bbmap_bbsplit/config.vsh.yaml b/src/bbmap_bbsplit/config.vsh.yaml
@@ -1,7 +1,6 @@
 namespace: "bbmap"
 name: "bbmap_bbsplit"
-description: |
-  Split sequencing reads by mapping them to multiple references simultaneously.
+description: Split sequencing reads by mapping them to multiple references simultaneously.
 links:
   homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/
   documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/
@@ -16,23 +15,87 @@ argument_groups:
     type: string
     description: Sample ID
   - name: "--paired"
-    type: boolean 
-    default: false
+    type: boolean_true
     description: Paired fastq files or not?
   - name: "--input"
     type: file
     multiple: true
     description: Input fastq files, either one or two (paired), separated by ";".
-    example: sample.fastq
+    example: reads.fastq
   - name: "--primary_ref"
     type: file
     description: Primary reference FASTA
   - name: "--other_ref_names"
     type: file
-    description: Path to comma-separated file containing a list of reference genomes to filter reads against with BBSplit.
+    description: |
+      Path to comma-separated file containing a list of reference genomes to filter reads
+      against with BBSplit.
   - name: "--only_build_index"
-    type: boolean
-    description: true = only build index; false = mapping
+    type: boolean_true
+    description: If set, only builds the index. Otherwise, mapping is performed.
+  - name: "--index"
+    type: string
+    description: |
+      Designate index to use. Corresponds to the number specified when building the index.
+      If building the index, this will be the build's id. If multiple references are indexed
+      in the same directory, each needs a unique build ID. Default: 1.\
+    example: 1
+  - name: "--qin"
+    type: string
+    description: |
+      Set to 33 or 64 to specify input quality value ASCII offset. Automatically detected if
+      not specified.
+  - name: "--interleaved"
+    type: boolean_true
+    description: |
+      True forces paired/interleaved input; false forces single-ended mapping.
+      If not specified, interleaved status will be autodetected from read names.
+  - name: "--maxindel"
+    type: integer
+    description: |
+      Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq.
+    example: 20
+  - name: "--minratio"
+    type: double
+    description: |
+      Fraction of max alignment score required to keep a site. Higher is faster.
+    example: 0.56
+  - name: "--minhits"
+    type: integer
+    description: |
+      Minimum number of seed hits required for candidate sites. Higher is faster.
+    example: 1
+  - name: "--ambiguous"
+    type: string
+    description: |
+      Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations).
+        * best    Use the first best site (Default)
+        * toss    Consider unmapped
+        * random  Select one top-scoring site randomly
+        * all     Retain all top-scoring sites.  Does not work yet with SAM output
+    choices: [best, toss, random, all]
+    example: best
+  - name: "--ambiguous2"
+    type: string
+    description: |
+      Set behavior only for reads that map ambiguously to multiple different references.
+      Normal 'ambiguous=' controls behavior on all ambiguous reads;
+      Ambiguous2 excludes reads that map ambiguously within a single reference.
+        * best    Use the first best site (Default)
+        * toss    Consider unmapped
+        * all     Write a copy to the output for each reference to which it maps
+        * split   Write a copy to the AMBIGUOUS_ output for each reference to which it maps
+    choices: [best, toss, all, split]
+    example: best
+  - name: "--qtrim"
+    type: string
+    description: |
+      Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both).
+    choices: [l, r, lr]
+  - name: "--untrim"
+    type: boolean_true
+    description: Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings.
+
 
 - name: "Output"
   arguments:
@@ -41,13 +104,13 @@ argument_groups:
     description: |
       Output file for read 1.
     direction: output
-    example: read_1.fastq
+    example: read_out1.fastq
   - name: "--fastq_2"
     type: file
     description: |
       Output file for read 2.
     direction: output
-    example: read_2.fastq
+    example: read_out2.fastq
   - name: "--primary_fastq"
     type: file
     description: |
@@ -60,18 +123,35 @@ argument_groups:
       Output reads that map to the primary reference.
     direction: output
     example: all.fastq.gz
-  - name: "--index"
+  - name: "--ref_fasta_list"
     type: file
     description: |
       Directory with index files.
     direction: output
     example: bbsplit
-  - name : "--stats"
+  - name: "--sam2bam"
+    alternatives: ["--bs"]
+    type: file
+    description: |
+      Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file.
+    direction: output
+    example: script.sh
+  - name: "--scafstats"
+    type: file
+    description: |
+      Write statistics on how many reads mapped to which scaffold to this file.
+    direction: output
+    example: scaffold_stats.txt
+  - name: "--refstats"
     type: file
     description: |
-      Tab-delimited text file containing mapping statistics.
+      Write statistics on how many reads were assigned to which reference to this file.
+      Unmapped reads whose mate mapped to a reference are considered assigned and will be counted.
     direction: output
-    example: stats.txt
+    example: reference_stats.txt
+  - name: "--nzo"
+    type: boolean_true
+    description: Only print lines with nonzero coverage.
 
 resources:
   - type: bash_script

diff --git a/src/bbmap_bbsplit/help.txt b/src/bbmap_bbsplit/help.txt
diff --git a/src/bbmap_bbsplit/script.sh b/src/bbmap_bbsplit/script.sh
@@ -7,19 +7,27 @@ function clean_up {
 }
 trap clean_up EXIT 
 
-if [ ! -d "$par_built_bbsplit_index" ]; then
+unset_if_false=( par_paired par_only_build_index par_interleaved par_untrim par_nzo)
+
+for var in "${unset_if_false[@]}"; do
+    if [ -z "${!var}" ]; then
+        unset $var
+    fi
+done
+
+if [ ! -d "$par_built_index" ]; then
     other_refs=()
     while IFS="," read -r name path 
     do
         other_refs+=("ref_$name=$path")
-    done < "$par_bbsplit_fasta_list"
+    done < "$par_ref_fasta_list"
 fi
 
 if $par_only_build_index; then
     if [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then
         bbsplit.sh \
             ref_primary="$par_primary_ref" "${other_refs[@]}" \
-            path=$par_bbsplit_index \
+            path=$par_index \
             threads=${meta_cpus:-1}
     else
         echo "ERROR: Please specify as input a primary fasta file along with names and paths to non-primary fasta files."
@@ -28,8 +36,8 @@ else
     IFS="," read -ra input <<< "$par_input"
     tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX")
     index_files=''
-    if [ -d "$par_built_bbsplit_index" ]; then
-        index_files="path=$par_built_bbsplit_index"
+    if [ -d "$par_built_index" ]; then
+        index_files="path=$par_built_index"
     elif [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then
         index_files="ref_primary=$par_primary_ref ${other_refs[@]}"
     else

diff --git a/src/bbmap_bbsplit/test.sh b/src/bbmap_bbsplit/test.sh
@@ -10,9 +10,9 @@ HERE
 echo ">>> Building BBSplit index"
 "${meta_executable}" \
   --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
-  --bbsplit_fasta_list "bbsplit_fasta_list.txt" \
+  --ref_fasta_list "bbsplit_fasta_list.txt" \
   --only_build_index true \
-  --bbsplit_index "BBSplit_index" 
+  --index "BBSplit_index" 
 
 echo ">>> Check whether output exists"
 [ ! -d "BBSplit_index" ] && echo "BBSplit index does not exist!" && exit 1
@@ -22,11 +22,10 @@ echo ">>> Filtering ribosomal RNA reads"
 
 echo ">>> Testing with single-end reads and primary/non-primary FASTA files"
 "${meta_executable}" \
-  --paired false \
   --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \
   --only_build_index false \
   --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
-  --bbsplit_fasta_list "bbsplit_fasta_list.txt" \
+  --ref_fasta_list "bbsplit_fasta_list.txt" \
   --fastq_1 "filtered_SRR6357070_1.fastq.gz"
 
 echo ">>> Check whether output exists"
@@ -37,11 +36,11 @@ rm filtered_SRR6357070_1.fastq.gz
 
 echo ">>> Testing with paired-end reads and primary/non-primary FASTA files"
 "${meta_executable}" \
-  --paired true \
+  --paired \
   --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \
   --only_build_index false \
   --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
-  --bbsplit_fasta_list "bbsplit_fasta_list.txt" \
+  --ref_fasta_list "bbsplit_fasta_list.txt" \
   --fastq_1 "filtered_SRR6357070_1.fastq.gz" \
   --fastq_2 "filtered_SRR6357070_2.fastq.gz"
 
@@ -55,10 +54,9 @@ rm filtered_SRR6357070_1.fastq.gz filtered_SRR6357070_2.fastq.gz
 
 echo ">>> Testing with single-end reads and BBSplit index"
 "${meta_executable}" \
-  --paired false \
   --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \
   --only_build_index false \
-  --built_bbsplit_index "BBSplit_index" \
+  --built_index "BBSplit_index" \
   --fastq_1 "filtered_SRR6357070_1.fastq.gz"
 
 echo ">>> Check whether output exists"
@@ -67,10 +65,10 @@ echo ">>> Check whether output exists"
 
 echo ">>> Testing with paired-end reads and BBSplit index"
 "${meta_executable}" \
-  --paired true \
+  --paired \
   --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \
   --only_build_index false \
-  --built_bbsplit_index "BBSplit_index" \
+  --built_index "BBSplit_index" \
   --fastq_1 "filtered_SRR6357070_1.fastq.gz" \
   --fastq_2 "filtered_SRR6357070_2.fastq.gz"
 

diff --git a/src/bbmap_bbsplit/test_data/script.sh b/src/bbmap_bbsplit/test_data/script.sh
@@ -0,0 +1 @@
+#!/bin/bash