Skip to content

Commit

Permalink
complete config file, adjusted script and tests, not functional
Browse files Browse the repository at this point in the history
  • Loading branch information
emmarousseau committed Aug 25, 2024
1 parent 924eeee commit 53e4dee
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 29 deletions.
108 changes: 94 additions & 14 deletions src/bbmap_bbsplit/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
namespace: "bbmap"
name: "bbmap_bbsplit"
description: |
Split sequencing reads by mapping them to multiple references simultaneously.
description: Split sequencing reads by mapping them to multiple references simultaneously.
links:
homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/
documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/
Expand All @@ -16,23 +15,87 @@ argument_groups:
type: string
description: Sample ID
- name: "--paired"
type: boolean
default: false
type: boolean_true
description: Paired fastq files or not?
- name: "--input"
type: file
multiple: true
description: Input fastq files, either one or two (paired), separated by ";".
example: sample.fastq
example: reads.fastq
- name: "--primary_ref"
type: file
description: Primary reference FASTA
- name: "--other_ref_names"
type: file
description: Path to comma-separated file containing a list of reference genomes to filter reads against with BBSplit.
description: |
Path to comma-separated file containing a list of reference genomes to filter reads
against with BBSplit.
- name: "--only_build_index"
type: boolean
description: true = only build index; false = mapping
type: boolean_true
description: If set, only builds the index. Otherwise, mapping is performed.
- name: "--index"
type: string
description: |
Designate index to use. Corresponds to the number specified when building the index.
If building the index, this will be the build's id. If multiple references are indexed
in the same directory, each needs a unique build ID. Default: 1.\
example: 1
- name: "--qin"
type: string
description: |
Set to 33 or 64 to specify input quality value ASCII offset. Automatically detected if
not specified.
- name: "--interleaved"
type: boolean_true
description: |
True forces paired/interleaved input; false forces single-ended mapping.
If not specified, interleaved status will be autodetected from read names.
- name: "--maxindel"
type: integer
description: |
Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq.
example: 20
- name: "--minratio"
type: double
description: |
Fraction of max alignment score required to keep a site. Higher is faster.
example: 0.56
- name: "--minhits"
type: integer
description: |
Minimum number of seed hits required for candidate sites. Higher is faster.
example: 1
- name: "--ambiguous"
type: string
description: |
Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations).
* best Use the first best site (Default)
* toss Consider unmapped
* random Select one top-scoring site randomly
* all Retain all top-scoring sites. Does not work yet with SAM output
choices: [best, toss, random, all]
example: best
- name: "--ambiguous2"
type: string
description: |
Set behavior only for reads that map ambiguously to multiple different references.
Normal 'ambiguous=' controls behavior on all ambiguous reads;
Ambiguous2 excludes reads that map ambiguously within a single reference.
* best Use the first best site (Default)
* toss Consider unmapped
* all Write a copy to the output for each reference to which it maps
* split Write a copy to the AMBIGUOUS_ output for each reference to which it maps
choices: [best, toss, all, split]
example: best
- name: "--qtrim"
type: string
description: |
Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both).
choices: [l, r, lr]
- name: "--untrim"
type: boolean_true
description: Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings.


- name: "Output"
arguments:
Expand All @@ -41,13 +104,13 @@ argument_groups:
description: |
Output file for read 1.
direction: output
example: read_1.fastq
example: read_out1.fastq
- name: "--fastq_2"
type: file
description: |
Output file for read 2.
direction: output
example: read_2.fastq
example: read_out2.fastq
- name: "--primary_fastq"
type: file
description: |
Expand All @@ -60,18 +123,35 @@ argument_groups:
Output reads that map to the primary reference.
direction: output
example: all.fastq.gz
- name: "--index"
- name: "--ref_fasta_list"
type: file
description: |
Directory with index files.
direction: output
example: bbsplit
- name : "--stats"
- name: "--sam2bam"
alternatives: ["--bs"]
type: file
description: |
Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file.
direction: output
example: script.sh
- name: "--scafstats"
type: file
description: |
Write statistics on how many reads mapped to which scaffold to this file.
direction: output
example: scaffold_stats.txt
- name: "--refstats"
type: file
description: |
Tab-delimited text file containing mapping statistics.
Write statistics on how many reads were assigned to which reference to this file.
Unmapped reads whose mate mapped to a reference are considered assigned and will be counted.
direction: output
example: stats.txt
example: reference_stats.txt
- name: "--nzo"
type: boolean_true
description: Only print lines with nonzero coverage.

resources:
- type: bash_script
Expand Down
Empty file added src/bbmap_bbsplit/help.txt
Empty file.
18 changes: 13 additions & 5 deletions src/bbmap_bbsplit/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,27 @@ function clean_up {
}
trap clean_up EXIT

if [ ! -d "$par_built_bbsplit_index" ]; then
unset_if_false=( par_paired par_only_build_index par_interleaved par_untrim par_nzo)

for var in "${unset_if_false[@]}"; do
if [ -z "${!var}" ]; then
unset $var
fi
done

if [ ! -d "$par_built_index" ]; then
other_refs=()
while IFS="," read -r name path
do
other_refs+=("ref_$name=$path")
done < "$par_bbsplit_fasta_list"
done < "$par_ref_fasta_list"
fi

if $par_only_build_index; then
if [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then
bbsplit.sh \
ref_primary="$par_primary_ref" "${other_refs[@]}" \
path=$par_bbsplit_index \
path=$par_index \
threads=${meta_cpus:-1}
else
echo "ERROR: Please specify as input a primary fasta file along with names and paths to non-primary fasta files."
Expand All @@ -28,8 +36,8 @@ else
IFS="," read -ra input <<< "$par_input"
tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX")
index_files=''
if [ -d "$par_built_bbsplit_index" ]; then
index_files="path=$par_built_bbsplit_index"
if [ -d "$par_built_index" ]; then
index_files="path=$par_built_index"
elif [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then
index_files="ref_primary=$par_primary_ref ${other_refs[@]}"
else
Expand Down
18 changes: 8 additions & 10 deletions src/bbmap_bbsplit/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ HERE
echo ">>> Building BBSplit index"
"${meta_executable}" \
--primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
--bbsplit_fasta_list "bbsplit_fasta_list.txt" \
--ref_fasta_list "bbsplit_fasta_list.txt" \
--only_build_index true \
--bbsplit_index "BBSplit_index"
--index "BBSplit_index"

echo ">>> Check whether output exists"
[ ! -d "BBSplit_index" ] && echo "BBSplit index does not exist!" && exit 1
Expand All @@ -22,11 +22,10 @@ echo ">>> Filtering ribosomal RNA reads"

echo ">>> Testing with single-end reads and primary/non-primary FASTA files"
"${meta_executable}" \
--paired false \
--input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \
--only_build_index false \
--primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
--bbsplit_fasta_list "bbsplit_fasta_list.txt" \
--ref_fasta_list "bbsplit_fasta_list.txt" \
--fastq_1 "filtered_SRR6357070_1.fastq.gz"

echo ">>> Check whether output exists"
Expand All @@ -37,11 +36,11 @@ rm filtered_SRR6357070_1.fastq.gz

echo ">>> Testing with paired-end reads and primary/non-primary FASTA files"
"${meta_executable}" \
--paired true \
--paired \
--input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \
--only_build_index false \
--primary_ref "${meta_resources_dir}/test_data/genome.fasta" \
--bbsplit_fasta_list "bbsplit_fasta_list.txt" \
--ref_fasta_list "bbsplit_fasta_list.txt" \
--fastq_1 "filtered_SRR6357070_1.fastq.gz" \
--fastq_2 "filtered_SRR6357070_2.fastq.gz"

Expand All @@ -55,10 +54,9 @@ rm filtered_SRR6357070_1.fastq.gz filtered_SRR6357070_2.fastq.gz

echo ">>> Testing with single-end reads and BBSplit index"
"${meta_executable}" \
--paired false \
--input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \
--only_build_index false \
--built_bbsplit_index "BBSplit_index" \
--built_index "BBSplit_index" \
--fastq_1 "filtered_SRR6357070_1.fastq.gz"

echo ">>> Check whether output exists"
Expand All @@ -67,10 +65,10 @@ echo ">>> Check whether output exists"

echo ">>> Testing with paired-end reads and BBSplit index"
"${meta_executable}" \
--paired true \
--paired \
--input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \
--only_build_index false \
--built_bbsplit_index "BBSplit_index" \
--built_index "BBSplit_index" \
--fastq_1 "filtered_SRR6357070_1.fastq.gz" \
--fastq_2 "filtered_SRR6357070_2.fastq.gz"

Expand Down
1 change: 1 addition & 0 deletions src/bbmap_bbsplit/test_data/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#!/bin/bash

0 comments on commit 53e4dee

Please sign in to comment.