From 53e4dee07670cc9ee651ee10921fd2045097b4f1 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 25 Aug 2024 18:55:18 +0100 Subject: [PATCH] complete config file, adjusted script and tests, not functional --- src/bbmap_bbsplit/config.vsh.yaml | 108 ++++++++++++++++++++++---- src/bbmap_bbsplit/help.txt | 0 src/bbmap_bbsplit/script.sh | 18 +++-- src/bbmap_bbsplit/test.sh | 18 ++--- src/bbmap_bbsplit/test_data/script.sh | 1 + 5 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 src/bbmap_bbsplit/help.txt create mode 100644 src/bbmap_bbsplit/test_data/script.sh diff --git a/src/bbmap_bbsplit/config.vsh.yaml b/src/bbmap_bbsplit/config.vsh.yaml index 3df68038..0ac1dcd5 100644 --- a/src/bbmap_bbsplit/config.vsh.yaml +++ b/src/bbmap_bbsplit/config.vsh.yaml @@ -1,7 +1,6 @@ namespace: "bbmap" name: "bbmap_bbsplit" -description: | - Split sequencing reads by mapping them to multiple references simultaneously. +description: Split sequencing reads by mapping them to multiple references simultaneously. links: homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/ documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/ @@ -16,23 +15,87 @@ argument_groups: type: string description: Sample ID - name: "--paired" - type: boolean - default: false + type: boolean_true description: Paired fastq files or not? - name: "--input" type: file multiple: true description: Input fastq files, either one or two (paired), separated by ";". - example: sample.fastq + example: reads.fastq - name: "--primary_ref" type: file description: Primary reference FASTA - name: "--other_ref_names" type: file - description: Path to comma-separated file containing a list of reference genomes to filter reads against with BBSplit. + description: | + Path to comma-separated file containing a list of reference genomes to filter reads + against with BBSplit. - name: "--only_build_index" - type: boolean - description: true = only build index; false = mapping + type: boolean_true + description: If set, only builds the index. Otherwise, mapping is performed. + - name: "--index" + type: string + description: | + Designate index to use. Corresponds to the number specified when building the index. + If building the index, this will be the build's id. If multiple references are indexed + in the same directory, each needs a unique build ID. Default: 1.\ + example: 1 + - name: "--qin" + type: string + description: | + Set to 33 or 64 to specify input quality value ASCII offset. Automatically detected if + not specified. + - name: "--interleaved" + type: boolean_true + description: | + True forces paired/interleaved input; false forces single-ended mapping. + If not specified, interleaved status will be autodetected from read names. + - name: "--maxindel" + type: integer + description: | + Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq. + example: 20 + - name: "--minratio" + type: double + description: | + Fraction of max alignment score required to keep a site. Higher is faster. + example: 0.56 + - name: "--minhits" + type: integer + description: | + Minimum number of seed hits required for candidate sites. Higher is faster. + example: 1 + - name: "--ambiguous" + type: string + description: | + Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations). + * best Use the first best site (Default) + * toss Consider unmapped + * random Select one top-scoring site randomly + * all Retain all top-scoring sites. Does not work yet with SAM output + choices: [best, toss, random, all] + example: best + - name: "--ambiguous2" + type: string + description: | + Set behavior only for reads that map ambiguously to multiple different references. + Normal 'ambiguous=' controls behavior on all ambiguous reads; + Ambiguous2 excludes reads that map ambiguously within a single reference. + * best Use the first best site (Default) + * toss Consider unmapped + * all Write a copy to the output for each reference to which it maps + * split Write a copy to the AMBIGUOUS_ output for each reference to which it maps + choices: [best, toss, all, split] + example: best + - name: "--qtrim" + type: string + description: | + Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both). + choices: [l, r, lr] + - name: "--untrim" + type: boolean_true + description: Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings. + - name: "Output" arguments: @@ -41,13 +104,13 @@ argument_groups: description: | Output file for read 1. direction: output - example: read_1.fastq + example: read_out1.fastq - name: "--fastq_2" type: file description: | Output file for read 2. direction: output - example: read_2.fastq + example: read_out2.fastq - name: "--primary_fastq" type: file description: | @@ -60,18 +123,35 @@ argument_groups: Output reads that map to the primary reference. direction: output example: all.fastq.gz - - name: "--index" + - name: "--ref_fasta_list" type: file description: | Directory with index files. direction: output example: bbsplit - - name : "--stats" + - name: "--sam2bam" + alternatives: ["--bs"] + type: file + description: | + Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file. + direction: output + example: script.sh + - name: "--scafstats" + type: file + description: | + Write statistics on how many reads mapped to which scaffold to this file. + direction: output + example: scaffold_stats.txt + - name: "--refstats" type: file description: | - Tab-delimited text file containing mapping statistics. + Write statistics on how many reads were assigned to which reference to this file. + Unmapped reads whose mate mapped to a reference are considered assigned and will be counted. direction: output - example: stats.txt + example: reference_stats.txt + - name: "--nzo" + type: boolean_true + description: Only print lines with nonzero coverage. resources: - type: bash_script diff --git a/src/bbmap_bbsplit/help.txt b/src/bbmap_bbsplit/help.txt new file mode 100644 index 00000000..e69de29b diff --git a/src/bbmap_bbsplit/script.sh b/src/bbmap_bbsplit/script.sh index 564cff93..e5de89ed 100755 --- a/src/bbmap_bbsplit/script.sh +++ b/src/bbmap_bbsplit/script.sh @@ -7,19 +7,27 @@ function clean_up { } trap clean_up EXIT -if [ ! -d "$par_built_bbsplit_index" ]; then +unset_if_false=( par_paired par_only_build_index par_interleaved par_untrim par_nzo) + +for var in "${unset_if_false[@]}"; do + if [ -z "${!var}" ]; then + unset $var + fi +done + +if [ ! -d "$par_built_index" ]; then other_refs=() while IFS="," read -r name path do other_refs+=("ref_$name=$path") - done < "$par_bbsplit_fasta_list" + done < "$par_ref_fasta_list" fi if $par_only_build_index; then if [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then bbsplit.sh \ ref_primary="$par_primary_ref" "${other_refs[@]}" \ - path=$par_bbsplit_index \ + path=$par_index \ threads=${meta_cpus:-1} else echo "ERROR: Please specify as input a primary fasta file along with names and paths to non-primary fasta files." @@ -28,8 +36,8 @@ else IFS="," read -ra input <<< "$par_input" tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX") index_files='' - if [ -d "$par_built_bbsplit_index" ]; then - index_files="path=$par_built_bbsplit_index" + if [ -d "$par_built_index" ]; then + index_files="path=$par_built_index" elif [ -f "$par_primary_ref" ] && [ ${#other_refs[@]} -gt 0 ]; then index_files="ref_primary=$par_primary_ref ${other_refs[@]}" else diff --git a/src/bbmap_bbsplit/test.sh b/src/bbmap_bbsplit/test.sh index a58325f4..62d9393d 100644 --- a/src/bbmap_bbsplit/test.sh +++ b/src/bbmap_bbsplit/test.sh @@ -10,9 +10,9 @@ HERE echo ">>> Building BBSplit index" "${meta_executable}" \ --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \ - --bbsplit_fasta_list "bbsplit_fasta_list.txt" \ + --ref_fasta_list "bbsplit_fasta_list.txt" \ --only_build_index true \ - --bbsplit_index "BBSplit_index" + --index "BBSplit_index" echo ">>> Check whether output exists" [ ! -d "BBSplit_index" ] && echo "BBSplit index does not exist!" && exit 1 @@ -22,11 +22,10 @@ echo ">>> Filtering ribosomal RNA reads" echo ">>> Testing with single-end reads and primary/non-primary FASTA files" "${meta_executable}" \ - --paired false \ --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \ --only_build_index false \ --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \ - --bbsplit_fasta_list "bbsplit_fasta_list.txt" \ + --ref_fasta_list "bbsplit_fasta_list.txt" \ --fastq_1 "filtered_SRR6357070_1.fastq.gz" echo ">>> Check whether output exists" @@ -37,11 +36,11 @@ rm filtered_SRR6357070_1.fastq.gz echo ">>> Testing with paired-end reads and primary/non-primary FASTA files" "${meta_executable}" \ - --paired true \ + --paired \ --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \ --only_build_index false \ --primary_ref "${meta_resources_dir}/test_data/genome.fasta" \ - --bbsplit_fasta_list "bbsplit_fasta_list.txt" \ + --ref_fasta_list "bbsplit_fasta_list.txt" \ --fastq_1 "filtered_SRR6357070_1.fastq.gz" \ --fastq_2 "filtered_SRR6357070_2.fastq.gz" @@ -55,10 +54,9 @@ rm filtered_SRR6357070_1.fastq.gz filtered_SRR6357070_2.fastq.gz echo ">>> Testing with single-end reads and BBSplit index" "${meta_executable}" \ - --paired false \ --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz" \ --only_build_index false \ - --built_bbsplit_index "BBSplit_index" \ + --built_index "BBSplit_index" \ --fastq_1 "filtered_SRR6357070_1.fastq.gz" echo ">>> Check whether output exists" @@ -67,10 +65,10 @@ echo ">>> Check whether output exists" echo ">>> Testing with paired-end reads and BBSplit index" "${meta_executable}" \ - --paired true \ + --paired \ --input "${meta_resources_dir}/test_data/SRR6357070_1.fastq.gz,${meta_resources_dir}/test_data/SRR6357070_2.fastq.gz" \ --only_build_index false \ - --built_bbsplit_index "BBSplit_index" \ + --built_index "BBSplit_index" \ --fastq_1 "filtered_SRR6357070_1.fastq.gz" \ --fastq_2 "filtered_SRR6357070_2.fastq.gz" diff --git a/src/bbmap_bbsplit/test_data/script.sh b/src/bbmap_bbsplit/test_data/script.sh new file mode 100644 index 00000000..a9bf588e --- /dev/null +++ b/src/bbmap_bbsplit/test_data/script.sh @@ -0,0 +1 @@ +#!/bin/bash