From 7fb67a98539868b9af788338fb5f46d34ab742f7 Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Fri, 18 Oct 2024 11:15:20 +0200 Subject: [PATCH] Add bbmap_bbsplit (#138) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * initial commit, complete config file, add test data * complete config file, adjusted script and tests, not functional * update changelog, hep.txt, functional test, large test data * smaller test data * remove test resource from config * modify paths in test script * Arguments closer to original tool's * Extra arg to allow use of bbmap args --- CHANGELOG.md | 3 + src/bbmap_bbsplit/config.vsh.yaml | 162 ++++++++++++++++++++++++++++++ src/bbmap_bbsplit/help.txt | 83 +++++++++++++++ src/bbmap_bbsplit/script.sh | 91 +++++++++++++++++ src/bbmap_bbsplit/test.sh | 145 ++++++++++++++++++++++++++ 5 files changed, 484 insertions(+) create mode 100644 src/bbmap_bbsplit/config.vsh.yaml create mode 100644 src/bbmap_bbsplit/help.txt create mode 100755 src/bbmap_bbsplit/script.sh create mode 100644 src/bbmap_bbsplit/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 47c786c6..16e79693 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -184,6 +184,9 @@ * `bedtools`: - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). + +* `bbmap`: + - `bbmap_bbsplit`: Split sequencing reads by mapping them to multiple references simultaneously (PR #138). diff --git a/src/bbmap_bbsplit/config.vsh.yaml b/src/bbmap_bbsplit/config.vsh.yaml new file mode 100644 index 00000000..61336b35 --- /dev/null +++ b/src/bbmap_bbsplit/config.vsh.yaml @@ -0,0 +1,162 @@ +namespace: "bbmap" +name: "bbmap_bbsplit" +description: Split sequencing reads by mapping them to multiple references simultaneously. +links: + homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/ + repository: https://github.com/BioInfoTools/BBMap/blob/master/sh/bbsplit.sh + +license: BBTools Copyright (c) 2014 + +argument_groups: +- name: "Input" + arguments: + - name: "--id" + type: string + description: Sample ID + - name: "--paired" + type: boolean_true + description: Paired fastq files or not? + - name: "--input" + type: file + multiple: true + description: Input fastq files, either one or two (paired), separated by ";". + example: reads.fastq + - name: "--ref" + type: file + multiple: true + description: Reference FASTA files, separated by ";". The primary reference should be specified first. + - name: "--only_build_index" + type: boolean_true + description: If set, only builds the index. Otherwise, mapping is performed. + - name: "--build" + type: string + description: | + Designate index to use. Corresponds to the number specified when building the index. + If building the index, this will be the build's id. If multiple references are indexed + in the same directory, each needs a unique build ID. Default: 1. + example: "1" + - name: "--qin" + type: string + description: | + Set to 33 or 64 to specify input quality value ASCII offset. Automatically detected if + not specified. + - name: "--interleaved" + type: boolean_true + description: | + True forces paired/interleaved input; false forces single-ended mapping. + If not specified, interleaved status will be autodetected from read names. + - name: "--maxindel" + type: integer + description: | + Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq. + example: 20 + - name: "--minratio" + type: double + description: | + Fraction of max alignment score required to keep a site. Higher is faster. + example: 0.56 + - name: "--minhits" + type: integer + description: | + Minimum number of seed hits required for candidate sites. Higher is faster. + example: 1 + - name: "--ambiguous" + type: string + description: | + Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations). + * best Use the first best site (Default) + * toss Consider unmapped + * random Select one top-scoring site randomly + * all Retain all top-scoring sites. Does not work yet with SAM output + choices: [best, toss, random, all] + example: best + - name: "--ambiguous2" + type: string + description: | + Set behavior only for reads that map ambiguously to multiple different references. + Normal 'ambiguous=' controls behavior on all ambiguous reads; + Ambiguous2 excludes reads that map ambiguously within a single reference. + * best Use the first best site (Default) + * toss Consider unmapped + * all Write a copy to the output for each reference to which it maps + * split Write a copy to the AMBIGUOUS_ output for each reference to which it maps + choices: [best, toss, all, split] + example: best + - name: "--qtrim" + type: string + description: | + Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both). + choices: [l, r, lr] + - name: "--untrim" + type: boolean_true + description: Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings. + + +- name: "Output" + arguments: + - name: "--fastq_1" + type: file + description: | + Output file for read 1. + direction: output + example: read_out1.fastq + - name: "--fastq_2" + type: file + description: | + Output file for read 2. + direction: output + example: read_out2.fastq + - name: "--sam2bam" + alternatives: ["--bs"] + type: file + description: | + Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file. + direction: output + example: script.sh + - name: "--scafstats" + type: file + description: | + Write statistics on how many reads mapped to which scaffold to this file. + direction: output + example: scaffold_stats.txt + - name: "--refstats" + type: file + description: | + Write statistics on how many reads were assigned to which reference to this file. + Unmapped reads whose mate mapped to a reference are considered assigned and will be counted. + direction: output + example: reference_stats.txt + - name: "--nzo" + type: boolean_true + description: Only print lines with nonzero coverage. + - name: "--bbmap_args" + type: string + description: | + Additional arguments from BBMap to pass to BBSplit. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y build-essential openjdk-17-jdk wget tar && \ + wget --no-check-certificate https://sourceforge.net/projects/bbmap/files/BBMap_39.01.tar.gz && \ + tar xzf BBMap_39.01.tar.gz && \ + cp -r bbmap/* /usr/local/bin + - type: docker + run: | + bbsplit.sh --version 2>&1 | awk '/BBMap version/{print "BBMAP:", $NF}' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/bbmap_bbsplit/help.txt b/src/bbmap_bbsplit/help.txt new file mode 100644 index 00000000..56544a34 --- /dev/null +++ b/src/bbmap_bbsplit/help.txt @@ -0,0 +1,83 @@ +``` +bbsplit.sh +``` + +BBSplit +Written by Brian Bushnell, from Dec. 2010 - present +Last modified June 11, 2018 + +Description: Maps reads to multiple references simultaneously. +Outputs reads to a file for the reference they best match, with multiple options for dealing with ambiguous mappings. + +To index: bbsplit.sh build=<1> ref_x= ref_y= +To map: bbsplit.sh build=<1> in= out_x= out_y= + +To be concise, and do everything in one command: +bbsplit.sh ref=x.fa,y.fa in=reads.fq basename=o%.fq + +that is equivalent to +bbsplit.sh build=1 in=reads.fq ref_x=x.fa ref_y=y.fa out_x=ox.fq out_y=oy.fq + +By default paired reads will yield interleaved output, but you can use the # symbol to produce twin output files. +For example, basename=o%_#.fq will produce ox_1.fq, ox_2.fq, oy_1.fq, and oy_2.fq. + + +Indexing Parameters (required when building the index): +ref= A list of references, or directories containing fasta files. +ref_= Alternate, longer way to specify references. e.g., ref_ecoli=ecoli.fa + These can also be comma-delimited lists of files; e.g., ref_a=a1.fa,a2.fa,a3.fa +build=<1> If multiple references are indexed in the same directory, each needs a unique build ID. +path=<.> Specify the location to write the index, if you don't want it in the current working directory. + +Input Parameters: +build=<1> Designate index to use. Corresponds to the number specified when building the index. +in= Primary reads input; required parameter. +in2= For paired reads in two files. +qin= Set to 33 or 64 to specify input quality value ASCII offset. +interleaved= True forces paired/interleaved input; false forces single-ended mapping. + If not specified, interleaved status will be autodetected from read names. + +Mapping Parameters: +maxindel=<20> Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq. +minratio=<0.56> Fraction of max alignment score required to keep a site. Higher is faster. +minhits=<1> Minimum number of seed hits required for candidate sites. Higher is faster. +ambiguous= Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations). + best (use the first best site) + toss (consider unmapped) + random (select one top-scoring site randomly) + all (retain all top-scoring sites. Does not work yet with SAM output) +ambiguous2= Set behavior only for reads that map ambiguously to multiple different references. + Normal 'ambiguous=' controls behavior on all ambiguous reads; + Ambiguous2 excludes reads that map ambiguously within a single reference. + best (use the first best site) + toss (consider unmapped) + all (write a copy to the output for each reference to which it maps) + split (write a copy to the AMBIGUOUS_ output for each reference to which it maps) +qtrim= Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both). +untrim= Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings. + +Output Parameters: +out_= Output reads that map to the reference to . +basename=prefix%suffix Equivalent to multiple out_%=prefix%suffix expressions, in which each % is replaced by the name of a reference file. +bs= Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file. +scafstats= Write statistics on how many reads mapped to which scaffold to this file. +refstats= Write statistics on how many reads were assigned to which reference to this file. + Unmapped reads whose mate mapped to a reference are considered assigned and will be counted. +nzo=t Only print lines with nonzero coverage. + +***** Notes ***** +Almost all BBMap parameters can be used; run bbmap.sh for more details. +Exceptions include the 'nodisk' flag, which BBSplit does not support. +BBSplit is recommended for fastq and fasta output, not for sam/bam output. +When the reference sequences are shorter than read length, use Seal instead of BBSplit. + +Java Parameters: +-Xmx This will set Java's memory usage, overriding autodetection. + -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. + The max is typically 85% of physical memory. +-eoom This flag will cause the process to exit if an + out-of-memory exception occurs. Requires Java 8u92+. +-da Disable assertions. + +This list is not complete. For more information, please consult /readme.txt +Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. \ No newline at end of file diff --git a/src/bbmap_bbsplit/script.sh b/src/bbmap_bbsplit/script.sh new file mode 100755 index 00000000..ac8542c9 --- /dev/null +++ b/src/bbmap_bbsplit/script.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +unset_if_false=( par_paired par_only_build_index par_interleaved par_untrim par_nzo) + +for var in "${unset_if_false[@]}"; do + if [ -z "${!var}" ]; then + unset $var + fi +done + +if [ ! -d "$par_build" ]; then + IFS=";" read -ra ref_files <<< "$par_ref" + primary_ref="${ref_files[0]}" + refs=() + for file in "${ref_files[@]:1}" + do + name=$(basename "$file" | sed 's/\.[^.]*$//') + refs+=("ref_$name=$file") + done +fi + +if $par_only_build_index; then + if [ ${#refs[@]} -gt 1 ]; then + bbsplit.sh \ + --ref_primary="$primary_ref" \ + "${refs[@]}" \ + path=$par_build + else + echo "ERROR: Please specify at least two reference fasta files." + fi +else + IFS=";" read -ra input <<< "$par_input" + tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX") + index_files='' + if [ -d "$par_build" ]; then + index_files="path=$par_build" + elif [ ${#refs[@]} -gt 0 ]; then + index_files="--ref_primary=$primary_ref ${refs[*]}" + else + echo "ERROR: Please either specify a BBSplit index as input or at least two reference fasta files." + fi + + extra_args="" + if [ -n "$par_refstats" ]; then extra_args+=" --refstats $par_refstats"; fi + if [ -n "$par_ambiguous" ]; then extra_args+=" --ambiguous $par_ambiguous"; fi + if [ -n "$par_ambiguous2" ]; then extra_args+=" --ambiguous2 $par_ambiguous2"; fi + if [ -n "$par_minratio" ]; then extra_args+=" --minratio $par_minratio"; fi + if [ -n "$par_minhits" ]; then extra_args+=" --minhits $par_minhits"; fi + if [ -n "$par_maxindel" ]; then extra_args+=" --maxindel $par_maxindel"; fi + if [ -n "$par_qin" ]; then extra_args+=" --qin $par_qin"; fi + if [ -n "$par_qtrim" ]; then extra_args+=" --qtrim $par_qtrim"; fi + if [ "$par_interleaved" = true ]; then extra_args+=" --interleaved"; fi + if [ "$par_untrim" = true ]; then extra_args+=" --untrim"; fi + if [ "$par_nzo" = true ]; then extra_args+=" --nzo"; fi + + if [ -n "$par_bbmap_args" ]; then extra_args+=" $par_bbmap_args"; fi + + + if $par_paired; then + bbsplit.sh \ + $index_files \ + in=${input[0]} \ + in2=${input[1]} \ + basename=${tmpdir}/%_#.fastq \ + $extra_args + read1=$(find $tmpdir/ -iname primary_1*) + read2=$(find $tmpdir/ -iname primary_2*) + cp $read1 $par_fastq_1 + cp $read2 $par_fastq_2 + else + bbsplit.sh \ + $index_files \ + in=${input[0]} \ + basename=${tmpdir}/%.fastq \ + $extra_args + read1=$(find $tmpdir/ -iname primary*) + cp $read1 $par_fastq_1 + fi +fi + +exit 0 diff --git a/src/bbmap_bbsplit/test.sh b/src/bbmap_bbsplit/test.sh new file mode 100644 index 00000000..1ad7aac2 --- /dev/null +++ b/src/bbmap_bbsplit/test.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +echo ">>> Test $meta_functionality_name" + +echo "> Prepare test data" + +cat > reads_R1.fastq <<'EOF' +@SEQ_ID1 +ACAGGGTTTCACCATGTTGGCCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +TCCCAGGTAACAAACCAACCAACTT ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > reads_R2.fastq <<'EOF' +@SEQ_ID1 +TACCATTACCCTACCATCCACCATG ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +CACTCGGCTGCATGCTTAGTGCACT ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > genome.fasta <<'EOF' +>I +AGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGGTCTTGATCTCCTGACCTCAGGTGATCCATCCGCCT +TGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCACCTGGCCTGGTTTCGAACTCTTGACCTCAGGTGGTCTG +CCCATCTTGACCTTCCAAAGTGCTGGAGCTACAGGCATGAGCCACTGCACCTGGTGCTTTTGGTAAAAGCAACCTGGAAT +CAAATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTT +TAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGAC +EOF + +cat > human.fa <<'EOF' +>human +AGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGGTCTTGATCTCCTGACCTCAGGTGATCCATCCGCCT +TGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCACCTGGCCTGGTTTCGAACTCTTGACCTCAGGTGGTCTG +CCCATCTTGACCTTCCAAAGTGCTGGAGCTACAGGCATGAGCCACTGCACCTGGTGCTTTTGGTAAAAGCAACCTGGAAT +EOF + +cat > sarscov2.fa <<'EOF' +>sarscov2 +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAA +AATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGG +ACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT +EOF + +#################################################################################################### + +echo ">>> Building BBSplit index" +"${meta_executable}" \ + --ref "genome.fasta;human.fa;sarscov2.fa" \ + --only_build_index \ + --build "BBSplit_index" + +echo ">>> Check whether output exists" +[ ! -d "BBSplit_index" ] && echo "BBSplit index does not exist!" && exit 1 +[ -z "$(ls -A 'BBSplit_index')" ] && echo "BBSplit index is empty!" && exit 1 + +#################################################################################################### + + +echo ">>> Testing with single-end reads and primary/non-primary FASTA files" +"${meta_executable}" \ + --input "reads_R1.fastq" \ + --ref "genome.fasta;human.fa;sarscov2.fa" \ + --fastq_1 "filtered_reads_R1.fastq" + +echo ">>> Check whether output exists" +[ ! -f "filtered_reads_R1.fastq" ] && echo "Filtered reads file does not exist!" && exit 1 +[ ! -s "filtered_reads_R1.fastq" ] && echo "Filtered reads file is empty!" && exit 1 + +echo ">>> Check whether output is correct" +grep -q "ACAGGGTTTCACCATGTTGGCCAGG" filtered_reads_R1.fastq || { echo "Filtered reads file does not contain expected sequence!"; exit 1; } + +rm filtered_reads_R1.fastq + +#################################################################################################### + +echo ">>> Testing with paired-end reads and primary/non-primary FASTA files" +"${meta_executable}" \ + --paired \ + --input "reads_R1.fastq;reads_R2.fastq" \ + --ref "genome.fasta;human.fa;sarscov2.fa" \ + --fastq_1 "filtered_reads_R1.fastq" \ + --fastq_2 "filtered_reads_R2.fastq" + +echo ">>> Check whether output exists" +[ ! -f "filtered_reads_R1.fastq" ] && echo "Filtered read 1 file does not exist!" && exit 1 +[ ! -s "filtered_reads_R1.fastq" ] && echo "Filtered read 1 file is empty!" && exit 1 +[ ! -f "filtered_reads_R2.fastq" ] && echo "Filtered read 2 file does not exist!" && exit 1 +[ ! -s "filtered_reads_R2.fastq" ] && echo "Filtered read 2 file is empty!" && exit 1 + +echo ">>> Check whether output is correct" +grep -q "ACAGGGTTTCACCATGTTGGCCAGG" filtered_reads_R1.fastq || { echo "Filtered read 1 file does not contain expected sequence!"; exit 1; } +grep -q "TACCATTACCCTACCATCCACCATG" filtered_reads_R2.fastq || { echo "Filtered read 2 file does not contain expected sequence!"; exit 1; } + +rm filtered_reads_R1.fastq filtered_reads_R2.fastq + +#################################################################################################### + +echo ">>> Testing with single-end reads and BBSplit index" +"${meta_executable}" \ + --input "reads_R1.fastq" \ + --build "BBSplit_index" \ + --fastq_1 "filtered_reads_R1.fastq" + +echo ">>> Check whether output exists" +[ ! -f "filtered_reads_R1.fastq" ] && echo "Filtered reads file does not exist!" && exit 1 +[ ! -s "filtered_reads_R1.fastq" ] && echo "Filtered reads file is empty!" && exit 1 + +echo ">>> Check whether output is correct" +grep -q "ACAGGGTTTCACCATGTTGGCCAGG" filtered_reads_R1.fastq || { echo "Filtered reads file does not contain expected sequence!"; exit 1; } + +rm filtered_reads_R1.fastq + +#################################################################################################### + +echo ">>> Testing with paired-end reads and BBSplit index" +"${meta_executable}" \ + --paired \ + --input "reads_R1.fastq;reads_R2.fastq" \ + --build "BBSplit_index" \ + --fastq_1 "filtered_reads_R1.fastq" \ + --fastq_2 "filtered_reads_R2.fastq" + +echo ">>> Check whether output exists" +[ ! -f "filtered_reads_R1.fastq" ] && echo "Filtered read 1 file does not exist!" && exit 1 +[ ! -s "filtered_reads_R1.fastq" ] && echo "Filtered read 1 file is empty!" && exit 1 +[ ! -f "filtered_reads_R2.fastq" ] && echo "Filtered read 2 file does not exist!" && exit 1 +[ ! -s "filtered_reads_R2.fastq" ] && echo "Filtered read 2 file is empty!" && exit 1 + + +echo ">>> Check whether output is correct" +grep -q "ACAGGGTTTCACCATGTTGGCCAGG" filtered_reads_R1.fastq || { echo "Filtered read 1 file does not contain expected sequence!"; exit 1; } +grep -q "TACCATTACCCTACCATCCACCATG" filtered_reads_R2.fastq || { echo "Filtered read 2 file does not contain expected sequence!"; exit 1; } + +rm filtered_reads_R1.fastq filtered_reads_R2.fastq + +echo "All tests succeeded!" +exit 0 \ No newline at end of file