diff --git a/src/sortmerna/config.vsh.yaml b/src/sortmerna/config.vsh.yaml new file mode 100644 index 00000000..23925132 --- /dev/null +++ b/src/sortmerna/config.vsh.yaml @@ -0,0 +1,292 @@ +name: sortmerna +description: | + Local sequence alignment tool for filtering, mapping and clustering. The main + application of SortMeRNA is filtering rRNA from metatranscriptomic data. SortMeRNA + takes as input files of reads (fasta, fastq, fasta.gz, fastq.gz) and one or multiple + rRNA database file(s), and sorts apart aligned and rejected reads into two files. +keywords: [sort, mRNA, rRNA, alignment, filtering, mapping, clustering] +links: + homepage: https://sortmerna.readthedocs.io/en/latest/ + documentation: https://sortmerna.readthedocs.io/en/latest/manual4.0.html + repository: https://github.com/sortmerna/sortmerna +references: + doi: 10.1093/bioinformatics/bts611 +license: GPL-3.0 + +argument_groups: +- name: "Input" + arguments: + - name: "--paired" + type: boolean_true + description: | + Reads are paired-end. If a single reads file is provided, use this option + to indicate the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + - name: "--input" + type: file + multiple: true + description: Input fastq + - name: "--ref" + type: file + multiple: true + description: Reference fasta file(s) for rRNA database. + - name: "--ribo_database_manifest" + type: file + description: Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA. + +- name: "Output" + arguments: + - name: "--log" + type: file + direction: output + must_exist: false + example: $id.sortmerna.log + description: Sortmerna log file. + - name: "--output" + alternatives: ["--aligned"] + type: string + description: | + Directory and file prefix for aligned output. The appropriate extension: + (fasta|fastq|blast|sam|etc) is automatically added. + If 'dir' is not specified, the output is created in the WORKDIR/out/. + If 'pfx' is not specified, the prefix 'aligned' is used. + - name: "--other" + type: string + description: Create Non-aligned reads output file with this path/prefix. Must be used with fastx. + +- name: "Options" + arguments: + - name: "--kvdb" + type: string + description: Path to directory of the key-value database file, used for storing the alignment results. + - name: "--idx_dir" + type: string + description: Path to the directory for storing the reference index files. + - name: "--readb" + type: string + description: Path to the directory for storing pre-processed reads. + - name: "--fastx" + type: boolean_true + description: Output aligned reads into FASTA/FASTQ file + - name: "--sam" + type: boolean_true + description: Output SAM alignment for aligned reads. + - name: "--sq" + type: boolean_true + description: Add SQ tags to the SAM file + - name: "--blast" + type: string + description: | + Blast options: + * '0' - pairwise + * '1' - tabular(Blast - m 8 format) + * '1 cigar' - tabular + column for CIGAR + * '1 cigar qcov' - tabular + columns for CIGAR and query coverage + * '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage and strand + choices: ['0', '1', '1 cigar', '1 cigar qcov', '1 cigar qcov qstrand'] + - name: "--num_alignments" + type: integer + description: | + Report first INT alignments per read reaching E-value. If Int = 0, all alignments will be output. Default: '0' + example: 0 + - name: "--min_lis" + type: integer + description: | + search all alignments having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is + computed using seeds’ positions to expand hits into longer matches prior to Smith-Waterman alignment. Default: '2'. + example: 2 + - name: "--print_all_reads" + type: boolean_true + description: output null alignment strings for non-aligned reads to SAM and/or BLAST tabular files. + - name: "--paired_in" + type: boolean_true + description: | + In the case where a pair of reads is aligned with a score above the threshold, the output of the reads is controlled + by the following options: + * --paired_in and --paired_out are both false: Only one read per pair is output to the aligned fasta file. + * --paired_in is true and --paired_out is false: Both reads of the pair are output to the aligned fasta file. + * --paired_in is false and --paired_out is true: Both reads are output the the other fasta file (if it is specified). + - name: "--paired_out" + type: boolean_true + description: See description of --paired_in. + - name: "--out2" + type: boolean_true + description: | + Output paired reads into separate files. Must be used with '--fastx'. If a single reads file is provided, this options + implies interleaved paired reads. When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. If 'other' option is also used, + eight (8) output files will be generated. + - name: "--sout" + type: boolean_true + description: | + Separate paired and singleton aligned reads. Must be used with '--fastx'. If a single reads file is provided, + this options implies interleaved paired reads. Cannot be used with '--paired_in' or '--paired_out'. + - name: "--zip_out" + type: string + description: | + Compress the output files. The possible values are: + * '1/true/t/yes/y' + * '0/false/f/no/n' + *'-1' (the same format as input - default) + The values are Not case sensitive. + choices: ['1', 'true', 't', 'yes', 'y', '0', 'false', 'f', 'no', 'n', '-1'] + example: "-1" + - name: "--match" + type: integer + description: | + Smith-Waterman score for a match (positive integer). Default: '2'. + example: 2 + - name: "--mismatch" + type: integer + description: | + Smith-Waterman penalty for a mismatch (negative integer). Default: '-3'. + example: -3 + - name: "--gap_open" + type: integer + description: | + Smith-Waterman penalty for introducing a gap (positive integer). Default: '5'. + example: 5 + - name: "--gap_ext" + type: integer + description: | + Smith-Waterman penalty for extending a gap (positive integer). Default: '2'. + example: 2 + - name: "--N" + type: integer + description: | + Smith-Waterman penalty for ambiguous letters (N’s) scored as --mismatch. Default: '-1'.\ + example: -1 + - name: "--a" + type: integer + description: | + Number of threads to use. Default: '1'. + example: 1 + - name: "--e" + type: double + description: | + E-value threshold. Default: '1'. + example: 1 + - name: "--F" + type: boolean_true + description: Search only the forward strand. + - name: "--R" + type: boolean_true + description: Search only the reverse-complementary strand. + - name: "--num_alignment" + type: integer + description: | + Report first INT alignments per read reaching E-value (--num_alignments 0 signifies all alignments will be output). + Default: '-1' + example: -1 + - name: "--best" + type: integer + description: | + Report INT best alignments per read reaching E-value by searching --min_lis INT candidate alignments (--best 0 + signifies all candidate alignments will be searched) Default: '1'. + example: 1 + - name: "--verbose" + alternatives: ["-v"] + type: boolean_true + description: Verbose output. + +- name: "OTU picking options" + arguments: + - name: "--id" + type: double + description: | + %id similarity threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--coverage" + type: double + description: | + %query coverage threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--de_novo" + type: boolean_true + description: | + FASTA/FASTQ file for reads matching database < %id off (set using --id) and < %cov (set using --coverage) + (alignment must still pass the E-value threshold). + - name: "--otu_map" + type: boolean_true + description: | + Output OTU map (input to QIIME’s make_otu_table.py). + +- name: "Advanced options" + arguments: + - name: "--num_seed" + type: integer + description: | + Number of seeds matched before searching for candidate LIS. Default: '2'. + example: 2 + - name: "--passes" + type: integer + multiple: true + description: | + Three intervals at which to place the seed on the read L,L/2,3 (L is the seed length set in ./indexdb_rna). + - name: "--edge" + type: string + description: | + The number (or percentage if followed by %) of nucleotides to add to each edge of the alignment region on the + reference sequence before performing Smith-Waterman alignment. Default: '4'. + example: 4 + - name: "--full_search" + type: boolean_true + description: | + Search for all 0-error and 1-error seed off matches in the index rather than stopping after finding a 0-error match + (<1% gain in sensitivity with up four-fold decrease in speed). + +- name: "Indexing Options" + arguments: + - name: "--index" + type: integer + description: | + Create index files for the reference database. By default when this option is not used, the program checks the + reference index and builds it if not already existing. + This can be changed by using '-index' as follows: + * '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + * '-index 1' - only perform the indexing and terminate + * '-index 2' - the default behaviour, the same as when not using this option at all + example: 2 + choices: [0, 1, 2] + - name: "-L" + type: double + description: | + Indexing seed length. Default: '18' + example: 18 + - name: "--interval" + type: integer + description: | + Index every Nth L-mer in the reference database. Default: '1' + example: 1 + - name: "--max_pos" + type: integer + description: | + Maximum number of positions to store for each unique L-mer. Set to 0 to store all positions. Default: '1000' + example: 1000 + + + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends gzip cmake g++ wget && \ + apt-get clean && \ + wget --no-check-certificate https://github.com/sortmerna/sortmerna/releases/download/v4.3.6/sortmerna-4.3.6-Linux.sh && \ + bash sortmerna-4.3.6-Linux.sh --skip-license +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/sortmerna/help.txt b/src/sortmerna/help.txt new file mode 100644 index 00000000..f0842707 --- /dev/null +++ b/src/sortmerna/help.txt @@ -0,0 +1,319 @@ +``` +sortmerna -h +``` + + + Program: SortMeRNA version 4.3.6 + Copyright: 2016-2020 Clarity Genomics BVBA: + Turnhoutseweg 30, 2340 Beerse, Belgium + 2014-2016 Knight Lab: + Department of Pediatrics, UCSD, La Jolla + 2012-2014 Bonsai Bioinformatics Research Group: + LIFL, University Lille 1, CNRS UMR 8022, INRIA Nord-Europe + Disclaimer: SortMeRNA comes with ABSOLUTELY NO WARRANTY; without even the + implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU Lesser General Public License for more details. + Contributors: Jenya Kopylova jenya.kopylov@gmail.com + Laurent Noé laurent.noe@lifl.fr + Pierre Pericard pierre.pericard@lifl.fr + Daniel McDonald wasade@gmail.com + Mikaël Salson mikael.salson@lifl.fr + Hélène Touzet helene.touzet@lifl.fr + Rob Knight robknight@ucsd.edu + + Usage: sortmerna -ref FILE [-ref FILE] -reads FWD_READS [-reads REV_READS] [OPTIONS]: + ------------------------------------------------------------------------------------------------------------- + | option type-format description default | + ------------------------------------------------------------------------------------------------------------- + + [REQUIRED] + --ref PATH Required Reference file (FASTA) absolute or relative path. + + Use mutliple times, once per a reference file + + + --reads PATH Required Raw reads file (FASTA/FASTQ/FASTA.GZ/FASTQ.GZ). + + Use twice for files with paired reads. + The file extensions are Not important. The program automatically + recognizes the file format as flat/compressed, fasta/fastq + + + + [COMMON] + --workdir PATH Optional Workspace directory USRDIR/sortmerna/run/ + + Default structure: WORKDIR/ + idx/ (References index) + kvdb/ (Key-value storage for alignments) + out/ (processing output) + readb/ (pre-processed reads/index) + + + --kvdb PATH Optional Directory for Key-value database WORKDIR/kvdb + + KVDB is used for storing the alignment results. + + + --idx-dir PATH Optional Directory for storing Reference index. WORKDIR/idx + + + --readb PATH Optional Storage for pre-processed reads WORKDIR/readb/ + + Directory storing the split reads, or the random access index of compressed reads + + + --fastx BOOL Optional Output aligned reads into FASTA/FASTQ file + --sam BOOL Optional Output SAM alignment for aligned reads. + + + --SQ BOOL Optional Add SQ tags to the SAM file + + + --blast STR Optional output alignments in various Blast-like formats + + Sample values: '0' - pairwise + '1' - tabular (Blast - m 8 format) + '1 cigar' - tabular + column for CIGAR + '1 cigar qcov' - tabular + columns for CIGAR and query coverage + '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage, + and strand + + + --aligned STR/BOOL Optional Aligned reads file prefix [dir/][pfx] WORKDIR/out/aligned + + Directory and file prefix for aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'aligned' is used + Examples: + '-aligned $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-aligned dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-aligned dir_1/' -> $PWD/aligned.fasta + '-aligned apfx' -> $PWD/apfx.fasta + '-aligned (no argument)' -> WORKDIR/out/aligned.fasta + + + --other STR/BOOL Optional Non-aligned reads file prefix [dir/][pfx] WORKDIR/out/other + + Directory and file prefix for non-aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Must be used with 'fastx'. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'other' is used + Examples: + '-other $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-other dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-other dir_1/' -> $PWD/dir_1/other.fasta + '-other apfx' -> $PWD/apfx.fasta + '-other (no argument)' -> aligned_out/other.fasta + i.e. the same output directory + as used for aligned output + + + --num_alignments INT Optional Positive integer (INT >=0). + + If used with '-no-best' reports first INT alignments per read reaching + E-value threshold, which allows to lower the CPU time and memory use. + Otherwise outputs INT best alignments. + If INT = 0, all alignments are output + + + --no-best BOOL Optional Disable best alignments search False + + The 'best' alignment is the highest scoring alignment out of All alignments of a read, + and the read can potentially be aligned (reaching E-value threshold) to multiple reference + sequences. + By default the program searches for best alignments i.e. performs an exhaustive search + over all references. Using '-no-best' will make the program to search just + the first N alignments, where N is set using '-num_alignments' i.e. 1 by default. + + + --min_lis INT Optional Search only alignments that have the LIS 2 + of at least N seeds long + + LIS stands for Longest Increasing Subsequence. It is computed using seeds, which + are k-mers common to the read and the reference sequence. Sorted sequences of such seeds + are used to filter the candidate references prior performing the Smith-Waterman alignment. + + + --print_all_reads BOOL Optional Output null alignment strings for non-aligned reads False + to SAM and/or BLAST tabular files + + --paired BOOL Optional Flags paired reads False + + If a single reads file is provided, use this option to indicate + the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + + + --paired_in BOOL Optional Flags the paired-end reads as Aligned, False + when either of them is Aligned. + + With this option both reads are output into Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_out'. + + + --paired_out BOOL Optional Flags the paired-end reads as Non-aligned, False + when either of them is non-aligned. + + With this option both reads are output into Non-Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_in'. + + + --out2 BOOL Optional Output paired reads into separate files. False + + Must be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. + If 'other' option is also used, eight (8) output files will be generated. + + + --sout BOOL Optional Separate paired and singleton aligned reads. False + + To be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + Cannot be used with 'paired_in' | 'paired_out' + + + --zip-out STR/BOOL Optional Controls the output compression '-1' + + By default the report files are produced in the same format as the input i.e. + if the reads files are compressed (gz), the output is also compressed. + The default behaviour can be overriden by using '-zip-out'. + The possible values: '1/true/t/yes/y' + '0/false/f/no/n' + '-1' (the same format as input - default) + The values are Not case sensitive i.e. 'Yes, YES, yEs, Y, y' are all OK + Examples: + '-reads freads.gz -zip-out n' : generate flat output when the input is compressed + '-reads freads.flat -zip-out' : compress the output when the input files are flat + + + --match INT Optional SW score (positive integer) for a match. 2 + + --mismatch INT Optional SW penalty (negative integer) for a mismatch. -3 + + --gap_open INT Optional SW penalty (positive integer) for introducing a gap. 5 + + --gap_ext INT Optional SW penalty (positive integer) for extending a gap. 2 + + -e DOUBLE Optional E-value threshold. 1 + + Defines the 'statistical significance' of a local alignment. + Exponentially correllates with the Minimal Alignment score. + Higher E-values (100, 1000, ...) cause More reads to Pass the alignment threshold + + + -F BOOL Optional Search only the forward strand. False + + -N BOOL Optional SW penalty for ambiguous letters (N's) scored + as --mismatch + + -R BOOL Optional Search only the reverse-complementary strand. False + + + [OTU_PICKING] + --id INT Optional %%id similarity threshold (the alignment 0.97 + must still pass the E-value threshold). + + --coverage INT Optional %%query coverage threshold (the alignment must 0.97 + still pass the E-value threshold) + + --de_novo_otu BOOL Optional Output FASTA file with 'de novo' reads False + + Read is 'de novo' if its alignment score passes E-value threshold, but both the identity + '-id', and the '-coverage' are below their corresponding thresholds + i.e. ID < %%id and COV < %%cov + + + --otu_map BOOL Optional Output OTU map (input to QIIME's make_otu_table.py). False + Cannot be used with 'no-best because + the grouping is done around the best alignment' + + + [ADVANCED] + --passes INT,INT,INT Optional Three intervals at which to place the seed on L,L/2,3 + the read (L is the seed length) + + --edges INT Optional Number (or percent if INT followed by %% sign) of 4 + nucleotides to add to each edge of the read + prior to SW local alignment + + --num_seeds BOOL Optional Number of seeds matched before searching 2 + for candidate LIS + + --full_search INT Optional Search for all 0-error and 1-error seed False + matches in the index rather than stopping + after finding a 0-error match (<1%% gain in + sensitivity with up four-fold decrease in speed) + + --pid BOOL Optional Add pid to output file names. False + + -a INT Optional DEPRECATED in favour of '-threads'. Number of numCores + processing threads to use. + Automatically redirects to '-threads' + + --threads INT Optional Number of Processing threads to use 2 + + + [INDEXING] + --index INT Optional Build reference database index 2 + + By default when this option is not used, the program checks the reference index and + builds it if not already existing. + This can be changed by using '-index' as follows: + '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + '-index 1' - only perform the indexing and terminate + '-index 2' - the default behaviour, the same as when not using this option at all + + + -L DOUBLE Optional Indexing: seed length. 18 + + -m DOUBLE Optional Indexing: the amount of memory (in Mbytes) for 3072 + building the index. + + -v BOOL Optional Produce verbose output when building the index True + + --interval INT Optional Indexing: Positive integer: index every Nth L-mer in 1 + the reference database e.g. '-interval 2'. + + --max_pos INT Optional Indexing: maximum (integer) number of positions to 1000 + store for each unique L-mer. + If 0 - all positions are stored. + + + [HELP] + -h BOOL Optional Print help information + + --version BOOL Optional Print SortMeRNA version number + + + [DEVELOPER] + --dbg_put_db BOOL Optional + --cmd BOOL Optional Launch an interactive session (command prompt) False + + --task INT Optional Processing Task 4 + + Possible values: 0 - align. Only perform alignment + 1 - post-processing (log writing) + 2 - generate reports + 3 - align and post-process + 4 - all + + + --dbg-level INT Optional Debug level 0 + + Controls verbosity of the execution trace. Default value of 0 corresponds to + the least verbose output. + The highest value currently is 2. diff --git a/src/sortmerna/script.sh b/src/sortmerna/script.sh new file mode 100755 index 00000000..8dda3d60 --- /dev/null +++ b/src/sortmerna/script.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_fastx par_sq par_fastx par_print_all_reads par_paired_in par_paired_out + par_F par_R par_verbose par_de_novo par_otu_map par_full_search par_out2 + par_sout par_sam par_paired ) + + +for var in "${unset_if_false[@]}"; do + if [ "${!var}" == "false" ]; then + unset $var + fi +done + +reads=() +IFS=";" read -ra input <<< "$par_input" +if [ "${#input[@]}" -eq 2 ]; then + reads="--reads ${input[0]} --reads ${input[1]}" + # set paired to true in case it's not + par_paired=true +else + reads="--reads ${input[0]}" + par_paired=false +fi + +refs=() + +# check if references are input normally or through a manifest file +if [[ ! -z "$par_ribo_database_manifest" ]]; then + while IFS= read -r path || [[ -n $path ]]; do + refs=$refs" --ref $path" + done < $par_ribo_database_manifest + +elif [[ ! -z "$par_ref" ]]; then + IFS=";" read -ra ref <<< "$par_ref" + # check if length is 2 and par_paired is set to true + if [[ "${#ref[@]}" -eq 2 && "$par_paired" == "true" ]]; then + refs="--ref ${ref[0]} --ref ${ref[1]}" + # check if length is 1 and par_paired is set to false + elif [[ "${#ref[@]}" -eq 1 && "$par_paired" == "false" ]]; then + refs="--ref $par_ref" + else # if one reference provided but paired is set to true: + echo "Two reference fasta files are required for paired-end reads" + exit 1 + fi +else + echo "No reference fasta file(s) provided" + exit 1 +fi + + +sortmerna \ + $refs \ + $reads \ + --workdir . \ + ${par_output:+--aligned "${par_output}"} \ + ${par_fastx:+--fastx} \ + ${par_other:+--other "${par_other}"} \ + ${par_kvdb:+--kvdb "${par_kvdb}"} \ + ${par_idx_dir:+--idx-dir "${par_idx_dir}"} \ + ${par_readb:+--readb "${par_readb}"} \ + ${par_sam:+--sam} \ + ${par_sq:+--sq} \ + ${par_blast:+--blast "${par_blast}"} \ + ${par_num_alignments:+--num_alignments "${par_num_alignments}"} \ + ${par_min_lis:+--min_lis "${par_min_lis}"} \ + ${par_print_all_reads:+--print_all_reads} \ + ${par_paired_in:+--paired_in} \ + ${par_paired_out:+--paired_out} \ + ${par_out2:+--out2} \ + ${par_sout:+--sout} \ + ${par_zip_out:+--zip-out "${par_zip_out}"} \ + ${par_match:+--match "${par_match}"} \ + ${par_mismatch:+--mismatch "${par_mismatch}"} \ + ${par_gap_open:+--gap_open "${par_gap_open}"} \ + ${par_gap_ext:+--gap_ext "${par_gap_ext}"} \ + ${par_N:+-N "${par_N}"} \ + ${par_a:+-a "${par_a}"} \ + ${par_e:+-e "${par_e}"} \ + ${par_F:+-F} \ + ${par_R:+-R} \ + ${par_num_alignment:+--num_alignment "${par_num_alignment}"} \ + ${par_best:+--best "${par_best}"} \ + ${par_verbose:+--verbose} \ + ${par_id:+--id "${par_id}"} \ + ${par_coverage:+--coverage "${par_coverage}"} \ + ${par_de_novo:+--de_novo} \ + ${par_otu_map:+--otu_map} \ + ${par_num_seed:+--num_seed "${par_num_seed}"} \ + ${par_passes:+--passes "${par_passes}"} \ + ${par_edge:+--edge "${par_edge}"} \ + ${par_full_search:+--full_search} \ + ${par_index:+--index "${par_index}"} \ + ${par_L:+-L $par_L} \ + ${par_interval:+--interval "${par_interval}"} \ + ${par_max_pos:+--max_pos "${par_max_pos}"} + + +if [ ! -z $par_log ]; then + mv "${par_output}.log" $par_log +fi + +exit 0 + diff --git a/src/sortmerna/test.sh b/src/sortmerna/test.sh new file mode 100644 index 00000000..4d49c5ed --- /dev/null +++ b/src/sortmerna/test.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +echo ">>> Testing $meta_functionality_name" + +find $meta_resources_dir/test_data/rRNA -type f > test_data/rrna-db.txt + +echo ">>> Testing for paired-end reads" +# out2 separates the read pairs into two files (one fwd and one rev) +# paired_in outputs both reads of a pair +# other is the output file for non-rRNA reads +"$meta_executable" \ + --output "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ribo_database_manifest test_data/rrna-db.txt \ + --log test_log.log \ + --paired_in \ + --fastx \ + --out2 + + +echo ">> Checking if the correct files are present" +[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; } +[[ -s "rRNA_reads_fwd.fq.gz" ]] && [[ -s "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is empty!"; exit 1; } +[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;} +gzip -dk non_rRNA_reads_fwd.fq.gz +gzip -dk non_rRNA_reads_rev.fq.gz +[[ ! -s "non_rRNA_reads_fwd.fq" ]] && [[ ! -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is not empty!"; exit 1;} + +rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log +rm -rf kvdb/ + + +echo ">>> Testing for single-end reads" +"$meta_executable" \ + --aligned "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input $meta_resources_dir/test_data/reads_1.fq.gz \ + --ref $meta_resources_dir/test_data/rRNA/database1.fa \ + --log test_log.log \ + --fastx + +echo ">> Checking if the correct files are present" +[[ ! -f "rRNA_reads.fq.gz" ]] && echo "rRNA output fastq file is missing!" && exit 1 +gzip -dk rRNA_reads.fq.gz +[[ -s "rRNA_reads.fq" ]] && echo "rRNA output fastq file is not empty!" && exit 1 +[[ ! -f "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is missing!" && exit 1 +[[ ! -s "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is empty!" && exit 1 + + +echo ">>> All tests passed" +exit 0 \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database1.fa b/src/sortmerna/test_data/rRNA/database1.fa new file mode 100644 index 00000000..bae23aba --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database1.fa @@ -0,0 +1,24 @@ +>AY846379.1.1791 Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w +CCUGGUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUAUAAACUGCUUAUACUGU +GAAACUGCGAAUGGCUCAUUAAAUCAGUUAUAGUUUAUUUGAUGGUACCUCUACACGGAUAACCGUAGUAAUUCUAGAGC +UAAUACGUGCGUAAAUCCCGACUUCUGGAAGGGACGUAUUUAUUAGAUAAAAGGCCGACCGAGCUUUGCUCGACCCGCGG +UGAAUCAUGAUAACUUCACGAAUCGCAUAGCCUUGUGCUGGCGAUGUUUCAUUCAAAUUUCUGCCCUAUCAACUUUCGAU +GGUAGGAUAGAGGCCUACCAUGGUGGUAACGGGUGACGGAGGAUUAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGG +CUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCAAUCCUGAUACGGGGAGGUAGUGACAAUAAAUAACAAUGC +CGGGCAUUUCAUGUCUGGCAAUUGGAAUGAGUACAAUCUAAAUCCCUUAACGAGGAUCAAUUGGAGGGCAAGUCUGGUGC +CAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUUAAGUUGUUGCAGUUAAAAAGCUCGUAGUUGGAUUUCGGGUG +GGUUCCAGCGGUCCGCCUAUGGUGAGUACUGCUGUGGCCCUCCUUUUUGUCGGGGACGGGCUCCUGGGCUUCAUUGUCCG +GGACUCGGAGUCGACGAUGAUACUUUGAGUAAAUUAGAGUGUUCAAAGCAAGCCUACGCUCUGAAUACUUUAGCAUGGAA +UAUCGCGAUAGGACUCUGGCCUAUCUCGUUGGUCUGUAGGACCGGAGUAAUGAUUAAGAGGGACAGUCGGGGGCAUUCGU +AUUUCAUUGUCAGAGGUGAAAUUCUUGGAUUUAUGAAAGACGAACUACUGCGAAAGCAUUUGCCAAGGAUGUUUUCAUUA +AUCAAGAACGAAAGUUGGGGGCUCGAAGACGAUUAGAUACCGUCGUAGUCUCAACCAUAAACGAUGCCGACUAGGGAUUG +GAGGAUGUUCUUUUGAUGACUUCUCCAGCACCUUAUGAGAAAUCAAAGUUUUUGGGUUCCGGGGGGAGUAUGGUCGCAAG +GCUGAAACUUAAAGGAAUUGACGGAAGGGCACCACCAGGCGUGGAGCCUGCGGCUUAAUUUGACUCAACACGGGAAAACU +UACCAGGUCCAGACAUAGUGAGGAUUGACAGAUUGAGAGCUCUUUCUUGAUUCUAUGGGUGGUGGUGCAUGGCCGUUCUU +AGUUGGUGGGUUGCCUUGUCAGGUUGAUUCCGGUAACGAACGAGACCUCAGCCUGCUAAAUAUGUCACAUUCGCUUUUUG +CGGAUGGCCGACUUCUUAGAGGGACUAUUGGCGUUUAGUCAAUGGAAGUAUGAGGCAAUAACAGGUCUGUGAUGCCCUUA +GAUGUUCUGGGCCGCACGCGCGCUACACUGACGCAUUCAGCAAGCCUAUCCUUGACCGAGAGGUCUGGGUAAUCUUUGAA +ACUGCGUCGUGAUGGGGAUAGAUUAUUGCAAUUAUUAGUCUUCAACGAGGAAUGCCUAGUAAGCGCAAGUCAUCAGCUUG +CGUUGAUUACGUCCCUGCCCUUUGUACACACCGCCCGUCGCUCCUACCGAUUGGGUGUGCUGGUGAAGUGUUCGGAUUGG +CAGAGCGGGUGGCAACACUUGCUUUUGCCGAGAAGUUCAUUAAACCCUCCCACCUAGAGGAAGGAGAAGUCGUAACAAGG +UUUCCGUAGGUGAACCUGCAGAAG \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database2.fa b/src/sortmerna/test_data/rRNA/database2.fa new file mode 100644 index 00000000..87b5bc99 --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database2.fa @@ -0,0 +1,16 @@ +>AB001445.1.1538 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;Pseudomonas amygdali pv. morsprunorum +AGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGCAGCACGGGUACUUGUAC +CUGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUA +AUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGCCUAGGUCGGAUUAGCUAG +UUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCGUAACUGGUCUGAGAGGAUGAUCAGUCACACUGGAACUGAGACACG +GUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGAAAGCCUGAUCCAGCCAUGCCGCGUGUGUGA +AGAAGGUCUUCGGAUUGUAAAGCACUUUAAGUUGGGAGGAAGGGCAGUUACCUAAUACGUAUCUGUUUUGACGUUACCGA +CAGAAUAAGCACCGGCUAACUCUGUGCCAGCAGCCGCGGUAAUACAGAGGGUGCAAGCGUUAAUCGGAAUUACUGGGCGU +AAAGCGCGCGUAGGUGGUUUGUUAAGUUGAAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCCAAAACUGGCAAGC +UAGAGUAUGGUAGAGGGUGGUGGAAUUUCCUGUGUAGCGGUGAAAUGCGUAGAUAUAGGAAGGAACACCAGUGGCGAAGG +CGACCACCUGGACUGAUACUGACACUGAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCC +GUAAACGAUGUCAACUAGCCGUUGGGAGCCUUGAGCUCUUAGUGGCGCAGCUAACGCAUUAAGUUGACCGCCUGGGGAGU +ACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAAGCAACG +CGAAGAACCUUACCAGGCCUUGACAUCCAAUGAAUCCUUUAGAGAUAGAGGAGUGCCUUCGGGAGCAUUGAGACAGGUGC +UGCAUGGCUGUCGUCAGCUCGUGUCGUGAGAUGUUGGGUUAAGUCCCGUAACGAGCGCAACCCUUGUCCUUAGUUACCAG +CACGUCAUGGUGGGCACUCUAAGGAGACUGCCGGUGACAAACCGGAGGAAGGUGGGGAUGACGUCAAGUCAUCAUGGCCC diff --git a/src/sortmerna/test_data/reads_1.fq.gz b/src/sortmerna/test_data/reads_1.fq.gz new file mode 100644 index 00000000..41c02a22 Binary files /dev/null and b/src/sortmerna/test_data/reads_1.fq.gz differ diff --git a/src/sortmerna/test_data/reads_2.fq.gz b/src/sortmerna/test_data/reads_2.fq.gz new file mode 100644 index 00000000..9d0f8d3f Binary files /dev/null and b/src/sortmerna/test_data/reads_2.fq.gz differ diff --git a/src/sortmerna/test_data/script.sh b/src/sortmerna/test_data/script.sh new file mode 100755 index 00000000..b2531248 --- /dev/null +++ b/src/sortmerna/test_data/script.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +if [ ! -d /tmp/sortmerna_source ]; then + git clone --depth 2 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers.git /tmp/sortmerna_source +fi + +# copy test data +cp -r /tmp/sortmerna_source/bio/sortmerna/test/* .