From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:04:14 +0100
Subject: [PATCH 1/5] initial commit dedup

---
 CHANGELOG.md                                  |   3 +
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++
 src/umi_tools/umi_tools_dedup/help.txt        |  13 +
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ++++
 src/umi_tools/umi_tools_dedup/test.sh         |  49 +++
 5 files changed, 409 insertions(+)
 create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 create mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 create mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 create mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4fd7f001..1bef9345 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,9 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
+* `umi_tools`:
+    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
+    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
new file mode 100644
index 00000000..75306541
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -0,0 +1,279 @@
+name: umi_tool_dedup
+namespace: umi_tools
+description: |
+  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
+keywords: [umi_tools, deduplication, dedup]
+links:
+  homepage: https://umi-tools.readthedocs.io/en/latest/
+  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
+                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
+  repository: https://github.com/CGATOxford/UMI-tools
+references: 
+  doi: 10.1101/gr.209601.116
+license: MIT
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        alternatives: -I
+        type: file
+        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
+        required: true
+      - name: --in_sam
+        type: boolean_true
+        description: |
+          By default, inputs are assumed to be in BAM format. Use this options 
+          to specify the use of SAM format for input.
+      - name: --bai
+        type: file
+        description: BAM index
+      - name: --get_output_stats
+        type: boolean
+        description: Whether or not to generate output stats. 
+      - name: --random_seed
+        type: integer
+        description: |
+          Random seed to initialize number generator with.
+        default: none
+
+  - name: Outputs
+    arguments:
+      - name: --output
+        alternatives: -S
+        type: file
+        description: Deduplicated BAM file
+        required: true
+        direction: output
+      - name: --out_sam
+        type: boolean_true
+        description: |
+          By default, outputa are written in BAM format. Use this options to 
+          specify the use of SAM format for output.
+      - name: --paired
+        type: boolean_true
+        description: |
+          BAM is paired end - output both read pairs. This will also force the
+          use of the template length to determine reads with the same mapping
+          coordinates.
+      - name: --output_stats
+        type: file
+        description: Directory containing UMI based deduplication statistics files
+        direction: output
+      - name: --extract_umi_method
+        type: string
+        description: |
+          Specify the method by which the barcodes were encoded in the read.
+          The options are: [read_id, tag, umis].
+        default: read_id
+      - name: --umi_tag
+        type: string
+        description: |
+          The tag containing the UMI sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --umi_separator
+        type: string
+        description: |
+          The separator used to separate the UMI from the read sequence. 
+          This is only required if the extract_umi_method is set to id_read.
+        default: '_'
+      - name: --umi_tag_split
+        type: string
+        description: |
+          Separate the UMI in tag by <SPLIT> and take the first element.
+      - name: --umi_tag_delimiter
+        type: string
+        description: |
+          Separate the UMI in by <DELIMITER> and concatenate the elements
+      - name: --cell_tag
+        type: string
+        description: |
+          The tag containing the cell barcode sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --cell_tag_split
+        type: string
+        description: |
+          Separate the cell barcode in tag by <SPLIT> and take the first element.
+      - name: --cell_tag_delimiter
+        type: string
+        description: |
+          Separate the cell barcode in by <DELIMITER> and concatenate the elements
+  
+  - name: Grouping Options
+    arguments:    
+      - name: --method
+        type: string
+        description: |
+          The method to use for grouping reads. The options are: 
+          [unique, percentile, cluster, adjacency, directional].
+        default: directional
+      - name: --edit_distance_threshold
+        type: integer
+        description: |
+          For the adjacency and cluster methods the threshold for the edit 
+          distance to connect two UMIs in the network can be increased. The 
+          default value of 1 works best unless the UMI is very long (>14bp).
+        default: 1
+      - name: --spliced_is_unique
+        type: boolean_true
+        description: |
+          Causes two reads that start in the same position on the same strand
+          and having the same UMI to be considered unique if one is spliced 
+          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
+      - name: --soft_clip_threshold
+        type: integer
+        description: |
+          Mappers that soft clip will sometimes do so rather than mapping a
+          spliced read if there is only a small overhang over the exon junction.
+          By setting this option, you can treat reads with at least this many
+          bases soft-clipped at the 3’ end as spliced.
+        default: 4
+      - name: --multimapping_detection_method
+        type: string
+        description: |
+          If the sam/bam contains tags to identify multimapping reads, you can
+          specify for use when selecting the best read at a given loci. Supported
+          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
+          mapping quality will be selected.
+      - name: --read_length
+        type: integer
+        description: |
+          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+  
+  - name: Single-cell RNA-Seq Options
+    arguments:
+      - name: --per_gene
+        type: boolean_true
+        description: |
+          Reads will be grouped together if they have the same gene. This is useful
+          if your library prep generates PCR duplicates with non identical alignment
+          positions such as CEL-Seq. Note this option is hardcoded to be on with the
+          count command. I.e counting is always performed per-gene. Must be combined
+          with either --gene_tag or --per_contig option.
+      - name: --gene_tag
+        type: string
+        description: |
+          Deduplicate per gene. The gene information is encoded in the bam read tag
+          specified.
+      - name: --assigned_status_tag
+        type: string
+        description: |
+          BAM tag which describes whether a read is assigned to a gene. Defaults to
+          the same value as given for --gene_tag.
+      - name: --skip_tags_regex
+        type: string
+        description: |
+          Use in conjunction with the --assigned_status_tag option to skip any reads
+          where the tag matches this regex. Default ("^[__|Unassigned]") matches
+          anything which starts with “__” or “Unassigned”.
+      - name: --per_contig
+        type: boolean_true
+        description: |
+          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
+          contig will be considered to have the same alignment position. This is
+          useful if you have aligned to a reference transcriptome with one
+          transcript per gene. If you have aligned to a transcriptome with more
+          than one transcript per gene, you can supply a map between transcripts
+          and gene using the --gene_transcript_map option.
+      - name: --gene_transcript_map
+        type: file
+        description: |
+          A file containing a mapping between gene names and transcript names.
+          The file should be tab separated with the gene name in the first column
+          and the transcript name in the second column.
+      - name: --per_cell
+        type: boolean_true
+        description: |
+          Reads will only be grouped together if they have the same cell barcode.
+          Can be combined with --per_gene.
+  
+  - name: SAM/BAM Options
+    arguments:
+      - name: --mapping_quality
+        type: integer
+        description: |
+          Minimium mapping quality (MAPQ) for a read to be retained.
+        default: 0
+      - name: --unmapped_reads
+        type: string
+        description: |
+          How unmapped reads should be handled. 
+          The options are:
+          "discard": Discard all unmapped reads.
+          "use":     If read2 is unmapped, deduplicate using read1 only. 
+                     Requires --paired.
+          "output":  Output unmapped reads/read pairs without UMI 
+                     grouping/deduplication. Only available in umi_tools group.
+        default: discard
+      - name: --chimeric_pairs
+        type: string
+        description: |
+          How chimeric pairs should be handled. 
+          The options are:
+          "discard": Discard all chimeric read pairs.
+          "use":     Deduplicate using read1 only.
+          "output":  Output chimeric pairs without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --unapired_reads
+        type: string
+        description: |
+          How unpaired reads should be handled. 
+          The options are:
+          "discard": Discard all unpaired reads.
+          "use":     Deduplicate using read1 only.
+          "output":  Output unpaired reads without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --ignore_umi
+        type: boolean_true
+        description: |
+          Ignore the UMI and group reads using mapping coordinates only.
+      - name: --subset
+        type: boolean_true
+        description: |
+          Only consider a fraction of the reads, chosen at random. This is useful
+          for doing saturation analyses.
+      - name: --chrom
+        type: string
+        description: |
+          Only consider a single chromosome. This is useful for debugging/testing 
+          purposes.
+  
+  - name: Group/Dedup Options
+    arguments:
+      - name: --no_sort_output
+        type: boolean_true
+        description: |
+          By default, output is sorted. This involves the use of a temporary unsorted
+          file (saved in --temp-dir). Use this option to turn off sorting.
+      - name: --buffer_whole_contig
+        type: boolean_true
+        description: |
+          Forces dedup to parse an entire contig before yielding any reads for
+          deduplication. This is the only way to absolutely guarantee that all reads
+          with the same start position are grouped together for deduplication since
+          dedup uses the start position of the read, not the alignment coordinate on
+          which the reads are sorted. However, by default, dedup reads for another
+          1000bp before outputting read groups which will avoid any reads being missed
+          with short read sequencing (<1000bp).
+
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
+    setup:
+      - type: docker
+        run: |
+            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
+runners:
+- type: executable
+- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
new file mode 100644
index 00000000..d3c8fa44
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/help.txt
@@ -0,0 +1,13 @@
+```
+umi_tools dedup
+```
+
+dedup - Deduplicate reads using UMI and mapping coordinates
+
+Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+       note: If --stdout is ommited, standard out is output. To
+             generate a valid BAM file on standard out, please
+             redirect log with --log=LOGFILE or --log2stderr 
+
+For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
new file mode 100644
index 00000000..57c01258
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -e
+
+test_dir="${metal_executable}/test_data"
+
+[[ "$par_paired" == "false" ]] && unset par_paired
+[[ "$par_in_sam" == "false" ]] && unset par_in_sam
+[[ "$par_out_sam" == "false" ]] && unset par_out_sam
+[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
+[[ "$par_per_gene" == "false" ]] && unset par_per_gene
+[[ "$par_per_contig" == "false" ]] && unset par_per_contig
+[[ "$par_per_cell" == "false" ]] && unset par_per_cell
+[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
+[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
+[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
+[[ "$par_subset" == "false" ]] && unset par_subset
+
+
+$(which umi_tools) dedup \
+    -I "$par_input" \
+    ${par_in_sam:+--in-sam} \
+    ${par_bai:+--bai "$par_bai"} \
+    ${par_get_output_stats:+--get-output-stats} \
+    ${par_random_seed:+--random-seed "$par_random_seed"} \
+    -S "$par_output" \
+    ${par_out_sam:+--out-sam} \
+    ${par_paired:+--paired} \
+    ${par_output_stats:+--output-stats "$par_output_stats"} \
+    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
+    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
+    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
+    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
+    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
+    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
+    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
+    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
+    ${par_method:+--method "$par_method"} \
+    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
+    ${par_spliced_is_unique:+--spliced-is-unique} \
+    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
+    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
+    ${par_read_length:+--read-length "$par_read_length"} \
+    ${par_per_gene:+--per-gene} \
+    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
+    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
+    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
+    ${par_per_contig:+--per-contig}
+    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
+    ${par_per_cell:+--per-cell} \
+    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
+    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
+    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
+    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
+    ${par_ignore_umi:+--ignore-umi} \
+    ${par_subset:+--subset} \
+    ${par_chrom:+--chrom "$par_chrom"} \
+    ${par_no_sort_output:+--no-sort-output} \
+    ${par_buffer_whole_contig:+--buffer-whole-contig}
+
+
+exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
new file mode 100644
index 00000000..1459ec08
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+test_dir="${meta_resources_dir}/test_data"
+echo ">>> Testing $meta_functionality_name"
+
+"$meta_executable" \
+  --bam "$test_dir/a.sorted.bam" \
+  --bai "$test_dir/a.sorted.bam.bai" \
+  --output "$test_dir/a.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
+    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/a.sorted.idxstats"
+
+############################################################################################
+
+echo ">>> Testing $meta_functionality_name with singletons in the input"
+
+"$meta_executable" \
+  --bam "$test_dir/test.paired_end.sorted.bam" \
+  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
+  --output "$test_dir/test.paired_end.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
+    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/test.paired_end.sorted.idxstats"
+
+############################################################################################
+
+echo "All tests succeeded!"
+exit 0
\ No newline at end of file

From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:38:57 +0100
Subject: [PATCH 2/5] Revert "initial commit dedup"

This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2.
---
 CHANGELOG.md                                  |   3 -
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------
 src/umi_tools/umi_tools_dedup/help.txt        |  13 -
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ----
 src/umi_tools/umi_tools_dedup/test.sh         |  49 ---
 5 files changed, 409 deletions(-)
 delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1bef9345..4fd7f001 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,9 +39,6 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
-* `umi_tools`:
-    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
-    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
deleted file mode 100644
index 75306541..00000000
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ /dev/null
@@ -1,279 +0,0 @@
-name: umi_tool_dedup
-namespace: umi_tools
-description: |
-  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
-keywords: [umi_tools, deduplication, dedup]
-links:
-  homepage: https://umi-tools.readthedocs.io/en/latest/
-  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
-                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
-  repository: https://github.com/CGATOxford/UMI-tools
-references: 
-  doi: 10.1101/gr.209601.116
-license: MIT
-
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: --input
-        alternatives: -I
-        type: file
-        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
-        required: true
-      - name: --in_sam
-        type: boolean_true
-        description: |
-          By default, inputs are assumed to be in BAM format. Use this options 
-          to specify the use of SAM format for input.
-      - name: --bai
-        type: file
-        description: BAM index
-      - name: --get_output_stats
-        type: boolean
-        description: Whether or not to generate output stats. 
-      - name: --random_seed
-        type: integer
-        description: |
-          Random seed to initialize number generator with.
-        default: none
-
-  - name: Outputs
-    arguments:
-      - name: --output
-        alternatives: -S
-        type: file
-        description: Deduplicated BAM file
-        required: true
-        direction: output
-      - name: --out_sam
-        type: boolean_true
-        description: |
-          By default, outputa are written in BAM format. Use this options to 
-          specify the use of SAM format for output.
-      - name: --paired
-        type: boolean_true
-        description: |
-          BAM is paired end - output both read pairs. This will also force the
-          use of the template length to determine reads with the same mapping
-          coordinates.
-      - name: --output_stats
-        type: file
-        description: Directory containing UMI based deduplication statistics files
-        direction: output
-      - name: --extract_umi_method
-        type: string
-        description: |
-          Specify the method by which the barcodes were encoded in the read.
-          The options are: [read_id, tag, umis].
-        default: read_id
-      - name: --umi_tag
-        type: string
-        description: |
-          The tag containing the UMI sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --umi_separator
-        type: string
-        description: |
-          The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read.
-        default: '_'
-      - name: --umi_tag_split
-        type: string
-        description: |
-          Separate the UMI in tag by <SPLIT> and take the first element.
-      - name: --umi_tag_delimiter
-        type: string
-        description: |
-          Separate the UMI in by <DELIMITER> and concatenate the elements
-      - name: --cell_tag
-        type: string
-        description: |
-          The tag containing the cell barcode sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --cell_tag_split
-        type: string
-        description: |
-          Separate the cell barcode in tag by <SPLIT> and take the first element.
-      - name: --cell_tag_delimiter
-        type: string
-        description: |
-          Separate the cell barcode in by <DELIMITER> and concatenate the elements
-  
-  - name: Grouping Options
-    arguments:    
-      - name: --method
-        type: string
-        description: |
-          The method to use for grouping reads. The options are: 
-          [unique, percentile, cluster, adjacency, directional].
-        default: directional
-      - name: --edit_distance_threshold
-        type: integer
-        description: |
-          For the adjacency and cluster methods the threshold for the edit 
-          distance to connect two UMIs in the network can be increased. The 
-          default value of 1 works best unless the UMI is very long (>14bp).
-        default: 1
-      - name: --spliced_is_unique
-        type: boolean_true
-        description: |
-          Causes two reads that start in the same position on the same strand
-          and having the same UMI to be considered unique if one is spliced 
-          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
-      - name: --soft_clip_threshold
-        type: integer
-        description: |
-          Mappers that soft clip will sometimes do so rather than mapping a
-          spliced read if there is only a small overhang over the exon junction.
-          By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3’ end as spliced.
-        default: 4
-      - name: --multimapping_detection_method
-        type: string
-        description: |
-          If the sam/bam contains tags to identify multimapping reads, you can
-          specify for use when selecting the best read at a given loci. Supported
-          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
-          mapping quality will be selected.
-      - name: --read_length
-        type: integer
-        description: |
-          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
-  
-  - name: Single-cell RNA-Seq Options
-    arguments:
-      - name: --per_gene
-        type: boolean_true
-        description: |
-          Reads will be grouped together if they have the same gene. This is useful
-          if your library prep generates PCR duplicates with non identical alignment
-          positions such as CEL-Seq. Note this option is hardcoded to be on with the
-          count command. I.e counting is always performed per-gene. Must be combined
-          with either --gene_tag or --per_contig option.
-      - name: --gene_tag
-        type: string
-        description: |
-          Deduplicate per gene. The gene information is encoded in the bam read tag
-          specified.
-      - name: --assigned_status_tag
-        type: string
-        description: |
-          BAM tag which describes whether a read is assigned to a gene. Defaults to
-          the same value as given for --gene_tag.
-      - name: --skip_tags_regex
-        type: string
-        description: |
-          Use in conjunction with the --assigned_status_tag option to skip any reads
-          where the tag matches this regex. Default ("^[__|Unassigned]") matches
-          anything which starts with “__” or “Unassigned”.
-      - name: --per_contig
-        type: boolean_true
-        description: |
-          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
-          contig will be considered to have the same alignment position. This is
-          useful if you have aligned to a reference transcriptome with one
-          transcript per gene. If you have aligned to a transcriptome with more
-          than one transcript per gene, you can supply a map between transcripts
-          and gene using the --gene_transcript_map option.
-      - name: --gene_transcript_map
-        type: file
-        description: |
-          A file containing a mapping between gene names and transcript names.
-          The file should be tab separated with the gene name in the first column
-          and the transcript name in the second column.
-      - name: --per_cell
-        type: boolean_true
-        description: |
-          Reads will only be grouped together if they have the same cell barcode.
-          Can be combined with --per_gene.
-  
-  - name: SAM/BAM Options
-    arguments:
-      - name: --mapping_quality
-        type: integer
-        description: |
-          Minimium mapping quality (MAPQ) for a read to be retained.
-        default: 0
-      - name: --unmapped_reads
-        type: string
-        description: |
-          How unmapped reads should be handled. 
-          The options are:
-          "discard": Discard all unmapped reads.
-          "use":     If read2 is unmapped, deduplicate using read1 only. 
-                     Requires --paired.
-          "output":  Output unmapped reads/read pairs without UMI 
-                     grouping/deduplication. Only available in umi_tools group.
-        default: discard
-      - name: --chimeric_pairs
-        type: string
-        description: |
-          How chimeric pairs should be handled. 
-          The options are:
-          "discard": Discard all chimeric read pairs.
-          "use":     Deduplicate using read1 only.
-          "output":  Output chimeric pairs without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --unapired_reads
-        type: string
-        description: |
-          How unpaired reads should be handled. 
-          The options are:
-          "discard": Discard all unpaired reads.
-          "use":     Deduplicate using read1 only.
-          "output":  Output unpaired reads without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --ignore_umi
-        type: boolean_true
-        description: |
-          Ignore the UMI and group reads using mapping coordinates only.
-      - name: --subset
-        type: boolean_true
-        description: |
-          Only consider a fraction of the reads, chosen at random. This is useful
-          for doing saturation analyses.
-      - name: --chrom
-        type: string
-        description: |
-          Only consider a single chromosome. This is useful for debugging/testing 
-          purposes.
-  
-  - name: Group/Dedup Options
-    arguments:
-      - name: --no_sort_output
-        type: boolean_true
-        description: |
-          By default, output is sorted. This involves the use of a temporary unsorted
-          file (saved in --temp-dir). Use this option to turn off sorting.
-      - name: --buffer_whole_contig
-        type: boolean_true
-        description: |
-          Forces dedup to parse an entire contig before yielding any reads for
-          deduplication. This is the only way to absolutely guarantee that all reads
-          with the same start position are grouped together for deduplication since
-          dedup uses the start position of the read, not the alignment coordinate on
-          which the reads are sorted. However, by default, dedup reads for another
-          1000bp before outputting read groups which will avoid any reads being missed
-          with short read sequencing (<1000bp).
-
-
-resources:
-  - type: bash_script
-    path: script.sh
-test_resources:
-  - type: bash_script
-    path: test.sh
-  - type: file
-    path: test_data
-engines:
-  - type: docker
-    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
-    setup:
-      - type: docker
-        run: |
-            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
-runners:
-- type: executable
-- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
deleted file mode 100644
index d3c8fa44..00000000
--- a/src/umi_tools/umi_tools_dedup/help.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-```
-umi_tools dedup
-```
-
-dedup - Deduplicate reads using UMI and mapping coordinates
-
-Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
-
-       note: If --stdout is ommited, standard out is output. To
-             generate a valid BAM file on standard out, please
-             redirect log with --log=LOGFILE or --log2stderr 
-
-For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
deleted file mode 100644
index 57c01258..00000000
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-## VIASH START
-## VIASH END
-
-set -e
-
-test_dir="${metal_executable}/test_data"
-
-[[ "$par_paired" == "false" ]] && unset par_paired
-[[ "$par_in_sam" == "false" ]] && unset par_in_sam
-[[ "$par_out_sam" == "false" ]] && unset par_out_sam
-[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
-[[ "$par_per_gene" == "false" ]] && unset par_per_gene
-[[ "$par_per_contig" == "false" ]] && unset par_per_contig
-[[ "$par_per_cell" == "false" ]] && unset par_per_cell
-[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
-[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
-[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
-[[ "$par_subset" == "false" ]] && unset par_subset
-
-
-$(which umi_tools) dedup \
-    -I "$par_input" \
-    ${par_in_sam:+--in-sam} \
-    ${par_bai:+--bai "$par_bai"} \
-    ${par_get_output_stats:+--get-output-stats} \
-    ${par_random_seed:+--random-seed "$par_random_seed"} \
-    -S "$par_output" \
-    ${par_out_sam:+--out-sam} \
-    ${par_paired:+--paired} \
-    ${par_output_stats:+--output-stats "$par_output_stats"} \
-    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
-    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
-    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
-    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
-    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
-    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
-    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
-    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
-    ${par_method:+--method "$par_method"} \
-    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
-    ${par_spliced_is_unique:+--spliced-is-unique} \
-    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
-    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
-    ${par_read_length:+--read-length "$par_read_length"} \
-    ${par_per_gene:+--per-gene} \
-    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
-    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
-    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
-    ${par_per_contig:+--per-contig}
-    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
-    ${par_per_cell:+--per-cell} \
-    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
-    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
-    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
-    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
-    ${par_ignore_umi:+--ignore-umi} \
-    ${par_subset:+--subset} \
-    ${par_chrom:+--chrom "$par_chrom"} \
-    ${par_no_sort_output:+--no-sort-output} \
-    ${par_buffer_whole_contig:+--buffer-whole-contig}
-
-
-exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
deleted file mode 100644
index 1459ec08..00000000
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-test_dir="${meta_resources_dir}/test_data"
-echo ">>> Testing $meta_functionality_name"
-
-"$meta_executable" \
-  --bam "$test_dir/a.sorted.bam" \
-  --bai "$test_dir/a.sorted.bam.bai" \
-  --output "$test_dir/a.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
-    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/a.sorted.idxstats"
-
-############################################################################################
-
-echo ">>> Testing $meta_functionality_name with singletons in the input"
-
-"$meta_executable" \
-  --bam "$test_dir/test.paired_end.sorted.bam" \
-  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
-  --output "$test_dir/test.paired_end.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
-    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/test.paired_end.sorted.idxstats"
-
-############################################################################################
-
-echo "All tests succeeded!"
-exit 0
\ No newline at end of file

From 4b11f7f2332517695c6c3d247e25e585c3dd8522 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Sun, 1 Sep 2024 20:08:59 +0200
Subject: [PATCH 3/5] Functional component with tests

---
 src/sortmerna/config.vsh.yaml             | 292 ++++++++++++++++++++
 src/sortmerna/help.txt                    | 319 ++++++++++++++++++++++
 src/sortmerna/script.sh                   | 108 ++++++++
 src/sortmerna/test.sh                     |  52 ++++
 src/sortmerna/test_data/rRNA/database1.fa |  24 ++
 src/sortmerna/test_data/rRNA/database2.fa |  16 ++
 src/sortmerna/test_data/reads_1.fq.gz     | Bin 0 -> 189 bytes
 src/sortmerna/test_data/reads_2.fq.gz     | Bin 0 -> 147 bytes
 src/sortmerna/test_data/script.sh         |   8 +
 9 files changed, 819 insertions(+)
 create mode 100644 src/sortmerna/config.vsh.yaml
 create mode 100644 src/sortmerna/help.txt
 create mode 100755 src/sortmerna/script.sh
 create mode 100644 src/sortmerna/test.sh
 create mode 100644 src/sortmerna/test_data/rRNA/database1.fa
 create mode 100644 src/sortmerna/test_data/rRNA/database2.fa
 create mode 100644 src/sortmerna/test_data/reads_1.fq.gz
 create mode 100644 src/sortmerna/test_data/reads_2.fq.gz
 create mode 100755 src/sortmerna/test_data/script.sh

diff --git a/src/sortmerna/config.vsh.yaml b/src/sortmerna/config.vsh.yaml
new file mode 100644
index 00000000..23925132
--- /dev/null
+++ b/src/sortmerna/config.vsh.yaml
@@ -0,0 +1,292 @@
+name: sortmerna
+description: | 
+  Local sequence alignment tool for filtering, mapping and clustering. The main 
+  application of SortMeRNA is filtering rRNA from metatranscriptomic data. SortMeRNA
+  takes as input files of reads (fasta, fastq, fasta.gz, fastq.gz) and one or multiple
+  rRNA database file(s), and sorts apart aligned and rejected reads into two files.
+keywords: [sort, mRNA, rRNA, alignment, filtering, mapping, clustering]
+links:
+  homepage: https://sortmerna.readthedocs.io/en/latest/
+  documentation: https://sortmerna.readthedocs.io/en/latest/manual4.0.html
+  repository: https://github.com/sortmerna/sortmerna
+references: 
+  doi: 10.1093/bioinformatics/bts611
+license: GPL-3.0
+
+argument_groups:
+- name: "Input"
+  arguments: 
+  - name: "--paired"
+    type: boolean_true
+    description: |
+      Reads are paired-end. If a single reads file is provided, use this option 
+      to indicate the file contains interleaved paired reads when neither
+      'paired_in' | 'paired_out' | 'out2' | 'sout' are specified.
+  - name: "--input"
+    type: file
+    multiple: true
+    description: Input fastq
+  - name: "--ref"
+    type: file
+    multiple: true
+    description: Reference fasta file(s) for rRNA database.
+  - name: "--ribo_database_manifest"
+    type: file
+    description: Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA.
+
+- name: "Output"
+  arguments:     
+  - name: "--log"
+    type: file
+    direction: output
+    must_exist: false
+    example: $id.sortmerna.log
+    description: Sortmerna log file.
+  - name: "--output"
+    alternatives: ["--aligned"]
+    type: string
+    description: |
+      Directory and file prefix for aligned output. The appropriate extension: 
+      (fasta|fastq|blast|sam|etc) is automatically added.
+      If 'dir' is not specified, the output is created in the WORKDIR/out/.
+      If 'pfx' is not specified, the prefix 'aligned' is used.
+  - name: "--other"
+    type: string
+    description: Create Non-aligned reads output file with this path/prefix. Must be used with fastx. 
+
+- name: "Options"
+  arguments:
+  - name: "--kvdb"
+    type: string
+    description: Path to directory of the key-value database file, used for storing the alignment results.
+  - name: "--idx_dir"
+    type: string
+    description: Path to the directory for storing the reference index files.
+  - name: "--readb"
+    type: string
+    description: Path to the directory for storing pre-processed reads.
+  - name: "--fastx"
+    type: boolean_true
+    description: Output aligned reads into FASTA/FASTQ file
+  - name: "--sam"
+    type: boolean_true
+    description: Output SAM alignment for aligned reads.
+  - name: "--sq"
+    type: boolean_true
+    description: Add SQ tags to the SAM file
+  - name: "--blast"
+    type: string
+    description: | 
+      Blast options:
+      * '0'                    - pairwise
+      * '1'                    - tabular(Blast - m 8 format)
+      * '1 cigar'              - tabular + column for CIGAR
+      * '1 cigar qcov'         - tabular + columns for CIGAR and query coverage
+      * '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage and strand
+    choices: ['0', '1', '1 cigar', '1 cigar qcov', '1 cigar qcov qstrand']
+  - name: "--num_alignments"
+    type: integer
+    description: |
+      Report first INT alignments per read reaching E-value. If Int = 0, all alignments will be output. Default: '0'
+    example: 0
+  - name: "--min_lis"
+    type: integer
+    description: |
+      search all alignments having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is
+      computed using seeds’ positions to expand hits into longer matches prior to Smith-Waterman alignment. Default: '2'.
+    example: 2
+  - name: "--print_all_reads"
+    type: boolean_true
+    description: output null alignment strings for non-aligned reads to SAM and/or BLAST tabular files.
+  - name: "--paired_in"
+    type: boolean_true
+    description: |
+      In the case where a pair of reads is aligned with a score above the threshold, the output of the reads is controlled
+      by the following options:
+      * --paired_in and --paired_out are both false: Only one read per pair is output to the aligned fasta file.
+      * --paired_in is true and --paired_out is false: Both reads of the pair are output to the aligned fasta file.
+      * --paired_in is false and --paired_out is true: Both reads are output the the other fasta file (if it is specified).
+  - name: "--paired_out"
+    type: boolean_true
+    description: See description of --paired_in.
+  - name: "--out2"
+    type: boolean_true
+    description: |
+      Output paired reads into separate files. Must be used with '--fastx'. If a single reads file is provided, this options
+      implies interleaved paired reads. When used with 'sout', four (4) output files for aligned reads will be generated:
+      'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. If 'other' option is also used,
+      eight (8) output files will be generated.
+  - name: "--sout"
+    type: boolean_true
+    description: |
+      Separate paired and singleton aligned reads. Must be used with '--fastx'. If a single reads file is provided,
+      this options implies interleaved paired reads. Cannot be used with '--paired_in' or '--paired_out'.
+  - name: "--zip_out"
+    type: string
+    description: |
+      Compress the output files. The possible values are: 
+      * '1/true/t/yes/y'
+      * '0/false/f/no/n'
+      *'-1' (the same format as input - default)
+      The values are Not case sensitive.
+    choices: ['1', 'true', 't', 'yes', 'y', '0', 'false', 'f', 'no', 'n', '-1']
+    example: "-1"
+  - name: "--match"
+    type: integer
+    description: |
+      Smith-Waterman score for a match (positive integer). Default: '2'.
+    example: 2
+  - name: "--mismatch"
+    type: integer
+    description: |
+      Smith-Waterman penalty for a mismatch (negative integer). Default: '-3'.
+    example: -3
+  - name: "--gap_open"
+    type: integer
+    description: |
+      Smith-Waterman penalty for introducing a gap (positive integer). Default: '5'.
+    example: 5
+  - name: "--gap_ext"
+    type: integer
+    description: |
+      Smith-Waterman penalty for extending a gap (positive integer). Default: '2'.
+    example: 2
+  - name: "--N"
+    type: integer
+    description: |
+      Smith-Waterman penalty for ambiguous letters (N’s) scored as --mismatch. Default: '-1'.\
+    example: -1
+  - name: "--a"
+    type: integer
+    description: |
+      Number of threads to use. Default: '1'.
+    example: 1
+  - name: "--e"
+    type: double
+    description: |
+      E-value threshold. Default: '1'.
+    example: 1
+  - name: "--F"
+    type: boolean_true
+    description: Search only the forward strand.
+  - name: "--R"
+    type: boolean_true
+    description: Search only the reverse-complementary strand.
+  - name: "--num_alignment"
+    type: integer
+    description: |
+       Report first INT alignments per read reaching E-value (--num_alignments 0 signifies all alignments will be output).
+       Default: '-1'
+    example: -1
+  - name: "--best"
+    type: integer
+    description: |
+      Report INT best alignments per read reaching E-value by searching --min_lis INT candidate alignments (--best 0
+      signifies all candidate alignments will be searched) Default: '1'.
+    example: 1
+  - name: "--verbose"
+    alternatives: ["-v"]
+    type: boolean_true
+    description: Verbose output.
+
+- name: "OTU picking options"
+  arguments:
+    - name: "--id"
+      type: double
+      description: |
+        %id similarity threshold (the alignment must still pass the E-value threshold). Default: '0.97'.
+      example: 0.97
+    - name: "--coverage"
+      type: double
+      description: |
+        %query coverage threshold (the alignment must still pass the E-value threshold). Default: '0.97'.
+      example: 0.97
+    - name: "--de_novo"
+      type: boolean_true
+      description: |
+        FASTA/FASTQ file for reads matching database < %id off (set using --id) and < %cov (set using --coverage)
+        (alignment must still pass the E-value threshold).
+    - name: "--otu_map"
+      type: boolean_true
+      description: |
+        Output OTU map (input to QIIME’s make_otu_table.py).
+
+- name: "Advanced options"
+  arguments:
+  - name: "--num_seed"
+    type: integer
+    description: |
+      Number of seeds matched before searching for candidate LIS. Default: '2'.
+    example: 2
+  - name: "--passes"
+    type: integer
+    multiple: true
+    description: |
+      Three intervals at which to place the seed on the read L,L/2,3 (L is the seed length set in ./indexdb_rna).
+  - name: "--edge"
+    type: string
+    description: |
+      The number (or percentage if followed by %) of nucleotides to add to each edge of the alignment region on the
+      reference sequence before performing Smith-Waterman alignment. Default: '4'.
+    example: 4
+  - name: "--full_search"
+    type: boolean_true
+    description: |
+      Search for all 0-error and 1-error seed off matches in the index rather than stopping after finding a 0-error match
+      (<1% gain in sensitivity with up four-fold decrease in speed).
+
+- name: "Indexing Options"
+  arguments:
+  - name: "--index"
+    type: integer
+    description: |
+      Create index files for the reference database. By default when this option is not used, the program checks the
+      reference index and builds it if not already existing.
+      This can be changed by using '-index' as follows:
+      * '-index 0' - skip indexing. If the index does not exist, the program will terminate
+                              and warn to build the index prior performing the alignment
+      * '-index 1' - only perform the indexing and terminate
+      * '-index 2' - the default behaviour, the same as when not using this option at all
+    example: 2
+    choices: [0, 1, 2]
+  - name: "-L"
+    type: double
+    description: |
+      Indexing seed length. Default: '18'
+    example: 18
+  - name: "--interval"
+    type: integer
+    description: |
+      Index every Nth L-mer in the reference database. Default: '1'
+    example: 1
+  - name: "--max_pos"
+    type: integer
+    description: |
+      Maximum number of positions to store for each unique L-mer. Set to 0 to store all positions. Default: '1000'
+    example: 1000
+  
+  
+
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - path: test_data
+  
+engines:
+- type: docker
+  image: ubuntu:22.04
+  setup: 
+    - type: docker
+      run: |
+        apt-get update && \
+        apt-get install -y --no-install-recommends gzip cmake g++ wget && \
+        apt-get clean && \
+        wget --no-check-certificate https://github.com/sortmerna/sortmerna/releases/download/v4.3.6/sortmerna-4.3.6-Linux.sh && \
+        bash sortmerna-4.3.6-Linux.sh --skip-license
+runners: 
+- type: executable
+- type: nextflow 
\ No newline at end of file
diff --git a/src/sortmerna/help.txt b/src/sortmerna/help.txt
new file mode 100644
index 00000000..f0842707
--- /dev/null
+++ b/src/sortmerna/help.txt
@@ -0,0 +1,319 @@
+```
+sortmerna -h
+```
+
+
+  Program:      SortMeRNA version 4.3.6
+  Copyright:    2016-2020 Clarity Genomics BVBA:
+                Turnhoutseweg 30, 2340 Beerse, Belgium
+                2014-2016 Knight Lab:
+                Department of Pediatrics, UCSD, La Jolla
+                2012-2014 Bonsai Bioinformatics Research Group:
+                LIFL, University Lille 1, CNRS UMR 8022, INRIA Nord-Europe
+  Disclaimer:   SortMeRNA comes with ABSOLUTELY NO WARRANTY; without even the
+                implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+                See the GNU Lesser General Public License for more details.
+  Contributors: Jenya Kopylova   jenya.kopylov@gmail.com
+                Laurent Noé      laurent.noe@lifl.fr
+                Pierre Pericard  pierre.pericard@lifl.fr
+                Daniel McDonald  wasade@gmail.com
+                Mikaël Salson    mikael.salson@lifl.fr
+                Hélène Touzet    helene.touzet@lifl.fr
+                Rob Knight       robknight@ucsd.edu
+
+  Usage:   sortmerna -ref FILE [-ref FILE] -reads FWD_READS [-reads REV_READS] [OPTIONS]:
+  -------------------------------------------------------------------------------------------------------------
+  | option            type-format           description                                          default      |
+  -------------------------------------------------------------------------------------------------------------
+
+    [REQUIRED]
+    --ref             PATH        Required  Reference file (FASTA) absolute or relative path.
+
+       Use mutliple times, once per a reference file
+
+
+    --reads           PATH        Required  Raw reads file (FASTA/FASTQ/FASTA.GZ/FASTQ.GZ).
+
+       Use twice for files with paired reads.
+       The file extensions are Not important. The program automatically
+       recognizes the file format as flat/compressed, fasta/fastq
+
+
+
+    [COMMON]
+    --workdir         PATH        Optional  Workspace directory                         USRDIR/sortmerna/run/
+
+       Default structure: WORKDIR/
+                              idx/   (References index)
+                              kvdb/  (Key-value storage for alignments)
+                              out/   (processing output)
+                              readb/ (pre-processed reads/index)
+
+
+    --kvdb            PATH        Optional  Directory for Key-value database            WORKDIR/kvdb
+
+       KVDB is used for storing the alignment results.
+
+
+    --idx-dir         PATH        Optional  Directory for storing Reference index.      WORKDIR/idx
+
+
+    --readb           PATH        Optional  Storage for pre-processed reads             WORKDIR/readb/
+
+       Directory storing the split reads, or the random access index of compressed reads
+
+
+    --fastx           BOOL        Optional  Output aligned reads into FASTA/FASTQ file
+    --sam             BOOL        Optional  Output SAM alignment for aligned reads.
+
+
+    --SQ              BOOL        Optional  Add SQ tags to the SAM file
+
+
+    --blast           STR         Optional  output alignments in various Blast-like formats
+
+       Sample values: '0'                    - pairwise
+                      '1'                    - tabular (Blast - m 8 format)
+                      '1 cigar'              - tabular + column for CIGAR
+                      '1 cigar qcov'         - tabular + columns for CIGAR and query coverage
+                      '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage,
+                                               and strand
+
+
+    --aligned         STR/BOOL    Optional  Aligned reads file prefix [dir/][pfx]       WORKDIR/out/aligned
+
+       Directory and file prefix for aligned output i.e. each
+       output file goes into the specified directory with the given prefix.
+       The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added.
+       Both 'dir' and 'pfx' are optional.
+       The 'dir' can be a relative or an absolute path.
+       If 'dir' is not specified, the output is created in the WORKDIR/out/
+       If 'pfx' is not specified, the prefix 'aligned' is used
+       Examples:
+       '-aligned $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta
+       '-aligned dir_1/apfx'           -> $PWD/dir_1/apfx.fasta
+       '-aligned dir_1/'               -> $PWD/aligned.fasta
+       '-aligned apfx'                 -> $PWD/apfx.fasta
+       '-aligned  (no argument)'       -> WORKDIR/out/aligned.fasta
+
+
+    --other           STR/BOOL    Optional  Non-aligned reads file prefix [dir/][pfx]   WORKDIR/out/other
+
+       Directory and file prefix for non-aligned output i.e. each
+       output file goes into the specified directory with the given prefix.
+       The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added.
+       Must be used with 'fastx'.
+       Both 'dir' and 'pfx' are optional.
+       The 'dir' can be a relative or an absolute path.
+       If 'dir' is not specified, the output is created in the WORKDIR/out/
+       If 'pfx' is not specified, the prefix 'other' is used
+       Examples:
+       '-other $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta
+       '-other dir_1/apfx'           -> $PWD/dir_1/apfx.fasta
+       '-other dir_1/'               -> $PWD/dir_1/other.fasta
+       '-other apfx'                 -> $PWD/apfx.fasta
+       '-other  (no argument)'       -> aligned_out/other.fasta
+                                        i.e. the same output directory
+                                        as used for aligned output
+
+
+    --num_alignments  INT         Optional  Positive integer (INT >=0).
+
+       If used with '-no-best' reports first INT alignments per read reaching
+       E-value threshold, which allows to lower the CPU time and memory use.
+       Otherwise outputs INT best alignments.
+       If INT = 0, all alignments are output
+
+
+    --no-best         BOOL        Optional  Disable best alignments search                          False
+
+       The 'best' alignment is the highest scoring alignment out of All alignments of a read,
+       and the read can potentially be aligned (reaching E-value threshold) to multiple reference
+       sequences.
+       By default the program searches for best alignments i.e. performs an exhaustive search
+       over all references. Using '-no-best' will make the program to search just
+       the first N alignments, where N is set using '-num_alignments' i.e. 1 by default.
+
+
+    --min_lis         INT         Optional  Search only alignments that have the LIS                2
+                                            of at least N seeds long
+
+       LIS stands for Longest Increasing Subsequence. It is computed using seeds, which
+       are k-mers common to the read and the reference sequence. Sorted sequences of such seeds
+       are used to filter the candidate references prior performing the Smith-Waterman alignment.
+
+
+    --print_all_reads BOOL        Optional  Output null alignment strings for non-aligned reads     False
+                                            to SAM and/or BLAST tabular files
+
+    --paired          BOOL        Optional  Flags paired reads                                      False
+
+        If a single reads file is provided, use this option to indicate
+        the file contains interleaved paired reads when neither
+        'paired_in' | 'paired_out' | 'out2' | 'sout' are specified.
+
+
+    --paired_in       BOOL        Optional  Flags the paired-end reads as Aligned,                  False
+                                            when either of them is Aligned.
+
+        With this option both reads are output into Aligned FASTA/Q file
+        Must be used with 'fastx'.
+        Mutually exclusive with 'paired_out'.
+
+
+    --paired_out      BOOL        Optional  Flags the paired-end reads as Non-aligned,              False
+                                            when either of them is non-aligned.
+
+        With this option both reads are output into Non-Aligned FASTA/Q file
+        Must be used with 'fastx'.
+        Mutually exclusive with 'paired_in'.
+
+
+    --out2            BOOL        Optional  Output paired reads into separate files.                False
+
+       Must be used with 'fastx'.
+       If a single reads file is provided, this options implies interleaved paired reads
+       When used with 'sout', four (4) output files for aligned reads will be generated:
+       'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'.
+       If 'other' option is also used, eight (8) output files will be generated.
+
+
+    --sout            BOOL        Optional  Separate paired and singleton aligned reads.            False
+
+       To be used with 'fastx'.
+       If a single reads file is provided, this options implies interleaved paired reads
+       Cannot be used with 'paired_in' | 'paired_out'
+
+
+    --zip-out         STR/BOOL    Optional  Controls the output compression                        '-1'
+
+       By default the report files are produced in the same format as the input i.e.
+       if the reads files are compressed (gz), the output is also compressed.
+       The default behaviour can be overriden by using '-zip-out'.
+       The possible values: '1/true/t/yes/y'
+                            '0/false/f/no/n'
+                            '-1' (the same format as input - default)
+       The values are Not case sensitive i.e. 'Yes, YES, yEs, Y, y' are all OK
+       Examples:
+       '-reads freads.gz -zip-out n' : generate flat output when the input is compressed
+       '-reads freads.flat -zip-out' : compress the output when the input files are flat
+
+
+    --match           INT         Optional  SW score (positive integer) for a match.                2
+
+    --mismatch        INT         Optional  SW penalty (negative integer) for a mismatch.          -3
+
+    --gap_open        INT         Optional  SW penalty (positive integer) for introducing a gap.    5
+
+    --gap_ext         INT         Optional  SW penalty (positive integer) for extending a gap.      2
+
+    -e                DOUBLE      Optional  E-value threshold.                                      1
+
+       Defines the 'statistical significance' of a local alignment.
+       Exponentially correllates with the Minimal Alignment score.
+       Higher E-values (100, 1000, ...) cause More reads to Pass the alignment threshold
+
+
+    -F                BOOL        Optional  Search only the forward strand.                         False
+
+    -N                BOOL        Optional  SW penalty for ambiguous letters (N's) scored
+                                            as --mismatch
+
+    -R                BOOL        Optional  Search only the reverse-complementary strand.           False
+
+
+    [OTU_PICKING]
+    --id              INT         Optional  %%id similarity threshold (the alignment                0.97
+                                            must still pass the E-value threshold).
+
+    --coverage        INT         Optional  %%query coverage threshold (the alignment must          0.97
+                                            still pass the E-value threshold)
+
+    --de_novo_otu     BOOL        Optional  Output FASTA file with 'de novo' reads                  False
+
+       Read is 'de novo' if its alignment score passes E-value threshold, but both the identity
+       '-id', and the '-coverage' are below their corresponding thresholds
+       i.e. ID < %%id and COV < %%cov
+
+
+    --otu_map         BOOL        Optional  Output OTU map (input to QIIME's make_otu_table.py).    False
+                                            Cannot be used with 'no-best because
+                                            the grouping is done around the best alignment'
+
+
+    [ADVANCED]
+    --passes          INT,INT,INT Optional  Three intervals at which to place the seed on           L,L/2,3
+                                             the read (L is the seed length)
+
+    --edges           INT         Optional  Number (or percent if INT followed by %% sign) of       4
+                                            nucleotides to add to each edge of the read
+                                            prior to SW local alignment
+
+    --num_seeds       BOOL        Optional  Number of seeds matched before searching                2
+                                            for candidate LIS
+
+    --full_search     INT         Optional  Search for all 0-error and 1-error seed                 False
+                                            matches in the index rather than stopping
+                                            after finding a 0-error match (<1%% gain in
+                                            sensitivity with up four-fold decrease in speed)
+
+    --pid             BOOL        Optional  Add pid to output file names.                           False
+
+    -a                INT         Optional  DEPRECATED in favour of '-threads'. Number of           numCores
+                                            processing threads to use.
+                                            Automatically redirects to '-threads'
+
+    --threads         INT         Optional  Number of Processing threads to use                     2
+
+
+    [INDEXING]
+    --index           INT         Optional  Build reference database index                          2
+
+       By default when this option is not used, the program checks the reference index and
+       builds it if not already existing.
+       This can be changed by using '-index' as follows:
+       '-index 0' - skip indexing. If the index does not exist, the program will terminate
+                                and warn to build the index prior performing the alignment
+       '-index 1' - only perform the indexing and terminate
+       '-index 2' - the default behaviour, the same as when not using this option at all
+
+
+    -L                DOUBLE      Optional  Indexing: seed length.                                  18
+
+    -m                DOUBLE      Optional  Indexing: the amount of memory (in Mbytes) for          3072
+                                            building the index.
+
+    -v                BOOL        Optional  Produce verbose output when building the index          True
+
+    --interval        INT         Optional  Indexing: Positive integer: index every Nth L-mer in    1
+                                            the reference database e.g. '-interval 2'.
+
+    --max_pos         INT         Optional  Indexing: maximum (integer) number of positions to      1000
+                                            store for each unique L-mer.
+                                            If 0 - all positions are stored.
+
+
+    [HELP]
+    -h                BOOL        Optional  Print help information
+
+    --version         BOOL        Optional  Print SortMeRNA version number
+
+
+    [DEVELOPER]
+    --dbg_put_db      BOOL        Optional  
+    --cmd             BOOL        Optional  Launch an interactive session (command prompt)          False
+
+    --task            INT         Optional  Processing Task                                         4
+
+       Possible values: 0 - align. Only perform alignment
+                        1 - post-processing (log writing)
+                        2 - generate reports
+                        3 - align and post-process
+                        4 - all
+
+
+    --dbg-level       INT         Optional  Debug level                                             0
+
+      Controls verbosity of the execution trace. Default value of 0 corresponds to
+      the least verbose output.
+      The highest value currently is 2.
diff --git a/src/sortmerna/script.sh b/src/sortmerna/script.sh
new file mode 100755
index 00000000..8dda3d60
--- /dev/null
+++ b/src/sortmerna/script.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -eo pipefail
+
+unset_if_false=( par_fastx par_sq par_fastx par_print_all_reads par_paired_in par_paired_out
+                 par_F par_R par_verbose par_de_novo par_otu_map par_full_search par_out2
+                 par_sout par_sam par_paired )
+
+
+for var in "${unset_if_false[@]}"; do
+    if [ "${!var}" == "false" ]; then
+        unset $var
+    fi
+done
+
+reads=()
+IFS=";" read -ra input <<< "$par_input"
+if [ "${#input[@]}" -eq 2 ]; then
+    reads="--reads ${input[0]} --reads ${input[1]}"
+    # set paired to true in case it's not
+    par_paired=true
+else
+    reads="--reads ${input[0]}"
+    par_paired=false
+fi
+
+refs=()
+
+# check if references are input normally or through a manifest file
+if [[ ! -z "$par_ribo_database_manifest" ]]; then
+    while IFS= read -r path || [[ -n $path ]]; do
+        refs=$refs" --ref $path"
+    done < $par_ribo_database_manifest
+
+elif [[ ! -z "$par_ref" ]]; then
+    IFS=";" read -ra ref <<< "$par_ref"
+    # check if length is 2 and par_paired is set to true
+    if [[ "${#ref[@]}" -eq 2 && "$par_paired" == "true" ]]; then
+        refs="--ref ${ref[0]} --ref ${ref[1]}"
+    # check if length is 1 and par_paired is set to false
+    elif [[ "${#ref[@]}" -eq 1 && "$par_paired" == "false" ]]; then
+            refs="--ref $par_ref"      
+    else # if one reference provided but paired is set to true:
+        echo "Two reference fasta files are required for paired-end reads"
+            exit 1
+    fi
+else 
+    echo "No reference fasta file(s) provided"
+    exit 1
+fi
+
+
+sortmerna \
+    $refs \
+    $reads \
+    --workdir . \
+    ${par_output:+--aligned "${par_output}"} \
+    ${par_fastx:+--fastx} \
+    ${par_other:+--other "${par_other}"} \
+    ${par_kvdb:+--kvdb "${par_kvdb}"} \
+    ${par_idx_dir:+--idx-dir "${par_idx_dir}"} \
+    ${par_readb:+--readb "${par_readb}"} \
+    ${par_sam:+--sam} \
+    ${par_sq:+--sq} \
+    ${par_blast:+--blast "${par_blast}"} \
+    ${par_num_alignments:+--num_alignments "${par_num_alignments}"} \
+    ${par_min_lis:+--min_lis "${par_min_lis}"} \
+    ${par_print_all_reads:+--print_all_reads} \
+    ${par_paired_in:+--paired_in} \
+    ${par_paired_out:+--paired_out} \
+    ${par_out2:+--out2} \
+    ${par_sout:+--sout} \
+    ${par_zip_out:+--zip-out "${par_zip_out}"} \
+    ${par_match:+--match "${par_match}"} \
+    ${par_mismatch:+--mismatch "${par_mismatch}"} \
+    ${par_gap_open:+--gap_open "${par_gap_open}"} \
+    ${par_gap_ext:+--gap_ext "${par_gap_ext}"} \
+    ${par_N:+-N "${par_N}"} \
+    ${par_a:+-a "${par_a}"} \
+    ${par_e:+-e "${par_e}"} \
+    ${par_F:+-F} \
+    ${par_R:+-R} \
+    ${par_num_alignment:+--num_alignment "${par_num_alignment}"} \
+    ${par_best:+--best "${par_best}"} \
+    ${par_verbose:+--verbose} \
+    ${par_id:+--id "${par_id}"} \
+    ${par_coverage:+--coverage "${par_coverage}"} \
+    ${par_de_novo:+--de_novo} \
+    ${par_otu_map:+--otu_map} \
+    ${par_num_seed:+--num_seed "${par_num_seed}"} \
+    ${par_passes:+--passes "${par_passes}"} \
+    ${par_edge:+--edge "${par_edge}"} \
+    ${par_full_search:+--full_search} \
+    ${par_index:+--index "${par_index}"} \
+    ${par_L:+-L $par_L} \
+    ${par_interval:+--interval "${par_interval}"} \
+    ${par_max_pos:+--max_pos "${par_max_pos}"}
+
+
+if [ ! -z $par_log ]; then
+    mv "${par_output}.log" $par_log
+fi
+
+exit 0
+
diff --git a/src/sortmerna/test.sh b/src/sortmerna/test.sh
new file mode 100644
index 00000000..4d49c5ed
--- /dev/null
+++ b/src/sortmerna/test.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+echo ">>> Testing $meta_functionality_name"
+
+find $meta_resources_dir/test_data/rRNA -type f > test_data/rrna-db.txt
+
+echo ">>> Testing for paired-end reads"
+# out2 separates the read pairs into two files (one fwd and one rev)
+# paired_in outputs both reads of a pair
+# other is the output file for non-rRNA reads
+"$meta_executable" \
+    --output "rRNA_reads" \
+    --other "non_rRNA_reads" \
+    --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \
+    --ribo_database_manifest test_data/rrna-db.txt \
+    --log test_log.log \
+    --paired_in \
+    --fastx \
+    --out2
+    
+
+echo ">> Checking if the correct files are present"
+[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; }
+[[ -s "rRNA_reads_fwd.fq.gz" ]] && [[ -s "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is empty!"; exit 1; }
+[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;}
+gzip -dk non_rRNA_reads_fwd.fq.gz
+gzip -dk non_rRNA_reads_rev.fq.gz
+[[ ! -s "non_rRNA_reads_fwd.fq" ]] && [[ ! -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is not empty!"; exit 1;}
+
+rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log
+rm -rf kvdb/
+
+
+echo ">>> Testing for single-end reads"
+"$meta_executable" \
+    --aligned "rRNA_reads" \
+    --other "non_rRNA_reads" \
+    --input $meta_resources_dir/test_data/reads_1.fq.gz \
+    --ref $meta_resources_dir/test_data/rRNA/database1.fa \
+    --log test_log.log \
+    --fastx
+
+echo ">> Checking if the correct files are present"
+[[ ! -f "rRNA_reads.fq.gz" ]] && echo "rRNA output fastq file is missing!" && exit 1
+gzip -dk rRNA_reads.fq.gz
+[[ -s "rRNA_reads.fq" ]] && echo "rRNA output fastq file is not empty!" && exit 1
+[[ ! -f "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is missing!" && exit 1
+[[ ! -s "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is empty!" && exit 1
+
+
+echo ">>> All tests passed"
+exit 0
\ No newline at end of file
diff --git a/src/sortmerna/test_data/rRNA/database1.fa b/src/sortmerna/test_data/rRNA/database1.fa
new file mode 100644
index 00000000..bae23aba
--- /dev/null
+++ b/src/sortmerna/test_data/rRNA/database1.fa
@@ -0,0 +1,24 @@
+>AY846379.1.1791 Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w
+CCUGGUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUAUAAACUGCUUAUACUGU
+GAAACUGCGAAUGGCUCAUUAAAUCAGUUAUAGUUUAUUUGAUGGUACCUCUACACGGAUAACCGUAGUAAUUCUAGAGC
+UAAUACGUGCGUAAAUCCCGACUUCUGGAAGGGACGUAUUUAUUAGAUAAAAGGCCGACCGAGCUUUGCUCGACCCGCGG
+UGAAUCAUGAUAACUUCACGAAUCGCAUAGCCUUGUGCUGGCGAUGUUUCAUUCAAAUUUCUGCCCUAUCAACUUUCGAU
+GGUAGGAUAGAGGCCUACCAUGGUGGUAACGGGUGACGGAGGAUUAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGG
+CUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCAAUCCUGAUACGGGGAGGUAGUGACAAUAAAUAACAAUGC
+CGGGCAUUUCAUGUCUGGCAAUUGGAAUGAGUACAAUCUAAAUCCCUUAACGAGGAUCAAUUGGAGGGCAAGUCUGGUGC
+CAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUUAAGUUGUUGCAGUUAAAAAGCUCGUAGUUGGAUUUCGGGUG
+GGUUCCAGCGGUCCGCCUAUGGUGAGUACUGCUGUGGCCCUCCUUUUUGUCGGGGACGGGCUCCUGGGCUUCAUUGUCCG
+GGACUCGGAGUCGACGAUGAUACUUUGAGUAAAUUAGAGUGUUCAAAGCAAGCCUACGCUCUGAAUACUUUAGCAUGGAA
+UAUCGCGAUAGGACUCUGGCCUAUCUCGUUGGUCUGUAGGACCGGAGUAAUGAUUAAGAGGGACAGUCGGGGGCAUUCGU
+AUUUCAUUGUCAGAGGUGAAAUUCUUGGAUUUAUGAAAGACGAACUACUGCGAAAGCAUUUGCCAAGGAUGUUUUCAUUA
+AUCAAGAACGAAAGUUGGGGGCUCGAAGACGAUUAGAUACCGUCGUAGUCUCAACCAUAAACGAUGCCGACUAGGGAUUG
+GAGGAUGUUCUUUUGAUGACUUCUCCAGCACCUUAUGAGAAAUCAAAGUUUUUGGGUUCCGGGGGGAGUAUGGUCGCAAG
+GCUGAAACUUAAAGGAAUUGACGGAAGGGCACCACCAGGCGUGGAGCCUGCGGCUUAAUUUGACUCAACACGGGAAAACU
+UACCAGGUCCAGACAUAGUGAGGAUUGACAGAUUGAGAGCUCUUUCUUGAUUCUAUGGGUGGUGGUGCAUGGCCGUUCUU
+AGUUGGUGGGUUGCCUUGUCAGGUUGAUUCCGGUAACGAACGAGACCUCAGCCUGCUAAAUAUGUCACAUUCGCUUUUUG
+CGGAUGGCCGACUUCUUAGAGGGACUAUUGGCGUUUAGUCAAUGGAAGUAUGAGGCAAUAACAGGUCUGUGAUGCCCUUA
+GAUGUUCUGGGCCGCACGCGCGCUACACUGACGCAUUCAGCAAGCCUAUCCUUGACCGAGAGGUCUGGGUAAUCUUUGAA
+ACUGCGUCGUGAUGGGGAUAGAUUAUUGCAAUUAUUAGUCUUCAACGAGGAAUGCCUAGUAAGCGCAAGUCAUCAGCUUG
+CGUUGAUUACGUCCCUGCCCUUUGUACACACCGCCCGUCGCUCCUACCGAUUGGGUGUGCUGGUGAAGUGUUCGGAUUGG
+CAGAGCGGGUGGCAACACUUGCUUUUGCCGAGAAGUUCAUUAAACCCUCCCACCUAGAGGAAGGAGAAGUCGUAACAAGG
+UUUCCGUAGGUGAACCUGCAGAAG
\ No newline at end of file
diff --git a/src/sortmerna/test_data/rRNA/database2.fa b/src/sortmerna/test_data/rRNA/database2.fa
new file mode 100644
index 00000000..87b5bc99
--- /dev/null
+++ b/src/sortmerna/test_data/rRNA/database2.fa
@@ -0,0 +1,16 @@
+>AB001445.1.1538 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;Pseudomonas amygdali pv. morsprunorum
+AGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGCAGCACGGGUACUUGUAC
+CUGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUA
+AUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGCCUAGGUCGGAUUAGCUAG
+UUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCGUAACUGGUCUGAGAGGAUGAUCAGUCACACUGGAACUGAGACACG
+GUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGAAAGCCUGAUCCAGCCAUGCCGCGUGUGUGA
+AGAAGGUCUUCGGAUUGUAAAGCACUUUAAGUUGGGAGGAAGGGCAGUUACCUAAUACGUAUCUGUUUUGACGUUACCGA
+CAGAAUAAGCACCGGCUAACUCUGUGCCAGCAGCCGCGGUAAUACAGAGGGUGCAAGCGUUAAUCGGAAUUACUGGGCGU
+AAAGCGCGCGUAGGUGGUUUGUUAAGUUGAAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCCAAAACUGGCAAGC
+UAGAGUAUGGUAGAGGGUGGUGGAAUUUCCUGUGUAGCGGUGAAAUGCGUAGAUAUAGGAAGGAACACCAGUGGCGAAGG
+CGACCACCUGGACUGAUACUGACACUGAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCC
+GUAAACGAUGUCAACUAGCCGUUGGGAGCCUUGAGCUCUUAGUGGCGCAGCUAACGCAUUAAGUUGACCGCCUGGGGAGU
+ACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAAGCAACG
+CGAAGAACCUUACCAGGCCUUGACAUCCAAUGAAUCCUUUAGAGAUAGAGGAGUGCCUUCGGGAGCAUUGAGACAGGUGC
+UGCAUGGCUGUCGUCAGCUCGUGUCGUGAGAUGUUGGGUUAAGUCCCGUAACGAGCGCAACCCUUGUCCUUAGUUACCAG
+CACGUCAUGGUGGGCACUCUAAGGAGACUGCCGGUGACAAACCGGAGGAAGGUGGGGAUGACGUCAAGUCAUCAUGGCCC
diff --git a/src/sortmerna/test_data/reads_1.fq.gz b/src/sortmerna/test_data/reads_1.fq.gz
new file mode 100644
index 0000000000000000000000000000000000000000..41c02a22dbbae13db84acf1e79bc4fc3fa8589e6
GIT binary patch
literal 189
zcmV;u07CyCiwFo$iqvKR19D|yWOH9JE@p86wU0dx!Y~Yl_nZQWu>(o}P^}JqwIX+b
zPO-%OPl6LsC<}stmpJjW<4E7M&Ycg<xuv0{WvQ>1S#Apj3L$tq`=RbA_@+MuTFFyl
z78alq=EMofi84dLb}3jyAYujED%nAymVqrZpNFl>Dky^%D&<ox3$Aj=I>v_(cRIcb
rrwC*NyuH|rwZ}M?)J<PfHX9|Ll<5=Yj_XIzKTzHQmEYsV%K-oY|JGAn

literal 0
HcmV?d00001

diff --git a/src/sortmerna/test_data/reads_2.fq.gz b/src/sortmerna/test_data/reads_2.fq.gz
new file mode 100644
index 0000000000000000000000000000000000000000..9d0f8d3f82dc114add66bde14727742aa60d87ee
GIT binary patch
literal 147
zcmV;E0BrvsiwFqp`$S~`19D|yWOH9KE@p86Rf{_g!!Qg(cb}p_#tgOcE2624VAw;N
z$wTjdl2T59whz#U6!ko|Im-B$be*)6;k9r1T~t&=BKxuqvq~J7o9LlYt68=T^x3Rh
zMGeS=Cf&2pc5v_jP&ehk104RrBv1Ym`T(a(7f3&JU*nzt7r<Yli4T(P4++Tt002W4
BLoEOR

literal 0
HcmV?d00001

diff --git a/src/sortmerna/test_data/script.sh b/src/sortmerna/test_data/script.sh
new file mode 100755
index 00000000..b2531248
--- /dev/null
+++ b/src/sortmerna/test_data/script.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+if [ ! -d /tmp/sortmerna_source ]; then
+  git clone --depth 2 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers.git /tmp/sortmerna_source
+fi
+
+# copy test data
+cp -r /tmp/sortmerna_source/bio/sortmerna/test/* .

From 592a080451e32647a7e4ec41261a55d9cda0aea0 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Sun, 1 Sep 2024 21:21:55 +0200
Subject: [PATCH 4/5] Update changelog, shorten description

---
 CHANGELOG.md                  | 3 +++
 src/sortmerna/config.vsh.yaml | 4 +---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e9f40fc..16d9120f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -131,6 +131,9 @@
     - `bedtools_getfasta`: extract sequences from a FASTA file for each of the
                            intervals defined in a BED/GFF/VCF file (PR #59).
 
+* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic 
+               data. (PR #146)
+
 ## MINOR CHANGES
 
 * Uniformize component metadata (PR #23).
diff --git a/src/sortmerna/config.vsh.yaml b/src/sortmerna/config.vsh.yaml
index 23925132..6477660f 100644
--- a/src/sortmerna/config.vsh.yaml
+++ b/src/sortmerna/config.vsh.yaml
@@ -1,9 +1,7 @@
 name: sortmerna
 description: | 
   Local sequence alignment tool for filtering, mapping and clustering. The main 
-  application of SortMeRNA is filtering rRNA from metatranscriptomic data. SortMeRNA
-  takes as input files of reads (fasta, fastq, fasta.gz, fastq.gz) and one or multiple
-  rRNA database file(s), and sorts apart aligned and rejected reads into two files.
+  application of SortMeRNA is filtering rRNA from metatranscriptomic data.
 keywords: [sort, mRNA, rRNA, alignment, filtering, mapping, clustering]
 links:
   homepage: https://sortmerna.readthedocs.io/en/latest/

From c6a747f16dbe23819e76a3b32ceb23fb7e932288 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Sun, 8 Sep 2024 17:28:14 +0200
Subject: [PATCH 5/5] Add more test scenarios

---
 src/sortmerna/test.sh | 53 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/sortmerna/test.sh b/src/sortmerna/test.sh
index 4d49c5ed..390b9307 100644
--- a/src/sortmerna/test.sh
+++ b/src/sortmerna/test.sh
@@ -4,7 +4,7 @@ echo ">>> Testing $meta_functionality_name"
 
 find $meta_resources_dir/test_data/rRNA -type f > test_data/rrna-db.txt
 
-echo ">>> Testing for paired-end reads"
+echo ">>> Testing for paired-end reads and database manifest"
 # out2 separates the read pairs into two files (one fwd and one rev)
 # paired_in outputs both reads of a pair
 # other is the output file for non-rRNA reads
@@ -30,8 +30,33 @@ gzip -dk non_rRNA_reads_rev.fq.gz
 rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log
 rm -rf kvdb/
 
+################################################################################
+echo ">>> Testing for paired-end reads and --ref and --paired_out argumens"
+"$meta_executable" \
+    --output "rRNA_reads" \
+    --other "non_rRNA_reads" \
+    --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \
+    --ref "$meta_resources_dir/test_data/rRNA/database1.fa;$meta_resources_dir/test_data/rRNA/database2.fa" \
+    --log test_log.log \
+    --paired_out \
+    --fastx \
+    --out2
+
+echo ">> Checking if the correct files are present"
+[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; }
+gzip -dkf rRNA_reads_fwd.fq.gz
+[[ ! -s "rRNA_reads_fwd.fq" ]] && [[ ! -s "rRNA_reads_rev.fq" ]] || { echo "rRNA output fastq file is not empty!"; exit 1; }
+[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;}
+gzip -dkf non_rRNA_reads_fwd.fq.gz
+gzip -dkf non_rRNA_reads_rev.fq.gz
+[[ -s "non_rRNA_reads_fwd.fq" ]] && [[ -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is empty!"; exit 1; }
+
+rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log
+rm -rf kvdb/
+
+################################################################################
 
-echo ">>> Testing for single-end reads"
+echo ">>> Testing for single-end reads and --ref argument"
 "$meta_executable" \
     --aligned "rRNA_reads" \
     --other "non_rRNA_reads" \
@@ -47,6 +72,30 @@ gzip -dk rRNA_reads.fq.gz
 [[ ! -f "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is missing!" && exit 1
 [[ ! -s "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is empty!" && exit 1
 
+rm -f rRNA_reads.fq.gz non_rRNA_reads.fq.gz test_log.log
+rm -rf kvdb/
+
+################################################################################
+
+echo ">>> Testing for single-end reads with singleton output files"
+"$meta_executable" \
+    --aligned "rRNA_reads" \
+    --other "non_rRNA_reads" \
+    --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \
+    --ribo_database_manifest test_data/rrna-db.txt \
+    --log test_log.log \
+    --fastx \
+    --sout
+
+echo ">> Checking if the correct files are present"
+[[ ! -f "rRNA_reads_paired.fq.gz" ]] && echo "Aligned paired fwd output fastq file is missing!" && exit 1
+[[ ! -f "rRNA_reads_singleton.fq.gz" ]] && echo "Aligned singleton fwd output fastq file is missing!" && exit 1
+[[ ! -f "non_rRNA_reads_fwd.fq" ]] && echo "Non-rRNA fwd output fastq file is missing!" && exit 1
+[[ ! -f "non_rRNA_reads_rev.fq" ]] && echo "Non-rRNA rev output fastq file is missing!" && exit 1
+[[ ! -f "non_rRNA_reads_singleton.fq.gz" ]] && echo "Non-rRNA singleton output fastq file is missing!" && exit 1
+[[ ! -f "non_rRNA_reads_paired.fq.gz" ]] && echo "Non-rRNA paired output fastq file is missing!" && exit 1
+
+
 
 echo ">>> All tests passed"
 exit 0
\ No newline at end of file