From 2f9c7f7b38c45e856eda9e35204c8f6befc32f84 Mon Sep 17 00:00:00 2001
From: Theodoro Gasperin Terra Camargo
 <98555209+tgaspe@users.noreply.github.com>
Date: Thu, 18 Jul 2024 17:19:44 -0300
Subject: [PATCH 1/9] Seqtk subseq (#85)

* Config file and help.txt file

Created:
- config.vsh.yaml
- help.txt

* Added script.sh

File added:
- script.sh

* Created test.sh

updates:
- changes to config.vsh.yaml
- created test.sh
- created some test files

Problems:
- there is some error in config file that is preventing me from running the component and testing

* Update on test.sh

* update

* Bug fixes

- config required: false bug

* Update test

* Update CHANGELOG.md

* Improvement on test.sh

* Added more test

I tried out different option of the command with different fasta and fastq files and different list, but the output does not seem to change.

* Update on tests

- got unstuck
- I need to create a docker image with the lastest version of seqtk
-

* Bug fixed

- removed some test files
- fixed bug with the help of Toni
- added correct software_versions.txt to config

Still Needs:
- add one more test to strand aware
- fix tab test

* Update CHANGELOG.md

* Fixed Tabular test bug

* Strand Aware Test

- implementation of strand aware test
- change of format for reg.bed file

* Input validation for list file

- input validation for name_list parameter

* Sugested Changes

- removed test_data dir
- removed input validation

* Added author info

* Update CHANGELOG.md

* Update theodoro_gasperin.yaml

* add newline

* add newline

* Update src/seqtk/seqtk_subseq/config.vsh.yaml

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>

* Version Fix

* Update on config

* Helper bed.sh

* Deleted _helpers

* don't forget exit when a test fails

---------

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>
---
 CHANGELOG.md                           |  30 ++--
 src/_authors/theodoro_gasperin.yaml    |  10 ++
 src/seqtk/seqtk_subseq/config.vsh.yaml |  78 +++++++++++
 src/seqtk/seqtk_subseq/help.txt        |   9 ++
 src/seqtk/seqtk_subseq/script.sh       |  15 ++
 src/seqtk/seqtk_subseq/test.sh         | 182 +++++++++++++++++++++++++
 6 files changed, 305 insertions(+), 19 deletions(-)
 create mode 100644 src/_authors/theodoro_gasperin.yaml
 create mode 100644 src/seqtk/seqtk_subseq/config.vsh.yaml
 create mode 100644 src/seqtk/seqtk_subseq/help.txt
 create mode 100644 src/seqtk/seqtk_subseq/script.sh
 create mode 100644 src/seqtk/seqtk_subseq/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4e6a0369..d6256e72 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,16 @@
 
 ## NEW FEATURES
 
-* `bd_rhapsody`:
+* `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75).
 
-  - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75).
+* `umitools/umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54).
+
+* `seqtk`:
+  - `seqtk/seqtk_sample`: Subsamples sequences from FASTA/Q files (PR #68).
+  - `seqtk/seqtk_subseq`: Extract the sequences (complete or subsequence) from the FASTA/FASTQ files
+                based on a provided sequence IDs or region coordinates file (PR #85).
+
+* `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76).
 
 ## MINOR CHANGES
 
@@ -38,14 +45,8 @@
 
 * `multiqc`: update multiple separator to `;` (PR #81).
 
-# biobox 0.1.0
-
-## BREAKING CHANGES
-
-* Change default `multiple_sep` to `;` (PR #25). This aligns with an upcoming breaking change in
-  Viash 0.9.0 in order to avoid issues with the current default separator `:` unintentionally
-  splitting up certain file paths.
 
+# biobox 0.1.0
 
 ## NEW FEATURES
 
@@ -94,21 +95,12 @@
     - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52).
     - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53).
 
-
 * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43).
 
-* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files to FASTA/Q (PR #68).
-
-* `umitools`:
-    - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54).
-
 * `bedtools`:
     - `bedtools_getfasta`: extract sequences from a FASTA file for each of the
                            intervals defined in a BED/GFF/VCF file (PR #59).
 
-* `agat`:
-    - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76).
-
 ## MINOR CHANGES
 
 * Uniformize component metadata (PR #23).
@@ -129,4 +121,4 @@
 
 * Add escaping character before leading hashtag in the description field of the config file (PR #50).
 
-* Format URL in biobase/bcl_convert description (PR #55).
\ No newline at end of file
+* Format URL in biobase/bcl_convert description (PR #55).
diff --git a/src/_authors/theodoro_gasperin.yaml b/src/_authors/theodoro_gasperin.yaml
new file mode 100644
index 00000000..47af96a9
--- /dev/null
+++ b/src/_authors/theodoro_gasperin.yaml
@@ -0,0 +1,10 @@
+name: Theodoro Gasperin Terra Camargo
+info:
+  links:
+    email: theodorogtc@gmail.com
+    github: tgaspe
+    linkedin: theodoro-gasperin-terra-camargo
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Bioinformatician
diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml
new file mode 100644
index 00000000..1c2e8c08
--- /dev/null
+++ b/src/seqtk/seqtk_subseq/config.vsh.yaml
@@ -0,0 +1,78 @@
+name: seqtk_subseq
+namespace: seqtk
+description: | 
+  Extract subsequences from FASTA/Q files. Takes as input a FASTA/Q file and a name.lst (sequence ids file) or a reg.bed (genomic regions file).
+keywords: [subseq, FASTA, FASTQ]
+links:
+  repository: https://github.com/lh3/seqtk/tree/v1.4
+license: MIT
+authors:
+  - __merge__: /src/_authors/theodoro_gasperin.yaml
+    roles: [ author, maintainer ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        type: file
+        direction: input
+        description: The input FASTA/Q file.
+        required: true
+        example: input.fa
+        
+      - name: "--name_list"
+        type: file
+        direction: input
+        description: | 
+          List of sequence names (name.lst) or genomic regions (reg.bed) to extract.
+        required: true
+        example: list.lst
+
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        alternatives: -o
+        type: file
+        direction: output
+        description: The output FASTA/Q file.
+        required: true
+        default: output.fa
+
+  - name: Options
+    arguments:
+      - name: "--tab"
+        alternatives: -t
+        type: boolean_true
+        description: TAB delimited output.
+        
+      - name: "--strand_aware"
+        alternatives: -s
+        type: boolean_true
+        description: Strand aware.
+        
+      - name: "--sequence_line_length"
+        alternatives: -l
+        type: integer
+        description: | 
+          Sequence line length of input fasta file. Default: 0.
+        example: 0
+        
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+
+engines:
+  - type: docker
+    image: quay.io/biocontainers/seqtk:1.4--he4a0461_2
+    setup:
+      - type: docker
+        run: |
+          echo $(echo $(seqtk 2>&1) | sed -n 's/.*\(Version: [^ ]*\).*/\1/p') > /var/software_versions.txt
+
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/seqtk/seqtk_subseq/help.txt b/src/seqtk/seqtk_subseq/help.txt
new file mode 100644
index 00000000..5768e4ff
--- /dev/null
+++ b/src/seqtk/seqtk_subseq/help.txt
@@ -0,0 +1,9 @@
+```bash
+seqtk subseq
+```
+Usage:   seqtk subseq [options] <in.fa> <in.bed>|<name.list>
+Options:
+  -t       TAB delimited output
+  -s       strand aware
+  -l INT   sequence line length [0]
+Note: Use 'samtools faidx' if only a few regions are intended.
\ No newline at end of file
diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh
new file mode 100644
index 00000000..0aceaf29
--- /dev/null
+++ b/src/seqtk/seqtk_subseq/script.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+[[ "$par_tab" == "false" ]] && unset par_tab
+[[ "$par_strand_aware" == "false" ]] && unset par_strand_aware
+
+seqtk subseq \
+    ${par_tab:+-t} \
+    ${par_strand_aware:+-s} \
+    ${par_sequence_line_length:+-l "$par_sequence_line_length"} \
+    "$par_input" \
+    "$par_name_list" \
+    > "$par_output"
diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh
new file mode 100644
index 00000000..f19cfa4a
--- /dev/null
+++ b/src/seqtk/seqtk_subseq/test.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+# exit on error
+set -e
+
+## VIASH START
+meta_executable="target/executable/seqtk/seqtk_subseq"
+meta_resources_dir="src/seqtk"
+## VIASH END
+
+# Create directories for tests
+echo "Creating Test Data..."
+mkdir test_data
+
+# Create and populate input.fasta
+cat > "test_data/input.fasta" <<EOL
+>KU562861.1
+GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCA
+AGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA
+>GU056837.1
+CTAATTTTATTTTTTTATAATAATTATTGGAGGAACTAAAACATTAATGAAATAATAATTATCATAATTA
+TTAATTACATATTTATTAGGTATAATATTTAAGGAAAAATATATTTTATGTTAATTGTAATAATTAGAAC
+>CP097510.1
+CGATTTAGATCGGTGTAGTCAACACACATCCTCCACTTCCATTAGGCTTCTTGACGAGGACTACATTGAC
+AGCCACCGAGGGAACCGACCTCCTCAATGAAGTCAGACGCCAAGAGCCTATCAACTTCCTTCTGCACAGC
+>JAMFTS010000002.1
+CCTAAACCCTAAACCCTAAACCCCCTACAAACCTTACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA
+ACCCGAAACCCTATACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCAAACCTAATCCCTAAACC
+>MH150936.1
+TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTC
+AAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG
+EOL
+
+# Update id.list with new entries
+cat > "test_data/id.list" <<EOL
+KU562861.1
+MH150936.1
+EOL
+
+# Create and populate reg.bed
+cat > "test_data/reg.bed" <<EOL
+KU562861.1$(echo -e "\t")10$(echo -e "\t")20$(echo -e "\t")region$(echo -e "\t")0$(echo -e "\t")+$(echo -e "\n")
+MH150936.1$(echo -e "\t")10$(echo -e "\t")20$(echo -e "\t")region$(echo -e "\t")0$(echo -e "\t")-
+EOL
+
+#########################################################################################
+# Run basic test
+mkdir test1
+cd test1
+
+echo "> Run seqtk_subseq on FASTA/Q file"
+"$meta_executable" \
+  --input "../test_data/input.fasta" \
+  --name_list "../test_data/id.list" \
+  --output "sub_sample.fq"
+
+expected_output_basic=">KU562861.1
+GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCAAGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA
+>MH150936.1
+TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTCAAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG"
+output_basic=$(cat sub_sample.fq)
+
+if [ "$output_basic" != "$expected_output_basic" ]; then
+  echo "Test failed"
+  echo "Expected:"
+  echo "$expected_output_basic"
+  echo "Got:"
+  echo "$output_basic"
+  exit 1
+fi
+
+#########################################################################################
+# Run reg.bed as name list input test
+cd ..
+mkdir test2
+cd test2
+
+echo "> Run seqtk_subseq on FASTA/Q file with BED file as name list"
+"$meta_executable" \
+  --input "../test_data/input.fasta" \
+  --name_list "../test_data/reg.bed" \
+  --output "sub_sample.fq"
+
+expected_output_basic=">KU562861.1:11-20
+AGTGTTCGAG
+>MH150936.1:11-20
+TGAAAACTTT"
+output_basic=$(cat sub_sample.fq)
+
+if [ "$output_basic" != "$expected_output_basic" ]; then
+  echo "Test failed"
+  echo "Expected:"
+  echo "$expected_output_basic"
+  echo "Got:"
+  echo "$output_basic"
+  exit 1
+fi
+
+#########################################################################################
+# Run tab option output test
+cd ..
+mkdir test3
+cd test3
+
+echo "> Run seqtk_subseq with TAB option"
+"$meta_executable" \
+  --tab \
+  --input "../test_data/input.fasta" \
+  --name_list "../test_data/reg.bed" \
+  --output "sub_sample.fq"
+
+expected_output_tabular=$'KU562861.1\t11\tAGTGTTCGAG\nMH150936.1\t11\tTGAAAACTTT'
+output_tabular=$(cat sub_sample.fq)
+
+if [ "$output_tabular" != "$expected_output_tabular" ]; then
+  echo "Test failed"
+  echo "Expected:"
+  echo "$expected_output_tabular"
+  echo "Got:"
+  echo "$output_tabular"
+  exit 1
+fi
+
+#########################################################################################
+# Run line option output test
+cd ..
+mkdir test4
+cd test4
+
+echo "> Run seqtk_subseq with line length option"
+"$meta_executable" \
+  --sequence_line_length 5 \
+  --input "../test_data/input.fasta" \
+  --name_list "../test_data/reg.bed" \
+  --output "sub_sample.fq"
+
+expected_output_wrapped=">KU562861.1:11-20
+AGTGT
+TCGAG
+>MH150936.1:11-20
+TGAAA
+ACTTT"
+output_wrapped=$(cat sub_sample.fq)
+
+if [ "$output_wrapped" != "$expected_output_wrapped" ]; then
+  echo "Test failed"
+  echo "Expected:"
+  echo "$expected_output_wrapped"
+  echo "Got:"
+  echo "$output_wrapped"
+  exit 1
+fi
+
+#########################################################################################
+# Run Strand Aware option output test
+cd ..
+mkdir test5
+cd test5
+
+echo "> Run seqtk_subseq with strand aware option"
+"$meta_executable" \
+  --strand_aware \
+  --input "../test_data/input.fasta" \
+  --name_list "../test_data/reg.bed" \
+  --output "sub_sample.fq"
+
+expected_output_wrapped=">KU562861.1:11-20
+AGTGTTCGAG
+>MH150936.1:11-20
+AAAGTTTTCA"
+output_wrapped=$(cat sub_sample.fq)
+
+if [ "$output_wrapped" != "$expected_output_wrapped" ]; then
+  echo "Test failed"
+  echo "Expected:"
+  echo "$expected_output_wrapped"
+  echo "Got:"
+  echo "$output_wrapped"
+  exit 1
+fi
+
+echo "All tests succeeded!"

From 3566232d39446d6ac19db154e6046e8b000f51af Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Wed, 24 Jul 2024 17:46:02 +0200
Subject: [PATCH 2/9] UMI-tools extract (#71)

* initial commit dedup

* Revert "initial commit dedup"

This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2.

* config and help files

* tests template

* script, preliminary version of tests

* new test data

* include arguments and script from the rnaseq.vsh version

* new version of the script

* Fixed bugs, component functional

* Small formatting changes, update chagelog

* Argument formatting in config

* md formatting and ordering changes

* remove second link to doc

* Update src/umi_tools/umi_tools_extract/config.vsh.yaml

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>

* Reduce the size of the test data and add back missing arguments to config

* simplify argument names

* Update src/umi_tools/umi_tools_extract/config.vsh.yaml

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>

* more consistent arg names, remove "paired" argument and adapt script

---------

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>
---
 CHANGELOG.md                                  |   3 +
 .../umi_tools_extract/config.vsh.yaml         | 197 ++++++++++++++++++
 src/umi_tools/umi_tools_extract/help.txt      | 106 ++++++++++
 src/umi_tools/umi_tools_extract/script.sh     |  88 ++++++++
 src/umi_tools/umi_tools_extract/test.sh       |  86 ++++++++
 .../test_data/scrb_seq_fastq.1_30             | 120 +++++++++++
 .../test_data/scrb_seq_fastq.1_30.extract     | 120 +++++++++++
 .../test_data/scrb_seq_fastq.2_30             | 120 +++++++++++
 .../test_data/scrb_seq_fastq.2_30.extract     | 120 +++++++++++
 .../umi_tools_extract/test_data/script.sh     |  34 +++
 .../test_data/slim_30.extract                 | 120 +++++++++++
 .../umi_tools_extract/test_data/slim_30.fastq | 120 +++++++++++
 12 files changed, 1234 insertions(+)
 create mode 100644 src/umi_tools/umi_tools_extract/config.vsh.yaml
 create mode 100644 src/umi_tools/umi_tools_extract/help.txt
 create mode 100644 src/umi_tools/umi_tools_extract/script.sh
 create mode 100644 src/umi_tools/umi_tools_extract/test.sh
 create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30
 create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30.extract
 create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30
 create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30.extract
 create mode 100755 src/umi_tools/umi_tools_extract/test_data/script.sh
 create mode 100644 src/umi_tools/umi_tools_extract/test_data/slim_30.extract
 create mode 100644 src/umi_tools/umi_tools_extract/test_data/slim_30.fastq

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d6256e72..665b587d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -95,6 +95,9 @@
     - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52).
     - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53).
 
+* `umi_tools`:
+    -`umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71).
+
 * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43).
 
 * `bedtools`:
diff --git a/src/umi_tools/umi_tools_extract/config.vsh.yaml b/src/umi_tools/umi_tools_extract/config.vsh.yaml
new file mode 100644
index 00000000..b93c8cb9
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/config.vsh.yaml
@@ -0,0 +1,197 @@
+name: umi_tools_extract
+namespace: umi_tools
+description: |
+  Flexible removal of UMI sequences from fastq reads.
+  UMIs are removed and appended to the read name. Any other barcode, for example a library barcode,
+  is left on the read. Can also filter reads by quality or against a whitelist.
+keywords: [ extract, umi-tools, umi, fastq ]
+links:
+  homepage: https://umi-tools.readthedocs.io/en/latest/
+  documentation: https://umi-tools.readthedocs.io/en/latest/reference/extract.html
+  repository: https://github.com/CGATOxford/UMI-tools
+references: 
+  doi: 10.1101/gr.209601.116
+license: MIT
+
+argument_groups:
+
+  - name: Input
+    arguments: 
+    - name: --input
+      type: file
+      required: true
+      description: File containing the input data.
+      example: sample.fastq
+    - name: --read2_in
+      type: file
+      required: false
+      description: File containing the input data for the R2 reads (if paired). If provided, a <list of other required arguments> need to be provided.
+      example: sample_R2.fastq
+    - name: --bc_pattern
+      alternatives: -p
+      type: string
+      description: |
+        The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides 
+        of the read are from the UMI.
+    - name: --bc_pattern2
+      type: string
+      description: The UMI barcode pattern to use for read 2.
+    
+  - name: "Output"
+    arguments:  
+    - name: --output
+      type: file
+      required: true
+      description: Output file for read 1.
+      direction: output
+    - name: --read2_out
+      type: file
+      description: Output file for read 2.
+      direction: output
+    - name: --filtered_out
+      type: file
+      description: |
+        Write out reads not matching regex pattern or cell barcode whitelist to this file.
+    - name: --filtered_out2
+      type: file
+      description: |
+        Write out read pairs not matching regex pattern or cell barcode whitelist to this file.
+
+  - name: Extract Options
+    arguments:
+    - name: --extract_method
+      type: string
+      choices: [string, regex]
+      description: |
+        UMI pattern to use. Default: `string`.
+      example: "string"
+    - name: --error_correct_cell
+      type: boolean_true
+      description: Error correct cell barcodes to the whitelist.
+    - name: --whitelist
+      type: file
+      description: |
+        Whitelist of accepted cell barcodes tab-separated format, where column 1 is the whitelisted
+        cell barcodes and column 2 is the list (comma-separated) of other cell barcodes which should 
+        be corrected to the barcode in column 1. If the --error_correct_cell option is not used, this
+        column will be ignored.
+    - name: --blacklist
+      type: file
+      description: BlackWhitelist of cell barcodes to discard.
+    - name: --subset_reads
+      type: integer
+      description: Only parse the first N reads.
+    - name: --quality_filter_threshold
+      type: integer
+      description: Remove reads where any UMI base quality score falls below this threshold.
+    - name: --quality_filter_mask
+      type: string
+      description: |
+        If a UMI base has a quality below this threshold, replace the base with 'N'.
+    - name: --quality_encoding
+      type: string
+      choices: [phred33, phred64, solexa]
+      description: |
+        Quality score encoding. Choose from:
+          * phred33 [33-77]
+          * phred64 [64-106]
+          * solexa [59-106]
+    - name: --reconcile_pairs
+      type: boolean_true
+      description: |
+        Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols
+        where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard
+        to the read2.
+    - name: --three_prime
+      alternatives: --3prime
+      type: boolean_true
+      description: |
+        By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is
+        on the 3' end instead. This option only works with --extract_method=string since 3' encoding can be specified
+        explicitly with a regex, e.g `.*(?P<umi_1>.{5})$`.
+    - name: --ignore_read_pair_suffixes
+      type: boolean_true
+      description: |
+        Ignore "/1" and "/2" read name suffixes. Note that this options is required if the suffixes are not whitespace
+        separated from the rest of the read name.
+        arguments:
+    - name: --umi_separator
+      type: string
+      description: |
+        The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with
+        UMI-tools and used other software. Default: `_`
+      example: "_"
+    - name: --grouping_method
+      type: string
+      choices: [unique, percentile, cluster, adjacency, directional]
+      description: |
+        Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying
+        the reads with the same mapping position, but treat similar yet nonidentical UMIs differently. Default: `directional`
+      example: "directional"
+    - name: --umi_discard_read
+      type: integer
+      choices: [0, 1, 2]
+      description: |
+        After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively. Default: `0`
+      example: 0
+
+  - name: Common Options
+    arguments:
+    - name: --log
+      type: file
+      description: File with logging information.
+      direction: output
+    - name: --log2stderr
+      type: boolean_true
+      description: Send logging information to stderr.
+      direction: output
+    - name: --verbose
+      type: integer
+      description: Log level. The higher, the more output.
+    - name: --error
+      type: file
+      description: File with error information.
+      direction: output
+    - name: --temp_dir
+      type: string
+      description: |
+        Directory for temporary files. If not set, the bash environmental variable TMPDIR is used.
+    - name: --compresslevel
+      type: integer
+      description: |
+        Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9).
+        Default `6`.
+      example: 6
+    - name: --timeit
+      type: file
+      description: Store timing information in file.
+      direction: output
+    - name: --timeit_name
+      type: string
+      description: Name in timing file for this class of jobs.
+      default: all
+    - name: --timeit_header
+      type: boolean_true
+      description: Add header for timing information.
+    - name: --random_seed
+      type: integer
+      description: Random seed to initialize number generator with.
+  
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/umi_tools:1.1.4--py310h4b81fae_2
+    setup:
+      - type: docker
+        run: |
+            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
+runners:
+- type: executable
+- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_extract/help.txt b/src/umi_tools/umi_tools_extract/help.txt
new file mode 100644
index 00000000..46c77ed0
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/help.txt
@@ -0,0 +1,106 @@
+'''
+Generated from the following UMI-tools documentation:
+      https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options
+      https://umi-tools.readthedocs.io/en/latest/reference/extract.html
+'''
+
+extract - Extract UMI from fastq
+
+Usage:
+
+   Single-end:
+      umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]]
+
+   Paired end:
+      umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]] --read2-in=IN2_FASTQ[.gz] --read2-out=OUT2_FASTQ[.gz]
+
+   note: If -I/-S are ommited standard in and standard out are used
+         for input and output.  To generate a valid BAM file on
+         standard out, please redirect log with --log=LOGFILE or
+         --log2stderr. Input/Output will be (de)compressed if a
+         filename provided to -S/-I/--read2-in/read2-out ends in .gz
+
+Common UMI-tools Options:
+
+      -S, --stdout                  File where output is to go [default = stdout].
+      -L, --log                     File with logging information [default = stdout].
+      --log2stderr                  Send logging information to stderr [default = False].
+      -v, --verbose                 Log level. The higher, the more output [default = 1].
+      -E, --error                   File with error information [default = stderr].
+      --temp-dir                    Directory for temporary files. If not set, the bash environmental variable TMPDIR is used[default = None].
+      --compresslevel               Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9)
+
+      profiling and debugging options:
+      --timeit                      Store timing information in file [default=none].
+      --timeit-name                 Name in timing file for this class of jobs [default=all].
+      --timeit-header               Add header for timing information [default=none].
+      --random-seed                 Random seed to initialize number generator with [default=none].
+
+Extract Options:
+      -I, --stdin                   File containing the input data [default = stdin].
+      --error-correct-cell          Error correct cell barcodes to the whitelist (see --whitelist)
+      --whitelist                   Whitelist of accepted cell barcodes. The whitelist should be in the following format (tab-separated):
+                                          AAAAAA    AGAAAA
+                                          AAAATC
+                                          AAACAT
+                                          AAACTA    AAACTN,GAACTA
+                                          AAATAC
+                                          AAATCA    GAATCA
+                                          AAATGT    AAAGGT,CAATGT
+                                    Where column 1 is the whitelisted cell barcodes and column 2 is the list (comma-separated) of other cell 
+                                    barcodes which should be corrected to the barcode in column 1. If the --error-correct-cell option is not 
+                                    used, this column will be ignored. Any additional columns in the whitelist input, such as the counts columns 
+                                    from the output of umi_tools whitelist, will be ignored.
+      --blacklist                   BlackWhitelist of cell barcodes to discard
+      --subset-reads=[N]            Only parse the first N reads
+      --quality-filter-threshold    Remove reads where any UMI base quality score falls below this threshold
+      --quality-filter-mask         If a UMI base has a quality below this threshold, replace the base with 'N'
+      --quality-encoding            Quality score encoding. Choose from:
+                                          'phred33' [33-77]
+                                          'phred64' [64-106]
+                                          'solexa' [59-106]
+      --reconcile-pairs             Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols
+                                    where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard
+                                    to the read2s.
+
+Experimental options:
+      Note: These options have not been extensively testing to ensure behaviour is as expected. If you have some suitable input files which
+            we can use for testing, please contact us.
+            If you have a library preparation method where the UMI may be in either read, you can use the following options to search for the
+            UMI in either read:
+
+                  --either-read --extract-method --bc-pattern=[PATTERN1] --bc-pattern2=[PATTERN2]
+
+            Where both patterns match, the default behaviour is to discard both reads. If you want to select the read with the UMI with highest
+            sequence quality, provide --either-read-resolve=quality.
+
+
+      --bc-pattern                  Pattern for barcode(s) on read 1. See --extract-method
+      --bc-pattern2                 Pattern for barcode(s) on read 2. See --extract-method
+      --extract-method              There are two methods enabled to extract the umi barcode (+/- cell barcode). For both methods, the patterns
+                                    should be provided using the --bc-pattern and --bc-pattern2 options.x   
+                                    string: 
+                                    This should be used where the barcodes are always in the same place in the read.
+                                          N = UMI position (required)
+                                          C = cell barcode position (optional)
+                                          X = sample position (optional)
+                                    Bases with Ns and Cs will be extracted and added to the read name. The corresponding sequence qualities will
+                                    be removed from the read. Bases with an X will be reattached to the read.
+                                    regex:
+                                    This method allows for more flexible barcode extraction and should be used where the cell barcodes are variable
+                                    in length. Alternatively, the regex option can also be used to filter out reads which do not contain an expected
+                                    adapter sequence. The regex must contain groups to define how the barcodes are encoded in the read. 
+                                    The expected groups in the regex are:
+                                          umi_n = UMI positions, where n can be any value (required) 
+                                          cell_n = cell barcode positions, where n can be any value (optional) 
+                                          discard_n = positions to discard, where n can be any value (optional)
+      --3prime                      By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is
+                                    on the 3' end instead. This option only works with --extract-method=string since 3' encoding can be specified
+                                    explicitly with a regex, e.g .*(?P<umi_1>.{5})$
+      --read2-in                    Filename for read pairs
+      --filtered-out                Write out reads not matching regex pattern or cell barcode whitelist to this file
+      --filtered-out2               Write out read pairs not matching regex pattern or cell barcode whitelist to this file
+      --ignore-read-pair-suffixes   Ignore SOH and STX read name suffixes. Note that this options is required if the suffixes are not whitespace
+                                    separated from the rest of the read name
+
+For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_extract/script.sh b/src/umi_tools/umi_tools_extract/script.sh
new file mode 100644
index 00000000..5e41865d
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/script.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -exo pipefail
+
+test_dir="${metal_executable}/test_data"
+
+[[ "$par_error_correct_cell" == "false" ]] && unset par_error_correct_cell
+[[ "$par_reconcile_pairs" == "false" ]] && unset par_reconcile_pairs
+[[ "$par_three_prime" == "false" ]] && unset par_three_prime
+[[ "$par_ignore_read_pair_suffixes" == "false" ]] && unset par_ignore_read_pair_suffixes
+[[ "$par_timeit_header" == "false" ]] && unset par_timeit_header
+[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr
+
+
+# Check if we have the correct number of input files and patterns for paired-end or single-end reads
+
+# For paired-end rends, check that we have two read files, two patterns
+# Check for paired-end inputs
+if [ -n "$par_input" ] && [ -n "$par_read2_in" ]; then
+    # Paired-end checks: Ensure both UMI patterns are provided
+    if [ -z "$par_bc_pattern" ] || [ -z "$par_bc_pattern2" ]; then
+        echo "Paired end input requires two UMI patterns."
+        exit 1
+    fi
+elif [ -n "$par_input" ]; then
+    # Single-end checks: Ensure no second read or UMI pattern for the second read is provided
+    if [ -n "$par_bc_pattern2" ]; then
+        echo "Single end input requires only one read file and one UMI pattern."
+        exit 1
+    fi
+    # Check that discard_read is not set or set to 0 for single-end reads
+    if [ -n "$par_umi_discard_read" ] && [ "$par_umi_discard_read" != 0 ]; then
+        echo "umi_discard_read is only valid when processing paired end reads."
+        exit 1
+    fi
+else
+    # No inputs provided
+    echo "No input files provided."
+    exit 1
+fi
+
+
+
+
+umi_tools extract \
+    -I "$par_input" \
+    ${par_read2_in:+ --read2-in "$par_read2_in"} \
+    -S "$par_output" \
+    ${par_read2_out:+--read2-out "$par_read2_out"} \
+    ${par_extract_method:+--extract-method "$par_extract_method"} \
+    --bc-pattern "$par_bc_pattern" \
+    ${par_bc_pattern2:+ --bc-pattern2 "$par_bc_pattern2"} \
+    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
+    ${par_output_stats:+--output-stats "$par_output_stats"} \
+    ${par_error_correct_cell:+--error-correct-cell} \
+    ${par_whitelist:+--whitelist "$par_whitelist"} \
+    ${par_blacklist:+--blacklist "$par_blacklist"} \
+    ${par_subset_reads:+--subset-reads "$par_subset_reads"} \
+    ${par_quality_filter_threshold:+--quality-filter-threshold "$par_quality_filter_threshold"} \
+    ${par_quality_filter_mask:+--quality-filter-mask "$par_quality_filter_mask"} \
+    ${par_quality_encoding:+--quality-encoding "$par_quality_encoding"} \
+    ${par_reconcile_pairs:+--reconcile-pairs} \
+    ${par_three_prime:+--3prime} \
+    ${par_filtered_out:+--filtered-out "$par_filtered_out"} \
+    ${par_filtered_out2:+--filtered-out2 "$par_filtered_out2"} \
+    ${par_ignore_read_pair_suffixes:+--ignore-read-pair-suffixes} \
+    ${par_random_seed:+--random-seed "$par_random_seed"} \
+    ${par_temp_dir:+--temp-dir "$par_temp_dir"} \
+    ${par_compresslevel:+--compresslevel "$par_compresslevel"} \
+    ${par_timeit:+--timeit "$par_timeit"} \
+    ${par_timeit_name:+--timeit-name "$par_timeit_name"} \
+    ${par_timeit_header:+--timeit-header} \
+    ${par_log:+--log "$par_log"} \
+    ${par_log2stderr:+--log2stderr} \
+    ${par_verbose:+--verbose "$par_verbose"} \
+    ${par_error:+--error "$par_error"}
+
+
+if [ "$par_umi_discard_read" == 1 ]; then
+    # discard read 1
+    rm "$par_read1_out"
+elif [ "$par_umi_discard_read" == 2 ]; then
+    # discard read 2 (-f to bypass file existence check)
+    rm -f "$par_read2_out"
+fi
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_extract/test.sh b/src/umi_tools/umi_tools_extract/test.sh
new file mode 100644
index 00000000..0de5d5b3
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+test_dir="${meta_resources_dir}/test_data"
+
+echo ">>> Testing $meta_functionality_name"
+
+############################################################################################################
+
+echo ">>> Test 1: Testing for paired-end reads"
+"$meta_executable" \
+    --input "$test_dir/scrb_seq_fastq.1_30"\
+    --read2_in "$test_dir/scrb_seq_fastq.2_30" \
+    --bc_pattern "CCCCCCNNNNNNNNNN"\
+    --bc_pattern2 "CCCCCCNNNNNNNNNN" \
+    --extract_method string \
+    --umi_separator '_' \
+    --grouping_method directional \
+    --umi_discard_read 0 \
+    --output scrb_seq_fastq.1_30.extract \
+    --read2_out scrb_seq_fastq.2_30.extract \
+    --random_seed 1
+
+echo ">> Checking if the correct files are present"
+[[ ! -f "scrb_seq_fastq.1_30.extract" ]] || [[ ! -f "scrb_seq_fastq.2_30.extract" ]] && echo "Reads file missing" && exit 1
+[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1
+[ ! -s "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 file is empty" && exit 1
+
+
+echo ">> Checking if the files are correct"
+diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \
+    (echo "Read 1 file is not correct" && exit 1)
+diff -q "${meta_resources_dir}/scrb_seq_fastq.2_30.extract" "$test_dir/scrb_seq_fastq.2_30.extract" || \
+    (echo "Read 2 file is not correct" && exit 1)
+
+rm scrb_seq_fastq.1_30.extract scrb_seq_fastq.2_30.extract
+
+############################################################################################################
+
+echo ">>> Test 2: Testing for paired-end reads with umi_discard_reads option"
+"$meta_executable" \
+    --input "$test_dir/scrb_seq_fastq.1_30" \
+    --read2_in "$test_dir/scrb_seq_fastq.2_30" \
+    --bc_pattern CCCCCCNNNNNNNNNN \
+    --bc_pattern2 CCCCCCNNNNNNNNNN \
+    --extract_method string \
+    --umi_separator '_' \
+    --grouping_method directional \
+    --umi_discard_read 2 \
+    --output scrb_seq_fastq.1_30.extract \
+    --random_seed 1
+
+echo ">> Checking if the correct files are present"
+[ ! -f "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is missing" && exit 1
+[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1
+[ -f "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 is not discarded" && exit 1
+
+echo ">> Checking if the files are correct"
+diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \
+    (echo "Read 1 file is not correct" && exit 1)
+
+rm scrb_seq_fastq.1_30.extract
+
+############################################################################################################
+
+echo ">>> Test 3: Testing for single-end reads"
+"$meta_executable" \
+    --input "$test_dir/slim_30.fastq" \
+    --bc_pattern "^(?P<umi_1>.{3}).{4}(?P<umi_2>.{2})" \
+    --extract_method regex \
+    --umi_separator '_' \
+    --grouping_method directional \
+    --output slim_30.extract \
+    --random_seed 1 
+
+echo ">> Checking if the correct files are present"
+[ ! -f "slim_30.extract" ] && echo "Trimmed reads file missing" && exit 1
+[ ! -s "slim_30.extract" ] && echo "Trimmed reads file is empty" && exit 1
+
+echo ">> Checking if the files are correct"
+diff -q "${meta_resources_dir}/slim_30.extract" "$test_dir/slim_30.extract" || \
+    (echo "Trimmed reads file is not correct" && exit 1)
+
+rm slim_30.extract
+
+echo ">>> Test finished successfully"
+exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30
new file mode 100644
index 00000000..639f6243
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30
@@ -0,0 +1,120 @@
+@SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17
+AATAACTTCCCGCGTCG
++SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17
+@@@DDDBDDF>FFHGIB
+@SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17
+AGCGGGGTGCTCGTCGT
++SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17
+CCCFFFFFHHHHHJJJJ
+@SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17
+CTTTAGTACCAGTCCTT
++SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17
+BBCFFDADHHHHHHIJJ
+@SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17
+AGGCGTTGTTTTTTTTT
++SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17
+CCCFFFFFHHHHHJJJJ
+@SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17
+ATCGAGACATAATTGAT
++SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17
+@B@FFFFFHHHHHJJJJ
+@SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17
+TGGGGGCGGTACATGAT
++SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17
+BBBFFFFFHHHHHJJJJ
+@SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17
+CTATATGTTTGCGCTGT
++SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17
+1=BDFFFFHHHHHJJJJ
+@SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17
+CTCCCGCATGCTGCTGT
++SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17
+?BBFFFFFHHHHHJJJJ
+@SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17
+GAGCCCTGAGGGGATCT
++SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17
+1??DDDFD>DFDGFGHG
+@SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17
+AGCGGGGTTCGCGGTTT
++SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17
+CCCFFFFFHHHHHJIJI
+@SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17
+AGAATTGCCTGGATTTT
++SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17
+@CCFFFFAFHHHGJJJJ
+@SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17
+AGGCGGGGCAACGGGTT
++SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17
+CCCFFFFFHHGHHJJHH
+@SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17
+GTCCCCGCGTCGTGTAG
++SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17
+@C@FFFFFHFFGHJJJJ
+@SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17
+CCACGCATTCACTCGGT
++SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17
+BBBDFFFFHHHHHJJJJ
+@SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17
+TGCGCAATAAGCGCTAT
++SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17
++:=DDDDDBHHGDIBEH
+@SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17
+CGCTGGCAGAGCCCGGT
++SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17
+@BCFFFFFHHHHHJJJJ
+@SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17
+AGGCGGTGCATAGTCTT
++SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17
+CCCFFFFFHHHHHIJIH
+@SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17
+GTCCCCCGCGTGTGACT
++SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17
+<BBFFFFFHHHHHIIJJ
+@SRR1058032.19 HISEQ:653:H12WDADXX:1:1101:5397:2220 length=17
+TATAGAAAAAACTTTTT
++SRR1058032.19 HISEQ:653:H12WDADXX:1:1101:5397:2220 length=17
+B@BFDDFFGHHFHIJIJ
+@SRR1058032.20 HISEQ:653:H12WDADXX:1:1101:5605:2194 length=17
+CATTATGGGCTTATTTT
++SRR1058032.20 HISEQ:653:H12WDADXX:1:1101:5605:2194 length=17
+BBBFFFFFHHHHHJJJJ
+@SRR1058032.21 HISEQ:653:H12WDADXX:1:1101:5519:2196 length=17
+AAATGTGCAGTTCAGAT
++SRR1058032.21 HISEQ:653:H12WDADXX:1:1101:5519:2196 length=17
+BCCFFFFFHHHHHJJJJ
+@SRR1058032.22 HISEQ:653:H12WDADXX:1:1101:5705:2220 length=17
+TGGGGGCTAAAGGGACT
++SRR1058032.22 HISEQ:653:H12WDADXX:1:1101:5705:2220 length=17
+BBBDFFFFHHHHHJIJI
+@SRR1058032.23 HISEQ:653:H12WDADXX:1:1101:5558:2236 length=17
+GATAATACTTACGGTGT
++SRR1058032.23 HISEQ:653:H12WDADXX:1:1101:5558:2236 length=17
+CCCFFFFFHHHHHJFHI
+@SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=17
+CGTTAATAATTGTGGTT
++SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=17
+BBBFFFFFHHHHHIIHG
+@SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=17
+AAAAAAAAAAAAAAAAA
++SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=17
+@CCFFFFFGHAA<:46'
+@SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=17
+GCCGACCAACGATTTTT
++SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=17
+:=?DD@?DH;AFBFDFF
+@SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=17
+AATCAAGACCACTGAAT
++SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=17
+@CCFFFFFHHHHHJJJI
+@SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=17
+CGCGCTTTTGTTTTTTT
++SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=17
+BB@FFFFFHHHHHJJJJ
+@SRR1058032.29 HISEQ:653:H12WDADXX:1:1101:6133:2213 length=17
+AAATACTTTGAGGGAAT
++SRR1058032.29 HISEQ:653:H12WDADXX:1:1101:6133:2213 length=17
+@CCFFEFFHHFHGJJII
+@SRR1058032.30 HISEQ:653:H12WDADXX:1:1101:6651:2198 length=17
+AGCGGGGTTTTATCGGT
++SRR1058032.30 HISEQ:653:H12WDADXX:1:1101:6651:2198 length=17
+CCCFFFFDHHHHHHJJJ
diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30.extract b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30.extract
new file mode 100644
index 00000000..eda8b17e
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30.extract
@@ -0,0 +1,120 @@
+@SRR1058032.1_AATAACCCTACA_TTCCCGCGTCCTCTTTCCCT HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17
+G
++
+B
+@SRR1058032.2_AGCGGGACGCTA_GTGCTCGTCGTACTCTTTCC HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17
+T
++
+J
+@SRR1058032.3_CTTTAGACGCTA_TACCAGTCCTCACTCTTTCC HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17
+T
++
+J
+@SRR1058032.4_AGGCGTACTTTA_TGTTTTTTTTCACTCTCTCC HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17
+T
++
+J
+@SRR1058032.5_ATCGAGGTGTAG_ACATAATTGAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17
+T
++
+J
+@SRR1058032.6_TGGGGGCCTATA_CGGTACATGATAGTATAGCT HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17
+T
++
+J
+@SRR1058032.7_CTATATATTAAA_GTTTGCGCTGGACAAACTAC HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17
+T
++
+J
+@SRR1058032.8_CTCCCGGCCTAG_CATGCTGCTGTTGTGAACCA HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17
+T
++
+J
+@SRR1058032.9_GAGCCCCCCTTC_TGAGGGGATCACGACGCTAC HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17
+T
++
+G
+@SRR1058032.10_AGCGGGGGGAAA_GTTCGCGGTTGAGTGTGTCG HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17
+T
++
+I
+@SRR1058032.11_AGAATTCCCACA_GCCTGGATTTCTCTTTCCCT HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17
+T
++
+J
+@SRR1058032.12_AGGCGGGTGTAT_GGCAACGGGTGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17
+T
++
+H
+@SRR1058032.13_GTCCCCCTCTTT_GCGTCGTGTACCCTACACTC HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17
+G
++
+J
+@SRR1058032.14_CCACGCGTGTAG_ATTCACTCGGCGTCGTGTAG HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17
+T
++
+J
+@SRR1058032.15_TGCGCAGTGTAT_ATAAGCGCTAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17
+T
++
+H
+@SRR1058032.16_CGCTGGACTCTT_CAGAGCCCGGTCCCTACACT HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17
+T
++
+J
+@SRR1058032.17_AGGCGGGATTCT_TGCATAGTCTTCAAATGAGG HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17
+T
++
+H
+@SRR1058032.18_GTCCCCGCGTCG_CGCGTGTGACTGTAGGGAAA HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17
+T
++
+J
+@SRR1058032.19_TATAGACCATCA_AAAAACTTTTCGCCTGCCCT HISEQ:653:H12WDADXX:1:1101:5397:2220 length=17
+T
++
+J
+@SRR1058032.20_CATTATTTAATG_GGGCTTATTTGACTGTTTCA HISEQ:653:H12WDADXX:1:1101:5605:2194 length=17
+T
++
+J
+@SRR1058032.21_AAATGTTATCTA_GCAGTTCAGAGACTGCTCGT HISEQ:653:H12WDADXX:1:1101:5519:2196 length=17
+T
++
+J
+@SRR1058032.22_TGGGGGACTGTT_CTAAAGGGACCTTTAACCAA HISEQ:653:H12WDADXX:1:1101:5705:2220 length=17
+T
++
+I
+@SRR1058032.23_GATAATTTCCAT_ACTTACGGTGACACTCTTTC HISEQ:653:H12WDADXX:1:1101:5558:2236 length=17
+T
++
+I
+@SRR1058032.24_CGTTAAAGACGG_TAATTGTGGTACCAGAGCGA HISEQ:653:H12WDADXX:1:1101:5649:2244 length=17
+T
++
+G
+@SRR1058032.25_AAAAAAGAGTAT_AAAAAAAAAAAGGGAAAGAG HISEQ:653:H12WDADXX:1:1101:5910:2207 length=17
+A
++
+'
+@SRR1058032.26_GCCGACCCTTTT_CAACGATTTTATACAATACA HISEQ:653:H12WDADXX:1:1101:5757:2217 length=17
+T
++
+F
+@SRR1058032.27_AATCAAATCACA_GACCACTGAAGCTGGAGAGA HISEQ:653:H12WDADXX:1:1101:5790:2248 length=17
+T
++
+I
+@SRR1058032.28_CGCGCTGTACTA_TTTGTTTTTTGGCATCGTCA HISEQ:653:H12WDADXX:1:1101:6079:2195 length=17
+T
++
+J
+@SRR1058032.29_AAATACCCAATA_TTTGAGGGAAACTTGACCAA HISEQ:653:H12WDADXX:1:1101:6133:2213 length=17
+T
++
+I
+@SRR1058032.30_AGCGGGGAGTGT_GTTTTATCGGACACTCTTTC HISEQ:653:H12WDADXX:1:1101:6651:2198 length=17
+T
++
+J
diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30 b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30
new file mode 100644
index 00000000..8dbf0725
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30
@@ -0,0 +1,120 @@
+@SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=34
+CCTACACTCTTTCCCTACACGACGCTACACTCTN
++SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=34
+@@@DFEDABD?A?ABGHGGGIGGEGIIIJJJFI#
+@SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=34
+ACGCTATACTCTTTCCCTACACGACGCTACACTN
++SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=34
+CCCDFFFFHHHHHJJJJGGICGE6FDH<?F<F<#
+@SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=34
+ACGCTACACTCTTTCCCTACACGACGCTACACTN
++SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=34
+C@CFFFFFGHHGAEHIIEIGIIAGFHIFG@FBE#
+@SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=34
+ACTTTACACTCTCTCCCTACACGACGCTACACTN
++SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=34
+??;==DBDD?F:D<EGH<HGHIF>GEGCDG9FD#
+@SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34
+GTGTAGGGAAAGAGTGTAAGGAAAGAGTGTAGCN
++SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34
+?=??B?DB2ACCAEAEFHHIHHHIHFHCEHHIG#
+@SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34
+CCTATATAGTATAGCTTCCCATCTTCTTTGAGAN
++SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34
+CCCFFFFFHDHBHEIIJJJJIIIJJJGGGIGIE#
+@SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34
+ATTAAAGACAAACTACAACTCATATGAGGCATTN
++SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34
+@@@DDDADDHHHFBFAHIGBHH<H<BHDFGIIG#
+@SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=34
+GCCTAGTTGTGAACCAAATGTGAAAAAACCTCCN
++SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=34
+@@@FFDDDFFFFFIIGHIFI<HHEHCEBEFEED#
+@SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=34
+CCCTTCACGACGCTACACTCTTTCCCTACACGAN
++SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=34
+C@CFFFFFHHHHHJJJIJJJJIJJJIGBHBFG:#
+@SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=34
+GGGAAAGAGTGTGTCGTGTATGGAAAGAGTGTAN
++SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=34
+CCCFFFFDD>FAH;E@@?AB>F@BF3;3?1C?<#
+@SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34
+CCCACACTCTTTCCCTACACGACGCTACACTCTN
++SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34
+@@@DDFDDBHBFHGI<F@GFBFEE>)C:D@@@B#
+@SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34
+GTGTATGGAAAGAGTGTAGGGAAAGAGTGTAGGN
++SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34
+@@@DDDDAHHHFHIABEEEAB??CFBF?C@BFF#
+@SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34
+CTCTTTCCCTACACTCTTTCCCTACACGACGCTN
++SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34
+@@@DDDAAADHDHDGDGIIIIIJJJJJJIJIIJ#
+@SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34
+GTGTAGCGTCGTGTAGGGAAAGAGTGTGTGGAAN
++SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34
+@@@DDDDD?DFDCAEFHIGGFHEH:D1C:CG@F#
+@SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34
+GTGTATGGAAAGAGTGTGCGTCGTACGTGTAGAN
++SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34
+@?@DDFFFHHHHGDAC:CHGGIIGIIIFHFGHB#
+@SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34
+ACTCTTTCCCTACACTCTTTCCCTACACGACGCN
++SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34
+@@<DAAAA?>BCBE@9;EGGGGGIHJJIJHIGG#
+@SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34
+GATTCTTCAAATGAGGACTATGCGGGACATGAAN
++SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34
+@@@DDDDDFHHFAHB;FHIIIIIIIIFHEHIHI#
+@SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34
+GCGTCGTGTAGGGAAAGAGTGTAGCGTCGTGTAN
++SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34
+@@@DDDDD<FFD?GIIDGF+<<CBAFCGE@FB@#
+@SRR1058032.19 HISEQ:653:H12WDADXX:1:1101:5397:2220 length=34
+CCATCACGCCTGCCCTTCCTTGAAATTACACCTN
++SRR1058032.19 HISEQ:653:H12WDADXX:1:1101:5397:2220 length=34
+;===AAA<@A72??A+22<+,+<+@+++*:***#
+@SRR1058032.20 HISEQ:653:H12WDADXX:1:1101:5605:2194 length=34
+TTAATGGACTGTTTCAGGTAAAAGAGAATGAATN
++SRR1058032.20 HISEQ:653:H12WDADXX:1:1101:5605:2194 length=34
+CCCFFDAEHHHHDEHIGCEIIJJIGIJGIGGHE#
+@SRR1058032.21 HISEQ:653:H12WDADXX:1:1101:5519:2196 length=34
+TATCTAGACTGCTCGTCATTTAGAAGACACGTCN
++SRR1058032.21 HISEQ:653:H12WDADXX:1:1101:5519:2196 length=34
+@B@FDDFFHFHBHEIIGIIJJGHGHIIIGIGII#
+@SRR1058032.22 HISEQ:653:H12WDADXX:1:1101:5705:2220 length=34
+ACTGTTCTTTAACCAAACATCCGTGCGATTCGTN
++SRR1058032.22 HISEQ:653:H12WDADXX:1:1101:5705:2220 length=34
+CCCFFFFFHHHHHJJJJGHIJJIGIIIBEFG?G#
+@SRR1058032.23 HISEQ:653:H12WDADXX:1:1101:5558:2236 length=34
+TTCCATACACTCTTTCCCTACACGACGCACACTN
++SRR1058032.23 HISEQ:653:H12WDADXX:1:1101:5558:2236 length=34
+@@@DFBEFHFFD<A<CD>BHEGGFGHGGIEGII#
+@SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34
+AGACGGACCAGAGCGAAAGCATTTGCCAAGAATN
++SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34
+CCCFFFDFGHHHGJIIJJIJHEDD919CGGHJ@#
+@SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34
+GAGTATAGGGAAAGAGTTTTTTTTTTTTTTTTTN
++SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34
+?=?DDDD>AB:ACEEGHIJJIJJJJIIJJHFDD#
+@SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34
+CCTTTTATACAATACAAAGCTTTGCTTTTTTTTN
++SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34
+???DDDDDDDDD4EEEII@A<:33<33,22110#
+@SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34
+ATCACAGCTGGAGAGATCTTGATCTTCATGGTGN
++SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34
+CCCFFFFFHHFHGGIIIIJIEAHCEHHEFECGD#
+@SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34
+GTACTAGGCATCGTCATCCAATGCGACGAGTCCN
++SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34
+@@CFFDDFHHGHHIJJJIJJJIGGHIDG<GFHG#
+@SRR1058032.29 HISEQ:653:H12WDADXX:1:1101:6133:2213 length=34
+CCAATAACTTGACCAACGGAACAAGTTACCCTAN
++SRR1058032.29 HISEQ:653:H12WDADXX:1:1101:6133:2213 length=34
+@CCFFFFFHHGHHIJJJJIJIIIIIIIIIJIJI#
+@SRR1058032.30 HISEQ:653:H12WDADXX:1:1101:6651:2198 length=34
+GAGTGTACACTCTTTCCCTACACGACGTTACACN
++SRR1058032.30 HISEQ:653:H12WDADXX:1:1101:6651:2198 length=34
+???A:2ABDBDDDBEEIIA:F:CC8F<))1:??#
diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30.extract b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30.extract
new file mode 100644
index 00000000..572c8094
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30.extract
@@ -0,0 +1,120 @@
+@SRR1058032.1_AATAACCCTACA_TTCCCGCGTCCTCTTTCCCT HISEQ:653:H12WDADXX:1:1101:1210:2217 length=34
+ACACGACGCTACACTCTN
++
+HGGGIGGEGIIIJJJFI#
+@SRR1058032.2_AGCGGGACGCTA_GTGCTCGTCGTACTCTTTCC HISEQ:653:H12WDADXX:1:1101:1191:2236 length=34
+CTACACGACGCTACACTN
++
+JGGICGE6FDH<?F<F<#
+@SRR1058032.3_CTTTAGACGCTA_TACCAGTCCTCACTCTTTCC HISEQ:653:H12WDADXX:1:1101:1715:2245 length=34
+CTACACGACGCTACACTN
++
+IEIGIIAGFHIFG@FBE#
+@SRR1058032.4_AGGCGTACTTTA_TGTTTTTTTTCACTCTCTCC HISEQ:653:H12WDADXX:1:1101:1905:2212 length=34
+CTACACGACGCTACACTN
++
+H<HGHIF>GEGCDG9FD#
+@SRR1058032.5_ATCGAGGTGTAG_ACATAATTGAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34
+TAAGGAAAGAGTGTAGCN
++
+FHHIHHHIHFHCEHHIG#
+@SRR1058032.6_TGGGGGCCTATA_CGGTACATGATAGTATAGCT HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34
+TCCCATCTTCTTTGAGAN
++
+JJJJIIIJJJGGGIGIE#
+@SRR1058032.7_CTATATATTAAA_GTTTGCGCTGGACAAACTAC HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34
+AACTCATATGAGGCATTN
++
+HIGBHH<H<BHDFGIIG#
+@SRR1058032.8_CTCCCGGCCTAG_CATGCTGCTGTTGTGAACCA HISEQ:653:H12WDADXX:1:1101:2513:2219 length=34
+AATGTGAAAAAACCTCCN
++
+HIFI<HHEHCEBEFEED#
+@SRR1058032.9_GAGCCCCCCTTC_TGAGGGGATCACGACGCTAC HISEQ:653:H12WDADXX:1:1101:2604:2231 length=34
+ACTCTTTCCCTACACGAN
++
+IJJJJIJJJIGBHBFG:#
+@SRR1058032.10_AGCGGGGGGAAA_GTTCGCGGTTGAGTGTGTCG HISEQ:653:H12WDADXX:1:1101:2936:2218 length=34
+TGTATGGAAAGAGTGTAN
++
+@?AB>F@BF3;3?1C?<#
+@SRR1058032.11_AGAATTCCCACA_GCCTGGATTTCTCTTTCCCT HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34
+ACACGACGCTACACTCTN
++
+F@GFBFEE>)C:D@@@B#
+@SRR1058032.12_AGGCGGGTGTAT_GGCAACGGGTGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34
+TAGGGAAAGAGTGTAGGN
++
+EEEAB??CFBF?C@BFF#
+@SRR1058032.13_GTCCCCCTCTTT_GCGTCGTGTACCCTACACTC HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34
+TTTCCCTACACGACGCTN
++
+GIIIIIJJJJJJIJIIJ#
+@SRR1058032.14_CCACGCGTGTAG_ATTCACTCGGCGTCGTGTAG HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34
+GGAAAGAGTGTGTGGAAN
++
+HIGGFHEH:D1C:CG@F#
+@SRR1058032.15_TGCGCAGTGTAT_ATAAGCGCTAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34
+TGCGTCGTACGTGTAGAN
++
+:CHGGIIGIIIFHFGHB#
+@SRR1058032.16_CGCTGGACTCTT_CAGAGCCCGGTCCCTACACT HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34
+CTTTCCCTACACGACGCN
++
+;EGGGGGIHJJIJHIGG#
+@SRR1058032.17_AGGCGGGATTCT_TGCATAGTCTTCAAATGAGG HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34
+ACTATGCGGGACATGAAN
++
+FHIIIIIIIIFHEHIHI#
+@SRR1058032.18_GTCCCCGCGTCG_CGCGTGTGACTGTAGGGAAA HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34
+GAGTGTAGCGTCGTGTAN
++
+DGF+<<CBAFCGE@FB@#
+@SRR1058032.19_TATAGACCATCA_AAAAACTTTTCGCCTGCCCT HISEQ:653:H12WDADXX:1:1101:5397:2220 length=34
+TCCTTGAAATTACACCTN
++
+22<+,+<+@+++*:***#
+@SRR1058032.20_CATTATTTAATG_GGGCTTATTTGACTGTTTCA HISEQ:653:H12WDADXX:1:1101:5605:2194 length=34
+GGTAAAAGAGAATGAATN
++
+GCEIIJJIGIJGIGGHE#
+@SRR1058032.21_AAATGTTATCTA_GCAGTTCAGAGACTGCTCGT HISEQ:653:H12WDADXX:1:1101:5519:2196 length=34
+CATTTAGAAGACACGTCN
++
+GIIJJGHGHIIIGIGII#
+@SRR1058032.22_TGGGGGACTGTT_CTAAAGGGACCTTTAACCAA HISEQ:653:H12WDADXX:1:1101:5705:2220 length=34
+ACATCCGTGCGATTCGTN
++
+JGHIJJIGIIIBEFG?G#
+@SRR1058032.23_GATAATTTCCAT_ACTTACGGTGACACTCTTTC HISEQ:653:H12WDADXX:1:1101:5558:2236 length=34
+CCTACACGACGCACACTN
++
+D>BHEGGFGHGGIEGII#
+@SRR1058032.24_CGTTAAAGACGG_TAATTGTGGTACCAGAGCGA HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34
+AAGCATTTGCCAAGAATN
++
+JJIJHEDD919CGGHJ@#
+@SRR1058032.25_AAAAAAGAGTAT_AAAAAAAAAAAGGGAAAGAG HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34
+TTTTTTTTTTTTTTTTTN
++
+HIJJIJJJJIIJJHFDD#
+@SRR1058032.26_GCCGACCCTTTT_CAACGATTTTATACAATACA HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34
+AAGCTTTGCTTTTTTTTN
++
+II@A<:33<33,22110#
+@SRR1058032.27_AATCAAATCACA_GACCACTGAAGCTGGAGAGA HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34
+TCTTGATCTTCATGGTGN
++
+IIJIEAHCEHHEFECGD#
+@SRR1058032.28_CGCGCTGTACTA_TTTGTTTTTTGGCATCGTCA HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34
+TCCAATGCGACGAGTCCN
++
+JIJJJIGGHIDG<GFHG#
+@SRR1058032.29_AAATACCCAATA_TTTGAGGGAAACTTGACCAA HISEQ:653:H12WDADXX:1:1101:6133:2213 length=34
+CGGAACAAGTTACCCTAN
++
+JJIJIIIIIIIIIJIJI#
+@SRR1058032.30_AGCGGGGAGTGT_GTTTTATCGGACACTCTTTC HISEQ:653:H12WDADXX:1:1101:6651:2198 length=34
+CCTACACGACGTTACACN
++
+IIA:F:CC8F<))1:??#
diff --git a/src/umi_tools/umi_tools_extract/test_data/script.sh b/src/umi_tools/umi_tools_extract/test_data/script.sh
new file mode 100755
index 00000000..9463d177
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test_data/script.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Download test data
+wget https://github.com/CGATOxford/UMI-tools/raw/master/tests/slim.fastq.gz
+wget https://github.com/CGATOxford/UMI-tools/raw/master/tests/scrb_seq_fastq.1.gz
+wget https://github.com/CGATOxford/UMI-tools/raw/master/tests/scrb_seq_fastq.2.gz
+
+gunzip -f slim.fastq.gz scrb_seq_fastq.1.gz scrb_seq_fastq.2.gz
+
+# smaller datasets
+head -n 120 slim.fastq > slim_30.fastq
+head -n 120 scrb_seq_fastq.1 > scrb_seq_fastq.1_30
+head -n 120 scrb_seq_fastq.2 > scrb_seq_fastq.2_30
+rm slim.fastq scrb_seq_fastq.1 scrb_seq_fastq.2
+
+# Generate expected output
+# Test 1 and 2
+umi_tools extract \
+    --stdin "scrb_seq_fastq.1_30" \
+    --read2-in "scrb_seq_fastq.2_30" \
+    --bc-pattern "CCCCCCNNNNNNNNNN" \
+    --bc-pattern2 "CCCCCCNNNNNNNNNN" \
+    --extract-method string \
+    --stdout scrb_seq_fastq.1_30.extract \
+    --read2-out scrb_seq_fastq.2_30.extract \
+    --random-seed 1
+
+# Test 3
+umi_tools extract \
+    --stdin "slim_30.fastq" \
+    --bc-pattern "^(?P<umi_1>.{3}).{4}(?P<umi_2>.{2})" \
+    --extract-method regex \
+    --stdout slim_30.extract \
+    --random-seed 1
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_extract/test_data/slim_30.extract b/src/umi_tools/umi_tools_extract/test_data/slim_30.extract
new file mode 100644
index 00000000..1c20f782
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test_data/slim_30.extract
@@ -0,0 +1,120 @@
+@SRR2057595.7_CAGAA
+GTTCTCTCGGTGGGACCTC
++
+FFFFHHHJJJFGIJIJJIJ
+@SRR2057595.9_TTGAA
+GTTCTCTGATGCCCTCTTCTGGTGCATCTGAAGACAGCTACAGTGTACTTAGATATAATAAATAAATCTT
++
+FDBDFHHIGGEHJGGIHGHGGCAFCHGIGEHIJJJJIJJJIHIIIIIIJIIIIIGHIIGGIJGIIJIIJ@
+@SRR2057595.14_TGGAT
+GTTAGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT
++
+FFFFHHHJJIJJJJIGHJJIIJJJJJIJHFHHFFEDEEEEDDDDBDDDD
+@SRR2057595.22_ACGAT
+GTTAGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGC
++
+FFFFHHHJJJJJJJJIJJJJJJJJJJJJHHHFFFEDEEEEDDDDBDDD
+@SRR2057595.23_GCGTT
+GTTACCTAAGGCGAGCTCAGGGAGGACAGAAACCTCCCGTGGAGCAGAAGGGCAAAAGCTCGCTTGATCT
++
+FFFFHHHJJJJJJJJJJJJJJJIJJIIJJJJJJJJJJJJIJJHHHHHFFFFDDDDDDDDDDDDDDDDDDA
+@SRR2057595.29_ACGTT
+GTTCGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCTT
++
+FFFFHHHJJJJJJJJHIJJJJJJIJJJJHHHFFDEDEEDDCDDDBDDDDD
+@SRR2057595.30_GAGAA
+GTTGAATCCGTGCTAAGAAGAA
++
+DFFFHHHJJJJIJJJJJJJJJJ
+@SRR2057595.33_TCGAT
+GTTTCTCGTCTGATCTCGGAAGCTAAGCAGGGTCGGGCCTGGTTAGTACTTGGATGGGAGACCGCCTGGG
++
+FFFFHHHJJJJJJJJJJJJJJJJJJJJJJJJJDHIJJJJIJJJHGGEEHFFFFFFEDDEDDDDDDDDDDB
+@SRR2057595.35_ACGCT
+GTTACCCGGGGCTACGCCTGTCTGAGCGTCGCT
++
+DFFFHHHJJJJJJIJJJJJJIJJJJJJJHIIJJ
+@SRR2057595.38_GGGCC
+GTTATGCATGTTTATAGTTTCTAGTTTTGGCATTTTGTGTGGTCTCTTTTTTGTT
++
+DFFFHHHJJJJJJJJJJHJJIJJJIJJJJJJJJJJJJGIGHJHIJJIJJJJJJJJ
+@SRR2057595.42_TAGGA
+GTTGTAAGTTATACACTGACTAAGTCATCTGTTACTGCCTTCACTGAGTTTTTATTTCCTTT
++
+DFFFHHHJJJJJJJJJJJJJJJJJIIJJJJGJJJJJJJJJJJJJJJIIHIJJJJJJJIJJJI
+@SRR2057595.45_CTGGC
+GTTTTGCGGAAGGATCATTA
++
+DDDDFFDFFAGFE<EB8?BF
+@SRR2057595.46_CAGTT
+GTTTTGGCTTTTTTTTAAAACCATTTTGTGAAAGGTTTCTGAAACTTGATAATAAAAAGCAGTTGGTGTA
++
+DDDDHHFIGIJJJJJJJIIIJIIIJJJICHHIGIJFHHGHIEIHGHFHEDFFEFEFEEDEDD@CDD<@B:
+@SRR2057595.56_GGGCG
+GTTTATGAAGAACGCAGCTAGCTGCGAGAATTAATGTGAATTGCAGGACACATTGATCATCGACACTTCG
++
+FFFFHHHJJJJJJJJJJJJJJJJJJJIJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHHFFFFFDDDDDDD
+@SRR2057595.59_GCGCC
+GTTATCCTGTCTTATCATTGTCTTTTGAGCCTGGGCCTTGCCAGGTAGCTCTAGACTGGCCTAGAACTCA
++
+FFFFHHHJJJJJJJJJJJJ4CHHJJJJJJJJJJJJJJJJJJJJIJDHIJJJIIJJJJJIJJJJHHHHHHB
+@SRR2057595.60_ATGCA
+GTTTTCTCGTCTGATCTCGGAAGCTAAGCAGGGCCGGGCCTGGTTAGTACTTGGATGGGAGACCGCC
++
+DDFDBBBFECFE@HHIBCBG<2CGEC49?1CBD)86:;AB=7C.=;=)77;A3;?C@;96=?@B8;?
+@SRR2057595.61_GAGAG
+GTTTCAGGACACATTGATCATCGACACTTCGAACGCACTTGCGGCCCCGGGTTCCTCCCGGGGCTACGCC
++
+DFFFHHHGGHGHIIJGEFEGGFH9GGIIFGGGGIFGDHBG@FGGGHEFCCB?@@CDCCD?B7>B@ACB9<
+@SRR2057595.65_GCGCG
+GTTTGAGCTTGCTCCGTCCACTCAACGCATCGACCTGGTATTGCAGTACCTCCAGGAACGGTGCACCAAG
++
+FFFFHHHJJJJJJHJIHHIIIIIIIJHJBHIHBFHHJI@EHJJHHHHHHHFFFBDE?AEBD=AB@CDBD?
+@SRR2057595.67_AAGGT
+GTTGTTTTGAGGTCCTGCTCGTGCAGGGT
++
+DDDFHHHHGFHGFGGIIDGHHIGIJJJJ9
+@SRR2057595.69_ATTAT
+GGTTTTTGTTTTTCCTCCTTCTCTTTCTAAA
++
+FFFFHHHHJJJJJJJJJJJJJJJJJJIJIJJ
+@SRR2057595.70_TTAAA
+GGTTTTGTAATTTTATGAGGTCCCATTTGTCAATTCTT
++
+DDDD2CDFA@FBGHCCHFHGBFHGHIGGDHGHIIFCFF
+@SRR2057595.71_TGCCA
+GGTTTATTAGCATGGCCCCTGCGCAAGGATGACACGCAAATTCGTGAAGCGTTCCATATTT
++
+FFFFHGHHJJJJJJJJJJIIJJIJIJJIFHJIIIJJJIJJJJJJHIIHHHHFFFDEECEEE
+@SRR2057595.73_TGACA
+GGTTGCGAGTGCCTAGTGGGCCACTTTTGGTAAGCAGAACTGGCGCTGCGGGA
++
+FFFFGFFHC@EBHGHGAEGIIHIIIIJJJJGHIIIJIJIIGHIJIJJIGGEFD
+@SRR2057595.74_AATTC
+GGTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
++
+FFFFDFFHFIJJJGGGGJJGDDDDDDDDDDDDDBDDDDDDBBDDDDDDDDDDDDDDDDDDDBBBDDDDBD>
+@SRR2057595.77_GCGGA
+GTTCTCCCACTTCTGAC
++
+FFFFDHHHIJJJIJJJJ
+@SRR2057595.82_GAGAC
+GGTTTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT
++
+FFFFHHHHJJJJJJJJJJJJJJJJIJJJJJIJIIJJJH
+@SRR2057595.83_TGGAT
+GTTGCCCGGGGCTACGCCTGTCTGAGCGTCGCT
++
+DFFFHHHJJIJJJJJJIJJJIJJIGGHIFHGEH
+@SRR2057595.86_ACCAC
+GGTTTTTTTTTAAATGTAAAGCATAAATAAAAAGCCTTTGTGGACTGTGAAAAAAAAAAAAAAAAAAAAAA
++
+FFFFHHHHJJJJJJJJIIJJJJJJJJJIJJJJJJJJJJJJGIJIIJJIJJJJJJHFDDDDDDDDDDDDDB>
+@SRR2057595.88_TCAGC
+GGTTCTAAGCATAGATAACCATATATCAGGGGGAGCTCCATGTTCTAGTCCTGCAAGCGCCTGGGCAATAA
++
+FFFFHHHHJJJJJJIJJJJJIJJJJJJIJJIJJIJJJJJJJJJHIJJJJJJIIIHJIHHHFFDDDDEDDD@
+@SRR2057595.99_TGACA
+GGTTTCGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTTTGT
++
+FFFFDHHHIHIIIJJIJJJJJIGEHGFHIJJGHIHADHIIJIJJJIJG
diff --git a/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq b/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq
new file mode 100644
index 00000000..444a7a7a
--- /dev/null
+++ b/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq
@@ -0,0 +1,120 @@
+@SRR2057595.7
+CAGGTTCAATCTCGGTGGGACCTC
++SRR2057595.7
+1=DFFFFHHHHHJJJFGIJIJJIJ
+@SRR2057595.9
+TTGGTTCAATCTGATGCCCTCTTCTGGTGCATCTGAAGACAGCTACAGTGTACTTAGATATAATAAATAAATCTT
++SRR2057595.9
+4=DFDBDHHFHHIGGEHJGGIHGHGGCAFCHGIGEHIJJJJIJJJIHIIIIIIJIIIIIGHIIGGIJGIIJIIJ@
+@SRR2057595.14
+TGGGTTAATGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT
++SRR2057595.14
+1=DFFFFHHHHHJJIJJJJIGHJJIIJJJJJIJHFHHFFEDEEEEDDDDBDDDD
+@SRR2057595.22
+ACGGTTAATGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGC
++SRR2057595.22
+1=DFFFFHHHHHJJJJJJJJIJJJJJJJJJJJJHHHFFFEDEEEEDDDDBDDD
+@SRR2057595.23
+GCGGTTATTCCTAAGGCGAGCTCAGGGAGGACAGAAACCTCCCGTGGAGCAGAAGGGCAAAAGCTCGCTTGATCT
++SRR2057595.23
+1=DFFFFHHHHHJJJJJJJJJJJJJJJIJJIIJJJJJJJJJJJJIJJHHHHHFFFFDDDDDDDDDDDDDDDDDDA
+@SRR2057595.29
+ACGGTTCTTGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCTT
++SRR2057595.29
+1=DFFFFHHHHHJJJJJJJJHIJJJJJJIJJJJHHHFFDEDEEDDCDDDBDDDDD
+@SRR2057595.30
+GAGGTTGAAAATCCGTGCTAAGAAGAA
++SRR2057595.30
+4=DDFFFHHHHHJJJJIJJJJJJJJJJ
+@SRR2057595.33
+TCGGTTTATCTCGTCTGATCTCGGAAGCTAAGCAGGGTCGGGCCTGGTTAGTACTTGGATGGGAGACCGCCTGGG
++SRR2057595.33
+1=DFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJDHIJJJJIJJJHGGEEHFFFFFFEDDEDDDDDDDDDDB
+@SRR2057595.35
+ACGGTTACTCCCGGGGCTACGCCTGTCTGAGCGTCGCT
++SRR2057595.35
+1=DDFFFHHHHHJJJJJJIJJJJJJIJJJJJJJHIIJJ
+@SRR2057595.38
+GGGGTTACCTGCATGTTTATAGTTTCTAGTTTTGGCATTTTGTGTGGTCTCTTTTTTGTT
++SRR2057595.38
+1=DDFFFHHHHHJJJJJJJJJJHJJIJJJIJJJJJJJJJJJJGIGHJHIJJIJJJJJJJJ
+@SRR2057595.42
+TAGGTTGGATAAGTTATACACTGACTAAGTCATCTGTTACTGCCTTCACTGAGTTTTTATTTCCTTT
++SRR2057595.42
+1=DDFFFHHHHHJJJJJJJJJJJJJJJJJIIJJJJGJJJJJJJJJJJJJJJIIHIJJJJJJJIJJJI
+@SRR2057595.45
+CTGGTTTGCTGCGGAAGGATCATTA
++SRR2057595.45
+1:DDDDDDDFFDFFAGFE<EB8?BF
+@SRR2057595.46
+CAGGTTTTTTGGCTTTTTTTTAAAACCATTTTGTGAAAGGTTTCTGAAACTTGATAATAAAAAGCAGTTGGTGTA
++SRR2057595.46
+4=DDDDDHHHHFIGIJJJJJJJIIIJIIIJJJICHHIGIJFHHGHIEIHGHFHEDFFEFEFEEDEDD@CDD<@B:
+@SRR2057595.56
+GGGGTTTCGATGAAGAACGCAGCTAGCTGCGAGAATTAATGTGAATTGCAGGACACATTGATCATCGACACTTCG
++SRR2057595.56
+4=DFFFFHHHHHJJJJJJJJJJJJJJJJJJJIJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHHFFFFFDDDDDDD
+@SRR2057595.59
+GCGGTTACCTCCTGTCTTATCATTGTCTTTTGAGCCTGGGCCTTGCCAGGTAGCTCTAGACTGGCCTAGAACTCA
++SRR2057595.59
+1=DFFFFHHHHHJJJJJJJJJJJJ4CHHJJJJJJJJJJJJJJJJJJJJIJDHIJJJIIJJJJJIJJJJHHHHHHB
+@SRR2057595.60
+ATGGTTTCATCTCGTCTGATCTCGGAAGCTAAGCAGGGCCGGGCCTGGTTAGTACTTGGATGGGAGACCGCC
++SRR2057595.60
+11BDDFDFFBBBFECFE@HHIBCBG<2CGEC49?1CBD)86:;AB=7C.=;=)77;A3;?C@;96=?@B8;?
+@SRR2057595.61
+GAGGTTTAGCAGGACACATTGATCATCGACACTTCGAACGCACTTGCGGCCCCGGGTTCCTCCCGGGGCTACGCC
++SRR2057595.61
+1=DDFFFGHHHHGGHGHIIJGEFEGGFH9GGIIFGGGGIFGDHBG@FGGGHEFCCB?@@CDCCD?B7>B@ACB9<
+@SRR2057595.65
+GCGGTTTCGGAGCTTGCTCCGTCCACTCAACGCATCGACCTGGTATTGCAGTACCTCCAGGAACGGTGCACCAAG
++SRR2057595.65
+1=DFFFFHHHHHJJJJJJHJIHHIIIIIIIJHJBHIHBFHHJI@EHJJHHHHHHHFFFBDE?AEBD=AB@CDBD?
+@SRR2057595.67
+AAGGTTGGTTTTTGAGGTCCTGCTCGTGCAGGGT
++SRR2057595.67
+1:BDDDFHFHHHHGFHGFGGIIDGHHIGIJJJJ9
+@SRR2057595.69
+ATTGGTTATTTTGTTTTTCCTCCTTCTCTTTCTAAA
++SRR2057595.69
+CCCFFFFFHHHHHJJJJJJJJJJJJJJJJJJIJIJJ
+@SRR2057595.70
+TTAGGTTAATTGTAATTTTATGAGGTCCCATTTGTCAATTCTT
++SRR2057595.70
+@@@DDDDD+2CDFA@FBGHCCHFHGBFHGHIGGDHGHIIFCFF
+@SRR2057595.71
+TGCGGTTCATATTAGCATGGCCCCTGCGCAAGGATGACACGCAAATTCGTGAAGCGTTCCATATTT
++SRR2057595.71
+CCCFFFFFHHGHHJJJJJJJJJJIIJJIJIJJIFHJIIIJJJIJJJJJJHIIHHHHFFFDEECEEE
+@SRR2057595.73
+TGAGGTTCAGCGAGTGCCTAGTGGGCCACTTTTGGTAAGCAGAACTGGCGCTGCGGGA
++SRR2057595.73
+@@@FFFFFHGFFHC@EBHGHGAEGIIHIIIIJJJJGHIIIJIJIIGHIJIJJIGGEFD
+@SRR2057595.74
+AATGGTTTCTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
++SRR2057595.74
+@CCFFFFFGDFFHFIJJJGGGGJJGDDDDDDDDDDDDDBDDDDDDBBDDDDDDDDDDDDDDDDDDDBBBDDDDBD>
+@SRR2057595.77
+GCGGTTCGATCCCACTTCTGAC
++SRR2057595.77
+1=DFFFFHGDHHHIJJJIJJJJ
+@SRR2057595.82
+GAGGGTTACTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT
++SRR2057595.82
+CBCFFFFFHHHHHJJJJJJJJJJJJJJJJIJJJJJIJIIJJJH
+@SRR2057595.83
+TGGGTTGATCCCGGGGCTACGCCTGTCTGAGCGTCGCT
++SRR2057595.83
+1=DDFFFHHHHHJJIJJJJJJIJJJIJJIGGHIFHGEH
+@SRR2057595.86
+ACCGGTTACTTTTTTTAAATGTAAAGCATAAATAAAAAGCCTTTGTGGACTGTGAAAAAAAAAAAAAAAAAAAAAA
++SRR2057595.86
+BCCFFFFFHHHHHJJJJJJJJIIJJJJJJJJJIJJJJJJJJJJJJGIJIIJJIJJJJJJHFDDDDDDDDDDDDDB>
+@SRR2057595.88
+TCAGGTTGCCTAAGCATAGATAACCATATATCAGGGGGAGCTCCATGTTCTAGTCCTGCAAGCGCCTGGGCAATAA
++SRR2057595.88
+CCCFFFFFHHHHHJJJJJJIJJJJJIJJJJJJIJJIJJIJJJJJJJJJHIJJJJJJIIIHJIHHHFFDDDDEDDD@
+@SRR2057595.99
+TGAGGTTCATCGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTTTGT
++SRR2057595.99
+B@CFFFFFFDHHHIHIIIJJIJJJJJIGEHGFHIJJGHIHADHIIJIJJJIJG

From da414e72c60758895b16818309d6c147c288dd84 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Mon, 29 Jul 2024 09:55:17 +0200
Subject: [PATCH 3/9] Add star solo component (#62)

* add star solo component

* change arguments from camelCase to snake_case

* get rid of multiple_sep

* drop star_solo component and just add arguments to star_align_reads

* Update src/star/star_align_reads/script.py

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>

---------

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
---
 CHANGELOG.md                                  |   10 +-
 .../star_align_reads/argument_groups.yaml     | 1034 +++++++++++++----
 src/star/star_align_reads/config.vsh.yaml     |    2 +
 src/star/star_align_reads/script.py           |   20 +-
 src/star/star_align_reads/test.sh             |   12 +-
 .../star_align_reads/utils/process_params.R   |   54 +-
 src/star/star_genome_generate/config.vsh.yaml |   43 +-
 src/star/star_genome_generate/script.sh       |   30 +-
 src/star/star_genome_generate/test.sh         |    6 +-
 9 files changed, 903 insertions(+), 308 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 665b587d..c4575cb9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,14 @@
 # biobox x.x.x
 
-## NEW FEATURES
+## BREAKING CHANGES
+
+* `star/star_align_reads`: Change all arguments from `--camelCase` to `--snake_case` (PR #62).
+
+* `star/star_genome_generate`: Change all arguments from `--camelCase` to `--snake_case` (PR #62).
+
+## NEW FUNCTIONALITY
+
+* `star/star_align_reads`: Add star solo related arguments (PR #62).
 
 * `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75).
 
diff --git a/src/star/star_align_reads/argument_groups.yaml b/src/star/star_align_reads/argument_groups.yaml
index e6a1c874..7c804dd3 100644
--- a/src/star/star_align_reads/argument_groups.yaml
+++ b/src/star/star_align_reads/argument_groups.yaml
@@ -1,19 +1,23 @@
 argument_groups:
 - name: Run Parameters
   arguments:
-  - name: --runRNGseed
+  - name: --run_rng_seed
     type: integer
     description: random number generator seed.
+    info:
+      orig_name: --runRNGseed
     example: 777
 - name: Genome Parameters
   arguments:
-  - name: --genomeDir
+  - name: --genome_dir
     type: file
     description: path to the directory where genome files are stored (for --runMode
       alignReads) or will be generated (for --runMode generateGenome)
+    info:
+      orig_name: --genomeDir
     example: ./GenomeDir/
-    required: yes
-  - name: --genomeLoad
+    required: true
+  - name: --genome_load
     type: string
     description: |-
       mode of shared memory usage for the genome files. Only used with --runMode alignReads.
@@ -23,132 +27,162 @@ argument_groups:
       - LoadAndExit     ... load genome into shared memory and exit, keeping the genome in memory for future runs
       - Remove          ... do not map anything, just remove loaded genome from memory
       - NoSharedMemory  ... do not use shared memory, each job will have its own private copy of the genome
+    info:
+      orig_name: --genomeLoad
     example: NoSharedMemory
-  - name: --genomeFastaFiles
+  - name: --genome_fasta_files
     type: file
     description: |-
       path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped.
 
       Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins).
-    multiple: yes
-    multiple_sep: ;
-  - name: --genomeFileSizes
+    info:
+      orig_name: --genomeFastaFiles
+    multiple: true
+  - name: --genome_file_sizes
     type: integer
     description: genome files exact sizes in bytes. Typically, this should not be
       defined by the user.
+    info:
+      orig_name: --genomeFileSizes
     example: 0
-    multiple: yes
-    multiple_sep: ;
-  - name: --genomeTransformOutput
+    multiple: true
+  - name: --genome_transform_output
     type: string
     description: |-
       which output to transform back to original genome
 
       - SAM     ... SAM/BAM alignments
       - SJ      ... splice junctions (SJ.out.tab)
-      - Quant   ... quantifications (from --quantMode option)
+      - Quant   ... quantifications (from --quant_mode option)
       - None    ... no transformation of the output
-    multiple: yes
-    multiple_sep: ;
-  - name: --genomeChrSetMitochondrial
+    info:
+      orig_name: --genomeTransformOutput
+    multiple: true
+  - name: --genome_chr_set_mitochondrial
     type: string
     description: names of the mitochondrial chromosomes. Presently only used for STARsolo
       statistics output/
+    info:
+      orig_name: --genomeChrSetMitochondrial
     example:
     - chrM
     - M
     - MT
-    multiple: yes
-    multiple_sep: ;
+    multiple: true
 - name: Splice Junctions Database
   arguments:
-  - name: --sjdbFileChrStartEnd
+  - name: --sjdb_file_chr_start_end
     type: string
     description: path to the files with genomic coordinates (chr <tab> start <tab>
       end <tab> strand) for the splice junction introns. Multiple files can be supplied
       and will be concatenated.
-    multiple: yes
-    multiple_sep: ;
-  - name: --sjdbGTFfile
+    info:
+      orig_name: --sjdbFileChrStartEnd
+    multiple: true
+  - name: --sjdb_gtf_file
     type: file
     description: path to the GTF file with annotations
-  - name: --sjdbGTFchrPrefix
+    info:
+      orig_name: --sjdbGTFfile
+  - name: --sjdb_gtf_chr_prefix
     type: string
     description: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL
       annotations with UCSC genomes)
-  - name: --sjdbGTFfeatureExon
+    info:
+      orig_name: --sjdbGTFchrPrefix
+  - name: --sjdb_gtf_feature_exon
     type: string
     description: feature type in GTF file to be used as exons for building transcripts
+    info:
+      orig_name: --sjdbGTFfeatureExon
     example: exon
-  - name: --sjdbGTFtagExonParentTranscript
+  - name: --sjdb_gtf_tag_exon_parent_transcript
     type: string
     description: GTF attribute name for parent transcript ID (default "transcript_id"
       works for GTF files)
+    info:
+      orig_name: --sjdbGTFtagExonParentTranscript
     example: transcript_id
-  - name: --sjdbGTFtagExonParentGene
+  - name: --sjdb_gtf_tag_exon_parent_gene
     type: string
     description: GTF attribute name for parent gene ID (default "gene_id" works for
       GTF files)
+    info:
+      orig_name: --sjdbGTFtagExonParentGene
     example: gene_id
-  - name: --sjdbGTFtagExonParentGeneName
+  - name: --sjdb_gtf_tag_exon_parent_gene_name
     type: string
     description: GTF attribute name for parent gene name
+    info:
+      orig_name: --sjdbGTFtagExonParentGeneName
     example: gene_name
-    multiple: yes
-    multiple_sep: ;
-  - name: --sjdbGTFtagExonParentGeneType
+    multiple: true
+  - name: --sjdb_gtf_tag_exon_parent_gene_type
     type: string
     description: GTF attribute name for parent gene type
+    info:
+      orig_name: --sjdbGTFtagExonParentGeneType
     example:
     - gene_type
     - gene_biotype
-    multiple: yes
-    multiple_sep: ;
-  - name: --sjdbOverhang
+    multiple: true
+  - name: --sjdb_overhang
     type: integer
     description: length of the donor/acceptor sequence on each side of the junctions,
       ideally = (mate_length - 1)
+    info:
+      orig_name: --sjdbOverhang
     example: 100
-  - name: --sjdbScore
+  - name: --sjdb_score
     type: integer
     description: extra alignment score for alignments that cross database junctions
+    info:
+      orig_name: --sjdbScore
     example: 2
-  - name: --sjdbInsertSave
+  - name: --sjdb_insert_save
     type: string
     description: |-
       which files to save when sjdb junctions are inserted on the fly at the mapping step
 
       - Basic ... only small junction / transcript files
       - All   ... all files including big Genome, SA and SAindex - this will create a complete genome directory
+    info:
+      orig_name: --sjdbInsertSave
     example: Basic
 - name: Variation parameters
   arguments:
-  - name: --varVCFfile
+  - name: --var_vcf_file
     type: string
     description: path to the VCF file that contains variation data. The 10th column
       should contain the genotype information, e.g. 0/1
+    info:
+      orig_name: --varVCFfile
 - name: Read Parameters
   arguments:
-  - name: --readFilesType
+  - name: --read_files_type
     type: string
     description: |-
       format of input read files
 
       - Fastx       ... FASTA or FASTQ
-      - SAM SE      ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view
-      - SAM PE      ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view
+      - SAM SE      ... SAM or BAM single-end reads; for BAM use --read_files_command samtools view
+      - SAM PE      ... SAM or BAM paired-end reads; for BAM use --read_files_command samtools view
+    info:
+      orig_name: --readFilesType
     example: Fastx
-  - name: --readFilesSAMattrKeep
+  - name: --read_files_sam_attr_keep
     type: string
     description: |-
-      for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL
+      for --read_files_type SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL
 
       - All     ... keep all tags
       - None    ... do not keep any tags
+    info:
+      orig_name: --readFilesSAMattrKeep
     example: All
-    multiple: yes
-    multiple_sep: ;
-  - name: --readFilesManifest
+    multiple: true
+  - name: --read_files_manifest
     type: file
     description: |-
       path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns:
@@ -158,45 +192,57 @@ argument_groups:
       Spaces, but not tabs are allowed in file names.
       If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it.
       If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line.
-  - name: --readFilesPrefix
+    info:
+      orig_name: --readFilesManifest
+  - name: --read_files_prefix
     type: string
     description: prefix for the read files names, i.e. it will be added in front of
       the strings in --readFilesIn
-  - name: --readFilesCommand
+    info:
+      orig_name: --readFilesPrefix
+  - name: --read_files_command
     type: string
     description: |-
       command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout
 
       For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.
-    multiple: yes
-    multiple_sep: ;
-  - name: --readMapNumber
+    info:
+      orig_name: --readFilesCommand
+    multiple: true
+  - name: --read_map_number
     type: integer
     description: |-
       number of reads to map from the beginning of the file
 
       -1: map all reads
+    info:
+      orig_name: --readMapNumber
     example: -1
-  - name: --readMatesLengthsIn
+  - name: --read_mates_lengths_in
     type: string
     description: Equal/NotEqual - lengths of names,sequences,qualities for both mates
       are the same  / not the same. NotEqual is safe in all situations.
+    info:
+      orig_name: --readMatesLengthsIn
     example: NotEqual
-  - name: --readNameSeparator
+  - name: --read_name_separator
     type: string
     description: character(s) separating the part of the read names that will be trimmed
       in output (read name after space is always trimmed)
+    info:
+      orig_name: --readNameSeparator
     example: /
-    multiple: yes
-    multiple_sep: ;
-  - name: --readQualityScoreBase
+    multiple: true
+  - name: --read_quality_score_base
     type: integer
     description: number to be subtracted from the ASCII code to get Phred quality
       score
+    info:
+      orig_name: --readQualityScoreBase
     example: 33
 - name: Read Clipping
   arguments:
-  - name: --clipAdapterType
+  - name: --clip_adapter_type
     type: string
     description: |-
       adapter clipping type
@@ -204,129 +250,161 @@ argument_groups:
       - Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp
       - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal
       - None ... no adapter clipping, all other clip* parameters are disregarded
+    info:
+      orig_name: --clipAdapterType
     example: Hamming
-  - name: --clip3pNbases
+  - name: --clip3p_nbases
     type: integer
     description: number(s) of bases to clip from 3p of each mate. If one value is
       given, it will be assumed the same for both mates.
+    info:
+      orig_name: --clip3pNbases
     example: 0
-    multiple: yes
-    multiple_sep: ;
-  - name: --clip3pAdapterSeq
+    multiple: true
+  - name: --clip3p_adapter_seq
     type: string
     description: |-
       adapter sequences to clip from 3p of each mate.  If one value is given, it will be assumed the same for both mates.
 
       - polyA ... polyA sequence with the length equal to read length
-    multiple: yes
-    multiple_sep: ;
-  - name: --clip3pAdapterMMp
+    info:
+      orig_name: --clip3pAdapterSeq
+    multiple: true
+  - name: --clip3p_adapter_mm_p
     type: double
     description: max proportion of mismatches for 3p adapter clipping for each mate.  If
       one value is given, it will be assumed the same for both mates.
+    info:
+      orig_name: --clip3pAdapterMMp
     example: 0.1
-    multiple: yes
-    multiple_sep: ;
-  - name: --clip3pAfterAdapterNbases
+    multiple: true
+  - name: --clip3p_after_adapter_nbases
     type: integer
     description: number of bases to clip from 3p of each mate after the adapter clipping.
       If one value is given, it will be assumed the same for both mates.
+    info:
+      orig_name: --clip3pAfterAdapterNbases
     example: 0
-    multiple: yes
-    multiple_sep: ;
-  - name: --clip5pNbases
+    multiple: true
+  - name: --clip5p_nbases
     type: integer
     description: number(s) of bases to clip from 5p of each mate. If one value is
       given, it will be assumed the same for both mates.
+    info:
+      orig_name: --clip5pNbases
     example: 0
-    multiple: yes
-    multiple_sep: ;
+    multiple: true
 - name: Limits
   arguments:
-  - name: --limitGenomeGenerateRAM
+  - name: --limit_genome_generate_ram
     type: long
     description: maximum available RAM (bytes) for genome generation
+    info:
+      orig_name: --limitGenomeGenerateRAM
     example: '31000000000'
-  - name: --limitIObufferSize
+  - name: --limit_io_buffer_size
     type: long
     description: max available buffers size (bytes) for input/output, per thread
+    info:
+      orig_name: --limitIObufferSize
     example:
     - 30000000
     - 50000000
-    multiple: yes
-    multiple_sep: ;
-  - name: --limitOutSAMoneReadBytes
+    multiple: true
+  - name: --limit_out_sam_one_read_bytes
     type: long
     description: 'max size of the SAM record (bytes) for one read. Recommended value:
       >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax'
+    info:
+      orig_name: --limitOutSAMoneReadBytes
     example: 100000
-  - name: --limitOutSJoneRead
+  - name: --limit_out_sj_one_read
     type: integer
     description: max number of junctions for one read (including all multi-mappers)
+    info:
+      orig_name: --limitOutSJoneRead
     example: 1000
-  - name: --limitOutSJcollapsed
+  - name: --limit_out_sj_collapsed
     type: integer
     description: max number of collapsed junctions
+    info:
+      orig_name: --limitOutSJcollapsed
     example: 1000000
-  - name: --limitBAMsortRAM
+  - name: --limit_bam_sort_ram
     type: long
     description: maximum available RAM (bytes) for sorting BAM. If =0, it will be
-      set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory
+      set to the genome index size. 0 value can only be used with --genome_load NoSharedMemory
       option.
+    info:
+      orig_name: --limitBAMsortRAM
     example: 0
-  - name: --limitSjdbInsertNsj
+  - name: --limit_sjdb_insert_nsj
     type: integer
     description: maximum number of junctions to be inserted to the genome on the fly
       at the mapping stage, including those from annotations and those detected in
       the 1st step of the 2-pass run
+    info:
+      orig_name: --limitSjdbInsertNsj
     example: 1000000
-  - name: --limitNreadsSoft
+  - name: --limit_nreads_soft
     type: integer
     description: soft limit on the number of reads
+    info:
+      orig_name: --limitNreadsSoft
     example: -1
 - name: 'Output: general'
   arguments:
-  - name: --outTmpKeep
+  - name: --out_tmp_keep
     type: string
     description: |-
       whether to keep the temporary files after STAR runs is finished
 
       - None ... remove all temporary files
       - All ... keep all files
-  - name: --outStd
+    info:
+      orig_name: --outTmpKeep
+  - name: --out_std
     type: string
     description: |-
       which output will be directed to stdout (standard out)
 
       - Log                    ... log messages
       - SAM                    ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out
-      - BAM_Unsorted           ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted
-      - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate
-      - BAM_Quant              ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM
+      - BAM_Unsorted           ... alignments in BAM format, unsorted. Requires --out_sam_type BAM Unsorted
+      - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --out_sam_type BAM SortedByCoordinate
+      - BAM_Quant              ... alignments to transcriptome in BAM format, unsorted. Requires --quant_mode TranscriptomeSAM
+    info:
+      orig_name: --outStd
     example: Log
-  - name: --outReadsUnmapped
+  - name: --out_reads_unmapped
     type: string
     description: |-
       output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s).
 
       - None    ... no output
       - Fastx   ... output in separate fasta/fastq files, Unmapped.out.mate1/2
-  - name: --outQSconversionAdd
+    info:
+      orig_name: --outReadsUnmapped
+  - name: --out_qs_conversion_add
     type: integer
     description: add this number to the quality score (e.g. to convert from Illumina
       to Sanger, use -31)
+    info:
+      orig_name: --outQSconversionAdd
     example: 0
-  - name: --outMultimapperOrder
+  - name: --out_multimapper_order
     type: string
     description: |-
       order of multimapping alignments in the output files
 
       - Old_2.4             ... quasi-random order used before 2.5.0
       - Random              ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases.
+    info:
+      orig_name: --outMultimapperOrder
     example: Old_2.4
 - name: 'Output: SAM and BAM'
   arguments:
-  - name: --outSAMtype
+  - name: --out_sam_type
     type: string
     description: |-
       type of SAM/BAM output
@@ -337,11 +415,12 @@ argument_groups:
       - None ... no SAM/BAM output
       2nd, 3rd:
       - Unsorted           ... standard unsorted
-      - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM.
+      - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limit_bam_sort_ram.
+    info:
+      orig_name: --outSAMtype
     example: SAM
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSAMmode
+    multiple: true
+  - name: --out_sam_mode
     type: string
     description: |-
       mode of SAM output
@@ -349,15 +428,19 @@ argument_groups:
       - None ... no SAM output
       - Full ... full SAM output
       - NoQS ... full SAM but without quality scores
+    info:
+      orig_name: --outSAMmode
     example: Full
-  - name: --outSAMstrandField
+  - name: --out_sam_strand_field
     type: string
     description: |-
       Cufflinks-like strand field flag
 
       - None        ... not used
       - intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out.
-  - name: --outSAMattributes
+    info:
+      orig_name: --outSAMstrandField
+  - name: --out_sam_attributes
     type: string
     description: |-
       a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order.
@@ -368,27 +451,27 @@ argument_groups:
       - All         ... NH HI AS nM NM MD jM jI MC ch
       ***Alignment:
       - NH          ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag.
-      - HI          ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag.
+      - HI          ... multiple alignment index, starts with --out_sam_attr_ih_start (=1 by default). Standard SAM tag.
       - AS          ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag.
       - nM          ... number of mismatches. For PE reads, sum over two mates.
       - NM          ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag.
       - MD          ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag.
       - jM          ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value.
       - jI          ... start and end of introns for all junctions (1-based).
-      - XS          ... alignment strand according to --outSAMstrandField.
+      - XS          ... alignment strand according to --out_sam_strand_field.
       - MC          ... mate's CIGAR string. Standard SAM tag.
-      - ch          ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output.
+      - ch          ... marks all segment of all chimeric alingments for --chim_out_type WithinBAM output.
       - cN          ... number of bases clipped from the read ends: 5' and 3'
       ***Variation:
       - vA          ... variant allele
       - vG          ... genomic coordinate of the variant overlapped by the read.
-      - vW          ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag.
+      - vW          ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --wasp_output_mode SAMtag.
       - ha          ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid .
       ***STARsolo:
       - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.
       - GX GN       ... gene ID and gene name for unique-gene reads.
       - gx gn       ... gene IDs and gene names for unique- and multi-gene reads.
-      - CB UB       ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.
+      - CB UB       ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --out_sam_type BAM SortedByCoordinate.
       - sM          ... assessment of CB and UMI.
       - sS          ... sequence of the entire barcode (CB,UMI,adapter).
       - sQ          ... quality of the entire barcode.
@@ -396,15 +479,18 @@ argument_groups:
       ***Unsupported/undocumented:
       - rB          ... alignment block read/genomic coordinates.
       - vR          ... read coordinate of the variant.
+    info:
+      orig_name: --outSAMattributes
     example: Standard
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSAMattrIHstart
+    multiple: true
+  - name: --out_sam_attr_ih_start
     type: integer
     description: start value for the IH attribute. 0 may be required by some downstream
       software, such as Cufflinks or StringTie.
+    info:
+      orig_name: --outSAMattrIHstart
     example: 1
-  - name: --outSAMunmapped
+  - name: --out_sam_unmapped
     type: string
     description: |-
       output of unmapped reads in the SAM format
@@ -414,112 +500,141 @@ argument_groups:
       - Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam)
       2nd word:
       - KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads.
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSAMorder
+    info:
+      orig_name: --outSAMunmapped
+    multiple: true
+  - name: --out_sam_order
     type: string
     description: |-
       type of sorting for the SAM output
 
       Paired: one mate after the other for all paired alignments
       PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files
+    info:
+      orig_name: --outSAMorder
     example: Paired
-  - name: --outSAMprimaryFlag
+  - name: --out_sam_primary_flag
     type: string
     description: |-
       which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG
 
       - OneBestScore ... only one alignment with the best score is primary
       - AllBestScore ... all alignments with the best score are primary
+    info:
+      orig_name: --outSAMprimaryFlag
     example: OneBestScore
-  - name: --outSAMreadID
+  - name: --out_sam_read_id
     type: string
     description: |-
       read ID record type
 
       - Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end
       - Number   ... read number (index) in the FASTx file
+    info:
+      orig_name: --outSAMreadID
     example: Standard
-  - name: --outSAMmapqUnique
+  - name: --out_sam_mapq_unique
     type: integer
     description: '0 to 255: the MAPQ value for unique mappers'
+    info:
+      orig_name: --outSAMmapqUnique
     example: 255
-  - name: --outSAMflagOR
+  - name: --out_sam_flag_or
     type: integer
     description: '0 to 65535: sam FLAG will be bitwise OR''d with this value, i.e.
       FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR,
       and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.'
+    info:
+      orig_name: --outSAMflagOR
     example: 0
-  - name: --outSAMflagAND
+  - name: --out_sam_flag_and
     type: integer
     description: '0 to 65535: sam FLAG will be bitwise AND''d with this value, i.e.
       FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR,
       but before outSAMflagOR. Can be used to unset specific bits that are not set
       otherwise.'
+    info:
+      orig_name: --outSAMflagAND
     example: 65535
-  - name: --outSAMattrRGline
+  - name: --out_sam_attr_rg_line
     type: string
     description: |-
-      SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z".
+      SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --out_sam_attr_rg_line ID:xxx CN:yy "DS:z z z".
 
       xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.
       Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g.
-      --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSAMheaderHD
+      --out_sam_attr_rg_line ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy
+    info:
+      orig_name: --outSAMattrRGline
+    multiple: true
+  - name: --out_sam_header_hd
     type: string
     description: '@HD (header) line of the SAM header'
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSAMheaderPG
+    info:
+      orig_name: --outSAMheaderHD
+    multiple: true
+  - name: --out_sam_header_pg
     type: string
     description: extra @PG (software) line of the SAM header (in addition to STAR)
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSAMheaderCommentFile
+    info:
+      orig_name: --outSAMheaderPG
+    multiple: true
+  - name: --out_sam_header_comment_file
     type: string
     description: path to the file with @CO (comment) lines of the SAM header
-  - name: --outSAMfilter
+    info:
+      orig_name: --outSAMheaderCommentFile
+  - name: --out_sam_filter
     type: string
     description: |-
       filter the output into main SAM/BAM files
 
-      - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage.
-      - KeepAllAddedReferences ...  keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage.
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSAMmultNmax
+      - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genome_fasta_files at the mapping stage.
+      - KeepAllAddedReferences ...  keep all alignments to the extra reference sequences added with --genome_fasta_files at the mapping stage.
+    info:
+      orig_name: --outSAMfilter
+    multiple: true
+  - name: --out_sam_mult_nmax
     type: integer
     description: |-
       max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first
 
-      - -1 ... all alignments (up to --outFilterMultimapNmax) will be output
+      - -1 ... all alignments (up to --out_filter_multimap_nmax) will be output
+    info:
+      orig_name: --outSAMmultNmax
     example: -1
-  - name: --outSAMtlen
+  - name: --out_sam_tlen
     type: integer
     description: |-
       calculation method for the TLEN field in the SAM/BAM files
 
       - 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate
       - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends
+    info:
+      orig_name: --outSAMtlen
     example: 1
-  - name: --outBAMcompression
+  - name: --out_bam_compression
     type: integer
     description: -1 to 10  BAM compression level, -1=default compression (6?), 0=no
       compression, 10=maximum compression
+    info:
+      orig_name: --outBAMcompression
     example: 1
-  - name: --outBAMsortingThreadN
+  - name: --out_bam_sorting_thread_n
     type: integer
     description: '>=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).'
+    info:
+      orig_name: --outBAMsortingThreadN
     example: 0
-  - name: --outBAMsortingBinsN
+  - name: --out_bam_sorting_bins_n
     type: integer
     description: '>0:  number of genome bins for coordinate-sorting'
+    info:
+      orig_name: --outBAMsortingBinsN
     example: 50
 - name: BAM processing
   arguments:
-  - name: --bamRemoveDuplicatesType
+  - name: --bam_remove_duplicates_type
     type: string
     description: |-
       mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only
@@ -527,17 +642,21 @@ argument_groups:
       - -                       ... no duplicate removal/marking
       - UniqueIdentical         ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical
       - UniqueIdenticalNotMulti  ... mark duplicate unique mappers but not multimappers.
-  - name: --bamRemoveDuplicatesMate2basesN
+    info:
+      orig_name: --bamRemoveDuplicatesType
+  - name: --bam_remove_duplicates_mate2bases_n
     type: integer
     description: number of bases from the 5' of mate 2 to use in collapsing (e.g.
       for RAMPAGE)
+    info:
+      orig_name: --bamRemoveDuplicatesMate2basesN
     example: 0
 - name: Output Wiggle
   arguments:
-  - name: --outWigType
+  - name: --out_wig_type
     type: string
     description: |-
-      type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .
+      type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --out_sam_type BAM SortedByCoordinate .
 
       1st word:
       - None       ... no signal output
@@ -546,85 +665,112 @@ argument_groups:
       2nd word:
       - read1_5p   ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc
       - read2      ... signal from only 2nd read
-    multiple: yes
-    multiple_sep: ;
-  - name: --outWigStrand
+    info:
+      orig_name: --outWigType
+    multiple: true
+  - name: --out_wig_strand
     type: string
     description: |-
       strandedness of wiggle/bedGraph output
 
       - Stranded   ...  separate strands, str1 and str2
       - Unstranded ...  collapsed strands
+    info:
+      orig_name: --outWigStrand
     example: Stranded
-  - name: --outWigReferencesPrefix
+  - name: --out_wig_references_prefix
     type: string
     description: prefix matching reference names to include in the output wiggle file,
       e.g. "chr", default "-" - include all references
-  - name: --outWigNorm
+    info:
+      orig_name: --outWigReferencesPrefix
+  - name: --out_wig_norm
     type: string
     description: |-
       type of normalization for the signal
 
       - RPM    ... reads per million of mapped reads
       - None   ... no normalization, "raw" counts
+    info:
+      orig_name: --outWigNorm
     example: RPM
 - name: Output Filtering
   arguments:
-  - name: --outFilterType
+  - name: --out_filter_type
     type: string
     description: |-
       type of filtering
 
       - Normal  ... standard filtering using only current alignment
       - BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab
+    info:
+      orig_name: --outFilterType
     example: Normal
-  - name: --outFilterMultimapScoreRange
+  - name: --out_filter_multimap_score_range
     type: integer
     description: the score range below the maximum score for multimapping alignments
+    info:
+      orig_name: --outFilterMultimapScoreRange
     example: 1
-  - name: --outFilterMultimapNmax
+  - name: --out_filter_multimap_nmax
     type: integer
     description: |-
       maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value.
 
       Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out .
+    info:
+      orig_name: --outFilterMultimapNmax
     example: 10
-  - name: --outFilterMismatchNmax
+  - name: --out_filter_mismatch_nmax
     type: integer
     description: alignment will be output only if it has no more mismatches than this
       value.
+    info:
+      orig_name: --outFilterMismatchNmax
     example: 10
-  - name: --outFilterMismatchNoverLmax
+  - name: --out_filter_mismatch_nover_lmax
     type: double
     description: alignment will be output only if its ratio of mismatches to *mapped*
       length is less than or equal to this value.
+    info:
+      orig_name: --outFilterMismatchNoverLmax
     example: 0.3
-  - name: --outFilterMismatchNoverReadLmax
+  - name: --out_filter_mismatch_nover_read_lmax
     type: double
     description: alignment will be output only if its ratio of mismatches to *read*
       length is less than or equal to this value.
+    info:
+      orig_name: --outFilterMismatchNoverReadLmax
     example: 1.0
-  - name: --outFilterScoreMin
+  - name: --out_filter_score_min
     type: integer
     description: alignment will be output only if its score is higher than or equal
       to this value.
+    info:
+      orig_name: --outFilterScoreMin
     example: 0
-  - name: --outFilterScoreMinOverLread
+  - name: --out_filter_score_min_over_lread
     type: double
     description: same as outFilterScoreMin, but normalized to read length (sum of
       mates' lengths for paired-end reads)
+    info:
+      orig_name: --outFilterScoreMinOverLread
     example: 0.66
-  - name: --outFilterMatchNmin
+  - name: --out_filter_match_nmin
     type: integer
     description: alignment will be output only if the number of matched bases is higher
       than or equal to this value.
+    info:
+      orig_name: --outFilterMatchNmin
     example: 0
-  - name: --outFilterMatchNminOverLread
+  - name: --out_filter_match_nmin_over_lread
     type: double
     description: sam as outFilterMatchNmin, but normalized to the read length (sum
       of mates' lengths for paired-end reads).
+    info:
+      orig_name: --outFilterMatchNminOverLread
     example: 0.66
-  - name: --outFilterIntronMotifs
+  - name: --out_filter_intron_motifs
     type: string
     description: |-
       filter alignment using their motifs
@@ -632,244 +778,316 @@ argument_groups:
       - None                           ... no filtering
       - RemoveNoncanonical             ... filter out alignments that contain non-canonical junctions
       - RemoveNoncanonicalUnannotated  ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept.
-  - name: --outFilterIntronStrands
+    info:
+      orig_name: --outFilterIntronMotifs
+  - name: --out_filter_intron_strands
     type: string
     description: |-
       filter alignments
 
       - RemoveInconsistentStrands      ... remove alignments that have junctions with inconsistent strands
       - None                           ... no filtering
+    info:
+      orig_name: --outFilterIntronStrands
     example: RemoveInconsistentStrands
 - name: Output splice junctions (SJ.out.tab)
   arguments:
-  - name: --outSJtype
+  - name: --out_sj_type
     type: string
     description: |-
       type of splice junction output
 
       - Standard    ... standard SJ.out.tab output
       - None        ... no splice junction output
+    info:
+      orig_name: --outSJtype
     example: Standard
 - name: 'Output Filtering: Splice Junctions'
   arguments:
-  - name: --outSJfilterReads
+  - name: --out_sj_filter_reads
     type: string
     description: |-
       which reads to consider for collapsed splice junctions output
 
       - All     ... all reads, unique- and multi-mappers
       - Unique  ... uniquely mapping reads only
+    info:
+      orig_name: --outSJfilterReads
     example: All
-  - name: --outSJfilterOverhangMin
+  - name: --out_sj_filter_overhang_min
     type: integer
     description: |-
       minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
 
       does not apply to annotated junctions
+    info:
+      orig_name: --outSJfilterOverhangMin
     example:
     - 30
     - 12
     - 12
     - 12
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSJfilterCountUniqueMin
+    multiple: true
+  - name: --out_sj_filter_count_unique_min
     type: integer
     description: |-
       minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
 
       Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied
       does not apply to annotated junctions
+    info:
+      orig_name: --outSJfilterCountUniqueMin
     example:
     - 3
     - 1
     - 1
     - 1
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSJfilterCountTotalMin
+    multiple: true
+  - name: --out_sj_filter_count_total_min
     type: integer
     description: |-
       minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
 
       Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied
       does not apply to annotated junctions
+    info:
+      orig_name: --outSJfilterCountTotalMin
     example:
     - 3
     - 1
     - 1
     - 1
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSJfilterDistToOtherSJmin
+    multiple: true
+  - name: --out_sj_filter_dist_to_other_sj_min
     type: integer
     description: |-
       minimum allowed distance to other junctions' donor/acceptor
 
       does not apply to annotated junctions
+    info:
+      orig_name: --outSJfilterDistToOtherSJmin
     example:
     - 10
     - 0
     - 5
     - 10
-    multiple: yes
-    multiple_sep: ;
-  - name: --outSJfilterIntronMaxVsReadN
+    multiple: true
+  - name: --out_sj_filter_intron_max_vs_read_n
     type: integer
     description: |-
       maximum gap allowed for junctions supported by 1,2,3,,,N reads
 
       i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax
       does not apply to annotated junctions
+    info:
+      orig_name: --outSJfilterIntronMaxVsReadN
     example:
     - 50000
     - 100000
     - 200000
-    multiple: yes
-    multiple_sep: ;
+    multiple: true
 - name: Scoring
   arguments:
-  - name: --scoreGap
+  - name: --score_gap
     type: integer
     description: splice junction penalty (independent on intron motif)
+    info:
+      orig_name: --scoreGap
     example: 0
-  - name: --scoreGapNoncan
+  - name: --score_gap_noncan
     type: integer
     description: non-canonical junction penalty (in addition to scoreGap)
+    info:
+      orig_name: --scoreGapNoncan
     example: -8
-  - name: --scoreGapGCAG
+  - name: --score_gap_gcag
     type: integer
     description: GC/AG and CT/GC junction penalty (in addition to scoreGap)
+    info:
+      orig_name: --scoreGapGCAG
     example: -4
-  - name: --scoreGapATAC
+  - name: --score_gap_atac
     type: integer
     description: AT/AC  and GT/AT junction penalty  (in addition to scoreGap)
+    info:
+      orig_name: --scoreGapATAC
     example: -8
-  - name: --scoreGenomicLengthLog2scale
+  - name: --score_genomic_length_log2scale
     type: integer
     description: 'extra score logarithmically scaled with genomic length of the alignment:
       scoreGenomicLengthLog2scale*log2(genomicLength)'
+    info:
+      orig_name: --scoreGenomicLengthLog2scale
     example: 0
-  - name: --scoreDelOpen
+  - name: --score_del_open
     type: integer
     description: deletion open penalty
+    info:
+      orig_name: --scoreDelOpen
     example: -2
-  - name: --scoreDelBase
+  - name: --score_del_base
     type: integer
     description: deletion extension penalty per base (in addition to scoreDelOpen)
+    info:
+      orig_name: --scoreDelBase
     example: -2
-  - name: --scoreInsOpen
+  - name: --score_ins_open
     type: integer
     description: insertion open penalty
+    info:
+      orig_name: --scoreInsOpen
     example: -2
-  - name: --scoreInsBase
+  - name: --score_ins_base
     type: integer
     description: insertion extension penalty per base (in addition to scoreInsOpen)
+    info:
+      orig_name: --scoreInsBase
     example: -2
-  - name: --scoreStitchSJshift
+  - name: --score_stitch_sj_shift
     type: integer
     description: maximum score reduction while searching for SJ boundaries in the
       stitching step
+    info:
+      orig_name: --scoreStitchSJshift
     example: 1
 - name: Alignments and Seeding
   arguments:
-  - name: --seedSearchStartLmax
+  - name: --seed_search_start_lmax
     type: integer
     description: defines the search start point through the read - the read is split
       into pieces no longer than this value
+    info:
+      orig_name: --seedSearchStartLmax
     example: 50
-  - name: --seedSearchStartLmaxOverLread
+  - name: --seed_search_start_lmax_over_lread
     type: double
     description: seedSearchStartLmax normalized to read length (sum of mates' lengths
       for paired-end reads)
+    info:
+      orig_name: --seedSearchStartLmaxOverLread
     example: 1.0
-  - name: --seedSearchLmax
+  - name: --seed_search_lmax
     type: integer
     description: defines the maximum length of the seeds, if =0 seed length is not
       limited
+    info:
+      orig_name: --seedSearchLmax
     example: 0
-  - name: --seedMultimapNmax
+  - name: --seed_multimap_nmax
     type: integer
     description: only pieces that map fewer than this value are utilized in the stitching
       procedure
+    info:
+      orig_name: --seedMultimapNmax
     example: 10000
-  - name: --seedPerReadNmax
+  - name: --seed_per_read_nmax
     type: integer
     description: max number of seeds per read
+    info:
+      orig_name: --seedPerReadNmax
     example: 1000
-  - name: --seedPerWindowNmax
+  - name: --seed_per_window_nmax
     type: integer
     description: max number of seeds per window
+    info:
+      orig_name: --seedPerWindowNmax
     example: 50
-  - name: --seedNoneLociPerWindow
+  - name: --seed_none_loci_per_window
     type: integer
     description: max number of one seed loci per window
+    info:
+      orig_name: --seedNoneLociPerWindow
     example: 10
-  - name: --seedSplitMin
+  - name: --seed_split_min
     type: integer
     description: min length of the seed sequences split by Ns or mate gap
+    info:
+      orig_name: --seedSplitMin
     example: 12
-  - name: --seedMapMin
+  - name: --seed_map_min
     type: integer
     description: min length of seeds to be mapped
+    info:
+      orig_name: --seedMapMin
     example: 5
-  - name: --alignIntronMin
+  - name: --align_intron_min
     type: integer
     description: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,
       otherwise it is considered Deletion
+    info:
+      orig_name: --alignIntronMin
     example: 21
-  - name: --alignIntronMax
+  - name: --align_intron_max
     type: integer
     description: maximum intron size, if 0, max intron size will be determined by
       (2^winBinNbits)*winAnchorDistNbins
+    info:
+      orig_name: --alignIntronMax
     example: 0
-  - name: --alignMatesGapMax
+  - name: --align_mates_gap_max
     type: integer
     description: maximum gap between two mates, if 0, max intron gap will be determined
       by (2^winBinNbits)*winAnchorDistNbins
+    info:
+      orig_name: --alignMatesGapMax
     example: 0
-  - name: --alignSJoverhangMin
+  - name: --align_sj_overhang_min
     type: integer
     description: minimum overhang (i.e. block size) for spliced alignments
+    info:
+      orig_name: --alignSJoverhangMin
     example: 5
-  - name: --alignSJstitchMismatchNmax
+  - name: --align_sj_stitch_mismatch_nmax
     type: integer
     description: |-
       maximum number of mismatches for stitching of the splice junctions (-1: no limit).
 
       (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.
+    info:
+      orig_name: --alignSJstitchMismatchNmax
     example:
     - 0
     - -1
     - 0
     - 0
-    multiple: yes
-    multiple_sep: ;
-  - name: --alignSJDBoverhangMin
+    multiple: true
+  - name: --align_sjdb_overhang_min
     type: integer
     description: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments
+    info:
+      orig_name: --alignSJDBoverhangMin
     example: 3
-  - name: --alignSplicedMateMapLmin
+  - name: --align_spliced_mate_map_lmin
     type: integer
     description: minimum mapped length for a read mate that is spliced
+    info:
+      orig_name: --alignSplicedMateMapLmin
     example: 0
-  - name: --alignSplicedMateMapLminOverLmate
+  - name: --align_spliced_mate_map_lmin_over_lmate
     type: double
     description: alignSplicedMateMapLmin normalized to mate length
+    info:
+      orig_name: --alignSplicedMateMapLminOverLmate
     example: 0.66
-  - name: --alignWindowsPerReadNmax
+  - name: --align_windows_per_read_nmax
     type: integer
     description: max number of windows per read
+    info:
+      orig_name: --alignWindowsPerReadNmax
     example: 10000
-  - name: --alignTranscriptsPerWindowNmax
+  - name: --align_transcripts_per_window_nmax
     type: integer
     description: max number of transcripts per window
+    info:
+      orig_name: --alignTranscriptsPerWindowNmax
     example: 100
-  - name: --alignTranscriptsPerReadNmax
+  - name: --align_transcripts_per_read_nmax
     type: integer
     description: max number of different alignments per read to consider
+    info:
+      orig_name: --alignTranscriptsPerReadNmax
     example: 10000
-  - name: --alignEndsType
+  - name: --align_ends_type
     type: string
     description: |-
       type of read ends alignment
@@ -878,8 +1096,10 @@ argument_groups:
       - EndToEnd          ... force end-to-end read alignment, do not soft-clip
       - Extend5pOfRead1   ... fully extend only the 5p of the read1, all other ends: local alignment
       - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment
+    info:
+      orig_name: --alignEndsType
     example: Local
-  - name: --alignEndsProtrude
+  - name: --align_ends_protrude
     type: string
     description: |-
       allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate
@@ -888,68 +1108,90 @@ argument_groups:
       2nd word: string:
       -                     ConcordantPair ... report alignments with non-zero protrusion as concordant pairs
       -                     DiscordantPair ... report alignments with non-zero protrusion as discordant pairs
+    info:
+      orig_name: --alignEndsProtrude
     example: 0    ConcordantPair
-  - name: --alignSoftClipAtReferenceEnds
+  - name: --align_soft_clip_at_reference_ends
     type: string
     description: |-
       allow the soft-clipping of the alignments past the end of the chromosomes
 
       - Yes ... allow
       - No  ... prohibit, useful for compatibility with Cufflinks
+    info:
+      orig_name: --alignSoftClipAtReferenceEnds
     example: 'Yes'
-  - name: --alignInsertionFlush
+  - name: --align_insertion_flush
     type: string
     description: |-
       how to flush ambiguous insertion positions
 
       - None    ... insertions are not flushed
       - Right   ... insertions are flushed to the right
+    info:
+      orig_name: --alignInsertionFlush
 - name: Paired-End reads
   arguments:
-  - name: --peOverlapNbasesMin
+  - name: --pe_overlap_nbases_min
     type: integer
     description: minimum number of overlapping bases to trigger mates merging and
       realignment. Specify >0 value to switch on the "merginf of overlapping mates"
       algorithm.
+    info:
+      orig_name: --peOverlapNbasesMin
     example: 0
-  - name: --peOverlapMMp
+  - name: --pe_overlap_mm_p
     type: double
     description: maximum proportion of mismatched bases in the overlap area
+    info:
+      orig_name: --peOverlapMMp
     example: 0.01
 - name: Windows, Anchors, Binning
   arguments:
-  - name: --winAnchorMultimapNmax
+  - name: --win_anchor_multimap_nmax
     type: integer
     description: max number of loci anchors are allowed to map to
+    info:
+      orig_name: --winAnchorMultimapNmax
     example: 50
-  - name: --winBinNbits
+  - name: --win_bin_nbits
     type: integer
     description: =log2(winBin), where winBin is the size of the bin for the windows/clustering,
       each window will occupy an integer number of bins.
+    info:
+      orig_name: --winBinNbits
     example: 16
-  - name: --winAnchorDistNbins
+  - name: --win_anchor_dist_nbins
     type: integer
     description: max number of bins between two anchors that allows aggregation of
       anchors into one window
+    info:
+      orig_name: --winAnchorDistNbins
     example: 9
-  - name: --winFlankNbins
+  - name: --win_flank_nbins
     type: integer
     description: log2(winFlank), where win Flank is the size of the left and right
       flanking regions for each window
+    info:
+      orig_name: --winFlankNbins
     example: 4
-  - name: --winReadCoverageRelativeMin
+  - name: --win_read_coverage_relative_min
     type: double
     description: minimum relative coverage of the read sequence by the seeds in a
       window, for STARlong algorithm only.
+    info:
+      orig_name: --winReadCoverageRelativeMin
     example: 0.5
-  - name: --winReadCoverageBasesMin
+  - name: --win_read_coverage_bases_min
     type: integer
     description: minimum number of bases covered by the seeds in a window , for STARlong
       algorithm only.
+    info:
+      orig_name: --winReadCoverageBasesMin
     example: 0
 - name: Chimeric Alignments
   arguments:
-  - name: --chimOutType
+  - name: --chim_out_type
     type: string
     description: |-
       type of chimeric output
@@ -959,83 +1201,109 @@ argument_groups:
       - WithinBAM       ... output into main aligned BAM files (Aligned.*.bam)
       - WithinBAM HardClip  ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present)
       - WithinBAM SoftClip  ... soft-clipping in the CIGAR for supplemental chimeric alignments
+    info:
+      orig_name: --chimOutType
     example: Junctions
-    multiple: yes
-    multiple_sep: ;
-  - name: --chimSegmentMin
+    multiple: true
+  - name: --chim_segment_min
     type: integer
     description: minimum length of chimeric segment length, if ==0, no chimeric output
+    info:
+      orig_name: --chimSegmentMin
     example: 0
-  - name: --chimScoreMin
+  - name: --chim_score_min
     type: integer
     description: minimum total (summed) score of the chimeric segments
+    info:
+      orig_name: --chimScoreMin
     example: 0
-  - name: --chimScoreDropMax
+  - name: --chim_score_drop_max
     type: integer
     description: max drop (difference) of chimeric score (the sum of scores of all
       chimeric segments) from the read length
+    info:
+      orig_name: --chimScoreDropMax
     example: 20
-  - name: --chimScoreSeparation
+  - name: --chim_score_separation
     type: integer
     description: minimum difference (separation) between the best chimeric score and
       the next one
+    info:
+      orig_name: --chimScoreSeparation
     example: 10
-  - name: --chimScoreJunctionNonGTAG
+  - name: --chim_score_junction_non_gtag
     type: integer
     description: penalty for a non-GT/AG chimeric junction
+    info:
+      orig_name: --chimScoreJunctionNonGTAG
     example: -1
-  - name: --chimJunctionOverhangMin
+  - name: --chim_junction_overhang_min
     type: integer
     description: minimum overhang for a chimeric junction
+    info:
+      orig_name: --chimJunctionOverhangMin
     example: 20
-  - name: --chimSegmentReadGapMax
+  - name: --chim_segment_read_gap_max
     type: integer
     description: maximum gap in the read sequence between chimeric segments
+    info:
+      orig_name: --chimSegmentReadGapMax
     example: 0
-  - name: --chimFilter
+  - name: --chim_filter
     type: string
     description: |-
       different filters for chimeric alignments
 
       - None ... no filtering
       - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction
+    info:
+      orig_name: --chimFilter
     example: banGenomicN
-    multiple: yes
-    multiple_sep: ;
-  - name: --chimMainSegmentMultNmax
+    multiple: true
+  - name: --chim_main_segment_mult_nmax
     type: integer
     description: maximum number of multi-alignments for the main chimeric segment.
       =1 will prohibit multimapping main segments.
+    info:
+      orig_name: --chimMainSegmentMultNmax
     example: 10
-  - name: --chimMultimapNmax
+  - name: --chim_multimap_nmax
     type: integer
     description: |-
       maximum number of chimeric multi-alignments
 
       - 0 ... use the old scheme for chimeric detection which only considered unique alignments
+    info:
+      orig_name: --chimMultimapNmax
     example: 0
-  - name: --chimMultimapScoreRange
+  - name: --chim_multimap_score_range
     type: integer
     description: the score range for multi-mapping chimeras below the best chimeric
-      score. Only works with --chimMultimapNmax > 1
+      score. Only works with --chim_multimap_nmax > 1
+    info:
+      orig_name: --chimMultimapScoreRange
     example: 1
-  - name: --chimNonchimScoreDropMin
+  - name: --chim_nonchim_score_drop_min
     type: integer
     description: to trigger chimeric detection, the drop in the best non-chimeric
       alignment score with respect to the read length has to be greater than this
       value
+    info:
+      orig_name: --chimNonchimScoreDropMin
     example: 20
-  - name: --chimOutJunctionFormat
+  - name: --chim_out_junction_format
     type: integer
     description: |-
       formatting type for the Chimeric.out.junction file
 
       - 0 ... no comment lines/headers
       - 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping
+    info:
+      orig_name: --chimOutJunctionFormat
     example: 0
 - name: Quantification of Annotations
   arguments:
-  - name: --quantMode
+  - name: --quant_mode
     type: string
     description: |-
       types of quantification requested
@@ -1043,9 +1311,10 @@ argument_groups:
       - -                ... none
       - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file
       - GeneCounts       ... count reads per gene
-    multiple: yes
-    multiple_sep: ;
-  - name: --quantTranscriptomeBAMcompression
+    info:
+      orig_name: --quantMode
+    multiple: true
+  - name: --quant_transcriptome_bam_compression
     type: integer
     description: |-
       -2 to 10  transcriptome BAM compression level
@@ -1054,8 +1323,10 @@ argument_groups:
       - -1  ... default compression (6?)
       -  0  ... no compression
       -  10 ... maximum compression
+    info:
+      orig_name: --quantTranscriptomeBAMcompression
     example: 1
-  - name: --quantTranscriptomeSAMoutput
+  - name: --quant_transcriptome_sam_output
     type: string
     description: |-
       alignment filtering for TranscriptomeSAM output
@@ -1063,26 +1334,301 @@ argument_groups:
       - BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM
       - BanSingleEnd               ... prohibit single-end alignments, allow indels and softclips
       - BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels
+    info:
+      orig_name: --quantTranscriptomeSAMoutput
     example: BanSingleEnd_BanIndels_ExtendSoftclip
 - name: 2-pass Mapping
   arguments:
-  - name: --twopassMode
+  - name: --twopass_mode
     type: string
     description: |-
       2-pass mapping mode.
 
       - None        ... 1-pass mapping
       - Basic       ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly
-  - name: --twopass1readsN
+    info:
+      orig_name: --twopassMode
+  - name: --twopass1reads_n
     type: integer
     description: number of reads to process for the 1st step. Use very large number
       (or default -1) to map all reads in the first step.
+    info:
+      orig_name: --twopass1readsN
     example: -1
 - name: WASP parameters
   arguments:
-  - name: --waspOutputMode
+  - name: --wasp_output_mode
     type: string
     description: |-
       WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .
 
       - SAMtag      ... add WASP tags to the alignments that pass WASP filtering
+    info:
+      orig_name: --waspOutputMode
+- name: STARsolo (single cell RNA-seq) parameters
+  arguments:
+  - name: --solo_type
+    type: string
+    description: |-
+      type of single-cell RNA-seq
+
+      - CB_UMI_Simple   ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium.
+      - CB_UMI_Complex  ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq).
+      - CB_samTagOut    ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --out_sam_type BAM Unsorted [and/or SortedByCoordinate]
+      - SmartSeq        ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)
+    info:
+      orig_name: --soloType
+    multiple: true
+  - name: --solo_cb_type
+    type: string
+    description: |-
+      cell barcode type
+
+      Sequence: cell barcode is a sequence (standard option)
+      String: cell barcode is an arbitrary string
+    info:
+      orig_name: --soloCBtype
+    example: Sequence
+  - name: --solo_cb_whitelist
+    type: string
+    description: |-
+      file(s) with whitelist(s) of cell barcodes. Only --solo_type CB_UMI_Complex allows more than one whitelist file.
+
+      - None            ... no whitelist: all cell barcodes are allowed
+    info:
+      orig_name: --soloCBwhitelist
+    multiple: true
+  - name: --solo_cb_start
+    type: integer
+    description: cell barcode start base
+    info:
+      orig_name: --soloCBstart
+    example: 1
+  - name: --solo_cb_len
+    type: integer
+    description: cell barcode length
+    info:
+      orig_name: --soloCBlen
+    example: 16
+  - name: --solo_umi_start
+    type: integer
+    description: UMI start base
+    info:
+      orig_name: --soloUMIstart
+    example: 17
+  - name: --solo_umi_len
+    type: integer
+    description: UMI length
+    info:
+      orig_name: --soloUMIlen
+    example: 10
+  - name: --solo_barcode_read_length
+    type: integer
+    description: |-
+      length of the barcode read
+
+      - 1   ... equal to sum of soloCBlen+soloUMIlen
+      - 0   ... not defined, do not check
+    info:
+      orig_name: --soloBarcodeReadLength
+    example: 1
+  - name: --solo_barcode_mate
+    type: integer
+    description: |-
+      identifies which read mate contains the barcode (CB+UMI) sequence
+
+      - 0   ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed
+      - 1   ... barcode sequence is a part of mate 1
+      - 2   ... barcode sequence is a part of mate 2
+    info:
+      orig_name: --soloBarcodeMate
+    example: 0
+  - name: --solo_cb_position
+    type: string
+    description: |-
+      position of Cell Barcode(s) on the barcode read.
+
+      Presently only works with --solo_type CB_UMI_Complex, and barcodes are assumed to be on Read2.
+      Format for each barcode: startAnchor_startPosition_endAnchor_endPosition
+      start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end
+      start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base
+      String for different barcodes are separated by space.
+      Example: inDrop (Zilionis et al, Nat. Protocols, 2017):
+      --solo_cb_position  0_0_2_-1  3_1_3_8
+    info:
+      orig_name: --soloCBposition
+    multiple: true
+  - name: --solo_umi_position
+    type: string
+    description: |-
+      position of the UMI on the barcode read, same as soloCBposition
+
+      Example: inDrop (Zilionis et al, Nat. Protocols, 2017):
+      --solo_cb_position  3_9_3_14
+    info:
+      orig_name: --soloUMIposition
+  - name: --solo_adapter_sequence
+    type: string
+    description: adapter sequence to anchor barcodes. Only one adapter sequence is
+      allowed.
+    info:
+      orig_name: --soloAdapterSequence
+  - name: --solo_adapter_mismatches_nmax
+    type: integer
+    description: maximum number of mismatches allowed in adapter sequence.
+    info:
+      orig_name: --soloAdapterMismatchesNmax
+    example: 1
+  - name: --solo_cb_match_wl_type
+    type: string
+    description: |-
+      matching the Cell Barcodes to the WhiteList
+
+      - Exact                           ... only exact matches allowed
+      - 1MM                             ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match.
+      - 1MM_multi                       ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches.
+      Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0
+      - 1MM_multi_pseudocounts          ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes.
+      - 1MM_multi_Nbase_pseudocounts    ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0
+      - EditDist_2                    ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --solo_type CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline.
+    info:
+      orig_name: --soloCBmatchWLtype
+    example: 1MM_multi
+  - name: --solo_input_sam_attr_barcode_seq
+    type: string
+    description: |-
+      when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).
+
+      For instance, for 10X CellRanger or STARsolo BAMs, use --solo_input_sam_attr_barcode_seq CR UR .
+      This parameter is required when running STARsolo with input from SAM.
+    info:
+      orig_name: --soloInputSAMattrBarcodeSeq
+    multiple: true
+  - name: --solo_input_sam_attr_barcode_qual
+    type: string
+    description: |-
+      when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).
+
+      For instance, for 10X CellRanger or STARsolo BAMs, use --solo_input_sam_attr_barcode_qual CY UY .
+      If this parameter is '-' (default), the quality 'H' will be assigned to all bases.
+    info:
+      orig_name: --soloInputSAMattrBarcodeQual
+    multiple: true
+  - name: --solo_strand
+    type: string
+    description: |-
+      strandedness of the solo libraries:
+
+      - Unstranded  ... no strand information
+      - Forward     ... read strand same as the original RNA molecule
+      - Reverse     ... read strand opposite to the original RNA molecule
+    info:
+      orig_name: --soloStrand
+    example: Forward
+  - name: --solo_features
+    type: string
+    description: |-
+      genomic features for which the UMI counts per Cell Barcode are collected
+
+      - Gene            ... genes: reads match the gene transcript
+      - SJ              ... splice junctions: reported in SJ.out.tab
+      - GeneFull        ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns
+      - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons
+      - GeneFull_Ex50pAS        ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction.
+    info:
+      orig_name: --soloFeatures
+    example: Gene
+    multiple: true
+  - name: --solo_multi_mappers
+    type: string
+    description: |-
+      counting method for reads mapping to multiple genes
+
+      - Unique     ... count only reads that map to unique genes
+      - Uniform    ... uniformly distribute multi-genic UMIs to all genes
+      - Rescue     ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM)
+      - PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not.
+      - EM         ... multi-gene UMIs are distributed using Expectation Maximization algorithm
+    info:
+      orig_name: --soloMultiMappers
+    example: Unique
+    multiple: true
+  - name: --solo_umi_dedup
+    type: string
+    description: |-
+      type of UMI deduplication (collapsing) algorithm
+
+      - 1MM_All                     ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once).
+      - 1MM_Directional_UMItools    ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017).
+      - 1MM_Directional             ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs
+      - Exact                       ... only exactly matching UMIs are collapsed.
+      - NoDedup                     ... no deduplication of UMIs, count all reads.
+      - 1MM_CR                      ... CellRanger2-4 algorithm for 1MM UMI collapsing.
+    info:
+      orig_name: --soloUMIdedup
+    example: 1MM_All
+    multiple: true
+  - name: --solo_umi_filtering
+    type: string
+    description: |-
+      type of UMI filtering (for reads uniquely mapping to genes)
+
+      - -                  ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0).
+      - MultiGeneUMI       ... basic + remove lower-count UMIs that map to more than one gene.
+      - MultiGeneUMI_All   ... basic + remove all UMIs that map to more than one gene.
+      - MultiGeneUMI_CR    ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 .
+      Only works with --solo_umi_dedup 1MM_CR
+    info:
+      orig_name: --soloUMIfiltering
+    multiple: true
+  - name: --solo_out_file_names
+    type: string
+    description: |-
+      file names for STARsolo output:
+
+      file_name_prefix   gene_names   barcode_sequences   cell_feature_count_matrix
+    info:
+      orig_name: --soloOutFileNames
+    example:
+    - Solo.out/
+    - features.tsv
+    - barcodes.tsv
+    - matrix.mtx
+    multiple: true
+  - name: --solo_cell_filter
+    type: string
+    description: |-
+      cell filtering type and parameters
+
+      - None            ... do not output filtered cells
+      - TopCells        ... only report top cells by UMI count, followed by the exact number of cells
+      - CellRanger2.2   ... simple filtering of CellRanger 2.2.
+      Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count
+      The harcoded values are from CellRanger: nExpectedCells=3000;  maxPercentile=0.99;  maxMinRatio=10
+      - EmptyDrops_CR   ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y
+      Can be followed by 10 numeric parameters:  nExpectedCells   maxPercentile   maxMinRatio   indMin   indMax   umiMin   umiMinFracMedian   candMaxN   FDR   simN
+      The harcoded values are from CellRanger:             3000            0.99            10    45000    90000      500               0.01      20000  0.01  10000
+    info:
+      orig_name: --soloCellFilter
+    example:
+    - CellRanger2.2
+    - '3000'
+    - '0.99'
+    - '10'
+    multiple: true
+  - name: --solo_out_format_features_gene_field3
+    type: string
+    description: field 3 in the Gene features.tsv file. If "-", then no 3rd field
+      is output.
+    info:
+      orig_name: --soloOutFormatFeaturesGeneField3
+    example: Gene Expression
+    multiple: true
+  - name: --solo_cell_read_stats
+    type: string
+    description: |-
+      Output reads statistics for each CB
+
+      - Standard    ... standard output
+    info:
+      orig_name: --soloCellReadStats
diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml
index bdc956d3..a9a845a1 100644
--- a/src/star/star_align_reads/config.vsh.yaml
+++ b/src/star/star_align_reads/config.vsh.yaml
@@ -118,6 +118,8 @@ engines:
             rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \
             apt-get --purge autoremove -y ${PACKAGES} && \
             apt-get clean
+      - type: python
+        packages: [ pyyaml ]
       - type: docker
         run: |
           STAR --version | sed 's#\(.*\)#star: "\1"#' > /var/software_versions.txt
diff --git a/src/star/star_align_reads/script.py b/src/star/star_align_reads/script.py
index f3d64a57..4d9d046f 100644
--- a/src/star/star_align_reads/script.py
+++ b/src/star/star_align_reads/script.py
@@ -2,6 +2,7 @@
 import subprocess
 import shutil
 from pathlib import Path
+import yaml
 
 ## VIASH START
 par = {
@@ -18,10 +19,20 @@
 }
 meta = {
     "cpus": 8,
-    "temp_dir": "/tmp"
+    "temp_dir": "/tmp",
+    "config": "target/executable/star/star_align_reads/.config.vsh.yaml",
 }
 ## VIASH END
 
+# read config
+with open(meta["config"], 'r') as stream:
+    config = yaml.safe_load(stream)
+all_arguments = {
+    arg["name"].lstrip('-'): arg
+    for argument_group in config["argument_groups"]
+    for arg in argument_group["arguments"]
+}
+
 ##################################################
 # check and process SE / PE R1 input files
 input_r1 = par["input"]
@@ -87,8 +98,13 @@
     cmd_args = [ "STAR" ]
     for name, value in par.items():
         if value is not None:
+            if name in all_arguments:
+                arg_info = all_arguments[name].get("info", {})
+                cli_name = arg_info.get("orig_name", f"--{name}")
+            else:
+                cli_name = f"--{name}"
             val_to_add = value if isinstance(value, list) else [value]
-            cmd_args.extend([f"--{name}"] + [str(x) for x in val_to_add])
+            cmd_args.extend([cli_name] + [str(x) for x in val_to_add])
     print("", flush=True)
 
     # run command
diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh
index bd78094d..46566ec0 100644
--- a/src/star/star_align_reads/test.sh
+++ b/src/star/star_align_reads/test.sh
@@ -88,14 +88,14 @@ cd star_align_reads_se
 echo "> Run star_align_reads on SE"
 "$meta_executable" \
   --input "../reads_R1.fastq" \
-  --genomeDir "../index/" \
+  --genome_dir "../index/" \
   --aligned_reads "output.sam" \
   --log "log.txt" \
-  --outReadsUnmapped "Fastx" \
+  --out_reads_unmapped "Fastx" \
   --unmapped "unmapped.sam" \
-  --quantMode "TranscriptomeSAM;GeneCounts" \
+  --quant_mode "TranscriptomeSAM;GeneCounts" \
   --reads_per_gene "reads_per_gene.tsv" \
-  --outSJtype Standard \
+  --out_sj_type Standard \
   --splice_junctions "splice_junctions.tsv" \
   --reads_aligned_to_transcriptome "transcriptome_aligned.bam" \
   ${meta_cpus:+---cpus $meta_cpus}
@@ -143,10 +143,10 @@ echo ">> Run star_align_reads on PE"
 "$meta_executable" \
   --input ../reads_R1.fastq \
   --input_r2 ../reads_R2.fastq \
-  --genomeDir ../index/ \
+  --genome_dir ../index/ \
   --aligned_reads output.bam \
   --log log.txt \
-  --outReadsUnmapped Fastx \
+  --out_reads_unmapped Fastx \
   --unmapped unmapped_r1.bam \
   --unmapped_r2 unmapped_r2.bam \
   ${meta_cpus:+---cpus $meta_cpus}
diff --git a/src/star/star_align_reads/utils/process_params.R b/src/star/star_align_reads/utils/process_params.R
index ccdc50b3..eee1db65 100644
--- a/src/star/star_align_reads/utils/process_params.R
+++ b/src/star/star_align_reads/utils/process_params.R
@@ -14,6 +14,14 @@ param_txt <- iconv(param_txt, "UTF-8", "ASCII//TRANSLIT")
 dev_begin <- grep("#####UnderDevelopment_begin", param_txt)
 dev_end <- grep("#####UnderDevelopment_end", param_txt)
 
+camel_case_to_snake_case <- function(x) {
+  x %>%
+    str_replace_all("([A-Z][A-Z][A-Z]*)", "_\\1_") %>%
+    str_replace_all("([a-z])([A-Z])", "\\1_\\2") %>%
+    str_to_lower() %>%
+    str_replace_all("_$", "")
+}
+
 # strip development sections
 nondev_ix <- unlist(map2(c(1, dev_end + 1), c(dev_begin - 1, length(param_txt)), function(i, j) {
   if (i >= 1 && i < j) {
@@ -128,9 +136,8 @@ out2 <- out %>%
   # remove arguments that are related to a different runmode
   filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) %>%
   filter(!grepl("--runMode", group_name) | grepl("--runMode alignReads", group_name)) %>%
-  filter(!grepl("STARsolo", group_name)) %>%
   mutate(
-    viash_arg = paste0("--", name),
+    viash_arg = paste0("--", camel_case_to_snake_case(name)),
     type_step1 = type %>%
       str_replace_all(".*(int, string|string|int|real|double)\\(?(s?).*", "\\1\\2"),
     viash_type = type_map[gsub("(int, string|string|int|real|double).*", "\\1", type_step1)],
@@ -155,28 +162,41 @@ out2 <- out %>%
     group_name = gsub(" - .*", "", group_name),
     required = ifelse(name %in% required_args, TRUE, NA)
   )
-print(out2, n = 200)
-out2 %>% mutate(i = row_number()) %>% 
-  # filter(is.na(default_step1) != is.na(viash_default)) %>%
-  select(-group_name, -description)
 
-out2 %>% filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description))
+# change references to argument names
+out3 <- out2
+for (i in seq_len(nrow(out2))) {
+  orig_name <- paste0("--", out2$name[[i]])
+  new_name <- out2$viash_arg[[i]]
+  out3$description <- str_replace_all(out3$description, orig_name, new_name)
+}
+
+# sanity checks
+out3 %>% select(name, viash_arg) %>% as.data.frame()
+print(out3, n = 200)
+out3 %>%
+  mutate(i = row_number()) %>%
+  select(-group_name, -description)
+out3 %>% filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description))
 
-argument_groups <- map(unique(out2$group_name), function(group_name) {
-  args <- out2 %>%
+# create argument groups
+argument_groups <- map(unique(out3$group_name), function(group_name) {
+  args <- out3 %>%
     filter(group_name == !!group_name) %>%
-    pmap(function(viash_arg, viash_type, multiple, viash_default, description, required, ...) {
-      li <- lst(
+    pmap(function(viash_arg, viash_type, multiple, viash_default, description, required, name, ...) {
+      li <- list(
         name = viash_arg,
         type = viash_type,
-        description = description
+        description = description,
+        info = list(
+          orig_name = paste0("--", name)
+        )
       )
       if (all(!is.na(viash_default))) {
         li$example <- viash_default
       }
       if (!is.na(multiple) && multiple) {
         li$multiple <- multiple
-        li$multiple_sep <- ";"
       }
       if (!is.na(required) && required) {
         li$required <- required
@@ -186,4 +206,10 @@ argument_groups <- map(unique(out2$group_name), function(group_name) {
   list(name = group_name, arguments = args)
 })
 
-yaml::write_yaml(list(argument_groups = argument_groups), yaml_file)
+yaml::write_yaml(
+  list(argument_groups = argument_groups),
+  yaml_file,
+  handlers = list(
+    logical = yaml::verbatim_logical
+  )
+)
diff --git a/src/star/star_genome_generate/config.vsh.yaml b/src/star/star_genome_generate/config.vsh.yaml
index 60fa3839..71c58826 100644
--- a/src/star/star_genome_generate/config.vsh.yaml
+++ b/src/star/star_genome_generate/config.vsh.yaml
@@ -17,71 +17,68 @@ authors:
 argument_groups:
 - name: "Input"
   arguments: 
-  - name: "--genomeFastaFiles"
+  - name: "--genome_fasta_files"
     type: file
     description: |
       Path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped.
     required: true
-    multiple: yes
-    multiple_sep: ;  
-  - name: "--sjdbGTFfile"
+    multiple: true
+  - name: "--sjdb_gtf_file"
     type: file
     description: Path to the GTF file with annotations
-  - name: --sjdbOverhang
+  - name: --sjdb_overhang
     type: integer
     description: Length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)
     example: 100
-  - name: --sjdbGTFchrPrefix
+  - name: --sjdb_gtf_chr_prefix
     type: string
     description: Prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes)
-  - name: --sjdbGTFfeatureExon
+  - name: --sjdb_gtf_feature_exon
     type: string
     description: Feature type in GTF file to be used as exons for building transcripts
     example: exon
-  - name: --sjdbGTFtagExonParentTranscript
+  - name: --sjdb_gtf_tag_exon_parent_transcript
     type: string
     description: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files)
     example: transcript_id
-  - name: --sjdbGTFtagExonParentGene
+  - name: --sjdb_gtf_tag_exon_parent_gene
     type: string
     description: GTF attribute name for parent gene ID (default "gene_id" works for GTF files)
     example: gene_id
-  - name: --sjdbGTFtagExonParentGeneName
+  - name: --sjdb_gtf_tag_exon_parent_gene_name
     type: string
     description: GTF attribute name for parent gene name
     example: gene_name
-    multiple: yes
-    multiple_sep: ;
-  - name: --sjdbGTFtagExonParentGeneType
+    multiple: true
+  - name: --sjdb_gtf_tag_exon_parent_gene_type
     type: string
     description: GTF attribute name for parent gene type
     example:
     - gene_type
     - gene_biotype
-    multiple: yes
-    multiple_sep: ;
-  - name: --limitGenomeGenerateRAM
+    multiple: true
+  - name: --limit_genome_generate_ram
     type: long
     description: Maximum available RAM (bytes) for genome generation
-    example: '31000000000'
-  - name: --genomeSAindexNbases
+    example: 31000000000
+  - name: --genome_sa_index_nbases
     type: integer
     description: Length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, this parameter must be scaled down to min(14, log2(GenomeLength)/2 - 1).
     example: 14
-  - name: --genomeChrBinNbits
+  - name: --genome_chr_bin_nbits
     type: integer
     description: Defined as log2(chrBin), where chrBin is the size of the bins for genome storage. Each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]).
     example: 18
-  - name: --genomeSAsparseD
+  - name: --genome_sa_sparse_d
     type: integer
     min: 0
     example: 1
     description: Suffux array sparsity, i.e. distance between indices. Use bigger numbers to decrease needed RAM at the cost of mapping speed reduction.
-  - name: --genomeSuffixLengthMax
+  - name: --genome_suffix_length_max
     type: integer
     description: Maximum length of the suffixes, has to be longer than read length. Use -1 for infinite length.
     example: -1
-  - name: --genomeTransformType   
+  - name: --genome_transform_type   
     type: string
     description: |
       Type of genome transformation
@@ -89,7 +86,7 @@ argument_groups:
         Haploid    ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele)
         Diploid    ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome)
     example: None
-  - name: --genomeTransformVCF
+  - name: --genome_transform_vcf
     type: file
     description: path to VCF file for genome transformation
   
diff --git a/src/star/star_genome_generate/script.sh b/src/star/star_genome_generate/script.sh
index cb3b906c..fc232672 100644
--- a/src/star/star_genome_generate/script.sh
+++ b/src/star/star_genome_generate/script.sh
@@ -10,20 +10,20 @@ mkdir -p $par_index
 STAR \
     --runMode genomeGenerate \
     --genomeDir $par_index \
-    --genomeFastaFiles $par_genomeFastaFiles \
+    --genomeFastaFiles $par_genome_fasta_files \
     ${meta_cpus:+--runThreadN "${meta_cpus}"} \
-    ${par_sjdbGTFfile:+--sjdbGTFfile "${par_sjdbGTFfile}"} \
+    ${par_sjdb_gtf_file:+--sjdbGTFfile "${par_sjdb_gtf_file}"} \
     ${par_sjdbOverhang:+--sjdbOverhang "${par_sjdbOverhang}"} \
-    ${par_genomeSAindexNbases:+--genomeSAindexNbases "${par_genomeSAindexNbases}"} \
-    ${par_sjdbGTFchrPrefix:+--sjdbGTFchrPrefix "${par_sjdbGTFchrPrefix}"} \
-    ${par_sjdbGTFfeatureExon:+--sjdbGTFfeatureExon "${par_sjdbGTFfeatureExon}"} \
-    ${par_sjdbGTFtagExonParentTranscript:+--sjdbGTFtagExonParentTranscript "${par_sjdbGTFtagExonParentTranscript}"} \
-    ${par_sjdbGTFtagExonParentGene:+--sjdbGTFtagExonParentGene "${par_sjdbGTFtagExonParentGene}"} \
-    ${par_sjdbGTFtagExonParentGeneName:+--sjdbGTFtagExonParentGeneName "${par_sjdbGTFtagExonParentGeneName}"} \
-    ${par_sjdbGTFtagExonParentGeneType:+--sjdbGTFtagExonParentGeneType "${sjdbGTFtagExonParentGeneType}"} \
-    ${par_limitGenomeGenerateRAM:+--limitGenomeGenerateRAM "${par_limitGenomeGenerateRAM}"} \
-    ${par_genomeChrBinNbits:+--genomeChrBinNbits "${par_genomeChrBinNbits}"} \
-    ${par_genomeSAsparseD:+--genomeSAsparseD "${par_genomeSAsparseD}"} \
-    ${par_genomeSuffixLengthMax:+--genomeSuffixLengthMax "${par_genomeSuffixLengthMax}"} \
-    ${par_genomeTransformType:+--genomeTransformType "${par_genomeTransformType}"} \
-    ${par_genomeTransformVCF:+--genomeTransformVCF "${par_genomeTransformVCF}"} \
+    ${par_genome_sa_index_nbases:+--genomeSAindexNbases "${par_genome_sa_index_nbases}"} \
+    ${par_sjdb_gtf_chr_prefix:+--sjdbGTFchrPrefix "${par_sjdb_gtf_chr_prefix}"} \
+    ${par_sjdb_gtf_feature_exon:+--sjdbGTFfeatureExon "${par_sjdb_gtf_feature_exon}"} \
+    ${par_sjdb_gtf_tag_exon_parent_transcript:+--sjdbGTFtag_exon_parent_transcript "${par_sjdb_gtf_tag_exon_parent_transcript}"} \
+    ${par_sjdb_gtf_tag_exon_parent_gene:+--sjdbGTFtag_exon_parent_gene "${par_sjdb_gtf_tag_exon_parent_gene}"} \
+    ${par_sjdb_gtf_tag_exon_parent_geneName:+--sjdbGTFtag_exon_parent_geneName "${par_sjdb_gtf_tag_exon_parent_geneName}"} \
+    ${par_sjdb_gtf_tag_exon_parent_geneType:+--sjdbGTFtag_exon_parent_geneType "${sjdbGTFtag_exon_parent_geneType}"} \
+    ${par_limit_genome_generate_ram:+--limitGenomeGenerateRAM "${par_limit_genome_generate_ram}"} \
+    ${par_genome_chr_bin_nbits:+--genomeChrBinNbits "${par_genome_chr_bin_nbits}"} \
+    ${par_genome_sa_sparse_d:+--genomeSAsparseD "${par_genome_sa_sparse_d}"} \
+    ${par_genome_suffix_length_max:+--genomeSuffixLengthMax "${par_genome_suffix_length_max}"} \
+    ${par_genome_transform_type:+--genomeTransformType "${par_genome_transform_type}"} \
+    ${par_genome_transform_vcf:+--genomeTransformVCF "${par_genome_transform_vCF}"} \
diff --git a/src/star/star_genome_generate/test.sh b/src/star/star_genome_generate/test.sh
index fd0e4775..681f3494 100644
--- a/src/star/star_genome_generate/test.sh
+++ b/src/star/star_genome_generate/test.sh
@@ -27,9 +27,9 @@ echo "> Generate index"
 "$meta_executable" \
   ${meta_cpus:+---cpus $meta_cpus} \
   --index "star_index/" \
-  --genomeFastaFiles "genome.fasta" \
-  --sjdbGTFfile "genes.gtf" \
-  --genomeSAindexNbases 2
+  --genome_fasta_files "genome.fasta" \
+  --sjdb_gtf_file "genes.gtf" \
+  --genome_sa_index_nbases 4 
 
 files=("Genome" "Log.out" "SA" "SAindex" "chrLength.txt" "chrName.txt" "chrNameLength.txt" "chrStart.txt" "exonGeTrInfo.tab" "exonInfo.tab" "geneInfo.tab" "genomeParameters.txt" "sjdbInfo.txt" "sjdbList.fromGTF.out.tab" "sjdbList.out.tab" "transcriptInfo.tab")
 

From 8f525f5e40301ad51bc1cd9587c0febbef84bd7d Mon Sep 17 00:00:00 2001
From: Theodoro Gasperin Terra Camargo
 <98555209+tgaspe@users.noreply.github.com>
Date: Wed, 31 Jul 2024 16:20:11 -0300
Subject: [PATCH 4/9] Bedtools_Intersect (#94)

* Initial Commit

* Update config.vsh.yaml

* creating templates

* Update config.vsh.yaml

* Update script.sh

* Added output

* Update config.vsh.yaml

* Update test.sh

* Update test.sh

* More tests

* small changes

* update

- change some var names
- debugged
- added more test

* Update CHANGELOG.md

* Update

* Update help.txt
---
 CHANGELOG.md                                  |   3 +
 .../bedtools_intersect/config.vsh.yaml        | 255 +++++++++++++
 src/bedtools/bedtools_intersect/help.txt      | 119 ++++++
 src/bedtools/bedtools_intersect/script.sh     |  61 ++++
 src/bedtools/bedtools_intersect/test.sh       | 340 ++++++++++++++++++
 5 files changed, 778 insertions(+)
 create mode 100644 src/bedtools/bedtools_intersect/config.vsh.yaml
 create mode 100644 src/bedtools/bedtools_intersect/help.txt
 create mode 100644 src/bedtools/bedtools_intersect/script.sh
 create mode 100644 src/bedtools/bedtools_intersect/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c4575cb9..36681056 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,9 @@
 
 * `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76).
 
+* `bedtools`:
+    - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94).
+
 ## MINOR CHANGES
 
 * `busco` components: update BUSCO to `5.7.1` (PR #72).
diff --git a/src/bedtools/bedtools_intersect/config.vsh.yaml b/src/bedtools/bedtools_intersect/config.vsh.yaml
new file mode 100644
index 00000000..73dc0047
--- /dev/null
+++ b/src/bedtools/bedtools_intersect/config.vsh.yaml
@@ -0,0 +1,255 @@
+name: bedtools_intersect
+namespace: bedtools
+description: |
+  bedtools intersect allows one to screen for overlaps between two sets of genomic features. 
+  Moreover, it allows one to have fine control as to how the intersections are reported. 
+  bedtools intersect works with both BED/GFF/VCF and BAM files as input.
+keywords: [feature intersection, BAM, BED, GFF, VCF]
+links:
+  documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
+  repository: https://github.com/arq5x/bedtools2
+references:
+  doi: 10.1093/bioinformatics/btq033
+license: GPL-2.0, MIT
+requirements:
+  commands: [bedtools]
+authors:
+  - __merge__: /src/_authors/theodoro_gasperin.yaml
+    roles: [ author, maintainer ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_a
+        alternatives: -a
+        type: file
+        direction: input
+        description: |
+          The input file (BED/GFF/VCF/BAM) to be used as the -a file.
+        required: true
+        example: input_a.bed
+        
+      - name: --input_b
+        alternatives: -b
+        type: file
+        direction: input
+        multiple: true
+        description: | 
+          The input file(s) (BED/GFF/VCF/BAM) to be used as the -b file(s).
+        required: true
+        example: input_b.bed
+    
+  - name: Outputs
+    arguments:
+      - name: --output
+        type: file
+        direction: output
+        description: | 
+          The output BED file. 
+        required: true
+        example: output.bed
+  
+  - name: Options
+    arguments:
+      - name: --write_a
+        alternatives: -wa
+        type: boolean_true
+        description: Write the original A entry for each overlap.      
+      
+      - name: --write_b
+        alternatives: -wb
+        type: boolean_true
+        description: | 
+          Write the original B entry for each overlap. 
+          Useful for knowing _what_ A overlaps. Restricted by -f and -r.
+      
+      - name: --left_outer_join
+        alternatives: -loj
+        type: boolean_true
+        description: |
+          Perform a "left outer join". That is, for each feature in A report each overlap with B. 
+          If no overlaps are found, report a NULL feature for B.
+      
+      - name: --write_overlap
+        alternatives: -wo
+        type: boolean_true
+        description: | 
+          Write the original A and B entries plus the number of base pairs of overlap between the two features.
+          - Overlaps restricted by -f and -r. 
+            Only A features with overlap are reported.
+      
+      - name: --write_overlap_plus
+        alternatives: -wao
+        type: boolean_true
+        description: | 
+          Write the original A and B entries plus the number of base pairs of overlap between the two features.
+          - Overlaps restricted by -f and -r. 
+            However, A features w/o overlap are also reported with a NULL B feature and overlap = 0.
+
+      - name: --report_A_if_no_overlap
+        alternatives: -u
+        type: boolean_true
+        description: | 
+          Write the original A entry _if_ no overlap is found. 
+          - In other words, just report the fact >=1 hit was found.
+          - Overlaps restricted by -f and -r. 
+      
+      - name: --number_of_overlaps_A
+        alternatives: -c
+        type: boolean_true
+        description: | 
+          For each entry in A, report the number of overlaps with B.
+          - Reports 0 for A entries that have no overlap with B.
+          - Overlaps restricted by -f and -r.
+
+      - name: --report_no_overlaps_A
+        alternatives: -v
+        type: boolean_true
+        description: | 
+          Only report those entries in A that have _no overlaps_ with B.
+          - Similar to "grep -v" (an homage).
+      
+      - name: --uncompressed_bam
+        alternatives: -ubam
+        type: boolean_true
+        description: Write uncompressed BAM output. Default writes compressed BAM.
+
+      - name: --same_strand
+        alternatives: -s
+        type: boolean_true
+        description: | 
+          Require same strandedness.  That is, only report hits in B.
+          that overlap A on the _same_ strand.
+          - By default, overlaps are reported without respect to strand.
+
+      - name: --opposite_strand
+        alternatives: -S
+        type: boolean_true
+        description: | 
+          Require different strandedness.  That is, only report hits in B
+          that overlap A on the _opposite_ strand.
+          - By default, overlaps are reported without respect to strand.
+
+      - name: --min_overlap_A
+        alternatives: -f
+        type: double
+        description: | 
+          Minimum overlap required as a fraction of A.
+          - Default is 1E-9 (i.e., 1bp).
+          - FLOAT (e.g. 0.50)
+        example: 0.50
+
+      - name: --min_overlap_B
+        alternatives: -F
+        type: double
+        description: | 
+          Minimum overlap required as a fraction of B.
+          - Default is 1E-9 (i.e., 1bp).
+          - FLOAT (e.g. 0.50)
+        example: 0.50
+
+      - name: --reciprocal_overlap
+        alternatives: -r
+        type: boolean_true
+        description: | 
+          Require that the fraction overlap be reciprocal for A AND B.
+          - In other words, if -f is 0.90 and -r is used, this requires
+          that B overlap 90% of A and A _also_ overlaps 90% of B.
+      
+      - name: --either_overlap
+        alternatives: -e
+        type: boolean_true
+        description: | 
+          Require that the minimum fraction be satisfied for A OR B.
+          - In other words, if -e is used with -f 0.90 and -F 0.10 this requires
+          that either 90% of A is covered OR 10% of B is covered.
+          Without -e, both fractions would have to be satisfied.
+
+      - name: --split
+        type: boolean_true
+        description: Treat "split" BAM or BED12 entries as distinct BED intervals.
+
+      - name: --genome
+        alternatives: -g
+        type: file
+        description: | 
+          Provide a genome file to enforce consistent chromosome 
+          sort order across input files. Only applies when used 
+          with -sorted option.
+        example: genome.txt
+
+      - name: --nonamecheck
+        type: boolean_true
+        description: | 
+          For sorted data, don't throw an error if the file 
+          has different naming conventions for the same chromosome 
+          (e.g., "chr1" vs "chr01").
+
+      - name: --sorted
+        type: boolean_true
+        description: | 
+          Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input.
+
+      - name: --names
+        type: string
+        description: | 
+          When using multiple databases, provide an alias 
+          for each that will appear instead of a fileId when 
+          also printing the DB record.
+
+      - name: --filenames
+        type: boolean_true
+        description: When using multiple databases, show each complete filename instead of a fileId when also printing the DB record.
+
+      - name: --sortout
+        type: boolean_true
+        description: When using multiple databases, sort the output DB hits for each record.
+
+      - name: --bed
+        type: boolean_true
+        description: If using BAM input, write output as BED.
+
+      - name: --header
+        type: boolean_true
+        description: Print the header from the A file prior to results.
+
+      - name: --no_buffer_output
+        alternatives: --nobuf
+        type: boolean_true
+        description: | 
+          Disable buffered output. Using this option will cause each line
+          of output to be printed as it is generated, rather than saved
+          in a buffer. This will make printing large output files 
+          noticeably slower, but can be useful in conjunction with
+          other software tools and scripts that need to process one
+          line of bedtools output at a time.
+
+      - name: --io_buffer_size
+        alternatives: --iobuf
+        type: integer
+        description: | 
+          Specify amount of memory to use for input buffer.
+          Takes an integer argument. Optional suffixes K/M/G supported.
+          Note: currently has no effect with compressed files.       
+
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+
+engines:
+  - type: docker
+    image: debian:stable-slim
+    setup:
+      - type: apt
+        packages: [bedtools, procps]
+      - type: docker
+        run: |
+          echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt
+
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/bedtools/bedtools_intersect/help.txt b/src/bedtools/bedtools_intersect/help.txt
new file mode 100644
index 00000000..d1bbdc20
--- /dev/null
+++ b/src/bedtools/bedtools_intersect/help.txt
@@ -0,0 +1,119 @@
+```bash
+bedtools intersect
+```
+
+Tool:    bedtools intersect (aka intersectBed)
+Version: v2.30.0
+Summary: Report overlaps between two feature files.
+
+Usage:   bedtools intersect [OPTIONS] -a <bed/gff/vcf/bam> -b <bed/gff/vcf/bam>
+
+	Note: -b may be followed with multiple databases and/or 
+	wildcard (*) character(s). 
+Options: 
+	-wa	Write the original entry in A for each overlap.
+
+	-wb	Write the original entry in B for each overlap.
+		- Useful for knowing _what_ A overlaps. Restricted by -f and -r.
+
+	-loj	Perform a "left outer join". That is, for each feature in A
+		report each overlap with B.  If no overlaps are found, 
+		report a NULL feature for B.
+
+	-wo	Write the original A and B entries plus the number of base
+		pairs of overlap between the two features.
+		- Overlaps restricted by -f and -r.
+		  Only A features with overlap are reported.
+
+	-wao	Write the original A and B entries plus the number of base
+		pairs of overlap between the two features.
+		- Overlapping features restricted by -f and -r.
+		  However, A features w/o overlap are also reported
+		  with a NULL B feature and overlap = 0.
+
+	-u	Write the original A entry _once_ if _any_ overlaps found in B.
+		- In other words, just report the fact >=1 hit was found.
+		- Overlaps restricted by -f and -r.
+
+	-c	For each entry in A, report the number of overlaps with B.
+		- Reports 0 for A entries that have no overlap with B.
+		- Overlaps restricted by -f, -F, -r, and -s.
+
+	-C	For each entry in A, separately report the number of
+		- overlaps with each B file on a distinct line.
+		- Reports 0 for A entries that have no overlap with B.
+		- Overlaps restricted by -f, -F, -r, and -s.
+
+	-v	Only report those entries in A that have _no overlaps_ with B.
+		- Similar to "grep -v" (an homage).
+
+	-ubam	Write uncompressed BAM output. Default writes compressed BAM.
+
+	-s	Require same strandedness.  That is, only report hits in B
+		that overlap A on the _same_ strand.
+		- By default, overlaps are reported without respect to strand.
+
+	-S	Require different strandedness.  That is, only report hits in B
+		that overlap A on the _opposite_ strand.
+		- By default, overlaps are reported without respect to strand.
+
+	-f	Minimum overlap required as a fraction of A.
+		- Default is 1E-9 (i.e., 1bp).
+		- FLOAT (e.g. 0.50)
+
+	-F	Minimum overlap required as a fraction of B.
+		- Default is 1E-9 (i.e., 1bp).
+		- FLOAT (e.g. 0.50)
+
+	-r	Require that the fraction overlap be reciprocal for A AND B.
+		- In other words, if -f is 0.90 and -r is used, this requires
+		  that B overlap 90% of A and A _also_ overlaps 90% of B.
+
+	-e	Require that the minimum fraction be satisfied for A OR B.
+		- In other words, if -e is used with -f 0.90 and -F 0.10 this requires
+		  that either 90% of A is covered OR 10% of  B is covered.
+		  Without -e, both fractions would have to be satisfied.
+
+	-split	Treat "split" BAM or BED12 entries as distinct BED intervals.
+
+	-g	Provide a genome file to enforce consistent chromosome sort order
+		across input files. Only applies when used with -sorted option.
+
+	-nonamecheck	For sorted data, don't throw an error if the file has different naming conventions
+			for the same chromosome. ex. "chr1" vs "chr01".
+
+	-sorted	Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input.
+
+	-names	When using multiple databases, provide an alias for each that
+		will appear instead of a fileId when also printing the DB record.
+
+	-filenames	When using multiple databases, show each complete filename
+			instead of a fileId when also printing the DB record.
+
+	-sortout	When using multiple databases, sort the output DB hits
+			for each record.
+
+	-bed	If using BAM input, write output as BED.
+
+	-header	Print the header from the A file prior to results.
+
+	-nobuf	Disable buffered output. Using this option will cause each line
+		of output to be printed as it is generated, rather than saved
+		in a buffer. This will make printing large output files 
+		noticeably slower, but can be useful in conjunction with
+		other software tools and scripts that need to process one
+		line of bedtools output at a time.
+
+	-iobuf	Specify amount of memory to use for input buffer.
+		Takes an integer argument. Optional suffixes K/M/G supported.
+		Note: currently has no effect with compressed files.
+
+Notes: 
+	(1) When a BAM file is used for the A file, the alignment is retained if overlaps exist,
+	and excluded if an overlap cannot be found.  If multiple overlaps exist, they are not
+	reported, as we are only testing for one or more overlaps.
+
+
+
+
+***** ERROR: No input file given. Exiting. *****
diff --git a/src/bedtools/bedtools_intersect/script.sh b/src/bedtools/bedtools_intersect/script.sh
new file mode 100644
index 00000000..2141859d
--- /dev/null
+++ b/src/bedtools/bedtools_intersect/script.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+[[ "$par_write_a" == "false" ]] && unset par_write_a
+[[ "$par_write_b" == "false" ]] && unset par_write_b
+[[ "$par_left_outer_join" == "false" ]] && unset par_left_outer_join
+[[ "$par_write_overlap" == "false" ]] && unset par_write_overlap
+[[ "$par_write_overlap_plus" == "false" ]] && unset par_write_overlap_plus
+[[ "$par_report_A_if_no_overlap" == "false" ]] && unset par_report_A_if_no_overlap
+[[ "$par_number_of_overlaps_A" == "false" ]] && unset par_number_of_overlaps_A
+[[ "$par_report_no_overlaps_A" == "false" ]] && unset par_report_no_overlaps_A
+[[ "$par_uncompressed_bam" == "false" ]] && unset par_uncompressed_bam
+[[ "$par_same_strand" == "false" ]] && unset par_same_strand
+[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand
+[[ "$par_reciprocal_overlap" == "false" ]] && unset par_reciprocal_overlap
+[[ "$par_either_overlap" == "false" ]] && unset par_either_overlap
+[[ "$par_split" == "false" ]] && unset par_split
+[[ "$par_nonamecheck" == "false" ]] && unset par_nonamecheck
+[[ "$par_sorted" == "false" ]] && unset par_sorted
+[[ "$par_filenames" == "false" ]] && unset par_filenames
+[[ "$par_sortout" == "false" ]] && unset par_sortout
+[[ "$par_bed" == "false" ]] && unset par_bed
+[[ "$par_header" == "false" ]] && unset par_header
+[[ "$par_no_buffer_output" == "false" ]] && unset par_no_buffer_output
+
+# Create input array 
+IFS=";" read -ra input <<< $par_input_b
+
+bedtools intersect \
+    ${par_write_a:+-wa} \
+    ${par_write_b:+-wb} \
+    ${par_left_outer_join:+-loj} \
+    ${par_write_overlap:+-wo} \
+    ${par_write_overlap_plus:+-wao} \
+    ${par_report_A_if_no_overlap:+-u} \
+    ${par_number_of_overlaps_A:+-c} \
+    ${par_report_no_overlaps_A:+-v} \
+    ${par_uncompressed_bam:+-ubam} \
+    ${par_same_strand:+-s} \
+    ${par_opposite_strand:+-S} \
+    ${par_min_overlap_A:+-f "$par_min_overlap_A"} \
+    ${par_min_overlap_B:+-F "$par_min_overlap_B"} \
+    ${par_reciprocal_overlap:+-r} \
+    ${par_either_overlap:+-e} \
+    ${par_split:+-split} \
+    ${par_genome:+-g "$par_genome"} \
+    ${par_nonamecheck:+-nonamecheck} \
+    ${par_sorted:+-sorted} \
+    ${par_names:+-names "$par_names"} \
+    ${par_filenames:+-filenames} \
+    ${par_sortout:+-sortout} \
+    ${par_bed:+-bed} \
+    ${par_header:+-header} \
+    ${par_no_buffer_output:+-nobuf} \
+    ${par_io_buffer_size:+-iobuf "$par_io_buffer_size"} \
+    -a "$par_input_a" \
+    ${par_input_b:+ -b ${input[*]}} \
+    > "$par_output"
+    
\ No newline at end of file
diff --git a/src/bedtools/bedtools_intersect/test.sh b/src/bedtools/bedtools_intersect/test.sh
new file mode 100644
index 00000000..b9405a59
--- /dev/null
+++ b/src/bedtools/bedtools_intersect/test.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+# exit on error
+set -e
+
+## VIASH START
+meta_executable="target/executable/bedtools/bedtools_intersect/bedtools_intersect"
+meta_resources_dir="src/bedtools/bedtools_intersect"
+## VIASH END
+
+#############################################
+# helper functions
+assert_file_exists() {
+  [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; }
+}
+assert_file_not_empty() {
+  [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; }
+}
+assert_file_contains() {
+  grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; }
+}
+assert_identical_content() {
+  diff -a "$2" "$1" \
+    || (echo "Files are not identical!" && exit 1)
+}
+#############################################
+
+# Create directories for tests
+echo "Creating Test Data..."
+mkdir -p test_data
+
+# Create and populate featuresA.bed
+printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/featuresA.bed"
+
+# Create and populate featuresB.bed
+printf "chr1\t180\t280\nchr1\t290\t390\nchr1\t500\t600\n" > "test_data/featuresB.bed"
+
+# Create and populate featuresC.bed
+printf "chr1\t120\t220\nchr1\t250\t350\nchr1\t500\t580\n" > "test_data/featuresC.bed"
+
+# Create and populate examples gff files
+# example1.gff
+printf "##gff-version 3\n" > "test_data/example1.gff"
+printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example1.gff"
+printf "chr1\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example1.gff"
+printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example1.gff"
+printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example1.gff"
+printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example1.gff"
+printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example1.gff"
+# example2.gff
+printf "##gff-version 3\n" > "test_data/example2.gff"
+printf "chr1\t.\tgene\t1200\t1800\t.\t-\t.\tID=gene2;Name=Gene2\n" >> "test_data/example2.gff"
+printf "chr1\t.\tmRNA\t1400\t2000\t.\t-\t.\tID=transcript2;Parent=gene2\n" >> "test_data/example2.gff"
+printf "chr1\t.\texon\t1400\t2000\t.\t-\t.\tID=exon3;Parent=transcript2\n" >> "test_data/example2.gff"
+printf "chr1\t.\texon\t1600\t2000\t.\t-\t.\tID=exon4;Parent=transcript2\n" >> "test_data/example2.gff"
+printf "chr1\t.\tCDS\t3000\t3200\t.\t-\t1\tID=cds3;Parent=transcript2\n" >> "test_data/example2.gff"
+printf "chr1\t.\tCDS\t3500\t3700\t.\t-\t0\tID=cds4;Parent=transcript2\n" >> "test_data/example2.gff"
+
+# Create and populate expected output files for different tests
+printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_default.bed"
+printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_wa.bed"
+printf "chr1\t180\t200\tchr1\t180\t280\nchr1\t180\t250\tchr1\t180\t280\nchr1\t300\t390\tchr1\t290\t390\n" > "test_data/expected_wb.bed"
+printf "chr1\t100\t200\tchr1\t180\t280\nchr1\t150\t250\tchr1\t180\t280\nchr1\t300\t400\tchr1\t290\t390\n" > "test_data/expected_loj.bed"
+printf "chr1\t100\t200\tchr1\t180\t280\t20\nchr1\t150\t250\tchr1\t180\t280\t70\nchr1\t300\t400\tchr1\t290\t390\t90\n" > "test_data/expected_wo.bed"
+printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_u.bed"
+printf "chr1\t100\t200\t1\nchr1\t150\t250\t1\nchr1\t300\t400\t1\n" > "test_data/expected_c.bed"
+printf "chr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f50.bed"
+printf "chr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f30.bed"
+printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_f10.bed"
+printf "chr1\t180\t200\nchr1\t180\t250\nchr1\t300\t390\n" > "test_data/expected_r.bed"
+printf "chr1\t180\t200\nchr1\t120\t200\nchr1\t180\t250\nchr1\t150\t220\nchr1\t300\t390\nchr1\t300\t350\n" > "test_data/expected_multiple.bed"
+# expected gff file
+printf "chr1\t.\tgene\t1200\t1800\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tgene\t1400\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tgene\t1400\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tgene\t1600\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tmRNA\t1200\t1800\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tmRNA\t1400\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tmRNA\t1400\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tmRNA\t1600\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected.gff"
+printf "chr1\t.\texon\t1200\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\texon\t1600\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tCDS\t1200\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff"
+printf "chr1\t.\tCDS\t1600\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected.gff"
+
+# Test 1: Default intersect
+mkdir test1
+cd test1
+
+echo "> Run bedtools_intersect on BED files with default intersect"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed"
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_default.bed"
+echo "- test1 succeeded -"
+
+cd ..
+
+# Test 2: Write A option
+mkdir test2
+cd test2
+
+echo "> Run bedtools_intersect on BED files with -wa option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --write_a
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_wa.bed"
+echo "- test2 succeeded -"
+
+cd ..
+
+# Test 3: -wb option
+mkdir test3
+cd test3
+
+echo "> Run bedtools_intersect on BED files with -wb option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --write_b 
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_wb.bed"
+echo "- test3 succeeded -"
+
+cd ..
+
+# Test 4: -loj option
+mkdir test4
+cd test4
+
+echo "> Run bedtools_intersect on BED files with -loj option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --left_outer_join
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_loj.bed"
+echo "- test4 succeeded -"
+
+cd ..
+
+# Test 5: -wo option
+mkdir test5
+cd test5
+
+echo "> Run bedtools_intersect on BED files with -wo option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --write_overlap 
+  
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_wo.bed"
+echo "- test5 succeeded -"
+
+cd ..
+
+# Test 6: -u option
+mkdir test6
+cd test6
+
+echo "> Run bedtools_intersect on BED files with -u option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --report_A_if_no_overlap true
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_u.bed"
+echo "- test6 succeeded -"
+
+cd ..
+
+# Test 7: -c option
+mkdir test7
+cd test7
+
+echo "> Run bedtools_intersect on BED files with -c option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --number_of_overlaps_A true
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_c.bed"
+echo "- test7 succeeded -"
+
+cd ..
+
+# Test 8: -f 0.50 option
+mkdir test8
+cd test8
+
+echo "> Run bedtools_intersect on BED files with -f 0.50 option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --min_overlap_A 0.50
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_f50.bed"
+echo "- test8 succeeded -"
+
+cd ..
+
+# Test 9: -f 0.30 option
+mkdir test9
+cd test9
+
+echo "> Run bedtools_intersect on BED files with -f 0.30 option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --min_overlap_A 0.30
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_f30.bed"
+echo "- test9 succeeded -"
+
+cd ..
+
+# Test 10: -f 0.10 option
+mkdir test10
+cd test10
+
+echo "> Run bedtools_intersect on BED files with -f 0.10 option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --min_overlap_A 0.10
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_f10.bed"
+echo "- test10 succeeded -"
+
+cd ..
+
+# Test 11: -r option
+mkdir test11
+cd test11
+
+echo "> Run bedtools_intersect on BED files with -r option"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --output "output.bed" \
+  --reciprocal_overlap true
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_r.bed"
+echo "- test11 succeeded -"
+
+cd ..
+
+
+# Test 12: Multiple files
+mkdir test12
+cd test12
+
+echo "> Run bedtools_intersect on multiple BED files"
+"$meta_executable" \
+  --input_a "../test_data/featuresA.bed" \
+  --input_b "../test_data/featuresB.bed" \
+  --input_b "../test_data/featuresC.bed" \
+  --output "output.bed"
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_multiple.bed"
+echo "- test12 succeeded -"
+
+cd ..
+
+# Test 13: VCF file format
+mkdir test13
+cd test13
+
+echo "> Run bedtools_intersect on GFF files"
+"$meta_executable" \
+  --input_a "../test_data/example1.gff" \
+  --input_b "../test_data/example2.gff" \
+  --output "output.bed"
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected.gff"
+echo "- test13 succeeded -"
+
+cd ..
+
+echo "---- All tests succeeded! ----"
+exit 0

From de8b4248b64e0d2e04a6f20c35212403c57a1058 Mon Sep 17 00:00:00 2001
From: Theodoro Gasperin Terra Camargo
 <98555209+tgaspe@users.noreply.github.com>
Date: Wed, 31 Jul 2024 16:21:15 -0300
Subject: [PATCH 5/9] Bedtools sort (#98)

* Initial Commmit

* config file

* Update config.vsh.yaml

* Update script.sh

* Update test.sh

* test files

* Update test.sh

* adding tests

* two more test

* more tests

* more tests

* Update CHANGELOG.md

* removing some files

* Update help.txt

---------

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>
---
 CHANGELOG.md                               |   1 +
 src/bedtools/bedtools_sort/config.vsh.yaml |  93 ++++++++
 src/bedtools/bedtools_sort/help.txt        |  21 ++
 src/bedtools/bedtools_sort/script.sh       |  27 +++
 src/bedtools/bedtools_sort/test.sh         | 264 +++++++++++++++++++++
 5 files changed, 406 insertions(+)
 create mode 100644 src/bedtools/bedtools_sort/config.vsh.yaml
 create mode 100644 src/bedtools/bedtools_sort/help.txt
 create mode 100644 src/bedtools/bedtools_sort/script.sh
 create mode 100644 src/bedtools/bedtools_sort/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36681056..1debf12b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@
 
 * `bedtools`:
     - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94).
+    - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98).
 
 ## MINOR CHANGES
 
diff --git a/src/bedtools/bedtools_sort/config.vsh.yaml b/src/bedtools/bedtools_sort/config.vsh.yaml
new file mode 100644
index 00000000..5024bd39
--- /dev/null
+++ b/src/bedtools/bedtools_sort/config.vsh.yaml
@@ -0,0 +1,93 @@
+name: bedtools_sort
+namespace: bedtools
+description: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria.
+keywords: [sort, BED, GFF, VCF]
+links:
+  documentation: https://bedtools.readthedocs.io/en/latest/content/tools/sort.html
+  repository: https://github.com/arq5x/bedtools2
+references:
+  doi: 10.1093/bioinformatics/btq033
+license: GPL-2.0, MIT
+requirements:
+  commands: [bedtools]
+authors:
+  - __merge__: /src/_authors/theodoro_gasperin.yaml
+    roles: [ author, maintainer ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        alternatives: -i
+        type: file
+        description: Input file (bed/gff/vcf) to be sorted.
+        required: true
+    
+  - name: Outputs
+    arguments:
+      - name: --output
+        alternatives: -o
+        type: file
+        direction: output
+        description: Output sorted file (bed/gff/vcf) to be written.
+
+  - name: Options
+    arguments:
+      - name: --sizeA
+        type: boolean_true
+        description: Sort by feature size in ascending order.
+      
+      - name: --sizeD
+        type: boolean_true
+        description: Sort by feature size in descending order.
+
+      - name: --chrThenSizeA 
+        type: boolean_true
+        description: Sort by chrom (asc), then feature size (asc).
+      
+      - name: --chrThenSizeD
+        type: boolean_true
+        description: Sort by chrom (asc), then feature size (desc).
+
+      - name: --chrThenScoreA
+        type: boolean_true
+        description: Sort by chrom (asc), then score (asc).
+
+      - name: --chrThenScoreD
+        type: boolean_true
+        description: Sort by chrom (asc), then score (desc).
+
+      - name: --genome
+        alternatives: -g
+        type: file
+        description: Sort according to the chromosomes declared in "genome.txt"
+
+      - name: --faidx
+        type: file
+        description: Sort according to the chromosomes declared in "names.txt"
+
+      - name: --header
+        type: boolean_true
+        description: Print the header from the A file prior to results.
+      
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+
+engines:
+  - type: docker
+    image: debian:stable-slim
+    setup:
+      - type: apt
+        packages: [bedtools, procps]
+      - type: docker
+        run: |
+          echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt
+
+runners:
+  - type: executable
+  - type: nextflow
\ No newline at end of file
diff --git a/src/bedtools/bedtools_sort/help.txt b/src/bedtools/bedtools_sort/help.txt
new file mode 100644
index 00000000..09266c69
--- /dev/null
+++ b/src/bedtools/bedtools_sort/help.txt
@@ -0,0 +1,21 @@
+```bash
+bedtools sort
+```
+
+Tool:    bedtools sort (aka sortBed)
+Version: v2.30.0
+Summary: Sorts a feature file in various and useful ways.
+
+Usage:   bedtools sort [OPTIONS] -i <bed/gff/vcf>
+
+Options: 
+	-sizeA			Sort by feature size in ascending order.
+	-sizeD			Sort by feature size in descending order.
+	-chrThenSizeA		Sort by chrom (asc), then feature size (asc).
+	-chrThenSizeD		Sort by chrom (asc), then feature size (desc).
+	-chrThenScoreA		Sort by chrom (asc), then score (asc).
+	-chrThenScoreD		Sort by chrom (asc), then score (desc).
+	-g (names.txt)	Sort according to the chromosomes declared in "genome.txt"
+	-faidx (names.txt)	Sort according to the chromosomes declared in "names.txt"
+	-header	Print the header from the A file prior to results.
+
diff --git a/src/bedtools/bedtools_sort/script.sh b/src/bedtools/bedtools_sort/script.sh
new file mode 100644
index 00000000..e7f712d7
--- /dev/null
+++ b/src/bedtools/bedtools_sort/script.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+# Unset parameters
+[[ "$par_sizeA" == "false" ]] && unset par_sizeA
+[[ "$par_sizeD" == "false" ]] && unset par_sizeD
+[[ "$par_chrThenSizeA" == "false" ]] && unset par_chrThenSizeA
+[[ "$par_chrThenSizeD" == "false" ]] && unset par_chrThenSizeD
+[[ "$par_chrThenScoreA" == "false" ]] && unset par_chrThenScoreA
+[[ "$par_chrThenScoreD" == "false" ]] && unset par_chrThenScoreD
+[[ "$par_header" == "false" ]] && unset par_header
+
+# Execute bedtools sort with the provided arguments
+bedtools sort \
+    ${par_sizeA:+-sizeA} \
+    ${par_sizeD:+-sizeD} \
+    ${par_chrThenSizeA:+-chrThenSizeA} \
+    ${par_chrThenSizeD:+-chrThenSizeD} \
+    ${par_chrThenScoreA:+-chrThenScoreA} \
+    ${par_chrThenScoreD:+-chrThenScoreD} \
+    ${par_genome:+-g "$par_genome"} \
+    ${par_faidx:+-faidx "$par_faidx"} \
+    ${par_header:+-header} \
+    -i "$par_input" \
+    > "$par_output"
diff --git a/src/bedtools/bedtools_sort/test.sh b/src/bedtools/bedtools_sort/test.sh
new file mode 100644
index 00000000..bf402c35
--- /dev/null
+++ b/src/bedtools/bedtools_sort/test.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+# exit on error
+set -e
+
+## VIASH START
+meta_executable="target/executable/bedtools/bedtools_sort/bedtools_sort"
+meta_resources_dir="src/bedtools/bedtools_sort"
+## VIASH END
+
+#############################################
+# helper functions
+assert_file_exists() {
+  [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; }
+}
+assert_file_not_empty() {
+  [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; }
+}
+assert_file_contains() {
+  grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; }
+}
+assert_identical_content() {
+  diff -a "$2" "$1" \
+    || (echo "Files are not identical!" && exit 1)
+}
+#############################################
+
+# Create directories for tests
+echo "Creating Test Data..."
+mkdir -p test_data
+
+# Create and populate example files
+printf "#Header\nchr1\t300\t400\nchr1\t150\t250\nchr1\t100\t200" > "test_data/featureA.bed"
+printf "chr2\t290\t400\nchr2\t180\t220\nchr1\t500\t600" > "test_data/featureB.bed"
+printf "chr1\t100\t200\tfeature1\t960\nchr1\t150\t250\tfeature2\t850\nchr1\t300\t400\tfeature3\t740\nchr2\t290\t390\tfeature4\t630\nchr2\t180\t280\tfeature5\t920\nchr3\t120\t220\tfeature6\t410\n" > "test_data/featureC.bed"
+printf "chr1\nchr3\nchr2\n" > "test_data/genome.txt"
+printf "chr1\t248956422\nchr3\t242193529\nchr2\t198295559\n" > "test_data/genome.fai"
+
+# Create and populate example.gff file
+printf "##gff-version 3\n" > "test_data/example.gff"
+printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/example.gff"
+printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/example.gff"
+printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/example.gff"
+printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/example.gff"
+printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/example.gff"
+printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/example.gff"
+
+# Create expected output files
+printf "chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_sorted_A.bed"
+printf "chr2\t180\t220\nchr1\t500\t600\nchr2\t290\t400\n" > "test_data/expected_sizeA.bed"
+printf "chr2\t290\t400\nchr1\t500\t600\nchr2\t180\t220\n" > "test_data/expected_sizeD.bed"
+printf "chr1\t500\t600\nchr2\t180\t220\nchr2\t290\t400\n" > "test_data/expected_chrThenSizeA.bed"
+printf "chr1\t500\t600\nchr2\t290\t400\nchr2\t180\t220\n" > "test_data/expected_chrThenSizeD.bed"
+printf "chr1\t300\t400\tfeature3\t740\nchr1\t150\t250\tfeature2\t850\nchr1\t100\t200\tfeature1\t960\nchr2\t290\t390\tfeature4\t630\nchr2\t180\t280\tfeature5\t920\nchr3\t120\t220\tfeature6\t410\n" > "test_data/expected_chrThenScoreA.bed"
+printf "chr1\t100\t200\tfeature1\t960\nchr1\t150\t250\tfeature2\t850\nchr1\t300\t400\tfeature3\t740\nchr2\t180\t280\tfeature5\t920\nchr2\t290\t390\tfeature4\t630\nchr3\t120\t220\tfeature6\t410\n" > "test_data/expected_chrThenScoreD.bed"
+printf "chr1\t100\t200\tfeature1\t960\nchr1\t150\t250\tfeature2\t850\nchr1\t300\t400\tfeature3\t740\nchr3\t120\t220\tfeature6\t410\nchr2\t180\t280\tfeature5\t920\nchr2\t290\t390\tfeature4\t630\n" > "test_data/expected_genome.bed"
+printf "#Header\nchr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n" > "test_data/expected_header.bed"
+
+# expected_sorted.gff
+printf "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=Gene1\n" >> "test_data/expected_sorted.gff"
+printf "chr1\t.\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=transcript1\n" >> "test_data/expected_sorted.gff"
+printf "chr1\t.\tCDS\t1000\t1200\t.\t+\t0\tID=cds1;Parent=transcript1\n" >> "test_data/expected_sorted.gff"
+printf "chr1\t.\tCDS\t1500\t1700\t.\t+\t2\tID=cds2;Parent=transcript1\n" >> "test_data/expected_sorted.gff"
+printf "chr2\t.\texon\t1500\t1700\t.\t+\t.\tID=exon2;Parent=transcript1\n" >> "test_data/expected_sorted.gff"
+printf "chr3\t.\tmRNA\t1000\t2000\t.\t+\t.\tID=transcript1;Parent=gene1\n" >> "test_data/expected_sorted.gff"
+
+# Test 1: Default sort on BED file
+mkdir test1
+cd test1
+
+echo "> Run bedtools_sort on BED file"
+"$meta_executable" \
+  --input "../test_data/featureA.bed" \
+  --output "output.bed"
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_sorted_A.bed"
+echo "- test1 succeeded -"
+
+cd ..
+
+# Test 2: Default sort on GFF file
+mkdir test2
+cd test2
+
+echo "> Run bedtools_sort on GFF file"
+"$meta_executable" \
+  --input "../test_data/example.gff" \
+  --output "output.gff"
+
+# checks
+assert_file_exists "output.gff"
+assert_file_not_empty "output.gff"
+assert_identical_content "output.gff" "../test_data/expected_sorted.gff"
+echo "- test2 succeeded -"
+
+cd ..
+
+# Test 3: Sort on sizeA
+mkdir test3
+cd test3
+
+echo "> Run bedtools_sort on BED file with sizeA"
+"$meta_executable" \
+  --input "../test_data/featureB.bed" \
+  --output "output.bed" \
+  --sizeA
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_sizeA.bed"
+echo "- test3 succeeded -"
+
+cd ..
+
+# Test 4: Sort on sizeD
+mkdir test4
+cd test4
+
+echo "> Run bedtools_sort on BED file with sizeD"
+"$meta_executable" \
+  --input "../test_data/featureB.bed" \
+  --output "output.bed" \
+  --sizeD
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_sizeD.bed"
+echo "- test4 succeeded -"
+
+cd ..
+
+# Test 5: Sort on chrThenSizeA
+mkdir test5
+cd test5
+
+echo "> Run bedtools_sort on BED file with chrThenSizeA"
+"$meta_executable" \
+    --input "../test_data/featureB.bed" \
+    --output "output.bed" \
+    --chrThenSizeA
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_chrThenSizeA.bed"
+echo "- test5 succeeded -"
+
+cd ..
+
+# Test 6: Sort on chrThenSizeD
+mkdir test6
+cd test6
+
+echo "> Run bedtools_sort on BED file with chrThenSizeD"
+"$meta_executable" \
+    --input "../test_data/featureB.bed" \
+    --output "output.bed" \
+    --chrThenSizeD
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_chrThenSizeD.bed"
+echo "- test6 succeeded -"
+
+cd ..
+
+# Test 7: Sort on chrThenScoreA
+mkdir test7
+cd test7
+
+echo "> Run bedtools_sort on BED file with chrThenScoreA"
+"$meta_executable" \
+    --input "../test_data/featureC.bed" \
+    --output "output.bed" \
+    --chrThenScoreA
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_chrThenScoreA.bed"
+echo "- test7 succeeded -"
+
+cd ..
+
+# Test 8: Sort on chrThenScoreD
+mkdir test8
+cd test8
+
+echo "> Run bedtools_sort on BED file with chrThenScoreD"
+"$meta_executable" \
+    --input "../test_data/featureC.bed" \
+    --output "output.bed" \
+    --chrThenScoreD
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_chrThenScoreD.bed"
+echo "- test8 succeeded -"
+
+cd ..
+
+# Test 9: Sort according to genome file
+mkdir test9
+cd test9
+
+echo "> Run bedtools_sort on BED file according to genome file"
+"$meta_executable" \
+    --input "../test_data/featureC.bed" \
+    --output "output.bed" \
+    --genome "../test_data/genome.txt"
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_genome.bed"
+echo "- test9 succeeded -"
+
+cd ..
+
+# Test 10: Sort according to faidx file
+mkdir test10
+cd test10
+
+echo "> Run bedtools_sort on BED file according to faidx file"
+"$meta_executable" \
+    --input "../test_data/featureC.bed" \
+    --output "output.bed" \
+    --faidx "../test_data/genome.fai"
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_genome.bed"
+echo "- test10 succeeded -"
+
+cd ..
+
+# Test 11: Sort with header
+mkdir test11
+cd test11
+
+echo "> Run bedtools_sort on BED file with header"
+"$meta_executable" \
+    --input "../test_data/featureA.bed" \
+    --output "output.bed" \
+    --header
+
+# checks
+assert_file_exists "output.bed"
+assert_file_not_empty "output.bed"
+assert_identical_content "output.bed" "../test_data/expected_header.bed"
+echo "- test11 succeeded -"
+
+cd ..
+
+echo "---- All tests succeeded! ----"
+exit 0

From 4aa0a893d2f8be5f0d03797afc15a04c53664367 Mon Sep 17 00:00:00 2001
From: Leila011 <leilapaquay@gmail.com>
Date: Wed, 31 Jul 2024 21:23:22 +0200
Subject: [PATCH 6/9] Add agat convert bed2gff (#97)

* add config

* add help

* add script

* add test data and expected output file

* add script to get test data

* add tests

* update changelog

* fix path to test data

---------

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>
---
 CHANGELOG.md                                  |  9 +-
 src/agat/agat_convert_bed2gff/config.vsh.yaml | 86 ++++++++++++++++++
 src/agat/agat_convert_bed2gff/help.txt        | 89 +++++++++++++++++++
 src/agat/agat_convert_bed2gff/script.sh       | 19 ++++
 src/agat/agat_convert_bed2gff/test.sh         | 27 ++++++
 .../test_data/agat_convert_bed2gff_1.gff      | 12 +++
 .../agat_convert_bed2gff/test_data/script.sh  | 10 +++
 .../agat_convert_bed2gff/test_data/test.bed   |  1 +
 8 files changed, 250 insertions(+), 3 deletions(-)
 create mode 100644 src/agat/agat_convert_bed2gff/config.vsh.yaml
 create mode 100644 src/agat/agat_convert_bed2gff/help.txt
 create mode 100644 src/agat/agat_convert_bed2gff/script.sh
 create mode 100644 src/agat/agat_convert_bed2gff/test.sh
 create mode 100644 src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff
 create mode 100755 src/agat/agat_convert_bed2gff/test_data/script.sh
 create mode 100644 src/agat/agat_convert_bed2gff/test_data/test.bed

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1debf12b..9dd2389c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,11 +19,14 @@
   - `seqtk/seqtk_subseq`: Extract the sequences (complete or subsequence) from the FASTA/FASTQ files
                 based on a provided sequence IDs or region coordinates file (PR #85).
 
-* `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76).
+* `agat`:
+  - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76).
+  - `/agat_convert_bed2gff`: convert bed file to gff format (PR #97).
 
 * `bedtools`:
-    - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94).
-    - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98).
+  - `bedtools/bedtools_intersect`: Allows one to screen for overlaps between two sets of genomic features (PR #94).
+  - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98).
+
 
 ## MINOR CHANGES
 
diff --git a/src/agat/agat_convert_bed2gff/config.vsh.yaml b/src/agat/agat_convert_bed2gff/config.vsh.yaml
new file mode 100644
index 00000000..a0fafc44
--- /dev/null
+++ b/src/agat/agat_convert_bed2gff/config.vsh.yaml
@@ -0,0 +1,86 @@
+name: agat_convert_bed2gff
+namespace: agat
+description: |
+  The script takes a bed file as input, and will translate it in gff format. The BED format is described here The script converts 0-based, half-open [start-1, end) bed file to 1-based, closed [start, end] General Feature Format v3 (GFF3).
+keywords: [gene annotations, GFF conversion]
+links:
+  homepage: https://github.com/NBISweden/AGAT
+  documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_bed2gff.html
+  issue_tracker: https://github.com/NBISweden/AGAT/issues
+  repository: https://github.com/NBISweden/AGAT
+references: 
+  doi: 10.5281/zenodo.3552717
+license: GPL-3.0
+authors:
+  - __merge__: /src/_authors/leila_paquay.yaml
+    roles: [ author, maintainer ]
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --bed
+        description: Input bed file that will be converted.
+        type: file
+        required: true
+        direction: input
+        example: input.bed
+  - name: Outputs
+    arguments:       
+      - name: --output
+        alternatives: [-o, --out, --outfile, --gff]
+        description: Output GFF file. If no output file is specified, the output will be written to STDOUT.
+        type: file
+        direction: output
+        required: true
+        example: output.gff
+  - name: Arguments
+    arguments:
+      - name: --source
+        description: |
+          The source informs about the tool used to produce the data and is stored in 2nd field of a gff file. Example: Stringtie, Maker, Augustus, etc. [default: data]
+        type: string
+        required: false
+        example: Stringtie
+      - name: --primary_tag
+        description: |
+          The primary_tag corresponds to the data type and is stored in 3rd field of a gff file. Example: gene, mRNA, CDS, etc. [default: gene]
+        type: string
+        required: false
+        example: gene
+      - name: --inflate_off
+        description: |
+          By default we inflate the block fields (blockCount, blockSizes, blockStarts) to create subfeatures of the main feature (primary_tag). The type of subfeature created is based on the inflate_type parameter. If you do not want this inflating behaviour you can deactivate it by using the --inflate_off option.
+        type: boolean_false
+      - name: --inflate_type
+        description: |
+          Feature type (3rd column in gff) created when inflate parameter activated [default: exon].
+        type: string
+        required: false
+        example: exon
+      - name: --verbose
+        description: add verbosity
+        type: boolean_true
+      - name: --config
+        alternatives: [-c]
+        description: |
+          Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently).
+        type: file
+        required: false
+        example: custom_agat_config.yaml
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0
+    setup:
+      - type: docker
+        run: |
+          agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt
+runners:
+  - type: executable
+  - type: nextflow
\ No newline at end of file
diff --git a/src/agat/agat_convert_bed2gff/help.txt b/src/agat/agat_convert_bed2gff/help.txt
new file mode 100644
index 00000000..56e953d7
--- /dev/null
+++ b/src/agat/agat_convert_bed2gff/help.txt
@@ -0,0 +1,89 @@
+```sh
+agat_convert_bed2gff.pl --help
+```
+ ------------------------------------------------------------------------------
+|   Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0                      |
+|   https://github.com/NBISweden/AGAT                                          |
+|   National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se         |
+ ------------------------------------------------------------------------------
+
+
+Name:
+    agat_convert_bed2gff.pl
+
+Description:
+    The script takes a bed file as input, and will translate it in gff
+    format. The BED format is described here:
+    https://genome.ucsc.edu/FAQ/FAQformat.html#format1 The script converts
+    0-based, half-open [start-1, end) bed file to 1-based, closed [start,
+    end] General Feature Format v3 (GFF3).
+
+Usage:
+        agat_convert_bed2gff.pl --bed infile.bed [ -o outfile ]
+        agat_convert_bed2gff.pl -h
+
+Options:
+    --bed   Input bed file that will be converted.
+
+    --source
+            The source informs about the tool used to produce the data and
+            is stored in 2nd field of a gff file. Example:
+            Stringtie,Maker,Augustus,etc. [default: data]
+
+    --primary_tag
+            The primary_tag corresponds to the data type and is stored in
+            3rd field of a gff file. Example: gene,mRNA,CDS,etc. [default:
+            gene]
+
+    --inflate_off
+            By default we inflate the block fields (blockCount, blockSizes,
+            blockStarts) to create subfeatures of the main feature
+            (primary_tag). The type of subfeature created is based on the
+            inflate_type parameter. If you do not want this inflating
+            behaviour you can deactivate it by using the --inflate_off
+            option.
+
+    --inflate_type
+            Feature type (3rd column in gff) created when inflate parameter
+            activated [default: exon].
+
+    --verbose
+            add verbosity
+
+    -o , --output , --out , --outfile or --gff
+            Output GFF file. If no output file is specified, the output will
+            be written to STDOUT.
+
+    -c or --config
+            String - Input agat config file. By default AGAT takes as input
+            agat_config.yaml file from the working directory if any,
+            otherwise it takes the orignal agat_config.yaml shipped with
+            AGAT. To get the agat_config.yaml locally type: "agat config
+            --expose". The --config option gives you the possibility to use
+            your own AGAT config file (located elsewhere or named
+            differently).
+
+    -h or --help
+            Display this helpful text.
+
+Feedback:
+  Did you find a bug?:
+    Do not hesitate to report bugs to help us keep track of the bugs and
+    their resolution. Please use the GitHub issue tracking system available
+    at this address:
+
+                https://github.com/NBISweden/AGAT/issues
+
+     Ensure that the bug was not already reported by searching under Issues.
+     If you're unable to find an (open) issue addressing the problem, open a new one.
+     Try as much as possible to include in the issue when relevant:
+     - a clear description,
+     - as much relevant information as possible,
+     - the command used,
+     - a data sample,
+     - an explanation of the expected behaviour that is not occurring.
+
+  Do you want to contribute?:
+    You are very welcome, visit this address for the Contributing
+    guidelines:
+    https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md
diff --git a/src/agat/agat_convert_bed2gff/script.sh b/src/agat/agat_convert_bed2gff/script.sh
new file mode 100644
index 00000000..fbeb9206
--- /dev/null
+++ b/src/agat/agat_convert_bed2gff/script.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+# unset flags
+[[ "$par_inflate_off" == "true" ]] && unset par_inflate_off
+[[ "$par_verbose" == "false" ]] && unset par_verbose
+
+# run agat_convert_sp_bed2gff.pl
+agat_convert_bed2gff.pl \
+  --bed "$par_bed" \
+  -o "$par_output" \
+  ${par_source:+--source "${par_source}"} \
+  ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \
+  ${par_inflate_off:+--inflate_off} \
+  ${par_inflate_type:+--inflate_type "${par_inflate_type}"} \
+  ${par_verbose:+--verbose}
+  ${par_config:+--config "${par_config}"} \
diff --git a/src/agat/agat_convert_bed2gff/test.sh b/src/agat/agat_convert_bed2gff/test.sh
new file mode 100644
index 00000000..6e7d43f3
--- /dev/null
+++ b/src/agat/agat_convert_bed2gff/test.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+test_dir="${meta_resources_dir}/test_data"
+out_dir="${meta_resources_dir}/out_data"
+
+echo "> Run $meta_name with test data"
+"$meta_executable" \
+  --bed "$test_dir/test.bed" \
+  --output "$out_dir/output.gff" 
+
+echo ">> Checking output"
+[ ! -f "$out_dir/output.gff" ] && echo "Output file output.gff does not exist" && exit 1
+
+echo ">> Check if output is empty"
+[ ! -s "$out_dir/output.gff" ] && echo "Output file output.gff is empty" && exit 1
+
+echo ">> Check if output matches expected output"
+diff "$out_dir/output.gff" "$test_dir/agat_convert_bed2gff_1.gff"
+if [ $? -ne 0 ]; then
+  echo "Output file output.gff does not match expected output"
+  exit 1
+fi
+
+echo "> Test successful"
\ No newline at end of file
diff --git a/src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff b/src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff
new file mode 100644
index 00000000..587e3d09
--- /dev/null
+++ b/src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff
@@ -0,0 +1,12 @@
+##gff-version 3
+scaffold625	data	gene	337818	343277	.	+	.	ID=1;Name=CLUHART00000008717;blockCount=4;blockSizes=154%2C109%2C111%2C1314;blockStarts=0%2C2915%2C3700%2C4146;itemRgb=255%2C0%2C0;thickEnd=343033;thickStart=337914
+scaffold625	data	exon	337818	337971	.	+	.	ID=exon1;Parent=1
+scaffold625	data	exon	340733	340841	.	+	.	ID=exon2;Parent=1
+scaffold625	data	exon	341518	341628	.	+	.	ID=exon3;Parent=1
+scaffold625	data	exon	341964	343277	.	+	.	ID=exon4;Parent=1
+scaffold625	data	CDS	337915	337971	.	+	0	ID=CDS1;Parent=1
+scaffold625	data	CDS	340733	340841	.	+	0	ID=CDS2;Parent=1
+scaffold625	data	CDS	341518	341628	.	+	2	ID=CDS3;Parent=1
+scaffold625	data	CDS	341964	343033	.	+	2	ID=CDS4;Parent=1
+scaffold625	data	five_prime_UTR	337818	337914	.	+	.	ID=five_prime_UTR1;Parent=1
+scaffold625	data	three_prime_UTR	343034	343277	.	+	.	ID=three_prime_UTR1;Parent=1
diff --git a/src/agat/agat_convert_bed2gff/test_data/script.sh b/src/agat/agat_convert_bed2gff/test_data/script.sh
new file mode 100755
index 00000000..d1206a42
--- /dev/null
+++ b/src/agat/agat_convert_bed2gff/test_data/script.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# clone repo
+if [ ! -d /tmp/agat_source ]; then
+  git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source
+fi
+
+# copy test data
+cp -r /tmp/agat_source/t/scripts_output/in/test.bed src/agat/agat_convert_bed2gff/test_data/test.bed
+cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_bed2gff_1.gff src/agat/agat_convert_bed2gff/test_data/agat_convert_bed2gff_1.gff
\ No newline at end of file
diff --git a/src/agat/agat_convert_bed2gff/test_data/test.bed b/src/agat/agat_convert_bed2gff/test_data/test.bed
new file mode 100644
index 00000000..bfeba3bb
--- /dev/null
+++ b/src/agat/agat_convert_bed2gff/test_data/test.bed
@@ -0,0 +1 @@
+scaffold625	337817	343277	CLUHART00000008717	0	+	337914	343033	255,0,0	4	154,109,111,1314	0,2915,3700,4146

From ede5850f577cbfe8ca5edf8525703535b12b4a36 Mon Sep 17 00:00:00 2001
From: Leila011 <leilapaquay@gmail.com>
Date: Sat, 10 Aug 2024 08:51:39 +0200
Subject: [PATCH 7/9] Add agat convert embl2gff (#99)

* add config

* add help

* add test data and expected output

* add script to get test data

* add running script

* add test script

* update description

* update changelog

* cleanup

* fix path to copy test data

* pull the test data again

* fix typo GTF => GFF

* fix tests

* fix output file: replace by generated output

* fix test data: add   --emblmygff3

* cleanup

* config: add longer name to `-k` and `-d`
---
 CHANGELOG.md                                  |  2 +
 .../agat_convert_embl2gff/config.vsh.yaml     | 84 +++++++++++++++++++
 src/agat/agat_convert_embl2gff/help.txt       | 78 +++++++++++++++++
 src/agat/agat_convert_embl2gff/script.sh      | 23 +++++
 src/agat/agat_convert_embl2gff/test.sh        | 28 +++++++
 .../test_data/agat_convert_embl2gff_1.embl    | 51 +++++++++++
 .../test_data/agat_convert_embl2gff_1.gff     | 10 +++
 .../agat_convert_embl2gff/test_data/script.sh | 10 +++
 8 files changed, 286 insertions(+)
 create mode 100644 src/agat/agat_convert_embl2gff/config.vsh.yaml
 create mode 100644 src/agat/agat_convert_embl2gff/help.txt
 create mode 100644 src/agat/agat_convert_embl2gff/script.sh
 create mode 100644 src/agat/agat_convert_embl2gff/test.sh
 create mode 100644 src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl
 create mode 100644 src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff
 create mode 100755 src/agat/agat_convert_embl2gff/test_data/script.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9dd2389c..3c2f347a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,8 @@
   - `bedtools/bedtools_sort`: Sorts a feature file (bed/gff/vcf) by chromosome and other criteria (PR #98).
 
 
+* `agat/agat_convert_embl2gff`: convert an EMBL file into GFF format (PR #99).
+
 ## MINOR CHANGES
 
 * `busco` components: update BUSCO to `5.7.1` (PR #72).
diff --git a/src/agat/agat_convert_embl2gff/config.vsh.yaml b/src/agat/agat_convert_embl2gff/config.vsh.yaml
new file mode 100644
index 00000000..99ceec46
--- /dev/null
+++ b/src/agat/agat_convert_embl2gff/config.vsh.yaml
@@ -0,0 +1,84 @@
+name: agat_convert_embl2gff
+namespace: agat
+description: |
+  The script takes an EMBL file as input, and will translate it in gff format.
+keywords: [gene annotations, GFF conversion]
+links:
+  homepage: https://github.com/NBISweden/AGAT
+  documentation: https://agat.readthedocs.io/en/latest/tools/agat_convert_embl2gff.html
+  issue_tracker: https://github.com/NBISweden/AGAT/issues
+  repository: https://github.com/NBISweden/AGAT
+references: 
+  doi: 10.5281/zenodo.3552717
+license: GPL-3.0
+authors:
+  - __merge__: /src/_authors/leila_paquay.yaml
+    roles: [ author, maintainer ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --embl
+        description: Input EMBL file that will be read.
+        type: file
+        required: true
+        direction: input
+        example: input.embl
+  - name: Outputs
+    arguments:
+      - name: --output
+        alternatives: [-o, --out, --outfile, --gff]
+        description: Output GFF file. If no output file is specified, the output will be written to STDOUT.
+        type: file
+        direction: output
+        required: false
+        example: output.gff
+  - name: Arguments
+    arguments:
+      - name: --emblmygff3
+        description: |
+          Means that the EMBL flat file comes from the EMBLmyGFF3 software. This is an EMBL format dedicated for submission and contains particularity to deal with. This parameter is needed to get a proper sequence id in the GFF3 from an embl made with EMBLmyGFF3.
+        type: boolean_true
+      - name: --primary_tag
+        alternatives: [--pt, -t]
+        description: |
+          List of "primary tag". Useful to discard or keep specific features. Multiple tags must be comma-separated.
+        type: string
+        multiple: true
+        required: false
+        example: [tag1, tag2]
+      - name: --discard
+        alternatives: [-d]
+        description: |
+          Means that primary tags provided by the option "primary_tag" will be discarded.
+        type: boolean_true
+      - name: --keep
+        alternatives: [-k]
+        description: |
+          Means that only primary tags provided by the option "primary_tag" will be kept.
+        type: boolean_true
+      - name: --config
+        alternatives: [-c]
+        description: |
+          Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the original agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently).
+        type: file
+        required: false
+        example: custom_agat_config.yaml
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0
+    setup:
+      - type: docker
+        run: |
+          agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt
+runners:
+  - type: executable
+  - type: nextflow
\ No newline at end of file
diff --git a/src/agat/agat_convert_embl2gff/help.txt b/src/agat/agat_convert_embl2gff/help.txt
new file mode 100644
index 00000000..5fce4939
--- /dev/null
+++ b/src/agat/agat_convert_embl2gff/help.txt
@@ -0,0 +1,78 @@
+ ```sh
+agat_convert_embl2gff.pl --help
+```
+ 
+ ------------------------------------------------------------------------------
+|   Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0                      |
+|   https://github.com/NBISweden/AGAT                                          |
+|   National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se         |
+ ------------------------------------------------------------------------------
+
+
+Name:
+    agat_converter_embl2gff.pl
+
+Description:
+    The script takes an EMBL file as input, and will translate it in gff
+    format.
+
+Usage:
+        agat_converter_embl2gff.pl --embl infile.embl [ -o outfile ]
+
+Options:
+    --embl  Input EMBL file that will be read
+
+    --emblmygff3
+            Bolean - Means that the EMBL flat file comes from the EMBLmyGFF3
+            software. This is an EMBL format dedicated for submission and
+            contains particularity to deal with. This parameter is needed to
+            get a proper sequence id in the GFF3 from an embl made with
+            EMBLmyGFF3.
+
+    --primary_tag, --pt, -t
+            List of "primary tag". Useful to discard or keep specific
+            features. Multiple tags must be coma-separated.
+
+    -d      Bolean - Means that primary tags provided by the option
+            "primary_tag" will be discarded.
+
+    -k      Bolean - Means that only primary tags provided by the option
+            "primary_tag" will be kept.
+
+    -o, --output, --out, --outfile or --gff
+            Output GFF file. If no output file is specified, the output will
+            be written to STDOUT.
+
+    -c or --config
+            String - Input agat config file. By default AGAT takes as input
+            agat_config.yaml file from the working directory if any,
+            otherwise it takes the orignal agat_config.yaml shipped with
+            AGAT. To get the agat_config.yaml locally type: "agat config
+            --expose". The --config option gives you the possibility to use
+            your own AGAT config file (located elsewhere or named
+            differently).
+
+    -h or --help
+            Display this helpful text.
+
+Feedback:
+  Did you find a bug?:
+    Do not hesitate to report bugs to help us keep track of the bugs and
+    their resolution. Please use the GitHub issue tracking system available
+    at this address:
+
+                https://github.com/NBISweden/AGAT/issues
+
+     Ensure that the bug was not already reported by searching under Issues.
+     If you're unable to find an (open) issue addressing the problem, open a new one.
+     Try as much as possible to include in the issue when relevant:
+     - a clear description,
+     - as much relevant information as possible,
+     - the command used,
+     - a data sample,
+     - an explanation of the expected behaviour that is not occurring.
+
+  Do you want to contribute?:
+    You are very welcome, visit this address for the Contributing
+    guidelines:
+    https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md
diff --git a/src/agat/agat_convert_embl2gff/script.sh b/src/agat/agat_convert_embl2gff/script.sh
new file mode 100644
index 00000000..63ab8df0
--- /dev/null
+++ b/src/agat/agat_convert_embl2gff/script.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+
+# unset flags
+[[ "$par_emblmygff3" == "false" ]] && unset par_emblmygff3
+[[ "$par_discard" == "false" ]] && unset par_discard
+[[ "$par_keep" == "false" ]] && unset par_keep
+
+# replace ';' with ','
+par_primary_tag=$(echo $par_primary_tag | tr ';' ',')
+
+# run agat_convert_embl2gff
+agat_convert_embl2gff.pl \
+  --embl "$par_embl" \
+  -o "$par_output" \
+  ${par_emblmygff3:+--emblmygff3} \
+  ${par_primary_tag:+--primary_tag "${par_primary_tag}"} \
+  ${par_discard:+-d} \
+  ${par_keep:+-k} \
+  ${par_config:+--config "${par_config}"}
diff --git a/src/agat/agat_convert_embl2gff/test.sh b/src/agat/agat_convert_embl2gff/test.sh
new file mode 100644
index 00000000..81d24aaa
--- /dev/null
+++ b/src/agat/agat_convert_embl2gff/test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+test_dir="${meta_resources_dir}/test_data"
+out_dir="${meta_resources_dir}/out_data"
+
+echo "> Run $meta_name with test data and --emblmygff3"
+"$meta_executable" \
+  --embl "$test_dir/agat_convert_embl2gff_1.embl" \
+  --output "$out_dir/output.gff" \
+  --emblmygff3
+
+echo ">> Checking output"
+[ ! -f "$out_dir/output.gff" ] && echo "Output file output.gff does not exist" && exit 1
+
+echo ">> Check if output is empty"
+[ ! -s "$out_dir/output.gff" ] && echo "Output file output.gff is empty" && exit 1
+
+echo ">> Check if output matches expected output"
+diff "$out_dir/output.gff" "$test_dir/agat_convert_embl2gff_1.gff"
+if [ $? -ne 0 ]; then
+  echo "Output file output.gff does not match expected output"
+  exit 1
+fi
+
+echo "> Test successful"
\ No newline at end of file
diff --git a/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl
new file mode 100644
index 00000000..aa4f50aa
--- /dev/null
+++ b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl
@@ -0,0 +1,51 @@
+ID   patatrac; SV 1; circular; genomic DNA; XXX; PRO; 317941 BP.
+XX
+AC   XXX; 
+XX
+AC * _ERS324955|SC|contig000001
+XX
+PR   Project:PRJEBNNNN;
+XX
+DE   XXX
+XX
+RN   [1]
+RP   1-2149
+RA   XXX;
+RT   ;
+RL   Submitted {(DD-MMM-YYYY)} to the INSDC.
+XX
+FH   Key           Location/Qualifiers
+FH
+FT   source        1..588788
+FT                 /organism={"scientific organism name"}
+FT                 /mol_type={"in vivo molecule type of sequence"}
+XX
+SQ   Sequence 588788 BP; 101836 A; 193561 C; 192752 G; 100639 T; 0 other;
+     tgcgtactcg aagagacgcg cccagattat ataagggcgt cgtctcgagg ccgacggcgc        60
+     gccggcgagt acgcgtgatc cacaacccga agcgaccgtc gggagaccga gggtcgtcga       120
+     gggtggatac gttcctgcct tcgtgccggg aaacggccga agggaacgtg gcgacctgcg       180
+//
+ID   fdssf; SV 1; circular; genomic DNA; XXX; PRO; 317941 BP.
+XX
+AC   XXX; 
+XX
+AC * _ERS344554
+XX
+PR   Project:PRJEBNNNN;
+XX
+DE   XXX
+XX
+RN   [1]
+RP   1-2149
+RA   XXX;
+RT   ;
+RL   Submitted {(DD-MMM-YYYY)} to the INSDC.
+XX
+FH   Key           Location/Qualifiers
+FH
+FT   source        1..588788
+FT                 /organism={"scientific organism name"}
+FT                 /mol_type={"in vivo molecule type of sequence"}
+XX
+SQ   Sequence 588788 BP; 101836 A; 193561 C; 192752 G; 100639 T; 0 other;
+     TTTTTTTTTT aagagacgcg cccagattat ataagggcgt cgtctcgagg ccgacggcgc        60
diff --git a/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff
new file mode 100644
index 00000000..f6893022
--- /dev/null
+++ b/src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff
@@ -0,0 +1,10 @@
+##gff-version 3
+ERS324955|SC|contig000001	EMBL/GenBank/SwissProt	source	1	588788	.	+	1	mol_type={"in vivo molecule type of sequence"};organism={"scientific organism name"}
+ERS344554	EMBL/GenBank/SwissProt	source	1	588788	.	+	1	mol_type={"in vivo molecule type of sequence"};organism={"scientific organism name"}
+##FASTA
+>ERS324955|SC|contig000001 XXX
+TGCGTACTCGAAGAGACGCGCCCAGATTATATAAGGGCGTCGTCTCGAGGCCGACGGCGCGCCGGCGAGTACGCGTGATC
+CACAACCCGAAGCGACCGTCGGGAGACCGAGGGTCGTCGAGGGTGGATACGTTCCTGCCTTCGTGCCGGGAAACGGCCGA
+AGGGAACGTGGCGACCTGCG
+>ERS344554 XXX
+TTTTTTTTTTAAGAGACGCGCCCAGATTATATAAGGGCGTCGTCTCGAGGCCGACGGCGC
diff --git a/src/agat/agat_convert_embl2gff/test_data/script.sh b/src/agat/agat_convert_embl2gff/test_data/script.sh
new file mode 100755
index 00000000..7ddbce5b
--- /dev/null
+++ b/src/agat/agat_convert_embl2gff/test_data/script.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# clone repo
+if [ ! -d /tmp/agat_source ]; then
+  git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source
+fi
+
+# copy test data
+cp -r /tmp/agat_source/t/scripts_output/in/agat_convert_embl2gff_1.embl src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.embl
+cp -r /tmp/agat_source/t/scripts_output/out/agat_convert_embl2gff_1.gff src/agat/agat_convert_embl2gff/test_data/agat_convert_embl2gff_1.gff
\ No newline at end of file

From d5fc46b1d9ef8313c06e369bc881f6de75c53dd4 Mon Sep 17 00:00:00 2001
From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Tue, 13 Aug 2024 10:14:59 +0200
Subject: [PATCH 8/9] Avoid duplicate code when unsetting multiple boolean
 arguments (#133)

* Avoid duplicate code when unsetting multiple boolean arguments

* Add CHANGELOG entry [ci skip]

* Update CONTRIBUTING guide
---
 CHANGELOG.md                              |  2 +
 CONTRIBUTING.md                           | 25 ++++++
 src/bedtools/bedtools_intersect/script.sh | 49 +++++++-----
 src/bedtools/bedtools_sort/script.sh      | 21 +++--
 src/busco/busco_run/script.sh             | 28 ++++---
 src/fastp/script.sh                       | 45 ++++++-----
 src/featurecounts/script.sh               | 43 +++++-----
 src/gffread/script.sh                     | 97 ++++++++++++-----------
 src/lofreq/call/script.sh                 | 37 +++++----
 src/multiqc/script.sh                     | 44 +++++-----
 src/salmon/salmon_index/script.sh         | 19 +++--
 src/salmon/salmon_quant/script.sh         | 90 +++++++++++----------
 src/samtools/samtools_fastq/script.sh     | 17 ++--
 src/samtools/samtools_sort/script.sh      | 25 +++---
 src/samtools/samtools_view/script.sh      | 38 +++++----
 src/umi_tools/umi_tools_dedup/script.sh   | 33 +++++---
 src/umi_tools/umi_tools_extract/script.sh | 19 +++--
 17 files changed, 380 insertions(+), 252 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c2f347a..5030894c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,6 +36,8 @@
 
 * Update CI to reusable workflow in `viash-io/viash-actions` (PR #86).
 
+* Update several components in order to avoid duplicate code when using `unset` on boolean arguments (PR #133).
+
 ## DOCUMENTATION
 
 * Extend the contributing guidelines (PR #82):
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cee4249a..a32b680c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -320,6 +320,31 @@ Notes:
 
 * If your tool allows for multiple inputs using a separator other than `;` (which is the default Viash multiple separator), you can substitute these values with a command like: `par_disable_filters=$(echo $par_disable_filters | tr ';' ',')`.
 
+* If you have a lot of boolean variables that you would like to unset when the value is `false`, you can avoid duplicate code by using the following syntax:
+
+```bash
+unset_if_false=(
+    par_argument_1
+    par_argument_2
+    par_argument_3
+    par_argument_4
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
+```
+
+this code is equivalent to
+
+```bash
+[[ "$par_argument_1" == "false" ]] && unset par_argument_1
+[[ "$par_argument_2" == "false" ]] && unset par_argument_2
+[[ "$par_argument_3" == "false" ]] && unset par_argument_3
+[[ "$par_argument_4" == "false" ]] && unset par_argument_4
+```
+
 
 ### Step 12: Create test script
 
diff --git a/src/bedtools/bedtools_intersect/script.sh b/src/bedtools/bedtools_intersect/script.sh
index 2141859d..04a8d854 100644
--- a/src/bedtools/bedtools_intersect/script.sh
+++ b/src/bedtools/bedtools_intersect/script.sh
@@ -3,27 +3,34 @@
 ## VIASH START
 ## VIASH END
 
-[[ "$par_write_a" == "false" ]] && unset par_write_a
-[[ "$par_write_b" == "false" ]] && unset par_write_b
-[[ "$par_left_outer_join" == "false" ]] && unset par_left_outer_join
-[[ "$par_write_overlap" == "false" ]] && unset par_write_overlap
-[[ "$par_write_overlap_plus" == "false" ]] && unset par_write_overlap_plus
-[[ "$par_report_A_if_no_overlap" == "false" ]] && unset par_report_A_if_no_overlap
-[[ "$par_number_of_overlaps_A" == "false" ]] && unset par_number_of_overlaps_A
-[[ "$par_report_no_overlaps_A" == "false" ]] && unset par_report_no_overlaps_A
-[[ "$par_uncompressed_bam" == "false" ]] && unset par_uncompressed_bam
-[[ "$par_same_strand" == "false" ]] && unset par_same_strand
-[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand
-[[ "$par_reciprocal_overlap" == "false" ]] && unset par_reciprocal_overlap
-[[ "$par_either_overlap" == "false" ]] && unset par_either_overlap
-[[ "$par_split" == "false" ]] && unset par_split
-[[ "$par_nonamecheck" == "false" ]] && unset par_nonamecheck
-[[ "$par_sorted" == "false" ]] && unset par_sorted
-[[ "$par_filenames" == "false" ]] && unset par_filenames
-[[ "$par_sortout" == "false" ]] && unset par_sortout
-[[ "$par_bed" == "false" ]] && unset par_bed
-[[ "$par_header" == "false" ]] && unset par_header
-[[ "$par_no_buffer_output" == "false" ]] && unset par_no_buffer_output
+unset_if_false=(
+    par_write_a
+    par_write_b
+    par_left_outer_join
+    par_write_overlap
+    par_write_overlap_plus
+    par_report_A_if_no_overlap
+    par_number_of_overlaps_A
+    par_report_no_overlaps_A
+    par_uncompressed_bam
+    par_same_strand
+    par_opposite_strand
+    par_reciprocal_overlap
+    par_either_overlap
+    par_split
+    par_nonamecheck
+    par_sorted
+    par_filenames
+    par_sortout
+    par_bed
+    par_no_buffer_output
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
+
 
 # Create input array 
 IFS=";" read -ra input <<< $par_input_b
diff --git a/src/bedtools/bedtools_sort/script.sh b/src/bedtools/bedtools_sort/script.sh
index e7f712d7..0d0b9b54 100644
--- a/src/bedtools/bedtools_sort/script.sh
+++ b/src/bedtools/bedtools_sort/script.sh
@@ -4,13 +4,20 @@
 ## VIASH END
 
 # Unset parameters
-[[ "$par_sizeA" == "false" ]] && unset par_sizeA
-[[ "$par_sizeD" == "false" ]] && unset par_sizeD
-[[ "$par_chrThenSizeA" == "false" ]] && unset par_chrThenSizeA
-[[ "$par_chrThenSizeD" == "false" ]] && unset par_chrThenSizeD
-[[ "$par_chrThenScoreA" == "false" ]] && unset par_chrThenScoreA
-[[ "$par_chrThenScoreD" == "false" ]] && unset par_chrThenScoreD
-[[ "$par_header" == "false" ]] && unset par_header
+unset_if_false=(
+    par_sizeA
+    par_sizeD
+    par_chrThenSizeA
+    par_chrThenSizeD
+    par_chrThenScoreA
+    par_chrThenScoreD
+    par_header
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 # Execute bedtools sort with the provided arguments
 bedtools sort \
diff --git a/src/busco/busco_run/script.sh b/src/busco/busco_run/script.sh
index a0ef24de..673ccd0b 100644
--- a/src/busco/busco_run/script.sh
+++ b/src/busco/busco_run/script.sh
@@ -3,18 +3,24 @@
 ## VIASH START
 ## VIASH END
 
+unset_if_false=(
+    par_tar
+    par_force
+    par_quiet
+    par_restart
+    par_auto_lineage
+    par_auto_lineage_euk
+    par_auto_lineage_prok
+    par_augustus
+    par_long
+    par_scaffold_composition
+    par_miniprot
+)
 
-[[ "$par_tar" == "false" ]] && unset par_tar
-[[ "$par_force" == "false" ]] && unset par_force
-[[ "$par_quiet" == "false" ]] && unset par_quiet
-[[ "$par_restart" == "false" ]] && unset par_restart
-[[ "$par_auto_lineage" == "false" ]] && unset par_auto_lineage
-[[ "$par_auto_lineage_euk" == "false" ]] && unset par_auto_lineage_euk
-[[ "$par_auto_lineage_prok" == "false" ]] && unset par_auto_lineage_prok
-[[ "$par_augustus" == "false" ]] && unset par_augustus
-[[ "$par_long" == "false" ]] && unset par_long
-[[ "$par_scaffold_composition" == "false" ]] && unset par_scaffold_composition
-[[ "$par_miniprot" == "false" ]] && unset par_miniprot
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 tmp_dir=$(mktemp -d -p "$meta_temp_dir" busco_XXXXXXXXX)
 prefix=$(openssl rand -hex 8)
diff --git a/src/fastp/script.sh b/src/fastp/script.sh
index 4bb37c87..557f7ac3 100644
--- a/src/fastp/script.sh
+++ b/src/fastp/script.sh
@@ -4,25 +4,32 @@
 ## VIASH END
 
 # disable flags
-[[ "$par_disable_adapter_trimming" == "false" ]] && unset par_disable_adapter_trimming
-[[ "$par_detect_adapter_for_pe" == "false" ]] && unset par_detect_adapter_for_pe
-[[ "$par_merge" == "false" ]] && unset par_merge
-[[ "$par_include_unmerged" == "false" ]] && unset par_include_unmerged
-[[ "$par_interleaved_in" == "false" ]] && unset par_interleaved_in
-[[ "$par_fix_mgi_id" == "false" ]] && unset par_fix_mgi_id
-[[ "$par_phred64" == "false" ]] && unset par_phred64
-[[ "$par_dont_overwrite" == "false" ]] && unset par_dont_overwrite
-[[ "$par_verbose" == "false" ]] && unset par_verbose
-[[ "$par_dedup" == "false" ]] && unset par_dedup
-[[ "$par_dont_eval_duplication" == "false" ]] && unset par_dont_eval_duplication
-[[ "$par_trim_poly_g" == "false" ]] && unset par_trim_poly_g
-[[ "$par_disable_trim_poly_g" == "false" ]] && unset par_disable_trim_poly_g
-[[ "$par_trim_poly_x" == "false" ]] && unset par_trim_poly_x
-[[ "$par_disable_quality_filtering" == "false" ]] && unset par_disable_quality_filtering
-[[ "$par_disable_length_filtering" == "false" ]] && unset par_disable_length_filtering
-[[ "$par_low_complexity_filter" == "false" ]] && unset par_low_complexity_filter
-[[ "$par_umi" == "false" ]] && unset par_umi
-[[ "$par_overrepresentation_analysis" == "false" ]] && unset par_overrepresentation_analysis
+unset_if_false=(
+    par_disable_adapter_trimming
+    par_detect_adapter_for_pe
+    par_merge
+    par_include_unmerged
+    par_interleaved_in
+    par_fix_mgi_id
+    par_phred64
+    par_dont_overwrite
+    par_verbose
+    par_dedup
+    par_dont_eval_duplication
+    par_trim_poly_g
+    par_disable_trim_poly_g
+    par_trim_poly_x
+    par_disable_quality_filtering
+    par_disable_length_filtering
+    par_low_complexity_filter
+    par_umi
+    par_overrepresentation_analysis
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 # run command
 fastp \
diff --git a/src/featurecounts/script.sh b/src/featurecounts/script.sh
index 2e54feb3..53f8c63f 100644
--- a/src/featurecounts/script.sh
+++ b/src/featurecounts/script.sh
@@ -19,24 +19,31 @@ par_feature_type=$(echo $par_feature_type | tr ',' ';')
 par_extra_attributes=$(echo $par_extra_attributes | tr ',' ';')
 
 # unset flag variables
-[[ "$par_feature_level" == "false" ]] && unset par_feature_level
-[[ "$par_overlapping" == "false" ]] && unset par_overlapping
-[[ "$par_largest_overlap" == "false" ]] && unset par_largest_overlap
-[[ "$par_multi_mapping" == "false" ]] && unset par_multi_mapping
-[[ "$par_fraction" == "false" ]] && unset par_fraction
-[[ "$par_split_only" == "false" ]] && unset par_split_only
-[[ "$par_non_split_only" == "false" ]] && unset par_non_split_only
-[[ "$par_primary" == "false" ]] && unset par_primary
-[[ "$par_ignore_dup" == "false" ]] && unset par_ignore_dup
-[[ "$par_paired" == "false" ]] && unset par_paired
-[[ "$par_count_read_pairs" == "false" ]] && unset par_count_read_pairs
-[[ "$par_both_aligned" == "false" ]] && unset par_both_aligned
-[[ "$par_check_pe_dist" == "false" ]] && unset par_check_pe_dist
-[[ "$par_same_strand" == "false" ]] && unset par_same_strand
-[[ "$par_donotsort" == "false" ]] && unset par_donotsort
-[[ "$par_by_read_group" == "false" ]] && unset par_by_read_group
-[[ "$par_long_reads" == "false" ]] && unset par_long_reads
-[[ "$par_verbose" == "false" ]] && unset par_verbose
+unset_if_false=(
+    par_feature_level
+    par_overlapping
+    par_largest_overlap
+    par_multi_mapping
+    par_fraction
+    par_split_only
+    par_non_split_only
+    par_primary
+    par_ignore_dup
+    par_paired
+    par_count_read_pairs
+    par_both_aligned
+    par_check_pe_dist
+    par_same_strand
+    par_donotsort
+    par_by_read_group
+    par_long_reads
+    par_verbose
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 IFS=";" read -ra input <<< $par_input
 
diff --git a/src/gffread/script.sh b/src/gffread/script.sh
index cd4abf14..fab9e521 100644
--- a/src/gffread/script.sh
+++ b/src/gffread/script.sh
@@ -4,51 +4,58 @@
 ## VIASH END
 
 # unset flags
-[[ "$par_coding" == "false" ]] && unset par_coding
-[[ "$par_strict_range" == "false" ]] && unset par_strict_range
-[[ "$par_no_single_exon" == "false" ]] && unset par_no_single_exon
-[[ "$par_no_exon_attrs" == "false" ]] && unset par_no_exon_attrs
-[[ "$par_nc" == "false" ]] && unset par_nc
-[[ "$par_ignore_locus" == "false" ]] && unset par_ignore_locus
-[[ "$par_description" == "false" ]] && unset par_description
-[[ "$par_sort_alpha" == "false" ]] && unset par_sort_alpha
-[[ "$par_keep_genes" == "false" ]] && unset par_keep_genes
-[[ "$par_keep_attrs" == "false" ]] && unset par_keep_attrs
-[[ "$par_keep_exon_attrs" == "false" ]] && unset par_keep_exon_attrs
-[[ "$par_keep_comments" == "false" ]] && unset par_keep_comments
-[[ "$par_process_other" == "false" ]] && unset par_process_other
-[[ "$par_rm_stop_codons" == "false" ]] && unset par_rm_stop_codons
-[[ "$par_adj_cds_start" == "false" ]] && unset par_adj_cds_start
-[[ "$par_opposite_strand" == "false" ]] && unset par_opposite_strand
-[[ "$par_coding_status" == "false" ]] && unset par_coding_status
-[[ "$par_add_hasCDS" == "false" ]] && unset par_add_hasCDS
-[[ "$par_adj_stop" == "false" ]] && unset par_adj_stop
-[[ "$par_rm_noncanon" == "false" ]] && unset par_rm_noncanon
-[[ "$par_complete_cds" == "false" ]] && unset par_complete_cds
-[[ "$par_no_pseudo" == "false" ]] && unset par_no_pseudo
-[[ "$par_in_bed" == "false" ]] && unset par_in_bed
-[[ "$par_in_tlf" == "false" ]] && unset par_in_tlf
-[[ "$par_stream" == "false" ]] && unset par_stream
-[[ "$par_merge" == "false" ]] && unset par_merge
-[[ "$par_rm_redundant" == "false" ]] && unset par_rm_redundant
-[[ "$par_no_boundary" == "false" ]] && unset par_no_boundary
-[[ "$par_no_overlap" == "false" ]] && unset par_no_overlap
-[[ "$par_force_exons" == "false" ]] && unset par_force_exons
-[[ "$par_gene2exon" == "false" ]] && unset par_gene2exon
-[[ "$par_t_adopt" == "false" ]] && unset par_t_adopt
-[[ "$par_decode" == "false" ]] && unset par_decode
-[[ "$par_merge_exons" == "false" ]] && unset par_merge_exons
-[[ "$par_junctions" == "false" ]] && unset par_junctions
-[[ "$par_w_nocds" == "false" ]] && unset par_w_nocds
-[[ "$par_tr_cds" == "false" ]] && unset par_tr_cds
-[[ "$par_w_coords" == "false" ]] && unset par_w_coords
-[[ "$par_stop_dot" == "false" ]] && unset par_stop_dot
-[[ "$par_id_version" == "false" ]] && unset par_id_version
-[[ "$par_gtf_output" == "false" ]] && unset par_gtf_output
-[[ "$par_bed" == "false" ]] && unset par_bed
-[[ "$par_tlf" == "false" ]] && unset par_tlf
-[[ "$par_expose_dups" == "false" ]] && unset par_expose_dups
-[[ "$par_cluster_only" == "false" ]] && unset par_cluster_only
+unset_if_false=(
+    par_coding
+    par_strict_range
+    par_no_single_exon
+    par_no_exon_attrs
+    par_nc
+    par_ignore_locus
+    par_description
+    par_sort_alpha
+    par_keep_genes
+    par_keep_attrs
+    par_keep_exon_attrs
+    par_keep_comments
+    par_process_other
+    par_rm_stop_codons
+    par_adj_cds_start
+    par_opposite_strand
+    par_coding_status
+    par_add_hasCDS
+    par_adj_stop
+    par_rm_noncanon
+    par_complete_cds
+    par_no_pseudo
+    par_in_bed
+    par_in_tlf
+    par_stream
+    par_merge
+    par_rm_redundant
+    par_no_boundary
+    par_no_overlap
+    par_force_exons
+    par_gene2exon
+    par_t_adopt
+    par_decode
+    par_merge_exons
+    par_junctions
+    par_w_nocds
+    par_tr_cds
+    par_w_coords
+    par_stop_dot
+    par_id_version
+    par_gtf_output
+    par_bed
+    par_tlf
+    par_expose_dups
+    par_cluster_only
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 # if par_table is not empty, replace ";" with ","
 par_table=$(echo "$par_table" | tr ';' ',')
diff --git a/src/lofreq/call/script.sh b/src/lofreq/call/script.sh
index 863fe986..ca229194 100644
--- a/src/lofreq/call/script.sh
+++ b/src/lofreq/call/script.sh
@@ -4,21 +4,28 @@
 ## VIASH END
 
 # Unset all parameters that are set to "false"
-[[ "$par_no_baq" == "false" ]] && unset par_no_baq
-[[ "$par_no_idaq" == "false" ]] && unset par_no_idaq
-[[ "$par_del_baq" == "false" ]] && unset par_del_baq
-[[ "$par_no_ext_baq" == "false" ]] && unset par_no_ext_baq
-[[ "$par_no_mq" == "false" ]] && unset par_no_mq
-[[ "$par_call_indels" == "false" ]] && unset par_call_indels
-[[ "$par_only_indels" == "false" ]] && unset par_only_indels
-[[ "$par_src_qual" == "false" ]] && unset par_src_qual
-[[ "$par_illumina_13" == "false" ]] && unset par_illumina_13
-[[ "$par_use_orphan" == "false" ]] && unset par_use_orphan
-[[ "$par_plp_summary_only" == "false" ]] && unset par_plp_summary_only
-[[ "$par_no_default_filter" == "false" ]] && unset par_no_default_filter
-[[ "$par_force_overwrite" == "false" ]] && unset par_force_overwrite
-[[ "$par_verbose" == "false" ]] && unset par_verbose
-[[ "$par_debug" == "false" ]] && unset par_debug
+unset_if_false=(
+    par_no_baq
+    par_no_idaq
+    par_del_baq
+    par_no_ext_baq
+    par_no_mq
+    par_call_indels
+    par_only_indels
+    par_src_qual
+    par_illumina_13
+    par_use_orphan
+    par_plp_summary_only
+    par_no_default_filter
+    par_force_overwrite
+    par_verbose
+    par_debug
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 # Run lofreq call
 lofreq call \
diff --git a/src/multiqc/script.sh b/src/multiqc/script.sh
index ad8c1c0c..5806fa1d 100755
--- a/src/multiqc/script.sh
+++ b/src/multiqc/script.sh
@@ -1,26 +1,32 @@
 #!/bin/bash
 
 # disable flags
-[[ "$par_ignore_symlinks" == "false" ]] && unset par_ignore_symlinks
-[[ "$par_dirs" == "false" ]] && unset par_dirs
-[[ "$par_full_names" == "false" ]] && unset par_full_names
-[[ "$par_fn_as_s_name" == "false" ]] && unset par_fn_as_s_name
-[[ "$par_profile_runtime" == "false" ]] && unset par_profile_runtime
-[[ "$par_verbose" == "false" ]] && unset par_verbose
-[[ "$par_quiet" == "false" ]] && unset par_quiet
-[[ "$par_strict" == "false" ]] && unset par_strict
-[[ "$par_development" == "false" ]] && unset par_development
-[[ "$par_require_logs" == "false" ]] && unset par_require_logs
-[[ "$par_no_megaqc_upload" == "false" ]] && unset par_no_megaqc_upload
-[[ "$par_no_ansi" == "false" ]] && unset par_no_ansi
-[[ "$par_flat" == "false" ]] && unset par_flat
-[[ "$par_interactive" == "false" ]] && unset par_interactive
-[[ "$par_static_plot_export" == "false" ]] && unset par_static_plot_export
-[[ "$par_data_dir" == "false" ]] && unset par_data_dir
-[[ "$par_no_data_dir" == "false" ]] && unset par_no_data_dir
-[[ "$par_zip_data_dir" == "false" ]] && unset par_zip_data_dir
-[[ "$par_pdf" == "false" ]] && unset par_pdf
+unset_if_false=(
+    par_ignore_symlinks
+    par_dirs
+    par_full_names
+    par_fn_as_s_name
+    par_profile_runtime
+    par_verbose
+    par_quiet
+    par_strict
+    par_development
+    par_require_logs
+    par_no_megaqc_upload
+    par_no_ansi
+    par_flat
+    par_interactive
+    par_static_plot_export
+    par_data_dir
+    par_no_data_dir
+    par_zip_data_dir
+    par_pdf
+)
 
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 # handle inputs
 out_dir=$(dirname "$par_output_report")
diff --git a/src/salmon/salmon_index/script.sh b/src/salmon/salmon_index/script.sh
index c2b9e7a0..5b1c4d76 100644
--- a/src/salmon/salmon_index/script.sh
+++ b/src/salmon/salmon_index/script.sh
@@ -5,12 +5,19 @@ set -e
 ## VIASH START
 ## VIASH END
 
-[[ "$par_gencode" == "false" ]] && unset par_gencode
-[[ "$par_features" == "false" ]] && unset par_features
-[[ "$par_keep_duplicates" == "false" ]] && unset par_keep_duplicates
-[[ "$par_keep_fixed_fasta" == "false" ]] && unset par_keep_fixed_fasta
-[[ "$par_sparse" == "false" ]] && unset par_sparse
-[[ "$par_no_clip" == "false" ]] && unset par_no_clip
+unset_if_false=(
+    par_gencode
+    par_features
+    par_keep_duplicates
+    par_keep_fixed_fasta
+    par_sparse
+    par_no_clip
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 tmp_dir=$(mktemp -d -p "$meta_temp_dir" "${meta_functionality_name}_XXXXXX")
 mkdir -p "$tmp_dir/temp"
diff --git a/src/salmon/salmon_quant/script.sh b/src/salmon/salmon_quant/script.sh
index 4c9f69d5..47cba1b9 100644
--- a/src/salmon/salmon_quant/script.sh
+++ b/src/salmon/salmon_quant/script.sh
@@ -4,49 +4,55 @@ set -e
 
 ## VIASH START
 ## VIASH END
+unset_if_false=(
+    par_discard_orphans
+    par_ont
+    par_seq_bias
+    par_gc_bias
+    par_pos_bias
+    par_meta
+    par_discard_orphans_quasi
+    par_disable_chaining_heuristic
+    par_allow_dovetail
+    par_recover_orphans
+    par_mimicBT2
+    par_mimic_strictBT2
+    par_softclip
+    par_softclip_overhangs
+    par_full_length_alignment
+    par_hard_filter
+    par_write_mappings
+    par_write_qualities
+    par_alternative_init_mode
+    par_skip_quant
+    par_dump_eq
+    par_dump_eq_weights
+    par_reduce_GC_memory
+    par_init_uniform
+    par_no_length_correction
+    par_no_effective_length_correction
+    par_no_single_frag_prob
+    par_no_frag_length_dist
+    par_no_bias_length_threshold
+    par_useEM
+    par_useVBOpt
+    par_no_Gamma_draw
+    par_bootstrap_reproject
+    par_quiet
+    par_per_transcript_prior
+    par_per_nucleotide_prior
+    par_write_orphan_links
+    par_write_unmapped_names
+    par_no_error_model
+    par_sample_out
+    par_sample_unaligned
+    par_gencode
+)
 
-[[ "$par_discard_orphans" == "false" ]] && unset par_discard_orphans
-[[ "$par_ont" == "false" ]] && unset par_ont
-[[ "$par_seq_bias" == "false" ]] && unset par_seq_bias
-[[ "$par_gc_bias" == "false" ]] && unset par_gc_bias
-[[ "$par_pos_bias" == "false" ]] && unset par_pos_bias
-[[ "$par_meta" == "false" ]] && unset par_meta
-[[ "$par_discard_orphans_quasi" == "false" ]] && unset par_discard_orphans_quasi
-[[ "$par_disable_chaining_heuristic" == "false" ]] && unset par_disable_chaining_heuristic
-[[ "$par_allow_dovetail" == "false" ]] && unset par_allow_dovetail
-[[ "$par_recover_orphans" == "false" ]] && unset par_recover_orphans
-[[ "$par_mimicBT2" == "false" ]] && unset par_mimicBT2
-[[ "$par_mimic_strictBT2" == "false" ]] && unset par_mimic_strictBT2
-[[ "$par_softclip" == "false" ]] && unset par_softclip
-[[ "$par_softclip_overhangs" == "false" ]] && unset par_softclip_overhangs
-[[ "$par_full_length_alignment" == "false" ]] && unset par_full_length_alignment
-[[ "$par_hard_filter" == "false" ]] && unset par_hard_filter
-[[ "$par_write_mappings" == "false" ]] && unset par_write_mappings
-[[ "$par_write_qualities" == "false" ]] && unset par_write_qualities
-[[ "$par_alternative_init_mode" == "false" ]] && unset par_alternative_init_mode
-[[ "$par_skip_quant" == "false" ]] && unset par_skip_quant
-[[ "$par_dump_eq" == "false" ]] && unset par_dump_eq
-[[ "$par_dump_eq_weights" == "false" ]] && unset par_dump_eq_weights
-[[ "$par_reduce_GC_memory" == "false" ]] && unset par_reduce_GC_memory
-[[ "$par_init_uniform" == "false" ]] && unset par_init_uniform
-[[ "$par_no_length_correction" == "false" ]] && unset par_no_length_correction
-[[ "$par_no_effective_length_correction" == "false" ]] && unset par_no_effective_length_correction
-[[ "$par_no_single_frag_prob" == "false" ]] && unset par_no_single_frag_prob
-[[ "$par_no_frag_length_dist" == "false" ]] && unset par_no_frag_length_dist
-[[ "$par_no_bias_length_threshold" == "false" ]] && unset par_no_bias_length_threshold
-[[ "$par_useEM" == "false" ]] && unset par_useEM
-[[ "$par_useVBOpt" == "false" ]] && unset par_useVBOpt
-[[ "$par_no_Gamma_draw" == "false" ]] && unset par_no_Gamma_draw
-[[ "$par_bootstrap_reproject" == "false" ]] && unset par_bootstrap_reproject
-[[ "$par_quiet" == "false" ]] && unset par_quiet
-[[ "$par_per_transcript_prior" == "false" ]] && unset par_per_transcript_prior
-[[ "$par_per_nucleotide_prior" == "false" ]] && unset par_per_nucleotide_prior
-[[ "$par_write_orphan_links" == "false" ]] && unset par_write_orphan_links
-[[ "$par_write_unmapped_names" == "false" ]] && unset par_write_unmapped_names
-[[ "$par_no_error_model" == "false" ]] && unset par_no_error_model
-[[ "$par_sample_out" == "false" ]] && unset par_sample_out
-[[ "$par_sample_unaligned" == "false" ]] && unset par_sample_unaligned
-[[ "$par_gencode" == "false" ]] && unset par_gencode
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 IFS=";" read -ra unmated_reads <<< $par_unmated_reads
 IFS=";" read -ra mates1 <<< $par_mates1
diff --git a/src/samtools/samtools_fastq/script.sh b/src/samtools/samtools_fastq/script.sh
index 0cad9cfe..e05da9b0 100644
--- a/src/samtools/samtools_fastq/script.sh
+++ b/src/samtools/samtools_fastq/script.sh
@@ -5,11 +5,18 @@
 
 set -e
 
-[[ "$par_no_suffix" == "false" ]] && unset par_no_suffix
-[[ "$par_suffix" == "false" ]] && unset par_suffix
-[[ "$par_use_oq" == "false" ]] && unset par_use_oq
-[[ "$par_copy_tags" == "false" ]] && unset par_copy_tags
-[[ "$par_casava" == "false" ]] && unset par_casava
+unset_if_false=(
+  par_no_suffix
+  par_suffix
+  par_use_oq
+  par_copy_tags
+  par_casava
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 if [[ "$meta_name" == "samtools_fasta" ]]; then
   subcommand=fasta
diff --git a/src/samtools/samtools_sort/script.sh b/src/samtools/samtools_sort/script.sh
index 94836c18..a8b3ce0f 100644
--- a/src/samtools/samtools_sort/script.sh
+++ b/src/samtools/samtools_sort/script.sh
@@ -5,15 +5,22 @@
 
 set -e
 
-[[ "$par_uncompressed" == "false" ]] && unset par_uncompressed
-[[ "$par_minimiser" == "false" ]] && unset par_minimiser
-[[ "$par_not_reverse" == "false" ]] && unset par_not_reverse
-[[ "$par_homopolymers" == "false" ]] && unset par_homopolymers
-[[ "$par_natural_sort" == "false" ]] && unset par_natural_sort
-[[ "$par_ascii_sort" == "false" ]] && unset par_ascii_sort
-[[ "$par_template_coordinate" == "false" ]] && unset par_template_coordinate
-[[ "$par_write_index" == "false" ]] && unset par_write_index
-[[ "$par_no_PG" == "false" ]] && unset par_no_PG
+unset_if_false=(
+   par_uncompressed
+   par_minimiser
+   par_not_reverse
+   par_homopolymers
+   par_natural_sort
+   par_ascii_sort
+   par_template_coordinate
+   par_write_index
+   par_no_PG
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 
 samtools sort \
diff --git a/src/samtools/samtools_view/script.sh b/src/samtools/samtools_view/script.sh
index c3911b48..7608844b 100644
--- a/src/samtools/samtools_view/script.sh
+++ b/src/samtools/samtools_view/script.sh
@@ -5,21 +5,29 @@
 
 set -e
 
-[[ "$par_bam" == "false" ]] && unset par_bam
-[[ "$par_cram" == "false" ]] && unset par_cram
-[[ "$par_fast" == "false" ]] && unset par_fast
-[[ "$par_uncompressed" == "false" ]] && unset par_uncompressed
-[[ "$par_with_header" == "false" ]] && unset par_with_header
-[[ "$par_header_only" == "false" ]] && unset par_header_only
-[[ "$par_no_header" == "false" ]] && unset par_no_header
-[[ "$par_count" == "false" ]] && unset par_count
-[[ "$par_unmap" == "false" ]] && unset par_unmap
-[[ "$par_use_index" == "false" ]] && unset par_use_index
-[[ "$par_fetch_pairs" == "false" ]] && unset par_fetch_pairs
-[[ "$par_customized_index" == "false" ]] && unset par_customized_index
-[[ "$par_no_PG" == "false" ]] && unset par_no_PG
-[[ "$par_write_index" == "false" ]] && unset par_write_index
-[[ "$par_remove_B" == "false" ]] && unset par_remove_B
+unset_if_false=(
+   par_bam
+   par_cram
+   par_fast
+   par_uncompressed
+   par_with_header
+   par_header_only
+   par_no_header
+   par_count
+   par_unmap
+   par_use_index
+   par_fetch_pairs
+   par_customized_index
+   par_no_PG
+   par_write_index
+   par_remove_B
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
+
 
 samtools view \
     ${par_bam:+-b} \
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
index d57a5e76..3f3bdc89 100644
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -7,19 +7,26 @@ set -e
 
 test_dir="${metal_executable}/test_data"
 
-[[ "$par_paired" == "false" ]] && unset par_paired
-[[ "$par_in_sam" == "false" ]] && unset par_in_sam
-[[ "$par_out_sam" == "false" ]] && unset par_out_sam
-[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
-[[ "$par_per_gene" == "false" ]] && unset par_per_gene
-[[ "$par_per_contig" == "false" ]] && unset par_per_contig
-[[ "$par_per_cell" == "false" ]] && unset par_per_cell
-[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
-[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
-[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
-[[ "$par_subset" == "false" ]] && unset par_subset
-[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr
-[[ "$par_read_length" == "false" ]] && unset par_read_length
+unset_if_false=(
+   par_paired
+   par_in_sam
+   par_out_sam
+   par_spliced_is_unique
+   par_per_gene
+   par_per_contig
+   par_per_cell
+   par_no_sort_output
+   par_buffer_whole_contig
+   par_ignore_umi
+   par_subset
+   par_log2stderr
+   par_read_length
+)
+
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 umi_tools dedup \
     --stdin "$par_input" \
diff --git a/src/umi_tools/umi_tools_extract/script.sh b/src/umi_tools/umi_tools_extract/script.sh
index 5e41865d..4514860e 100644
--- a/src/umi_tools/umi_tools_extract/script.sh
+++ b/src/umi_tools/umi_tools_extract/script.sh
@@ -5,14 +5,19 @@
 
 set -exo pipefail
 
-test_dir="${metal_executable}/test_data"
+unset_if_false=(
+   par_error_correct_cell
+   par_reconcile_pairs
+   par_three_prime
+   par_ignore_read_pair_suffixes
+   par_timeit_header
+   par_log2stderr
+)
 
-[[ "$par_error_correct_cell" == "false" ]] && unset par_error_correct_cell
-[[ "$par_reconcile_pairs" == "false" ]] && unset par_reconcile_pairs
-[[ "$par_three_prime" == "false" ]] && unset par_three_prime
-[[ "$par_ignore_read_pair_suffixes" == "false" ]] && unset par_ignore_read_pair_suffixes
-[[ "$par_timeit_header" == "false" ]] && unset par_timeit_header
-[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr
+for par in ${unset_if_false[@]}; do
+    test_val="${!par}"
+    [[ "$test_val" == "false" ]] && unset $par
+done
 
 
 # Check if we have the correct number of input files and patterns for paired-end or single-end reads

From 9fc07f6c05879f8efff441767ec489bb24fdce7d Mon Sep 17 00:00:00 2001
From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Tue, 13 Aug 2024 17:19:06 +0200
Subject: [PATCH 9/9] Bump Viash to 0.9.0-RC7 (#134)

* Bump viash to 0.9.0-RC7

* Update CHANGELOG
---
 CHANGELOG.md | 2 ++
 _viash.yaml  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5030894c..d51fcf12 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,8 @@
 
 * Update several components in order to avoid duplicate code when using `unset` on boolean arguments (PR #133).
 
+* Bump viash to `0.9.0-RC7` (PR #134)
+
 ## DOCUMENTATION
 
 * Extend the contributing guidelines (PR #82):
diff --git a/_viash.yaml b/_viash.yaml
index 9a240c24..ab4f3828 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -7,7 +7,7 @@ links:
   issue_tracker: https://github.com/viash-hub/biobox/issues
   repository: https://github.com/viash-hub/biobox
 
-viash_version: 0.9.0-RC6
+viash_version: 0.9.0-RC7
 
 config_mods: |
   .requirements.commands := ['ps']