viash-hub · rcannood · Jul 24, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 11, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -58,6 +58,9 @@
     - `samtools/samtools_view`: Views and converts SAM/BAM/CRAM files (PR #48).
     - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52).
 
+* `umi_tools`:
+    -`umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71).
+
 * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43).
 
 * `bedtools`:

diff --git a/src/umi_tools/umi_tools_extract/config.vsh.yaml b/src/umi_tools/umi_tools_extract/config.vsh.yaml
@@ -0,0 +1,197 @@
+name: umi_tools_extract
+namespace: umi_tools
+description: |
+  Flexible removal of UMI sequences from fastq reads.
+  UMIs are removed and appended to the read name. Any other barcode, for example a library barcode,
+  is left on the read. Can also filter reads by quality or against a whitelist.
+keywords: [ extract, umi-tools, umi, fastq ]
+links:
+  homepage: https://umi-tools.readthedocs.io/en/latest/
+  documentation: https://umi-tools.readthedocs.io/en/latest/reference/extract.html
+  repository: https://github.com/CGATOxford/UMI-tools
+references: 
+  doi: 10.1101/gr.209601.116
+license: MIT
+
+argument_groups:
+
+  - name: Input
+    arguments: 
+    - name: --paired
+      type: boolean_true
+      description: Paired fastq files. If option is set, two input files are expected.
+    - name: --input
+      type: file
+      required: true
+      description: Input fastq files, either one or two (paired).
+      example: sample.fastq
+    - name: --read2_in
+      type: file
+      description: Filename for read pairs.
+    - name: --bc_pattern
+      type: string
+      description: |
+        The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides 
+        of the read are from the UMI.
+    - name: --bc_pattern2
+      type: string
+      description: The UMI barcode pattern to use for read 2.
+
+  - name: "Output"
+    arguments:  
+    - name: --read1_out
+      type: file
+      required: true
+      description: Output file for read 1.
+      direction: output
+    - name: --read2_out
+      type: file
+      description: Output file for read 2.
+      direction: output
+    - name: --filtered_out
+      type: file
+      description: |
+        Write out reads not matching regex pattern or cell barcode whitelist to this file.
+    - name: --filtered_out2
+      type: file
+      description: |
+        Write out read pairs not matching regex pattern or cell barcode whitelist to this file.
+
+  - name: Extract Options
+    arguments:
+    - name: --extract_method
+      type: string
+      choices: [string, regex]
+      description: |
+        UMI pattern to use. Default: `string`.
+      example: "string"
+    - name: --error_correct_cell
+      type: boolean_true
+      description: Error correct cell barcodes to the whitelist.
+    - name: --whitelist
+      type: file
+      description: |
+        Whitelist of accepted cell barcodes tab-separated format, where column 1 is the whitelisted
+        cell barcodes and column 2 is the list (comma-separated) of other cell barcodes which should 
+        be corrected to the barcode in column 1. If the --error_correct_cell option is not used, this
+        column will be ignored.
+    - name: --blacklist
+      type: file
+      description: BlackWhitelist of cell barcodes to discard.
+    - name: --subset_reads
+      type: integer
+      description: Only parse the first N reads.
+    - name: --quality_filter_threshold
+      type: integer
+      description: Remove reads where any UMI base quality score falls below this threshold.
+    - name: --quality_filter_mask
+      type: string
+      description: |
+        If a UMI base has a quality below this threshold, replace the base with 'N'.
+    - name: --quality_encoding
+      type: string
+      choices: [phred33, phred64, solexa]
+      description: |
+        Quality score encoding. Choose from:
+          * phred33 [33-77]
+          * phred64 [64-106]
+          * solexa [59-106]
+    - name: --reconcile_pairs
+      type: boolean_true
+      description: |
+        Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols
+        where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard
+        to the read2.
+    - name: --three_prime
+      alternatives: --3prime
+      type: boolean_true
+      description: |
+        By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is
+        on the 3' end instead. This option only works with --extract_method=string since 3' encoding can be specified
+        explicitly with a regex, e.g `.*(?P<umi_1>.{5})$`.
+    - name: --ignore_read_pair_suffixes
+      type: boolean_true
+      description: |
+        Ignore "/1" and "/2" read name suffixes. Note that this options is required if the suffixes are not whitespace
+        separated from the rest of the read name.
+        arguments:
+    - name: --umi_separator
+      type: string
+      description: |
+        The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with
+        UMI-tools and used other software. Default: `_`
+      example: "_"
+    - name: --grouping_method
+      type: string
+      choices: [unique, percentile, cluster, adjacency, directional]
+      description: |
+        Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying
+        the reads with the same mapping position, but treat similar yet nonidentical UMIs differently. Default: `directional`
+      example: "directional"
+    - name: --umi_discard_read
+      type: integer
+      choices: [0, 1, 2]
+      description: |
+        After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively. Default: `0`
+      example: 0
+
+  - name: Common Options
+    arguments:
+    - name: --log
+      type: file
+      description: File with logging information.
+      direction: output
+    - name: --log2stderr
+      type: boolean_true
+      description: Send logging information to stderr.
+      direction: output
+    - name: --verbose
+      type: integer
+      description: Log level. The higher, the more output.
+    - name: --error
+      type: file
+      description: File with error information.
+      direction: output
+    - name: --temp_dir
+      type: string
+      description: |
+        Directory for temporary files. If not set, the bash environmental variable TMPDIR is used.
+    - name: --compresslevel
+      type: integer
+      description: |
+        Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9).
+        Default `6`.
+      example: 6
+    - name: --timeit
+      type: file
+      description: Store timing information in file.
+      direction: output
+    - name: --timeit_name
+      type: string
+      description: Name in timing file for this class of jobs.
+      default: all
+    - name: --timeit_header
+      type: boolean_true
+      description: Add header for timing information.
+    - name: --random_seed
+      type: integer
+      description: Random seed to initialize number generator with.
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/umi_tools:1.1.4--py310h4b81fae_2
+    setup:
+      - type: docker
+        run: |
+            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
+runners:
+- type: executable
+- type: nextflow
diff --git a/src/umi_tools/umi_tools_extract/help.txt b/src/umi_tools/umi_tools_extract/help.txt
@@ -0,0 +1,106 @@
+'''
+Generated from the following UMI-tools documentation:
+      https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options
+      https://umi-tools.readthedocs.io/en/latest/reference/extract.html
+'''
+
+extract - Extract UMI from fastq
+
+Usage:
+
+   Single-end:
+      umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]]
+
+   Paired end:
+      umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]] --read2-in=IN2_FASTQ[.gz] --read2-out=OUT2_FASTQ[.gz]
+
+   note: If -I/-S are ommited standard in and standard out are used
+         for input and output.  To generate a valid BAM file on
+         standard out, please redirect log with --log=LOGFILE or
+         --log2stderr. Input/Output will be (de)compressed if a
+         filename provided to -S/-I/--read2-in/read2-out ends in .gz
+
+Common UMI-tools Options:
+
+      -S, --stdout                  File where output is to go [default = stdout].
+      -L, --log                     File with logging information [default = stdout].
+      --log2stderr                  Send logging information to stderr [default = False].
+      -v, --verbose                 Log level. The higher, the more output [default = 1].
+      -E, --error                   File with error information [default = stderr].
+      --temp-dir                    Directory for temporary files. If not set, the bash environmental variable TMPDIR is used[default = None].
+      --compresslevel               Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9)
+
+      profiling and debugging options:
+      --timeit                      Store timing information in file [default=none].
+      --timeit-name                 Name in timing file for this class of jobs [default=all].
+      --timeit-header               Add header for timing information [default=none].
+      --random-seed                 Random seed to initialize number generator with [default=none].
+
+Extract Options:
+      -I, --stdin                   File containing the input data [default = stdin].
+      --error-correct-cell          Error correct cell barcodes to the whitelist (see --whitelist)
+      --whitelist                   Whitelist of accepted cell barcodes. The whitelist should be in the following format (tab-separated):
+                                          AAAAAA    AGAAAA
+                                          AAAATC
+                                          AAACAT
+                                          AAACTA    AAACTN,GAACTA
+                                          AAATAC
+                                          AAATCA    GAATCA
+                                          AAATGT    AAAGGT,CAATGT
+                                    Where column 1 is the whitelisted cell barcodes and column 2 is the list (comma-separated) of other cell 
+                                    barcodes which should be corrected to the barcode in column 1. If the --error-correct-cell option is not 
+                                    used, this column will be ignored. Any additional columns in the whitelist input, such as the counts columns 
+                                    from the output of umi_tools whitelist, will be ignored.
+      --blacklist                   BlackWhitelist of cell barcodes to discard
+      --subset-reads=[N]            Only parse the first N reads
+      --quality-filter-threshold    Remove reads where any UMI base quality score falls below this threshold
+      --quality-filter-mask         If a UMI base has a quality below this threshold, replace the base with 'N'
+      --quality-encoding            Quality score encoding. Choose from:
+                                          'phred33' [33-77]
+                                          'phred64' [64-106]
+                                          'solexa' [59-106]
+      --reconcile-pairs             Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols
+                                    where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard
+                                    to the read2s.
+
+Experimental options:
+      Note: These options have not been extensively testing to ensure behaviour is as expected. If you have some suitable input files which
+            we can use for testing, please contact us.
+            If you have a library preparation method where the UMI may be in either read, you can use the following options to search for the
+            UMI in either read:
+
+                  --either-read --extract-method --bc-pattern=[PATTERN1] --bc-pattern2=[PATTERN2]
+
+            Where both patterns match, the default behaviour is to discard both reads. If you want to select the read with the UMI with highest
+            sequence quality, provide --either-read-resolve=quality.
+
+
+      --bc-pattern                  Pattern for barcode(s) on read 1. See --extract-method
+      --bc-pattern2                 Pattern for barcode(s) on read 2. See --extract-method
+      --extract-method              There are two methods enabled to extract the umi barcode (+/- cell barcode). For both methods, the patterns
+                                    should be provided using the --bc-pattern and --bc-pattern2 options.x   
+                                    string: 
+                                    This should be used where the barcodes are always in the same place in the read.
+                                          N = UMI position (required)
+                                          C = cell barcode position (optional)
+                                          X = sample position (optional)
+                                    Bases with Ns and Cs will be extracted and added to the read name. The corresponding sequence qualities will
+                                    be removed from the read. Bases with an X will be reattached to the read.
+                                    regex:
+                                    This method allows for more flexible barcode extraction and should be used where the cell barcodes are variable
+                                    in length. Alternatively, the regex option can also be used to filter out reads which do not contain an expected
+                                    adapter sequence. The regex must contain groups to define how the barcodes are encoded in the read. 
+                                    The expected groups in the regex are:
+                                          umi_n = UMI positions, where n can be any value (required) 
+                                          cell_n = cell barcode positions, where n can be any value (optional) 
+                                          discard_n = positions to discard, where n can be any value (optional)
+      --3prime                      By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is
+                                    on the 3' end instead. This option only works with --extract-method=string since 3' encoding can be specified
+                                    explicitly with a regex, e.g .*(?P<umi_1>.{5})$
+      --read2-in                    Filename for read pairs
+      --filtered-out                Write out reads not matching regex pattern or cell barcode whitelist to this file
+      --filtered-out2               Write out read pairs not matching regex pattern or cell barcode whitelist to this file
+      --ignore-read-pair-suffixes   Ignore SOH and STX read name suffixes. Note that this options is required if the suffixes are not whitespace
+                                    separated from the rest of the read name
+
+For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
diff --git a/src/umi_tools/umi_tools_extract/script.sh b/src/umi_tools/umi_tools_extract/script.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -exo pipefail
+
+test_dir="${metal_executable}/test_data"
+
+[[ "$par_paired" == "false" ]] && unset par_paired
+[[ "$par_error_correct_cell" == "false" ]] && unset par_error_correct_cell
+[[ "$par_reconcile_pairs" == "false" ]] && unset par_reconcile_pairs
+[[ "$par_three_prime" == "false" ]] && unset par_three_prime
+[[ "$par_ignore_read_pair_suffixes" == "false" ]] && unset par_ignore_read_pair_suffixes
+[[ "$par_timeit_header" == "false" ]] && unset par_timeit_header
+[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr
+
+
+# Check if we have the correct number of input files and patterns for paired-end or single-end reads
+
+# For paired-end rends, check that we have two read files, two patterns
+if [ -n "$par_paired" ]; then
+    if [ -z "$par_input" ] || [ -z "$par_read2_in" ] ||
+       [ -z "$par_bc_pattern" ] || [ -z "$par_bc_pattern2" ]; 
+        then
+        echo "Paired end input requires two read files, two UMI patterns, and two output files"
+        exit 1
+    fi
+else # For single-end reads, check that we have only one read file, one pattern
+    if [ -n "$par_read2_in" ] || [ -n "$par_bc_pattern2" ]; then
+        echo "Single end input requires only one read file and one UMI pattern"
+        exit 1
+    # if par_umi_discard_read is not empty or not 0:
+    elif [ -n "$par_umi_discard_read" ] && [ "$par_umi_discard_read" != 0 ]; then
+        echo "umi_discard_read is only valid when processing paired end reads."
+        exit 1
+    fi
+fi
+
+
+umi_tools extract \
+    -I "$par_input" \
+    ${par_read2_in:+ --read2-in "$par_read2_in"} \
+    -S "$par_read1_out" \
+    ${par_read2_out:+--read2-out "$par_read2_out"} \
+    ${par_extract_method:+--extract-method "$par_extract_method"} \
+    --bc-pattern "$par_bc_pattern" \
+    ${par_bc_pattern2:+ --bc-pattern2 "$par_bc_pattern2"} \
+    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
+    ${par_output_stats:+--output-stats "$par_output_stats"} \
+    ${par_error_correct_cell:+--error-correct-cell} \
+    ${par_whitelist:+--whitelist "$par_whitelist"} \
+    ${par_blacklist:+--blacklist "$par_blacklist"} \
+    ${par_subset_reads:+--subset-reads "$par_subset_reads"} \
+    ${par_quality_filter_threshold:+--quality-filter-threshold "$par_quality_filter_threshold"} \
+    ${par_quality_filter_mask:+--quality-filter-mask "$par_quality_filter_mask"} \
+    ${par_quality_encoding:+--quality-encoding "$par_quality_encoding"} \
+    ${par_reconcile_pairs:+--reconcile-pairs} \
+    ${par_three_prime:+--3prime} \
+    ${par_filtered_out:+--filtered-out "$par_filtered_out"} \
+    ${par_filtered_out2:+--filtered-out2 "$par_filtered_out2"} \
+    ${par_ignore_read_pair_suffixes:+--ignore-read-pair-suffixes} \
+    ${par_random_seed:+--random-seed "$par_random_seed"} \
+    ${par_temp_dir:+--temp-dir "$par_temp_dir"} \
+    ${par_compresslevel:+--compresslevel "$par_compresslevel"} \
+    ${par_timeit:+--timeit "$par_timeit"} \
+    ${par_timeit_name:+--timeit-name "$par_timeit_name"} \
+    ${par_timeit_header:+--timeit-header} \
+    ${par_log:+--log "$par_log"} \
+    ${par_log2stderr:+--log2stderr} \
+    ${par_verbose:+--verbose "$par_verbose"} \
+    ${par_error:+--error "$par_error"}
+
+
+if [ "$par_umi_discard_read" == 1 ]; then
+    # discard read 1
+    rm "$par_read1_out"
+elif [ "$par_umi_discard_read" == 2 ]; then
+    # discard read 2 (-f to bypass file existence check)
+    rm -f "$par_read2_out"
+fi