From bea0fe725709ec72b5233a89856fcceaf9f24251 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Fri, 24 May 2024 22:07:46 +0200 Subject: [PATCH] config and help files --- .../umi_tools_extract/config.vsh.yaml | 164 ++++++++++++++++++ src/umi_tools/umi_tools_extract/help.txt | 106 +++++++++++ 2 files changed, 270 insertions(+) create mode 100644 src/umi_tools/umi_tools_extract/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_extract/help.txt diff --git a/src/umi_tools/umi_tools_extract/config.vsh.yaml b/src/umi_tools/umi_tools_extract/config.vsh.yaml new file mode 100644 index 00000000..d859ef94 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/config.vsh.yaml @@ -0,0 +1,164 @@ + +name: umi_tools_extract +namespace: umi_tools +description: | + Flexible removal of UMI sequences from fastq reads. + UMIs are removed and appended to the read name. Any other barcode, for example a library barcode, + is left on the read. Can also filter reads by quality or against a whitelist. +keywords: [ extract, umi, fastq ] +links: + homepage: https://umi-tools.readthedocs.io/en/stable/# + documentation: https://umi-tools.readthedocs.io/en/stable/reference/extract.html + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Common Options + arguments: + - name: --stdout + type: file + description: | + File where output is to go. + direction: output + - name: --log + type: file + description: | + File with logging information. + - name: --log2stderr + type: boolean_true + description: | + Send logging information to stderr. + - name: --verbose + type: integer + description: | + Log level. The higher, the more output. + - name: --error + type: file + description: | + File with error information. + - name: --temp-dir + type: directory + description: | + Directory for temporary files. If not set, the bash environmental variable TMPDIR is used. + - name: --compresslevel + type: integer + description: | + Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9) + default: 6 + - name: --timeit + type: file + description: | + Store timing information in file. + - name: --timeit_name + type: string + description: | + Name in timing file for this class of jobs. + default: all + - name: --timeit_header + type: boolean_true + description: | + Add header for timing information. + - name: --random_seed + type: string + description: | + Random seed to initialize number generator with. + + - name: Extract Options + arguments: + - name: --stdin + type: file + description: | + File containing the input data. + - name: --error_correct_cell + type: boolean_true + description: | + Error correct cell barcodes to the whitelist. + - name: --whitelist + type: file + description: | + Whitelist of accepted cell barcodes tab-separated format, where column 1 is the whitelisted + cell barcodes and column 2 is the list (comma-separated) of other cell barcodes which should + be corrected to the barcode in column 1. If the --error-correct-cell option is not used, this + column will be ignored. + - name: --blacklist + type: file + description: | + BlackWhitelist of cell barcodes to discard + - name: --subset_reads + type: integer + description: | + Only parse the first N reads + - name: --quality_filter_threshold + type: integer + description: | + Remove reads where any UMI base quality score falls below this threshold + - name: --quality_filter_mask + type: string + description: | + If a UMI base has a quality below this threshold, replace the base with 'N' + - name: --quality_encoding + type: string + description: | + Quality score encoding. Choose from: + - phred33 [33-77] + - phred64 [64-106] + - solexa [59-106] + - name: --reconcile_pairs + type: boolean_true + description: | + Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols + where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard + to the read2. + + - name: Experimental options + arguments: + - name: --bc_pattern + type: string + description: | + Pattern for barcode(s) on read 1. See --extract_method + - name: --bc_pattern2 + type: string + description: | + Pattern for barcode(s) on read 2. See --extract_method + - name: --extract_method + type: string + description: | + There are two methods enabled to extract the umi barcode (+/- cell barcode). For both methods, + the patterns should be provided using the --bc_pattern and --bc_pattern2 options. + - string: + This should be used where the barcodes are always in the same place in the read. + N = UMI position (required) + C = cell barcode position (optional) + X = sample position (optional) + - regex: + Allows for more flexible barcode extraction and should be used where the cell barcodes are variable in + length. Can also be used to filter out reads which do not contain an expected adapter sequence. + The expected groups in the regex are: + - umi_n = UMI positions, where n can be any value (required) + - cell_n = cell barcode positions, where n can be any value (optional) + - discard_n = positions to discard, where n can be any value (optional) + - name: --3prime + type: boolean_true + description: | + By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is + on the 3' end instead. This option only works with --extract_method=string since 3' encoding can be specified + explicitly with a regex, e.g .*(?P.{5})$ + - name: --read2_in + type: file + description: | + Filename for read pairs + - name: --filtered_out + type: file + description: | + Write out reads not matching regex pattern or cell barcode whitelist to this file + - name: --filtered_out2 + type: file + description: | + Write out read pairs not matching regex pattern or cell barcode whitelist to this file + - name: --ignore_read_pair_suffixes + type: boolean_true + description: | + Ignore SOH and STX read name suffixes. Note that this options is required if the suffixes are not whitespace + separated from the rest of the read name diff --git a/src/umi_tools/umi_tools_extract/help.txt b/src/umi_tools/umi_tools_extract/help.txt new file mode 100644 index 00000000..46c77ed0 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/help.txt @@ -0,0 +1,106 @@ +''' +Generated from the following UMI-tools documentation: + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options + https://umi-tools.readthedocs.io/en/latest/reference/extract.html +''' + +extract - Extract UMI from fastq + +Usage: + + Single-end: + umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]] + + Paired end: + umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]] --read2-in=IN2_FASTQ[.gz] --read2-out=OUT2_FASTQ[.gz] + + note: If -I/-S are ommited standard in and standard out are used + for input and output. To generate a valid BAM file on + standard out, please redirect log with --log=LOGFILE or + --log2stderr. Input/Output will be (de)compressed if a + filename provided to -S/-I/--read2-in/read2-out ends in .gz + +Common UMI-tools Options: + + -S, --stdout File where output is to go [default = stdout]. + -L, --log File with logging information [default = stdout]. + --log2stderr Send logging information to stderr [default = False]. + -v, --verbose Log level. The higher, the more output [default = 1]. + -E, --error File with error information [default = stderr]. + --temp-dir Directory for temporary files. If not set, the bash environmental variable TMPDIR is used[default = None]. + --compresslevel Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9) + + profiling and debugging options: + --timeit Store timing information in file [default=none]. + --timeit-name Name in timing file for this class of jobs [default=all]. + --timeit-header Add header for timing information [default=none]. + --random-seed Random seed to initialize number generator with [default=none]. + +Extract Options: + -I, --stdin File containing the input data [default = stdin]. + --error-correct-cell Error correct cell barcodes to the whitelist (see --whitelist) + --whitelist Whitelist of accepted cell barcodes. The whitelist should be in the following format (tab-separated): + AAAAAA AGAAAA + AAAATC + AAACAT + AAACTA AAACTN,GAACTA + AAATAC + AAATCA GAATCA + AAATGT AAAGGT,CAATGT + Where column 1 is the whitelisted cell barcodes and column 2 is the list (comma-separated) of other cell + barcodes which should be corrected to the barcode in column 1. If the --error-correct-cell option is not + used, this column will be ignored. Any additional columns in the whitelist input, such as the counts columns + from the output of umi_tools whitelist, will be ignored. + --blacklist BlackWhitelist of cell barcodes to discard + --subset-reads=[N] Only parse the first N reads + --quality-filter-threshold Remove reads where any UMI base quality score falls below this threshold + --quality-filter-mask If a UMI base has a quality below this threshold, replace the base with 'N' + --quality-encoding Quality score encoding. Choose from: + 'phred33' [33-77] + 'phred64' [64-106] + 'solexa' [59-106] + --reconcile-pairs Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols + where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard + to the read2s. + +Experimental options: + Note: These options have not been extensively testing to ensure behaviour is as expected. If you have some suitable input files which + we can use for testing, please contact us. + If you have a library preparation method where the UMI may be in either read, you can use the following options to search for the + UMI in either read: + + --either-read --extract-method --bc-pattern=[PATTERN1] --bc-pattern2=[PATTERN2] + + Where both patterns match, the default behaviour is to discard both reads. If you want to select the read with the UMI with highest + sequence quality, provide --either-read-resolve=quality. + + + --bc-pattern Pattern for barcode(s) on read 1. See --extract-method + --bc-pattern2 Pattern for barcode(s) on read 2. See --extract-method + --extract-method There are two methods enabled to extract the umi barcode (+/- cell barcode). For both methods, the patterns + should be provided using the --bc-pattern and --bc-pattern2 options.x + string: + This should be used where the barcodes are always in the same place in the read. + N = UMI position (required) + C = cell barcode position (optional) + X = sample position (optional) + Bases with Ns and Cs will be extracted and added to the read name. The corresponding sequence qualities will + be removed from the read. Bases with an X will be reattached to the read. + regex: + This method allows for more flexible barcode extraction and should be used where the cell barcodes are variable + in length. Alternatively, the regex option can also be used to filter out reads which do not contain an expected + adapter sequence. The regex must contain groups to define how the barcodes are encoded in the read. + The expected groups in the regex are: + umi_n = UMI positions, where n can be any value (required) + cell_n = cell barcode positions, where n can be any value (optional) + discard_n = positions to discard, where n can be any value (optional) + --3prime By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is + on the 3' end instead. This option only works with --extract-method=string since 3' encoding can be specified + explicitly with a regex, e.g .*(?P.{5})$ + --read2-in Filename for read pairs + --filtered-out Write out reads not matching regex pattern or cell barcode whitelist to this file + --filtered-out2 Write out read pairs not matching regex pattern or cell barcode whitelist to this file + --ignore-read-pair-suffixes Ignore SOH and STX read name suffixes. Note that this options is required if the suffixes are not whitespace + separated from the rest of the read name + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file