Skip to content

Commit

Permalink
include arguments and script from the rnaseq.vsh version
Browse files Browse the repository at this point in the history
  • Loading branch information
emmarousseau committed Jun 2, 2024
1 parent 26c2437 commit 5072b3e
Show file tree
Hide file tree
Showing 11 changed files with 120,422 additions and 97 deletions.
97 changes: 62 additions & 35 deletions src/umi_tools/umi_tools_extract/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,6 @@ license: MIT
argument_groups:
- name: Extract Options
arguments:
- name: --input
alternatives: --stdin
type: file
description: File containing the input data.
required: true
- name: --output
alternatives: --stdout
type: file
description: File where output is to go.
direction: output
- name: --error_correct_cell
type: boolean_true
description: Error correct cell barcodes to the whitelist.
Expand Down Expand Up @@ -65,32 +55,9 @@ argument_groups:
Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols
where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard
to the read2.
- name: Experimental options
arguments:
- name: --bc_pattern
type: string
description: Pattern for barcode(s) on read 1. See --extract_method
- name: --bc_pattern2
type: string
description: Pattern for barcode(s) on read 2. See --extract_method
- name: --extract_method
type: string
description: |
There are two methods enabled to extract the umi barcode (+/- cell barcode). For both methods,
the patterns should be provided using the --bc_pattern and --bc_pattern2 options.
- string:
This should be used where the barcodes are always in the same place in the read.
N = UMI position (required)
C = cell barcode position (optional)
X = sample position (optional)
- regex:
Allows for more flexible barcode extraction and should be used where the cell barcodes are variable in
length. Can also be used to filter out reads which do not contain an expected adapter sequence.
The expected groups in the regex are:
- umi_n = UMI positions, where n can be any value (required)
- cell_n = cell barcode positions, where n can be any value (optional)
- discard_n = positions to discard, where n can be any value (optional)
- name: --three_prime
alternatives: "--3prime"
type: boolean_true
Expand Down Expand Up @@ -120,15 +87,18 @@ argument_groups:
- name: --log
type: file
description: File with logging information.
direction: output
- name: --log2stderr
type: boolean_true
description: Send logging information to stderr.
direction: output
- name: --verbose
type: integer
description: Log level. The higher, the more output.
- name: --error
type: file
description: File with error information.
direction: output
- name: --temp_dir
type: string
description: |
Expand All @@ -141,6 +111,7 @@ argument_groups:
- name: --timeit
type: file
description: Store timing information in file.
direction: output
- name: --timeit_name
type: string
description: Name in timing file for this class of jobs.
Expand All @@ -151,6 +122,62 @@ argument_groups:
- name: --random_seed
type: integer
description: Random seed to initialize number generator with.

- name: "Input"
arguments:
- name: "--paired"
type: boolean_true
description: Paired fastq files or not? If option is set, two input files are expected.
- name: "--input"
type: file
required: true
multiple: true
multiple_sep: ","
description: Input fastq files, either one or two (paired)
example: sample.fastq
- name: "--bc_pattern"
type: string
description: "The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides of the read are from the UMI."
multiple: true
multiple_sep: ","

- name: "Output"
arguments:
- name: "--fastq_1"
type: file
required: true
description: Output file for read 1.
direction: output
- name: "--fastq_2"
type: file
required: false
must_exist: false
description: Output file for read 2.
direction: output

- name: "Optional arguments"
arguments:
- name: "--umitools_extract_method"
alternatives: "--extract_method"
type: "string"
description: UMI pattern to use.
default: string
choices: [ string, regex ]
- name: "--umitools_umi_separator"
alternatives: "--umi_separator"
type: string
default: "_"
description: The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with UMI-tools and used other software.
- name: "--umitools_grouping_method"
type: string
description: Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying the reads with the same mapping position, but treat similar yet nonidentical UMIs differently.
default: "directional"
choices: ["unique", "percentile", "cluster", "adjacency", "directional"]
- name: "--umi_discard_read"
type: integer
description: After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively.
choices: [0, 1, 2]
default: 0

resources:
- type: bash_script
Expand All @@ -162,7 +189,7 @@ test_resources:
path: test_data
engines:
- type: docker
image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
image: quay.io/biocontainers/umi_tools:1.1.4--py310h4b81fae_2
setup:
- type: docker
run: |
Expand Down
200 changes: 200 additions & 0 deletions src/umi_tools/umi_tools_extract/log.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
# UMI-tools version: 1.1.4
# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string --umi-separator _ -S scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1
# job started at Sun Jun 2 18:15:07 2024 on Aleks-MacBook-Air.local -- 73a6bd2a-5dd9-4473-8c32-cf7a2dd72117
# pid: 66002, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64
# blacklist : None
# compresslevel : 6
# correct_umi_threshold : 0
# either_read : False
# either_read_resolve : discard
# error_correct_cell : False
# extract_method : string
# filter_cell_barcode : None
# filter_cell_barcodes : False
# filter_umi : None
# filtered_out : None
# filtered_out2 : None
# ignore_suffix : False
# log2stderr : False
# loglevel : 1
# pattern : CCCCCCNNNNNNNNNN
# pattern2 : CCCCCCNNNNNNNNNN
# prime3 : None
# quality_encoding : None
# quality_filter_mask : None
# quality_filter_threshold : None
# random_seed : 1
# read2_in : test_data/scrb_seq_fastq.2.gz
# read2_out : scrb_seq_fastq.2.umi_extract.fastq.gz
# read2_stdout : False
# reads_subset : None
# reconcile : False
# retain_umi : None
# short_help : None
# stderr : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='utf-8'>
# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'>
# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'>
# stdout : <_io.TextIOWrapper name='scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'>
# timeit_file : None
# timeit_header : None
# timeit_name : all
# tmpdir : None
# umi_correct_log : None
# umi_separator : _
# umi_whitelist : None
# umi_whitelist_paired : None
# whitelist : None
2024-06-02 18:15:07,695 INFO Starting barcode extraction
2024-06-02 18:15:08,221 INFO Input Reads: 10000
2024-06-02 18:15:08,222 INFO Reads output: 10000
# job finished in 0 seconds at Sun Jun 2 18:15:08 2024 -- 2.98 0.98 0.00 0.00 -- 73a6bd2a-5dd9-4473-8c32-cf7a2dd72117
# UMI-tools version: 1.1.4
# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string -S scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1
# job started at Sun Jun 2 18:50:33 2024 on Aleks-MacBook-Air.local -- d91646df-6947-449d-9885-d33b60a81dcd
# pid: 66898, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64
# blacklist : None
# compresslevel : 6
# correct_umi_threshold : 0
# either_read : False
# either_read_resolve : discard
# error_correct_cell : False
# extract_method : string
# filter_cell_barcode : None
# filter_cell_barcodes : False
# filter_umi : None
# filtered_out : None
# filtered_out2 : None
# ignore_suffix : False
# log2stderr : False
# loglevel : 1
# pattern : CCCCCCNNNNNNNNNN
# pattern2 : CCCCCCNNNNNNNNNN
# prime3 : None
# quality_encoding : None
# quality_filter_mask : None
# quality_filter_threshold : None
# random_seed : 1
# read2_in : test_data/scrb_seq_fastq.2.gz
# read2_out : scrb_seq_fastq.2.umi_extract.fastq.gz
# read2_stdout : False
# reads_subset : None
# reconcile : False
# retain_umi : None
# short_help : None
# stderr : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='utf-8'>
# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'>
# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'>
# stdout : <_io.TextIOWrapper name='scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'>
# timeit_file : None
# timeit_header : None
# timeit_name : all
# tmpdir : None
# umi_correct_log : None
# umi_separator : _
# umi_whitelist : None
# umi_whitelist_paired : None
# whitelist : None
2024-06-02 18:50:33,203 INFO Starting barcode extraction
2024-06-02 18:50:33,698 INFO Input Reads: 10000
2024-06-02 18:50:33,698 INFO Reads output: 10000
# job finished in 0 seconds at Sun Jun 2 18:50:33 2024 -- 3.08 0.94 0.00 0.00 -- d91646df-6947-449d-9885-d33b60a81dcd
# UMI-tools version: 1.1.4
# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string --umi-separator _ -S test_data/scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out test_data/scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1
# job started at Sun Jun 2 19:07:36 2024 on Aleks-MacBook-Air.local -- 361a8e5b-157b-4b9f-a291-b9c107594f66
# pid: 68495, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64
# blacklist : None
# compresslevel : 6
# correct_umi_threshold : 0
# either_read : False
# either_read_resolve : discard
# error_correct_cell : False
# extract_method : string
# filter_cell_barcode : None
# filter_cell_barcodes : False
# filter_umi : None
# filtered_out : None
# filtered_out2 : None
# ignore_suffix : False
# log2stderr : False
# loglevel : 1
# pattern : CCCCCCNNNNNNNNNN
# pattern2 : CCCCCCNNNNNNNNNN
# prime3 : None
# quality_encoding : None
# quality_filter_mask : None
# quality_filter_threshold : None
# random_seed : 1
# read2_in : test_data/scrb_seq_fastq.2.gz
# read2_out : test_data/scrb_seq_fastq.2.umi_extract.fastq.gz
# read2_stdout : False
# reads_subset : None
# reconcile : False
# retain_umi : None
# short_help : None
# stderr : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='utf-8'>
# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'>
# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'>
# stdout : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'>
# timeit_file : None
# timeit_header : None
# timeit_name : all
# tmpdir : None
# umi_correct_log : None
# umi_separator : _
# umi_whitelist : None
# umi_whitelist_paired : None
# whitelist : None
2024-06-02 19:07:36,801 INFO Starting barcode extraction
2024-06-02 19:07:37,319 INFO Input Reads: 10000
2024-06-02 19:07:37,319 INFO Reads output: 10000
# job finished in 0 seconds at Sun Jun 2 19:07:37 2024 -- 3.33 0.76 0.00 0.00 -- 361a8e5b-157b-4b9f-a291-b9c107594f66
# UMI-tools version: 1.1.4
# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string --umi-separator _ -S scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1
# job started at Sun Jun 2 19:28:44 2024 on Aleks-MacBook-Air.local -- 15f7521c-d72a-48a9-bc5e-178e1290e3d0
# pid: 69211, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64
# blacklist : None
# compresslevel : 6
# correct_umi_threshold : 0
# either_read : False
# either_read_resolve : discard
# error_correct_cell : False
# extract_method : string
# filter_cell_barcode : None
# filter_cell_barcodes : False
# filter_umi : None
# filtered_out : None
# filtered_out2 : None
# ignore_suffix : False
# log2stderr : False
# loglevel : 1
# pattern : CCCCCCNNNNNNNNNN
# pattern2 : CCCCCCNNNNNNNNNN
# prime3 : None
# quality_encoding : None
# quality_filter_mask : None
# quality_filter_threshold : None
# random_seed : 1
# read2_in : test_data/scrb_seq_fastq.2.gz
# read2_out : scrb_seq_fastq.2.umi_extract.fastq.gz
# read2_stdout : False
# reads_subset : None
# reconcile : False
# retain_umi : None
# short_help : None
# stderr : <_io.TextIOWrapper name='<stderr>' mode='w' encoding='utf-8'>
# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'>
# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'>
# stdout : <_io.TextIOWrapper name='scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'>
# timeit_file : None
# timeit_header : None
# timeit_name : all
# tmpdir : None
# umi_correct_log : None
# umi_separator : _
# umi_whitelist : None
# umi_whitelist_paired : None
# whitelist : None
2024-06-02 19:28:44,260 INFO Starting barcode extraction
2024-06-02 19:28:44,778 INFO Input Reads: 10000
2024-06-02 19:28:44,778 INFO Reads output: 10000
# job finished in 0 seconds at Sun Jun 2 19:28:44 2024 -- 3.26 0.74 0.00 0.00 -- 15f7521c-d72a-48a9-bc5e-178e1290e3d0
Loading

0 comments on commit 5072b3e

Please sign in to comment.