From 945047ddfb8af8b9eabf069ff67ca02044ed9b7e Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Mon, 1 Jul 2024 12:10:08 +0100 Subject: [PATCH] Small formatting changes, update chagelog --- CHANGELOG.md | 3 + .../umi_tools_extract/config.vsh.yaml | 6 +- src/umi_tools/umi_tools_extract/log.out | 200 ------------------ src/umi_tools/umi_tools_extract/script.sh | 8 +- src/umi_tools/umi_tools_extract/test.sh | 3 - 5 files changed, 11 insertions(+), 209 deletions(-) delete mode 100644 src/umi_tools/umi_tools_extract/log.out diff --git a/CHANGELOG.md b/CHANGELOG.md index 151b7a41..ee0bef95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,9 @@ - `samtools/samtools_view`: Views and converts SAM/BAM/CRAM files (PR #48). - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52). +* `umi_tools`: + -`umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). diff --git a/src/umi_tools/umi_tools_extract/config.vsh.yaml b/src/umi_tools/umi_tools_extract/config.vsh.yaml index a0fd4a15..6586b557 100644 --- a/src/umi_tools/umi_tools_extract/config.vsh.yaml +++ b/src/umi_tools/umi_tools_extract/config.vsh.yaml @@ -32,7 +32,7 @@ argument_groups: description: | Whitelist of accepted cell barcodes tab-separated format, where column 1 is the whitelisted cell barcodes and column 2 is the list (comma-separated) of other cell barcodes which should - be corrected to the barcode in column 1. If the --error-correct-cell option is not used, this + be corrected to the barcode in column 1. If the --error_correct_cell option is not used, this column will be ignored. - name: --blacklist type: file @@ -71,9 +71,9 @@ argument_groups: - name: --ignore_read_pair_suffixes type: boolean_true description: | - Ignore SOH and STX read name suffixes. Note that this options is required if the suffixes are not whitespace + Ignore "/1" and "/2" read name suffixes. Note that this options is required if the suffixes are not whitespace separated from the rest of the read name - + - name: Input arguments: - name: --paired diff --git a/src/umi_tools/umi_tools_extract/log.out b/src/umi_tools/umi_tools_extract/log.out deleted file mode 100644 index d2950223..00000000 --- a/src/umi_tools/umi_tools_extract/log.out +++ /dev/null @@ -1,200 +0,0 @@ -# UMI-tools version: 1.1.4 -# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string --umi-separator _ -S scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1 -# job started at Sun Jun 2 18:15:07 2024 on Aleks-MacBook-Air.local -- 73a6bd2a-5dd9-4473-8c32-cf7a2dd72117 -# pid: 66002, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64 -# blacklist : None -# compresslevel : 6 -# correct_umi_threshold : 0 -# either_read : False -# either_read_resolve : discard -# error_correct_cell : False -# extract_method : string -# filter_cell_barcode : None -# filter_cell_barcodes : False -# filter_umi : None -# filtered_out : None -# filtered_out2 : None -# ignore_suffix : False -# log2stderr : False -# loglevel : 1 -# pattern : CCCCCCNNNNNNNNNN -# pattern2 : CCCCCCNNNNNNNNNN -# prime3 : None -# quality_encoding : None -# quality_filter_mask : None -# quality_filter_threshold : None -# random_seed : 1 -# read2_in : test_data/scrb_seq_fastq.2.gz -# read2_out : scrb_seq_fastq.2.umi_extract.fastq.gz -# read2_stdout : False -# reads_subset : None -# reconcile : False -# retain_umi : None -# short_help : None -# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> -# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'> -# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'> -# stdout : <_io.TextIOWrapper name='scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'> -# timeit_file : None -# timeit_header : None -# timeit_name : all -# tmpdir : None -# umi_correct_log : None -# umi_separator : _ -# umi_whitelist : None -# umi_whitelist_paired : None -# whitelist : None -2024-06-02 18:15:07,695 INFO Starting barcode extraction -2024-06-02 18:15:08,221 INFO Input Reads: 10000 -2024-06-02 18:15:08,222 INFO Reads output: 10000 -# job finished in 0 seconds at Sun Jun 2 18:15:08 2024 -- 2.98 0.98 0.00 0.00 -- 73a6bd2a-5dd9-4473-8c32-cf7a2dd72117 -# UMI-tools version: 1.1.4 -# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string -S scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1 -# job started at Sun Jun 2 18:50:33 2024 on Aleks-MacBook-Air.local -- d91646df-6947-449d-9885-d33b60a81dcd -# pid: 66898, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64 -# blacklist : None -# compresslevel : 6 -# correct_umi_threshold : 0 -# either_read : False -# either_read_resolve : discard -# error_correct_cell : False -# extract_method : string -# filter_cell_barcode : None -# filter_cell_barcodes : False -# filter_umi : None -# filtered_out : None -# filtered_out2 : None -# ignore_suffix : False -# log2stderr : False -# loglevel : 1 -# pattern : CCCCCCNNNNNNNNNN -# pattern2 : CCCCCCNNNNNNNNNN -# prime3 : None -# quality_encoding : None -# quality_filter_mask : None -# quality_filter_threshold : None -# random_seed : 1 -# read2_in : test_data/scrb_seq_fastq.2.gz -# read2_out : scrb_seq_fastq.2.umi_extract.fastq.gz -# read2_stdout : False -# reads_subset : None -# reconcile : False -# retain_umi : None -# short_help : None -# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> -# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'> -# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'> -# stdout : <_io.TextIOWrapper name='scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'> -# timeit_file : None -# timeit_header : None -# timeit_name : all -# tmpdir : None -# umi_correct_log : None -# umi_separator : _ -# umi_whitelist : None -# umi_whitelist_paired : None -# whitelist : None -2024-06-02 18:50:33,203 INFO Starting barcode extraction -2024-06-02 18:50:33,698 INFO Input Reads: 10000 -2024-06-02 18:50:33,698 INFO Reads output: 10000 -# job finished in 0 seconds at Sun Jun 2 18:50:33 2024 -- 3.08 0.94 0.00 0.00 -- d91646df-6947-449d-9885-d33b60a81dcd -# UMI-tools version: 1.1.4 -# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string --umi-separator _ -S test_data/scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out test_data/scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1 -# job started at Sun Jun 2 19:07:36 2024 on Aleks-MacBook-Air.local -- 361a8e5b-157b-4b9f-a291-b9c107594f66 -# pid: 68495, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64 -# blacklist : None -# compresslevel : 6 -# correct_umi_threshold : 0 -# either_read : False -# either_read_resolve : discard -# error_correct_cell : False -# extract_method : string -# filter_cell_barcode : None -# filter_cell_barcodes : False -# filter_umi : None -# filtered_out : None -# filtered_out2 : None -# ignore_suffix : False -# log2stderr : False -# loglevel : 1 -# pattern : CCCCCCNNNNNNNNNN -# pattern2 : CCCCCCNNNNNNNNNN -# prime3 : None -# quality_encoding : None -# quality_filter_mask : None -# quality_filter_threshold : None -# random_seed : 1 -# read2_in : test_data/scrb_seq_fastq.2.gz -# read2_out : test_data/scrb_seq_fastq.2.umi_extract.fastq.gz -# read2_stdout : False -# reads_subset : None -# reconcile : False -# retain_umi : None -# short_help : None -# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> -# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'> -# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'> -# stdout : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'> -# timeit_file : None -# timeit_header : None -# timeit_name : all -# tmpdir : None -# umi_correct_log : None -# umi_separator : _ -# umi_whitelist : None -# umi_whitelist_paired : None -# whitelist : None -2024-06-02 19:07:36,801 INFO Starting barcode extraction -2024-06-02 19:07:37,319 INFO Input Reads: 10000 -2024-06-02 19:07:37,319 INFO Reads output: 10000 -# job finished in 0 seconds at Sun Jun 2 19:07:37 2024 -- 3.33 0.76 0.00 0.00 -- 361a8e5b-157b-4b9f-a291-b9c107594f66 -# UMI-tools version: 1.1.4 -# output generated by extract -I test_data/scrb_seq_fastq.1.gz --read2-in test_data/scrb_seq_fastq.2.gz --bc-pattern CCCCCCNNNNNNNNNN --bc-pattern2 CCCCCCNNNNNNNNNN --extract-method string --umi-separator _ -S scrb_seq_fastq.1.umi_extract.fastq.gz --read2-out scrb_seq_fastq.2.umi_extract.fastq.gz --log log.out --random-seed 1 -# job started at Sun Jun 2 19:28:44 2024 on Aleks-MacBook-Air.local -- 15f7521c-d72a-48a9-bc5e-178e1290e3d0 -# pid: 69211, system: Darwin 23.4.0 Darwin Kernel Version 23.4.0: Fri Mar 15 00:11:05 PDT 2024; root:xnu-10063.101.17~1/RELEASE_X86_64 x86_64 -# blacklist : None -# compresslevel : 6 -# correct_umi_threshold : 0 -# either_read : False -# either_read_resolve : discard -# error_correct_cell : False -# extract_method : string -# filter_cell_barcode : None -# filter_cell_barcodes : False -# filter_umi : None -# filtered_out : None -# filtered_out2 : None -# ignore_suffix : False -# log2stderr : False -# loglevel : 1 -# pattern : CCCCCCNNNNNNNNNN -# pattern2 : CCCCCCNNNNNNNNNN -# prime3 : None -# quality_encoding : None -# quality_filter_mask : None -# quality_filter_threshold : None -# random_seed : 1 -# read2_in : test_data/scrb_seq_fastq.2.gz -# read2_out : scrb_seq_fastq.2.umi_extract.fastq.gz -# read2_stdout : False -# reads_subset : None -# reconcile : False -# retain_umi : None -# short_help : None -# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> -# stdin : <_io.TextIOWrapper name='test_data/scrb_seq_fastq.1.gz' encoding='ascii'> -# stdlog : <_io.TextIOWrapper name='log.out' mode='a' encoding='UTF-8'> -# stdout : <_io.TextIOWrapper name='scrb_seq_fastq.1.umi_extract.fastq.gz' encoding='ascii'> -# timeit_file : None -# timeit_header : None -# timeit_name : all -# tmpdir : None -# umi_correct_log : None -# umi_separator : _ -# umi_whitelist : None -# umi_whitelist_paired : None -# whitelist : None -2024-06-02 19:28:44,260 INFO Starting barcode extraction -2024-06-02 19:28:44,778 INFO Input Reads: 10000 -2024-06-02 19:28:44,778 INFO Reads output: 10000 -# job finished in 0 seconds at Sun Jun 2 19:28:44 2024 -- 3.26 0.74 0.00 0.00 -- 15f7521c-d72a-48a9-bc5e-178e1290e3d0 diff --git a/src/umi_tools/umi_tools_extract/script.sh b/src/umi_tools/umi_tools_extract/script.sh index c12b7d43..4f515b10 100644 --- a/src/umi_tools/umi_tools_extract/script.sh +++ b/src/umi_tools/umi_tools_extract/script.sh @@ -16,16 +16,17 @@ test_dir="${metal_executable}/test_data" [[ "$par_log2stderr" == "false" ]] && unset par_log2stderr +# Check if we have the correct number of input files and patterns for paired-end or single-end reads + +# For paired-end rends, check that we have two read files, two patterns if [ -n "$par_paired" ]; then - # For paired-end rendscheck that we have two read files, two patterns if [ -z "$par_input" ] || [ -z "$par_read2_in" ] || [ -z "$par_bc_pattern" ] || [ -z "$par_bc_pattern2" ]; then echo "Paired end input requires two read files, two UMI patterns, and two output files" exit 1 fi -else - # For single-end reads, check that we have only one read file, one pattern +else # For single-end reads, check that we have only one read file, one pattern if [ -n "$par_read2_in" ] || [ -n "$par_bc_pattern2" ]; then echo "Single end input requires one read file and one UMI pattern" exit 1 @@ -35,6 +36,7 @@ else fi fi + umi_tools extract \ -I "$par_input" \ ${par_read2_in:+ --read2-in "$par_read2_in"} \ diff --git a/src/umi_tools/umi_tools_extract/test.sh b/src/umi_tools/umi_tools_extract/test.sh index 7bc98e93..c7d4985d 100644 --- a/src/umi_tools/umi_tools_extract/test.sh +++ b/src/umi_tools/umi_tools_extract/test.sh @@ -19,7 +19,6 @@ echo ">>> Test 1: Testing for paired-end reads" --umi_discard_read 0 \ --read1_out scrb_seq_fastq.1.umi_extract.fastq.gz \ --read2_out scrb_seq_fastq.2.umi_extract.fastq.gz \ - --log log.out \ --random_seed 1 echo ">> Checking if the correct files are present" @@ -50,7 +49,6 @@ echo ">>> Test 2: Testing for paired-end reads with umi_discard_reads option" --umitools_grouping_method directional \ --umi_discard_read 2 \ --read1_out scrb_seq_fastq.1.umi_extract.fastq.gz \ - --log log.out \ --random_seed 1 echo ">> Checking if the correct files are present" @@ -74,7 +72,6 @@ echo ">>> Test 3: Testing for single-end reads" --umitools_umi_separator '_' \ --umitools_grouping_method directional \ --read1_out slim.umi_extract.fastq.gz \ - --log log.out \ --random_seed 1 echo ">> Checking if the correct files are present"