diff --git a/src/umi_tools/umi_tools_extract/config.vsh.yaml b/src/umi_tools/umi_tools_extract/config.vsh.yaml index 9fee19f9..2972820c 100644 --- a/src/umi_tools/umi_tools_extract/config.vsh.yaml +++ b/src/umi_tools/umi_tools_extract/config.vsh.yaml @@ -57,7 +57,7 @@ argument_groups: description: | Write out read pairs not matching regex pattern or cell barcode whitelist to this file. - - name: Extract Options + - name: Extract Options arguments: - name: --umitools_extract_method alternatives: --extract_method @@ -115,6 +115,27 @@ argument_groups: description: | Ignore "/1" and "/2" read name suffixes. Note that this options is required if the suffixes are not whitespace separated from the rest of the read name. + arguments: + - name: --umitools_umi_separator + alternatives: --umi_separator + type: string + description: | + The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with + UMI-tools and used other software. Default: `_` + example: "_" + - name: --umitools_grouping_method + type: string + choices: [unique, percentile, cluster, adjacency, directional] + description: | + Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying + the reads with the same mapping position, but treat similar yet nonidentical UMIs differently. Default: `directional` + example: "directional" + - name: --umi_discard_read + type: integer + choices: [0, 1, 2] + description: | + After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively. Default: `0` + example: 0 - name: Common Options arguments: diff --git a/src/umi_tools/umi_tools_extract/script.sh b/src/umi_tools/umi_tools_extract/script.sh index 4f515b10..dfa95bca 100644 --- a/src/umi_tools/umi_tools_extract/script.sh +++ b/src/umi_tools/umi_tools_extract/script.sh @@ -3,7 +3,7 @@ ## VIASH START ## VIASH END -set -eo pipefail +set -exo pipefail test_dir="${metal_executable}/test_data" @@ -28,9 +28,10 @@ if [ -n "$par_paired" ]; then fi else # For single-end reads, check that we have only one read file, one pattern if [ -n "$par_read2_in" ] || [ -n "$par_bc_pattern2" ]; then - echo "Single end input requires one read file and one UMI pattern" + echo "Single end input requires only one read file and one UMI pattern" exit 1 - elif [ "$par_umi_discard_read" != 0 ]; then + # if par_umi_discard_read is not empty or not 0: + elif [ -n "$par_umi_discard_read" ] && [ "$par_umi_discard_read" != 0 ]; then echo "umi_discard_read is only valid when processing paired end reads." exit 1 fi @@ -71,10 +72,10 @@ umi_tools extract \ ${par_error:+--error "$par_error"} -if [ $par_umi_discard_read == 1 ]; then +if [ "$par_umi_discard_read" == 1 ]; then # discard read 1 rm "$par_read1_out" -elif [ $par_umi_discard_read == 2 ]; then +elif [ "$par_umi_discard_read" == 2 ]; then # discard read 2 (-f to bypass file existence check) rm -f "$par_read2_out" fi \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/test.sh b/src/umi_tools/umi_tools_extract/test.sh index c7d4985d..e9c13e27 100644 --- a/src/umi_tools/umi_tools_extract/test.sh +++ b/src/umi_tools/umi_tools_extract/test.sh @@ -9,80 +9,80 @@ echo ">>> Testing $meta_functionality_name" echo ">>> Test 1: Testing for paired-end reads" "$meta_executable" \ --paired \ - --input "$test_dir/scrb_seq_fastq.1.gz"\ - --read2_in "$test_dir/scrb_seq_fastq.2.gz" \ + --input "$test_dir/scrb_seq_fastq.1_30"\ + --read2_in "$test_dir/scrb_seq_fastq.2_30" \ --bc_pattern "CCCCCCNNNNNNNNNN"\ --bc_pattern2 "CCCCCCNNNNNNNNNN" \ --umitools_extract_method string \ --umitools_umi_separator '_' \ --umitools_grouping_method directional \ --umi_discard_read 0 \ - --read1_out scrb_seq_fastq.1.umi_extract.fastq.gz \ - --read2_out scrb_seq_fastq.2.umi_extract.fastq.gz \ + --read1_out scrb_seq_fastq.1_30.extract \ + --read2_out scrb_seq_fastq.2_30.extract \ --random_seed 1 echo ">> Checking if the correct files are present" -[[ ! -f "scrb_seq_fastq.1.umi_extract.fastq.gz" ]] || [[ ! -f "scrb_seq_fastq.2.umi_extract.fastq.gz" ]] && echo "Reads file missing" && exit 1 -[ ! -s "scrb_seq_fastq.1.umi_extract.fastq.gz" ] && echo "Read 1 file is empty" && exit 1 -[ ! -s "scrb_seq_fastq.2.umi_extract.fastq.gz" ] && echo "Read 2 file is empty" && exit 1 +[[ ! -f "scrb_seq_fastq.1_30.extract" ]] || [[ ! -f "scrb_seq_fastq.2_30.extract" ]] && echo "Reads file missing" && exit 1 +[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1 +[ ! -s "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 file is empty" && exit 1 echo ">> Checking if the files are correct" -diff -q <(gunzip -c "${meta_resources_dir}/scrb_seq_fastq.1.umi_extract.fastq.gz") <(gunzip -c "$test_dir/scrb_seq_fastq.1.umi_extract.fastq.gz") || \ +diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \ (echo "Read 1 file is not correct" && exit 1) -diff -q <(gunzip -c "${meta_resources_dir}/scrb_seq_fastq.2.umi_extract.fastq.gz") <(gunzip -c "$test_dir/scrb_seq_fastq.2.umi_extract.fastq.gz") || \ +diff -q "${meta_resources_dir}/scrb_seq_fastq.2_30.extract" "$test_dir/scrb_seq_fastq.2_30.extract" || \ (echo "Read 2 file is not correct" && exit 1) -rm scrb_seq_fastq.1.umi_extract.fastq.gz scrb_seq_fastq.2.umi_extract.fastq.gz +rm scrb_seq_fastq.1_30.extract scrb_seq_fastq.2_30.extract ############################################################################################################ echo ">>> Test 2: Testing for paired-end reads with umi_discard_reads option" "$meta_executable" \ --paired \ - --input "$test_dir/scrb_seq_fastq.1.gz" \ - --read2_in "$test_dir/scrb_seq_fastq.2.gz" \ + --input "$test_dir/scrb_seq_fastq.1_30" \ + --read2_in "$test_dir/scrb_seq_fastq.2_30" \ --bc_pattern CCCCCCNNNNNNNNNN \ --bc_pattern2 CCCCCCNNNNNNNNNN \ --umitools_extract_method string \ --umitools_umi_separator '_' \ --umitools_grouping_method directional \ --umi_discard_read 2 \ - --read1_out scrb_seq_fastq.1.umi_extract.fastq.gz \ + --read1_out scrb_seq_fastq.1_30.extract \ --random_seed 1 echo ">> Checking if the correct files are present" -[ ! -f "scrb_seq_fastq.1.umi_extract.fastq.gz" ] && echo "Read 1 file is missing" && exit 1 -[ ! -s "scrb_seq_fastq.1.umi_extract.fastq.gz" ] && echo "Read 1 file is empty" && exit 1 -[ -f "scrb_seq_fastq.2.umi_extract.fastq.gz" ] && echo "Read 2 is not discarded" && exit 1 +[ ! -f "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is missing" && exit 1 +[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1 +[ -f "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 is not discarded" && exit 1 echo ">> Checking if the files are correct" -diff -q <(gunzip -c "${meta_resources_dir}/scrb_seq_fastq.1.umi_extract.fastq.gz") <(gunzip -c "$test_dir/scrb_seq_fastq.1.umi_extract.fastq.gz") || \ +diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \ (echo "Read 1 file is not correct" && exit 1) -rm scrb_seq_fastq.1.umi_extract.fastq.gz +rm scrb_seq_fastq.1_30.extract ############################################################################################################ echo ">>> Test 3: Testing for single-end reads" "$meta_executable" \ - --input "$test_dir/slim.fastq.gz" \ + --input "$test_dir/slim_30.fastq" \ --bc_pattern "^(?P.{3}).{4}(?P.{2})" \ --umitools_extract_method regex \ --umitools_umi_separator '_' \ --umitools_grouping_method directional \ - --read1_out slim.umi_extract.fastq.gz \ + --read1_out slim_30.extract \ --random_seed 1 echo ">> Checking if the correct files are present" -[ ! -f "slim.umi_extract.fastq.gz" ] && echo "Trimmed reads file missing" && exit 1 -[ ! -s "slim.umi_extract.fastq.gz" ] && echo "Trimmed reads file is empty" && exit 1 +[ ! -f "slim_30.extract" ] && echo "Trimmed reads file missing" && exit 1 +[ ! -s "slim_30.extract" ] && echo "Trimmed reads file is empty" && exit 1 echo ">> Checking if the files are correct" -diff -q <(gunzip -c "${meta_resources_dir}/slim.umi_extract.fastq.gz") <(gunzip -c "$test_dir/slim.umi_extract.fastq.gz") || \ +diff -q "${meta_resources_dir}/slim_30.extract" "$test_dir/slim_30.extract" || \ (echo "Trimmed reads file is not correct" && exit 1) -rm slim.umi_extract.fastq.gz +rm slim_30.extract echo ">>> Test finished successfully" exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1.gz b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1.gz deleted file mode 100644 index 715dffb6..00000000 Binary files a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1.gz and /dev/null differ diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1.umi_extract.fastq.gz b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1.umi_extract.fastq.gz deleted file mode 100644 index 247cd9fc..00000000 Binary files a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1.umi_extract.fastq.gz and /dev/null differ diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 new file mode 100644 index 00000000..639f6243 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 @@ -0,0 +1,120 @@ +@SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17 +AATAACTTCCCGCGTCG ++SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17 +@@@DDDBDDF>FFHGIB +@SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17 +AGCGGGGTGCTCGTCGT ++SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17 +CCCFFFFFHHHHHJJJJ +@SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17 +CTTTAGTACCAGTCCTT ++SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17 +BBCFFDADHHHHHHIJJ +@SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17 +AGGCGTTGTTTTTTTTT ++SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17 +CCCFFFFFHHHHHJJJJ +@SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17 +ATCGAGACATAATTGAT ++SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17 +@B@FFFFFHHHHHJJJJ +@SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17 +TGGGGGCGGTACATGAT ++SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17 +BBBFFFFFHHHHHJJJJ +@SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17 +CTATATGTTTGCGCTGT ++SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17 +1=BDFFFFHHHHHJJJJ +@SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17 +CTCCCGCATGCTGCTGT ++SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17 +?BBFFFFFHHHHHJJJJ +@SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17 +GAGCCCTGAGGGGATCT ++SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17 +1??DDDFD>DFDGFGHG +@SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17 +AGCGGGGTTCGCGGTTT ++SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17 +CCCFFFFFHHHHHJIJI +@SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17 +AGAATTGCCTGGATTTT ++SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17 +@CCFFFFAFHHHGJJJJ +@SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17 +AGGCGGGGCAACGGGTT ++SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17 +CCCFFFFFHHGHHJJHH +@SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17 +GTCCCCGCGTCGTGTAG ++SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17 +@C@FFFFFHFFGHJJJJ +@SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17 +CCACGCATTCACTCGGT ++SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17 +BBBDFFFFHHHHHJJJJ +@SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17 +TGCGCAATAAGCGCTAT ++SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17 ++:=DDDDDBHHGDIBEH +@SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17 +CGCTGGCAGAGCCCGGT ++SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17 +@BCFFFFFHHHHHJJJJ +@SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17 +AGGCGGTGCATAGTCTT ++SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17 +CCCFFFFFHHHHHIJIH +@SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17 +GTCCCCCGCGTGTGACT ++SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17 +GEGCDG9FD# +@SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34 +GTGTAGGGAAAGAGTGTAAGGAAAGAGTGTAGCN ++SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34 +?=??B?DB2ACCAEAEFHHIHHHIHFHCEHHIG# +@SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34 +CCTATATAGTATAGCTTCCCATCTTCTTTGAGAN ++SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34 +CCCFFFFFHDHBHEIIJJJJIIIJJJGGGIGIE# +@SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34 +ATTAAAGACAAACTACAACTCATATGAGGCATTN ++SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34 +@@@DDDADDHHHFBFAHIGBHHFAH;E@@?AB>F@BF3;3?1C?<# +@SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34 +CCCACACTCTTTCCCTACACGACGCTACACTCTN ++SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34 +@@@DDFDDBHBFHGI)C:D@@@B# +@SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34 +GTGTATGGAAAGAGTGTAGGGAAAGAGTGTAGGN ++SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34 +@@@DDDDAHHHFHIABEEEAB??CFBF?C@BFF# +@SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34 +CTCTTTCCCTACACTCTTTCCCTACACGACGCTN ++SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34 +@@@DDDAAADHDHDGDGIIIIIJJJJJJIJIIJ# +@SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34 +GTGTAGCGTCGTGTAGGGAAAGAGTGTGTGGAAN ++SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34 +@@@DDDDD?DFDCAEFHIGGFHEH:D1C:CG@F# +@SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34 +GTGTATGGAAAGAGTGTGCGTCGTACGTGTAGAN ++SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34 +@?@DDFFFHHHHGDAC:CHGGIIGIIIFHFGHB# +@SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34 +ACTCTTTCCCTACACTCTTTCCCTACACGACGCN ++SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34 +@@BCBE@9;EGGGGGIHJJIJHIGG# +@SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34 +GATTCTTCAAATGAGGACTATGCGGGACATGAAN ++SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34 +@@@DDDDDFHHFAHB;FHIIIIIIIIFHEHIHI# +@SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34 +GCGTCGTGTAGGGAAAGAGTGTAGCGTCGTGTAN ++SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34 +@@@DDDDDBHEGGFGHGGIEGII# +@SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34 +AGACGGACCAGAGCGAAAGCATTTGCCAAGAATN ++SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34 +CCCFFFDFGHHHGJIIJJIJHEDD919CGGHJ@# +@SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34 +GAGTATAGGGAAAGAGTTTTTTTTTTTTTTTTTN ++SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34 +?=?DDDD>AB:ACEEGHIJJIJJJJIIJJHFDD# +@SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34 +CCTTTTATACAATACAAAGCTTTGCTTTTTTTTN ++SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34 +???DDDDDDDDD4EEEII@A<:33<33,22110# +@SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34 +ATCACAGCTGGAGAGATCTTGATCTTCATGGTGN ++SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34 +CCCFFFFFHHFHGGIIIIJIEAHCEHHEFECGD# +@SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34 +GTACTAGGCATCGTCATCCAATGCGACGAGTCCN ++SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34 +@@CFFDDFHHGHHIJJJIJJJIGGHIDGGEGCDG9FD# +@SRR1058032.5_ATCGAGGTGTAG_ACATAATTGAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34 +TAAGGAAAGAGTGTAGCN ++ +FHHIHHHIHFHCEHHIG# +@SRR1058032.6_TGGGGGCCTATA_CGGTACATGATAGTATAGCT HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34 +TCCCATCTTCTTTGAGAN ++ +JJJJIIIJJJGGGIGIE# +@SRR1058032.7_CTATATATTAAA_GTTTGCGCTGGACAAACTAC HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34 +AACTCATATGAGGCATTN ++ +HIGBHHF@BF3;3?1C?<# +@SRR1058032.11_AGAATTCCCACA_GCCTGGATTTCTCTTTCCCT HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34 +ACACGACGCTACACTCTN ++ +F@GFBFEE>)C:D@@@B# +@SRR1058032.12_AGGCGGGTGTAT_GGCAACGGGTGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34 +TAGGGAAAGAGTGTAGGN ++ +EEEAB??CFBF?C@BFF# +@SRR1058032.13_GTCCCCCTCTTT_GCGTCGTGTACCCTACACTC HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34 +TTTCCCTACACGACGCTN ++ +GIIIIIJJJJJJIJIIJ# +@SRR1058032.14_CCACGCGTGTAG_ATTCACTCGGCGTCGTGTAG HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34 +GGAAAGAGTGTGTGGAAN ++ +HIGGFHEH:D1C:CG@F# +@SRR1058032.15_TGCGCAGTGTAT_ATAAGCGCTAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34 +TGCGTCGTACGTGTAGAN ++ +:CHGGIIGIIIFHFGHB# +@SRR1058032.16_CGCTGGACTCTT_CAGAGCCCGGTCCCTACACT HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34 +CTTTCCCTACACGACGCN ++ +;EGGGGGIHJJIJHIGG# +@SRR1058032.17_AGGCGGGATTCT_TGCATAGTCTTCAAATGAGG HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34 +ACTATGCGGGACATGAAN ++ +FHIIIIIIIIFHEHIHI# +@SRR1058032.18_GTCCCCGCGTCG_CGCGTGTGACTGTAGGGAAA HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34 +GAGTGTAGCGTCGTGTAN ++ +DGF+<BHEGGFGHGGIEGII# +@SRR1058032.24_CGTTAAAGACGG_TAATTGTGGTACCAGAGCGA HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34 +AAGCATTTGCCAAGAATN ++ +JJIJHEDD919CGGHJ@# +@SRR1058032.25_AAAAAAGAGTAT_AAAAAAAAAAAGGGAAAGAG HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34 +TTTTTTTTTTTTTTTTTN ++ +HIJJIJJJJIIJJHFDD# +@SRR1058032.26_GCCGACCCTTTT_CAACGATTTTATACAATACA HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34 +AAGCTTTGCTTTTTTTTN ++ +II@A<:33<33,22110# +@SRR1058032.27_AATCAAATCACA_GACCACTGAAGCTGGAGAGA HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34 +TCTTGATCTTCATGGTGN ++ +IIJIEAHCEHHEFECGD# +@SRR1058032.28_CGCGCTGTACTA_TTTGTTTTTTGGCATCGTCA HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34 +TCCAATGCGACGAGTCCN ++ +JIJJJIGGHIDG slim_30.fastq +head -n 120 scrb_seq_fastq.1 > scrb_seq_fastq.1_30 +head -n 120 scrb_seq_fastq.2 > scrb_seq_fastq.2_30 +rm slim.fastq scrb_seq_fastq.1 scrb_seq_fastq.2 + +# Generate expected output +# Test 1 and 2 +umi_tools extract \ + --stdin "scrb_seq_fastq.1_30" \ + --read2-in "scrb_seq_fastq.2_30" \ + --bc-pattern "CCCCCCNNNNNNNNNN" \ + --bc-pattern2 "CCCCCCNNNNNNNNNN" \ + --extract-method string \ + --stdout scrb_seq_fastq.1_30.extract \ + --read2-out scrb_seq_fastq.2_30.extract \ + --random-seed 1 + +# Test 3 +umi_tools extract \ + --stdin "slim_30.fastq" \ + --bc-pattern "^(?P.{3}).{4}(?P.{2})" \ + --extract-method regex \ + --stdout slim_30.extract \ + --random-seed 1 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/test_data/slim.fastq.gz b/src/umi_tools/umi_tools_extract/test_data/slim.fastq.gz deleted file mode 100644 index 04cd72ea..00000000 Binary files a/src/umi_tools/umi_tools_extract/test_data/slim.fastq.gz and /dev/null differ diff --git a/src/umi_tools/umi_tools_extract/test_data/slim.umi_extract.fastq.gz b/src/umi_tools/umi_tools_extract/test_data/slim.umi_extract.fastq.gz deleted file mode 100644 index 1d4f25c0..00000000 Binary files a/src/umi_tools/umi_tools_extract/test_data/slim.umi_extract.fastq.gz and /dev/null differ diff --git a/src/umi_tools/umi_tools_extract/test_data/slim_30.extract b/src/umi_tools/umi_tools_extract/test_data/slim_30.extract new file mode 100644 index 00000000..1c20f782 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/test_data/slim_30.extract @@ -0,0 +1,120 @@ +@SRR2057595.7_CAGAA +GTTCTCTCGGTGGGACCTC ++ +FFFFHHHJJJFGIJIJJIJ +@SRR2057595.9_TTGAA +GTTCTCTGATGCCCTCTTCTGGTGCATCTGAAGACAGCTACAGTGTACTTAGATATAATAAATAAATCTT ++ +FDBDFHHIGGEHJGGIHGHGGCAFCHGIGEHIJJJJIJJJIHIIIIIIJIIIIIGHIIGGIJGIIJIIJ@ +@SRR2057595.14_TGGAT +GTTAGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +FFFFHHHJJIJJJJIGHJJIIJJJJJIJHFHHFFEDEEEEDDDDBDDDD +@SRR2057595.22_ACGAT +GTTAGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGC ++ +FFFFHHHJJJJJJJJIJJJJJJJJJJJJHHHFFFEDEEEEDDDDBDDD +@SRR2057595.23_GCGTT +GTTACCTAAGGCGAGCTCAGGGAGGACAGAAACCTCCCGTGGAGCAGAAGGGCAAAAGCTCGCTTGATCT ++ +FFFFHHHJJJJJJJJJJJJJJJIJJIIJJJJJJJJJJJJIJJHHHHHFFFFDDDDDDDDDDDDDDDDDDA +@SRR2057595.29_ACGTT +GTTCGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCTT ++ +FFFFHHHJJJJJJJJHIJJJJJJIJJJJHHHFFDEDEEDDCDDDBDDDDD +@SRR2057595.30_GAGAA +GTTGAATCCGTGCTAAGAAGAA ++ +DFFFHHHJJJJIJJJJJJJJJJ +@SRR2057595.33_TCGAT +GTTTCTCGTCTGATCTCGGAAGCTAAGCAGGGTCGGGCCTGGTTAGTACTTGGATGGGAGACCGCCTGGG ++ +FFFFHHHJJJJJJJJJJJJJJJJJJJJJJJJJDHIJJJJIJJJHGGEEHFFFFFFEDDEDDDDDDDDDDB +@SRR2057595.35_ACGCT +GTTACCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +DFFFHHHJJJJJJIJJJJJJIJJJJJJJHIIJJ +@SRR2057595.38_GGGCC +GTTATGCATGTTTATAGTTTCTAGTTTTGGCATTTTGTGTGGTCTCTTTTTTGTT ++ +DFFFHHHJJJJJJJJJJHJJIJJJIJJJJJJJJJJJJGIGHJHIJJIJJJJJJJJ +@SRR2057595.42_TAGGA +GTTGTAAGTTATACACTGACTAAGTCATCTGTTACTGCCTTCACTGAGTTTTTATTTCCTTT ++ +DFFFHHHJJJJJJJJJJJJJJJJJIIJJJJGJJJJJJJJJJJJJJJIIHIJJJJJJJIJJJI +@SRR2057595.45_CTGGC +GTTTTGCGGAAGGATCATTA ++ +DDDDFFDFFAGFEB@ACB9< +@SRR2057595.65_GCGCG +GTTTGAGCTTGCTCCGTCCACTCAACGCATCGACCTGGTATTGCAGTACCTCCAGGAACGGTGCACCAAG ++ +FFFFHHHJJJJJJHJIHHIIIIIIIJHJBHIHBFHHJI@EHJJHHHHHHHFFFBDE?AEBD=AB@CDBD? +@SRR2057595.67_AAGGT +GTTGTTTTGAGGTCCTGCTCGTGCAGGGT ++ +DDDFHHHHGFHGFGGIIDGHHIGIJJJJ9 +@SRR2057595.69_ATTAT +GGTTTTTGTTTTTCCTCCTTCTCTTTCTAAA ++ +FFFFHHHHJJJJJJJJJJJJJJJJJJIJIJJ +@SRR2057595.70_TTAAA +GGTTTTGTAATTTTATGAGGTCCCATTTGTCAATTCTT ++ +DDDD2CDFA@FBGHCCHFHGBFHGHIGGDHGHIIFCFF +@SRR2057595.71_TGCCA +GGTTTATTAGCATGGCCCCTGCGCAAGGATGACACGCAAATTCGTGAAGCGTTCCATATTT ++ +FFFFHGHHJJJJJJJJJJIIJJIJIJJIFHJIIIJJJIJJJJJJHIIHHHHFFFDEECEEE +@SRR2057595.73_TGACA +GGTTGCGAGTGCCTAGTGGGCCACTTTTGGTAAGCAGAACTGGCGCTGCGGGA ++ +FFFFGFFHC@EBHGHGAEGIIHIIIIJJJJGHIIIJIJIIGHIJIJJIGGEFD +@SRR2057595.74_AATTC +GGTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ++ +FFFFDFFHFIJJJGGGGJJGDDDDDDDDDDDDDBDDDDDDBBDDDDDDDDDDDDDDDDDDDBBBDDDDBD> +@SRR2057595.77_GCGGA +GTTCTCCCACTTCTGAC ++ +FFFFDHHHIJJJIJJJJ +@SRR2057595.82_GAGAC +GGTTTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +FFFFHHHHJJJJJJJJJJJJJJJJIJJJJJIJIIJJJH +@SRR2057595.83_TGGAT +GTTGCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +DFFFHHHJJIJJJJJJIJJJIJJIGGHIFHGEH +@SRR2057595.86_ACCAC +GGTTTTTTTTTAAATGTAAAGCATAAATAAAAAGCCTTTGTGGACTGTGAAAAAAAAAAAAAAAAAAAAAA ++ +FFFFHHHHJJJJJJJJIIJJJJJJJJJIJJJJJJJJJJJJGIJIIJJIJJJJJJHFDDDDDDDDDDDDDB> +@SRR2057595.88_TCAGC +GGTTCTAAGCATAGATAACCATATATCAGGGGGAGCTCCATGTTCTAGTCCTGCAAGCGCCTGGGCAATAA ++ +FFFFHHHHJJJJJJIJJJJJIJJJJJJIJJIJJIJJJJJJJJJHIJJJJJJIIIHJIHHHFFDDDDEDDD@ +@SRR2057595.99_TGACA +GGTTTCGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTTTGT ++ +FFFFDHHHIHIIIJJIJJJJJIGEHGFHIJJGHIHADHIIJIJJJIJG diff --git a/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq b/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq new file mode 100644 index 00000000..444a7a7a --- /dev/null +++ b/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq @@ -0,0 +1,120 @@ +@SRR2057595.7 +CAGGTTCAATCTCGGTGGGACCTC ++SRR2057595.7 +1=DFFFFHHHHHJJJFGIJIJJIJ +@SRR2057595.9 +TTGGTTCAATCTGATGCCCTCTTCTGGTGCATCTGAAGACAGCTACAGTGTACTTAGATATAATAAATAAATCTT ++SRR2057595.9 +4=DFDBDHHFHHIGGEHJGGIHGHGGCAFCHGIGEHIJJJJIJJJIHIIIIIIJIIIIIGHIIGGIJGIIJIIJ@ +@SRR2057595.14 +TGGGTTAATGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.14 +1=DFFFFHHHHHJJIJJJJIGHJJIIJJJJJIJHFHHFFEDEEEEDDDDBDDDD +@SRR2057595.22 +ACGGTTAATGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGC ++SRR2057595.22 +1=DFFFFHHHHHJJJJJJJJIJJJJJJJJJJJJHHHFFFEDEEEEDDDDBDDD +@SRR2057595.23 +GCGGTTATTCCTAAGGCGAGCTCAGGGAGGACAGAAACCTCCCGTGGAGCAGAAGGGCAAAAGCTCGCTTGATCT ++SRR2057595.23 +1=DFFFFHHHHHJJJJJJJJJJJJJJJIJJIIJJJJJJJJJJJJIJJHHHHHFFFFDDDDDDDDDDDDDDDDDDA +@SRR2057595.29 +ACGGTTCTTGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCTT ++SRR2057595.29 +1=DFFFFHHHHHJJJJJJJJHIJJJJJJIJJJJHHHFFDEDEEDDCDDDBDDDDD +@SRR2057595.30 +GAGGTTGAAAATCCGTGCTAAGAAGAA ++SRR2057595.30 +4=DDFFFHHHHHJJJJIJJJJJJJJJJ +@SRR2057595.33 +TCGGTTTATCTCGTCTGATCTCGGAAGCTAAGCAGGGTCGGGCCTGGTTAGTACTTGGATGGGAGACCGCCTGGG ++SRR2057595.33 +1=DFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJDHIJJJJIJJJHGGEEHFFFFFFEDDEDDDDDDDDDDB +@SRR2057595.35 +ACGGTTACTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.35 +1=DDFFFHHHHHJJJJJJIJJJJJJIJJJJJJJHIIJJ +@SRR2057595.38 +GGGGTTACCTGCATGTTTATAGTTTCTAGTTTTGGCATTTTGTGTGGTCTCTTTTTTGTT ++SRR2057595.38 +1=DDFFFHHHHHJJJJJJJJJJHJJIJJJIJJJJJJJJJJJJGIGHJHIJJIJJJJJJJJ +@SRR2057595.42 +TAGGTTGGATAAGTTATACACTGACTAAGTCATCTGTTACTGCCTTCACTGAGTTTTTATTTCCTTT ++SRR2057595.42 +1=DDFFFHHHHHJJJJJJJJJJJJJJJJJIIJJJJGJJJJJJJJJJJJJJJIIHIJJJJJJJIJJJI +@SRR2057595.45 +CTGGTTTGCTGCGGAAGGATCATTA ++SRR2057595.45 +1:DDDDDDDFFDFFAGFEB@ACB9< +@SRR2057595.65 +GCGGTTTCGGAGCTTGCTCCGTCCACTCAACGCATCGACCTGGTATTGCAGTACCTCCAGGAACGGTGCACCAAG ++SRR2057595.65 +1=DFFFFHHHHHJJJJJJHJIHHIIIIIIIJHJBHIHBFHHJI@EHJJHHHHHHHFFFBDE?AEBD=AB@CDBD? +@SRR2057595.67 +AAGGTTGGTTTTTGAGGTCCTGCTCGTGCAGGGT ++SRR2057595.67 +1:BDDDFHFHHHHGFHGFGGIIDGHHIGIJJJJ9 +@SRR2057595.69 +ATTGGTTATTTTGTTTTTCCTCCTTCTCTTTCTAAA ++SRR2057595.69 +CCCFFFFFHHHHHJJJJJJJJJJJJJJJJJJIJIJJ +@SRR2057595.70 +TTAGGTTAATTGTAATTTTATGAGGTCCCATTTGTCAATTCTT ++SRR2057595.70 +@@@DDDDD+2CDFA@FBGHCCHFHGBFHGHIGGDHGHIIFCFF +@SRR2057595.71 +TGCGGTTCATATTAGCATGGCCCCTGCGCAAGGATGACACGCAAATTCGTGAAGCGTTCCATATTT ++SRR2057595.71 +CCCFFFFFHHGHHJJJJJJJJJJIIJJIJIJJIFHJIIIJJJIJJJJJJHIIHHHHFFFDEECEEE +@SRR2057595.73 +TGAGGTTCAGCGAGTGCCTAGTGGGCCACTTTTGGTAAGCAGAACTGGCGCTGCGGGA ++SRR2057595.73 +@@@FFFFFHGFFHC@EBHGHGAEGIIHIIIIJJJJGHIIIJIJIIGHIJIJJIGGEFD +@SRR2057595.74 +AATGGTTTCTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ++SRR2057595.74 +@CCFFFFFGDFFHFIJJJGGGGJJGDDDDDDDDDDDDDBDDDDDDBBDDDDDDDDDDDDDDDDDDDBBBDDDDBD> +@SRR2057595.77 +GCGGTTCGATCCCACTTCTGAC ++SRR2057595.77 +1=DFFFFHGDHHHIJJJIJJJJ +@SRR2057595.82 +GAGGGTTACTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.82 +CBCFFFFFHHHHHJJJJJJJJJJJJJJJJIJJJJJIJIIJJJH +@SRR2057595.83 +TGGGTTGATCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.83 +1=DDFFFHHHHHJJIJJJJJJIJJJIJJIGGHIFHGEH +@SRR2057595.86 +ACCGGTTACTTTTTTTAAATGTAAAGCATAAATAAAAAGCCTTTGTGGACTGTGAAAAAAAAAAAAAAAAAAAAAA ++SRR2057595.86 +BCCFFFFFHHHHHJJJJJJJJIIJJJJJJJJJIJJJJJJJJJJJJGIJIIJJIJJJJJJHFDDDDDDDDDDDDDB> +@SRR2057595.88 +TCAGGTTGCCTAAGCATAGATAACCATATATCAGGGGGAGCTCCATGTTCTAGTCCTGCAAGCGCCTGGGCAATAA ++SRR2057595.88 +CCCFFFFFHHHHHJJJJJJIJJJJJIJJJJJJIJJIJJIJJJJJJJJJHIJJJJJJIIIHJIHHHFFDDDDEDDD@ +@SRR2057595.99 +TGAGGTTCATCGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTTTGT ++SRR2057595.99 +B@CFFFFFFDHHHIHIIIJJIJJJJJIGEHGFHIJJGHIHADHIIJIJJJIJG