Skip to content

Commit

Permalink
Reduce the size of the test data and add back missing arguments to co…
Browse files Browse the repository at this point in the history
…nfig
  • Loading branch information
emmarousseau committed Jul 11, 2024
1 parent fe2bbe3 commit 48e5fb4
Show file tree
Hide file tree
Showing 16 changed files with 801 additions and 31 deletions.
23 changes: 22 additions & 1 deletion src/umi_tools/umi_tools_extract/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ argument_groups:
description: |
Write out read pairs not matching regex pattern or cell barcode whitelist to this file.
- name: Extract Options
- name: Extract Options
arguments:
- name: --umitools_extract_method
alternatives: --extract_method
Expand Down Expand Up @@ -115,6 +115,27 @@ argument_groups:
description: |
Ignore "/1" and "/2" read name suffixes. Note that this options is required if the suffixes are not whitespace
separated from the rest of the read name.
arguments:
- name: --umitools_umi_separator
alternatives: --umi_separator
type: string
description: |
The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with
UMI-tools and used other software. Default: `_`
example: "_"
- name: --umitools_grouping_method
type: string
choices: [unique, percentile, cluster, adjacency, directional]
description: |
Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying
the reads with the same mapping position, but treat similar yet nonidentical UMIs differently. Default: `directional`
example: "directional"
- name: --umi_discard_read
type: integer
choices: [0, 1, 2]
description: |
After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively. Default: `0`
example: 0

- name: Common Options
arguments:
Expand Down
11 changes: 6 additions & 5 deletions src/umi_tools/umi_tools_extract/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## VIASH START
## VIASH END

set -eo pipefail
set -exo pipefail

test_dir="${metal_executable}/test_data"

Expand All @@ -28,9 +28,10 @@ if [ -n "$par_paired" ]; then
fi
else # For single-end reads, check that we have only one read file, one pattern
if [ -n "$par_read2_in" ] || [ -n "$par_bc_pattern2" ]; then
echo "Single end input requires one read file and one UMI pattern"
echo "Single end input requires only one read file and one UMI pattern"
exit 1
elif [ "$par_umi_discard_read" != 0 ]; then
# if par_umi_discard_read is not empty or not 0:
elif [ -n "$par_umi_discard_read" ] && [ "$par_umi_discard_read" != 0 ]; then
echo "umi_discard_read is only valid when processing paired end reads."
exit 1
fi
Expand Down Expand Up @@ -71,10 +72,10 @@ umi_tools extract \
${par_error:+--error "$par_error"}


if [ $par_umi_discard_read == 1 ]; then
if [ "$par_umi_discard_read" == 1 ]; then
# discard read 1
rm "$par_read1_out"
elif [ $par_umi_discard_read == 2 ]; then
elif [ "$par_umi_discard_read" == 2 ]; then
# discard read 2 (-f to bypass file existence check)
rm -f "$par_read2_out"
fi
48 changes: 24 additions & 24 deletions src/umi_tools/umi_tools_extract/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,80 +9,80 @@ echo ">>> Testing $meta_functionality_name"
echo ">>> Test 1: Testing for paired-end reads"
"$meta_executable" \
--paired \
--input "$test_dir/scrb_seq_fastq.1.gz"\
--read2_in "$test_dir/scrb_seq_fastq.2.gz" \
--input "$test_dir/scrb_seq_fastq.1_30"\
--read2_in "$test_dir/scrb_seq_fastq.2_30" \
--bc_pattern "CCCCCCNNNNNNNNNN"\
--bc_pattern2 "CCCCCCNNNNNNNNNN" \
--umitools_extract_method string \
--umitools_umi_separator '_' \
--umitools_grouping_method directional \
--umi_discard_read 0 \
--read1_out scrb_seq_fastq.1.umi_extract.fastq.gz \
--read2_out scrb_seq_fastq.2.umi_extract.fastq.gz \
--read1_out scrb_seq_fastq.1_30.extract \
--read2_out scrb_seq_fastq.2_30.extract \
--random_seed 1

echo ">> Checking if the correct files are present"
[[ ! -f "scrb_seq_fastq.1.umi_extract.fastq.gz" ]] || [[ ! -f "scrb_seq_fastq.2.umi_extract.fastq.gz" ]] && echo "Reads file missing" && exit 1
[ ! -s "scrb_seq_fastq.1.umi_extract.fastq.gz" ] && echo "Read 1 file is empty" && exit 1
[ ! -s "scrb_seq_fastq.2.umi_extract.fastq.gz" ] && echo "Read 2 file is empty" && exit 1
[[ ! -f "scrb_seq_fastq.1_30.extract" ]] || [[ ! -f "scrb_seq_fastq.2_30.extract" ]] && echo "Reads file missing" && exit 1
[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1
[ ! -s "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 file is empty" && exit 1


echo ">> Checking if the files are correct"
diff -q <(gunzip -c "${meta_resources_dir}/scrb_seq_fastq.1.umi_extract.fastq.gz") <(gunzip -c "$test_dir/scrb_seq_fastq.1.umi_extract.fastq.gz") || \
diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \
(echo "Read 1 file is not correct" && exit 1)
diff -q <(gunzip -c "${meta_resources_dir}/scrb_seq_fastq.2.umi_extract.fastq.gz") <(gunzip -c "$test_dir/scrb_seq_fastq.2.umi_extract.fastq.gz") || \
diff -q "${meta_resources_dir}/scrb_seq_fastq.2_30.extract" "$test_dir/scrb_seq_fastq.2_30.extract" || \
(echo "Read 2 file is not correct" && exit 1)

rm scrb_seq_fastq.1.umi_extract.fastq.gz scrb_seq_fastq.2.umi_extract.fastq.gz
rm scrb_seq_fastq.1_30.extract scrb_seq_fastq.2_30.extract

############################################################################################################

echo ">>> Test 2: Testing for paired-end reads with umi_discard_reads option"
"$meta_executable" \
--paired \
--input "$test_dir/scrb_seq_fastq.1.gz" \
--read2_in "$test_dir/scrb_seq_fastq.2.gz" \
--input "$test_dir/scrb_seq_fastq.1_30" \
--read2_in "$test_dir/scrb_seq_fastq.2_30" \
--bc_pattern CCCCCCNNNNNNNNNN \
--bc_pattern2 CCCCCCNNNNNNNNNN \
--umitools_extract_method string \
--umitools_umi_separator '_' \
--umitools_grouping_method directional \
--umi_discard_read 2 \
--read1_out scrb_seq_fastq.1.umi_extract.fastq.gz \
--read1_out scrb_seq_fastq.1_30.extract \
--random_seed 1

echo ">> Checking if the correct files are present"
[ ! -f "scrb_seq_fastq.1.umi_extract.fastq.gz" ] && echo "Read 1 file is missing" && exit 1
[ ! -s "scrb_seq_fastq.1.umi_extract.fastq.gz" ] && echo "Read 1 file is empty" && exit 1
[ -f "scrb_seq_fastq.2.umi_extract.fastq.gz" ] && echo "Read 2 is not discarded" && exit 1
[ ! -f "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is missing" && exit 1
[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1
[ -f "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 is not discarded" && exit 1

echo ">> Checking if the files are correct"
diff -q <(gunzip -c "${meta_resources_dir}/scrb_seq_fastq.1.umi_extract.fastq.gz") <(gunzip -c "$test_dir/scrb_seq_fastq.1.umi_extract.fastq.gz") || \
diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \
(echo "Read 1 file is not correct" && exit 1)

rm scrb_seq_fastq.1.umi_extract.fastq.gz
rm scrb_seq_fastq.1_30.extract

############################################################################################################

echo ">>> Test 3: Testing for single-end reads"
"$meta_executable" \
--input "$test_dir/slim.fastq.gz" \
--input "$test_dir/slim_30.fastq" \
--bc_pattern "^(?P<umi_1>.{3}).{4}(?P<umi_2>.{2})" \
--umitools_extract_method regex \
--umitools_umi_separator '_' \
--umitools_grouping_method directional \
--read1_out slim.umi_extract.fastq.gz \
--read1_out slim_30.extract \
--random_seed 1

echo ">> Checking if the correct files are present"
[ ! -f "slim.umi_extract.fastq.gz" ] && echo "Trimmed reads file missing" && exit 1
[ ! -s "slim.umi_extract.fastq.gz" ] && echo "Trimmed reads file is empty" && exit 1
[ ! -f "slim_30.extract" ] && echo "Trimmed reads file missing" && exit 1
[ ! -s "slim_30.extract" ] && echo "Trimmed reads file is empty" && exit 1

echo ">> Checking if the files are correct"
diff -q <(gunzip -c "${meta_resources_dir}/slim.umi_extract.fastq.gz") <(gunzip -c "$test_dir/slim.umi_extract.fastq.gz") || \
diff -q "${meta_resources_dir}/slim_30.extract" "$test_dir/slim_30.extract" || \
(echo "Trimmed reads file is not correct" && exit 1)

rm slim.umi_extract.fastq.gz
rm slim_30.extract

echo ">>> Test finished successfully"
exit 0
Binary file not shown.
Binary file not shown.
120 changes: 120 additions & 0 deletions src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
@SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17
AATAACTTCCCGCGTCG
+SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17
@@@DDDBDDF>FFHGIB
@SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17
AGCGGGGTGCTCGTCGT
+SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17
CCCFFFFFHHHHHJJJJ
@SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17
CTTTAGTACCAGTCCTT
+SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17
BBCFFDADHHHHHHIJJ
@SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17
AGGCGTTGTTTTTTTTT
+SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17
CCCFFFFFHHHHHJJJJ
@SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17
ATCGAGACATAATTGAT
+SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17
@B@FFFFFHHHHHJJJJ
@SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17
TGGGGGCGGTACATGAT
+SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17
BBBFFFFFHHHHHJJJJ
@SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17
CTATATGTTTGCGCTGT
+SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17
1=BDFFFFHHHHHJJJJ
@SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17
CTCCCGCATGCTGCTGT
+SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17
?BBFFFFFHHHHHJJJJ
@SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17
GAGCCCTGAGGGGATCT
+SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17
1??DDDFD>DFDGFGHG
@SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17
AGCGGGGTTCGCGGTTT
+SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17
CCCFFFFFHHHHHJIJI
@SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17
AGAATTGCCTGGATTTT
+SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17
@CCFFFFAFHHHGJJJJ
@SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17
AGGCGGGGCAACGGGTT
+SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17
CCCFFFFFHHGHHJJHH
@SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17
GTCCCCGCGTCGTGTAG
+SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17
@C@FFFFFHFFGHJJJJ
@SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17
CCACGCATTCACTCGGT
+SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17
BBBDFFFFHHHHHJJJJ
@SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17
TGCGCAATAAGCGCTAT
+SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17
+:=DDDDDBHHGDIBEH
@SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17
CGCTGGCAGAGCCCGGT
+SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17
@BCFFFFFHHHHHJJJJ
@SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17
AGGCGGTGCATAGTCTT
+SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17
CCCFFFFFHHHHHIJIH
@SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17
GTCCCCCGCGTGTGACT
+SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17
<BBFFFFFHHHHHIIJJ
@SRR1058032.19 HISEQ:653:H12WDADXX:1:1101:5397:2220 length=17
TATAGAAAAAACTTTTT
+SRR1058032.19 HISEQ:653:H12WDADXX:1:1101:5397:2220 length=17
B@BFDDFFGHHFHIJIJ
@SRR1058032.20 HISEQ:653:H12WDADXX:1:1101:5605:2194 length=17
CATTATGGGCTTATTTT
+SRR1058032.20 HISEQ:653:H12WDADXX:1:1101:5605:2194 length=17
BBBFFFFFHHHHHJJJJ
@SRR1058032.21 HISEQ:653:H12WDADXX:1:1101:5519:2196 length=17
AAATGTGCAGTTCAGAT
+SRR1058032.21 HISEQ:653:H12WDADXX:1:1101:5519:2196 length=17
BCCFFFFFHHHHHJJJJ
@SRR1058032.22 HISEQ:653:H12WDADXX:1:1101:5705:2220 length=17
TGGGGGCTAAAGGGACT
+SRR1058032.22 HISEQ:653:H12WDADXX:1:1101:5705:2220 length=17
BBBDFFFFHHHHHJIJI
@SRR1058032.23 HISEQ:653:H12WDADXX:1:1101:5558:2236 length=17
GATAATACTTACGGTGT
+SRR1058032.23 HISEQ:653:H12WDADXX:1:1101:5558:2236 length=17
CCCFFFFFHHHHHJFHI
@SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=17
CGTTAATAATTGTGGTT
+SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=17
BBBFFFFFHHHHHIIHG
@SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=17
AAAAAAAAAAAAAAAAA
+SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=17
@CCFFFFFGHAA<:46'
@SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=17
GCCGACCAACGATTTTT
+SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=17
:=?DD@?DH;AFBFDFF
@SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=17
AATCAAGACCACTGAAT
+SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=17
@CCFFFFFHHHHHJJJI
@SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=17
CGCGCTTTTGTTTTTTT
+SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=17
BB@FFFFFHHHHHJJJJ
@SRR1058032.29 HISEQ:653:H12WDADXX:1:1101:6133:2213 length=17
AAATACTTTGAGGGAAT
+SRR1058032.29 HISEQ:653:H12WDADXX:1:1101:6133:2213 length=17
@CCFFEFFHHFHGJJII
@SRR1058032.30 HISEQ:653:H12WDADXX:1:1101:6651:2198 length=17
AGCGGGGTTTTATCGGT
+SRR1058032.30 HISEQ:653:H12WDADXX:1:1101:6651:2198 length=17
CCCFFFFDHHHHHHJJJ
Loading

0 comments on commit 48e5fb4

Please sign in to comment.