diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml index 312d5078..0614cbb2 100644 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -134,7 +134,7 @@ argument_groups: tags are “NH”, “X0” and “XT”. If not specified, the read with the highest mapping quality will be selected. - name: --read_length - type: integer + type: boolean_true description: | Use the read length as a criteria when deduping, for e.g sRNA-Seq. diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt index acbab88e..87baf322 100644 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -33,15 +33,13 @@ Dedup Options: --output-stats= One can use the edit distance between UMIs at the same position as an quality control for the deduplication process by comparing with a null expectation of random sampling. For the random sampling, the observed frequency of UMIs is used to more reasonably model the null expectation. - + Use this option to generate a stats outfiles called: + [PREFIX]_stats_edit_distance.tsv + Reports the (binned) average edit distance between the UMIs at each position. In addition, this option will trigger reporting of further summary statistics for the UMIs which may be informative for selecting the optimal deduplication method or debugging. Each unique UMI sequence may be observed [0-many] times at multiple positions in the BAM. The following files report the distribution for the frequencies of each UMI. - - Use this option to generate a stats outfiles called: - [PREFIX]_stats_edit_distance.tsv - Reports the (binned) average edit distance between the UMIs at each position. [PREFIX]_stats_per_umi_per_position.tsv Tabulates the counts for unique combinations of UMI and position. [PREFIX]_stats_per_umi_per.tsv diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh index 00889600..8aa89d10 100644 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -20,7 +20,7 @@ test_dir="${metal_executable}/test_data" [[ "$par_subset" == "false" ]] && unset par_subset [[ "$par_log2stderr" == "false" ]] && unset par_log2stderr [[ "$par_get_output_stats" == "false" ]] && unset par_get_output_stats - +[[ "$par_read_length" == "false" ]] && unset par_read_length umi_tools dedup \ --stdin "$par_input" \ @@ -43,7 +43,7 @@ umi_tools dedup \ ${par_spliced_is_unique:+--spliced-is-unique} \ ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ + ${par_read_length:+--read-length} \ ${par_per_gene:+--per-gene} \ ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh index 4b83ff5e..1b5a9053 100644 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -7,27 +7,45 @@ mkdir -p "$out_dir" ############################################################################################ -echo ">>> Test 1: Basic usage of $meta_functionality_name" +echo ">>> Test 1: Basic usage of $meta_functionality_name with statistics output" "$meta_executable" \ --paired \ --input "$test_dir/sample.bam" \ --bai "$test_dir/sample.bam.bai" \ - --output "$out_dir/deduped.bam" + --output "$out_dir/deduped.bam" \ + --output_stats deduped \ + --random_seed 1 echo ">>> Checking whether output exists" [ ! -f "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' does not exist!" && exit 1 +[ ! -f "$out_dir/deduped_edit_distance.tsv" ] && echo "File 'deduped_edit_distance.tsv' does not exist!" && exit 1 echo ">>> Checking whether output is non-empty" [ ! -s "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' is empty!" && exit 1 +[ ! -s "$out_dir/deduped_edit_distance.tsv" ] && echo "File 'deduped_edit_distance.tsv' is empty!" && exit 1 echo ">>> Checking whether output is correct" diff "$out_dir/deduped.bam" "$test_dir/deduped.bam" || \ (echo "Output file deduped.bam does not match expected output" && exit 1) - +diff "$out_dir/deduped_edit_distance.tsv" "$test_dir/deduped_edit_distance.tsv" || \ + (echo "Output file deduped_edit_distance.tsv does not match expected output" && exit 1) ############################################################################################ +echo ">>> Test 2: $meta_functionality_name" + +"$meta_executable" \ + --paired \ + --input "$test_dir/sample.bam" \ + --bai "$test_dir/sample.bam.bai" \ + --output "$out_dir/deduped.bam" \ + --random_seed 1 \ + + +echo ">>> Checking whether output exists" +[ ! -f "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' does not exist!" && exit 1 +[] rm -rf "$out_dir" echo "All tests succeeded!" diff --git a/src/umi_tools/umi_tools_dedup/test_data/dedup_edit_distance.tsv b/src/umi_tools/umi_tools_dedup/test_data/dedup_edit_distance.tsv new file mode 100644 index 00000000..89684b04 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test_data/dedup_edit_distance.tsv @@ -0,0 +1,5 @@ +unique unique_null directional directional_null edit_distance +3 3 4 4 Single_UMI +0 1 0 0 0 +1 0 0 0 1 +0 0 0 0 2 diff --git a/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi.tsv b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi.tsv new file mode 100644 index 00000000..a1d364e2 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi.tsv @@ -0,0 +1,6 @@ +UMI median_counts_pre times_observed_pre total_counts_pre median_counts_post times_observed_post total_counts_post +ACCGGTTTA 74 1 74 74 1 74 +ACTGGTTTC 48 1 48 49 1 49 +AGCGGTTAC 1 1 1 1 1 1 +CCAGGTTCT 1 1 1 1 1 1 +TCTGGTTTC 1 1 1 0 0 0 diff --git a/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi_per_position.tsv b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi_per_position.tsv new file mode 100644 index 00000000..d9211d0a --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi_per_position.tsv @@ -0,0 +1,5 @@ +counts instances_pre instances_post +1 3 2 +48 1 0 +49 0 1 +74 1 1 diff --git a/src/umi_tools/umi_tools_dedup/test_data/deduped.bam b/src/umi_tools/umi_tools_dedup/test_data/deduped.bam index 5ffd9fc2..a82e6c81 100644 Binary files a/src/umi_tools/umi_tools_dedup/test_data/deduped.bam and b/src/umi_tools/umi_tools_dedup/test_data/deduped.bam differ diff --git a/src/umi_tools/umi_tools_dedup/test_data/sample.bam b/src/umi_tools/umi_tools_dedup/test_data/sample.bam index 929d93b0..32192fc6 100644 Binary files a/src/umi_tools/umi_tools_dedup/test_data/sample.bam and b/src/umi_tools/umi_tools_dedup/test_data/sample.bam differ diff --git a/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai b/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai index bd87cde8..e9e2eee1 100644 Binary files a/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai and b/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai differ diff --git a/src/umi_tools/umi_tools_dedup/test_data/script.sh b/src/umi_tools/umi_tools_dedup/test_data/script.sh index 534c4af2..2253a0d1 100755 --- a/src/umi_tools/umi_tools_dedup/test_data/script.sh +++ b/src/umi_tools/umi_tools_dedup/test_data/script.sh @@ -2,6 +2,7 @@ # Download test data wget https://github.com/CGATOxford/UMI-tools/releases/download/v0.2.3/example.bam -samtools view -b -o sample.bam -s 0.00005 example.bam -samtools index sample.bam > sample.bam.bai +# extract 150 reads with a maximum of two reads having the same start position +samtools view -h example.bam | head -n 150 | samtools view -bS - > sample.bam +samtools index sample.bam rm example.bam \ No newline at end of file