From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:04:14 +0100
Subject: [PATCH 01/12] initial commit dedup

---
 CHANGELOG.md                                  |   3 +
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++
 src/umi_tools/umi_tools_dedup/help.txt        |  13 +
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ++++
 src/umi_tools/umi_tools_dedup/test.sh         |  49 +++
 5 files changed, 409 insertions(+)
 create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 create mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 create mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 create mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4fd7f001..1bef9345 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,9 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
+* `umi_tools`:
+    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
+    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
new file mode 100644
index 00000000..75306541
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -0,0 +1,279 @@
+name: umi_tool_dedup
+namespace: umi_tools
+description: |
+  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
+keywords: [umi_tools, deduplication, dedup]
+links:
+  homepage: https://umi-tools.readthedocs.io/en/latest/
+  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
+                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
+  repository: https://github.com/CGATOxford/UMI-tools
+references: 
+  doi: 10.1101/gr.209601.116
+license: MIT
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        alternatives: -I
+        type: file
+        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
+        required: true
+      - name: --in_sam
+        type: boolean_true
+        description: |
+          By default, inputs are assumed to be in BAM format. Use this options 
+          to specify the use of SAM format for input.
+      - name: --bai
+        type: file
+        description: BAM index
+      - name: --get_output_stats
+        type: boolean
+        description: Whether or not to generate output stats. 
+      - name: --random_seed
+        type: integer
+        description: |
+          Random seed to initialize number generator with.
+        default: none
+
+  - name: Outputs
+    arguments:
+      - name: --output
+        alternatives: -S
+        type: file
+        description: Deduplicated BAM file
+        required: true
+        direction: output
+      - name: --out_sam
+        type: boolean_true
+        description: |
+          By default, outputa are written in BAM format. Use this options to 
+          specify the use of SAM format for output.
+      - name: --paired
+        type: boolean_true
+        description: |
+          BAM is paired end - output both read pairs. This will also force the
+          use of the template length to determine reads with the same mapping
+          coordinates.
+      - name: --output_stats
+        type: file
+        description: Directory containing UMI based deduplication statistics files
+        direction: output
+      - name: --extract_umi_method
+        type: string
+        description: |
+          Specify the method by which the barcodes were encoded in the read.
+          The options are: [read_id, tag, umis].
+        default: read_id
+      - name: --umi_tag
+        type: string
+        description: |
+          The tag containing the UMI sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --umi_separator
+        type: string
+        description: |
+          The separator used to separate the UMI from the read sequence. 
+          This is only required if the extract_umi_method is set to id_read.
+        default: '_'
+      - name: --umi_tag_split
+        type: string
+        description: |
+          Separate the UMI in tag by <SPLIT> and take the first element.
+      - name: --umi_tag_delimiter
+        type: string
+        description: |
+          Separate the UMI in by <DELIMITER> and concatenate the elements
+      - name: --cell_tag
+        type: string
+        description: |
+          The tag containing the cell barcode sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --cell_tag_split
+        type: string
+        description: |
+          Separate the cell barcode in tag by <SPLIT> and take the first element.
+      - name: --cell_tag_delimiter
+        type: string
+        description: |
+          Separate the cell barcode in by <DELIMITER> and concatenate the elements
+  
+  - name: Grouping Options
+    arguments:    
+      - name: --method
+        type: string
+        description: |
+          The method to use for grouping reads. The options are: 
+          [unique, percentile, cluster, adjacency, directional].
+        default: directional
+      - name: --edit_distance_threshold
+        type: integer
+        description: |
+          For the adjacency and cluster methods the threshold for the edit 
+          distance to connect two UMIs in the network can be increased. The 
+          default value of 1 works best unless the UMI is very long (>14bp).
+        default: 1
+      - name: --spliced_is_unique
+        type: boolean_true
+        description: |
+          Causes two reads that start in the same position on the same strand
+          and having the same UMI to be considered unique if one is spliced 
+          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
+      - name: --soft_clip_threshold
+        type: integer
+        description: |
+          Mappers that soft clip will sometimes do so rather than mapping a
+          spliced read if there is only a small overhang over the exon junction.
+          By setting this option, you can treat reads with at least this many
+          bases soft-clipped at the 3’ end as spliced.
+        default: 4
+      - name: --multimapping_detection_method
+        type: string
+        description: |
+          If the sam/bam contains tags to identify multimapping reads, you can
+          specify for use when selecting the best read at a given loci. Supported
+          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
+          mapping quality will be selected.
+      - name: --read_length
+        type: integer
+        description: |
+          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+  
+  - name: Single-cell RNA-Seq Options
+    arguments:
+      - name: --per_gene
+        type: boolean_true
+        description: |
+          Reads will be grouped together if they have the same gene. This is useful
+          if your library prep generates PCR duplicates with non identical alignment
+          positions such as CEL-Seq. Note this option is hardcoded to be on with the
+          count command. I.e counting is always performed per-gene. Must be combined
+          with either --gene_tag or --per_contig option.
+      - name: --gene_tag
+        type: string
+        description: |
+          Deduplicate per gene. The gene information is encoded in the bam read tag
+          specified.
+      - name: --assigned_status_tag
+        type: string
+        description: |
+          BAM tag which describes whether a read is assigned to a gene. Defaults to
+          the same value as given for --gene_tag.
+      - name: --skip_tags_regex
+        type: string
+        description: |
+          Use in conjunction with the --assigned_status_tag option to skip any reads
+          where the tag matches this regex. Default ("^[__|Unassigned]") matches
+          anything which starts with “__” or “Unassigned”.
+      - name: --per_contig
+        type: boolean_true
+        description: |
+          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
+          contig will be considered to have the same alignment position. This is
+          useful if you have aligned to a reference transcriptome with one
+          transcript per gene. If you have aligned to a transcriptome with more
+          than one transcript per gene, you can supply a map between transcripts
+          and gene using the --gene_transcript_map option.
+      - name: --gene_transcript_map
+        type: file
+        description: |
+          A file containing a mapping between gene names and transcript names.
+          The file should be tab separated with the gene name in the first column
+          and the transcript name in the second column.
+      - name: --per_cell
+        type: boolean_true
+        description: |
+          Reads will only be grouped together if they have the same cell barcode.
+          Can be combined with --per_gene.
+  
+  - name: SAM/BAM Options
+    arguments:
+      - name: --mapping_quality
+        type: integer
+        description: |
+          Minimium mapping quality (MAPQ) for a read to be retained.
+        default: 0
+      - name: --unmapped_reads
+        type: string
+        description: |
+          How unmapped reads should be handled. 
+          The options are:
+          "discard": Discard all unmapped reads.
+          "use":     If read2 is unmapped, deduplicate using read1 only. 
+                     Requires --paired.
+          "output":  Output unmapped reads/read pairs without UMI 
+                     grouping/deduplication. Only available in umi_tools group.
+        default: discard
+      - name: --chimeric_pairs
+        type: string
+        description: |
+          How chimeric pairs should be handled. 
+          The options are:
+          "discard": Discard all chimeric read pairs.
+          "use":     Deduplicate using read1 only.
+          "output":  Output chimeric pairs without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --unapired_reads
+        type: string
+        description: |
+          How unpaired reads should be handled. 
+          The options are:
+          "discard": Discard all unpaired reads.
+          "use":     Deduplicate using read1 only.
+          "output":  Output unpaired reads without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --ignore_umi
+        type: boolean_true
+        description: |
+          Ignore the UMI and group reads using mapping coordinates only.
+      - name: --subset
+        type: boolean_true
+        description: |
+          Only consider a fraction of the reads, chosen at random. This is useful
+          for doing saturation analyses.
+      - name: --chrom
+        type: string
+        description: |
+          Only consider a single chromosome. This is useful for debugging/testing 
+          purposes.
+  
+  - name: Group/Dedup Options
+    arguments:
+      - name: --no_sort_output
+        type: boolean_true
+        description: |
+          By default, output is sorted. This involves the use of a temporary unsorted
+          file (saved in --temp-dir). Use this option to turn off sorting.
+      - name: --buffer_whole_contig
+        type: boolean_true
+        description: |
+          Forces dedup to parse an entire contig before yielding any reads for
+          deduplication. This is the only way to absolutely guarantee that all reads
+          with the same start position are grouped together for deduplication since
+          dedup uses the start position of the read, not the alignment coordinate on
+          which the reads are sorted. However, by default, dedup reads for another
+          1000bp before outputting read groups which will avoid any reads being missed
+          with short read sequencing (<1000bp).
+
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
+    setup:
+      - type: docker
+        run: |
+            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
+runners:
+- type: executable
+- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
new file mode 100644
index 00000000..d3c8fa44
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/help.txt
@@ -0,0 +1,13 @@
+```
+umi_tools dedup
+```
+
+dedup - Deduplicate reads using UMI and mapping coordinates
+
+Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+       note: If --stdout is ommited, standard out is output. To
+             generate a valid BAM file on standard out, please
+             redirect log with --log=LOGFILE or --log2stderr 
+
+For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
new file mode 100644
index 00000000..57c01258
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -e
+
+test_dir="${metal_executable}/test_data"
+
+[[ "$par_paired" == "false" ]] && unset par_paired
+[[ "$par_in_sam" == "false" ]] && unset par_in_sam
+[[ "$par_out_sam" == "false" ]] && unset par_out_sam
+[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
+[[ "$par_per_gene" == "false" ]] && unset par_per_gene
+[[ "$par_per_contig" == "false" ]] && unset par_per_contig
+[[ "$par_per_cell" == "false" ]] && unset par_per_cell
+[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
+[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
+[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
+[[ "$par_subset" == "false" ]] && unset par_subset
+
+
+$(which umi_tools) dedup \
+    -I "$par_input" \
+    ${par_in_sam:+--in-sam} \
+    ${par_bai:+--bai "$par_bai"} \
+    ${par_get_output_stats:+--get-output-stats} \
+    ${par_random_seed:+--random-seed "$par_random_seed"} \
+    -S "$par_output" \
+    ${par_out_sam:+--out-sam} \
+    ${par_paired:+--paired} \
+    ${par_output_stats:+--output-stats "$par_output_stats"} \
+    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
+    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
+    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
+    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
+    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
+    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
+    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
+    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
+    ${par_method:+--method "$par_method"} \
+    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
+    ${par_spliced_is_unique:+--spliced-is-unique} \
+    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
+    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
+    ${par_read_length:+--read-length "$par_read_length"} \
+    ${par_per_gene:+--per-gene} \
+    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
+    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
+    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
+    ${par_per_contig:+--per-contig}
+    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
+    ${par_per_cell:+--per-cell} \
+    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
+    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
+    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
+    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
+    ${par_ignore_umi:+--ignore-umi} \
+    ${par_subset:+--subset} \
+    ${par_chrom:+--chrom "$par_chrom"} \
+    ${par_no_sort_output:+--no-sort-output} \
+    ${par_buffer_whole_contig:+--buffer-whole-contig}
+
+
+exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
new file mode 100644
index 00000000..1459ec08
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+test_dir="${meta_resources_dir}/test_data"
+echo ">>> Testing $meta_functionality_name"
+
+"$meta_executable" \
+  --bam "$test_dir/a.sorted.bam" \
+  --bai "$test_dir/a.sorted.bam.bai" \
+  --output "$test_dir/a.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
+    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/a.sorted.idxstats"
+
+############################################################################################
+
+echo ">>> Testing $meta_functionality_name with singletons in the input"
+
+"$meta_executable" \
+  --bam "$test_dir/test.paired_end.sorted.bam" \
+  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
+  --output "$test_dir/test.paired_end.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
+    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/test.paired_end.sorted.idxstats"
+
+############################################################################################
+
+echo "All tests succeeded!"
+exit 0
\ No newline at end of file

From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:38:57 +0100
Subject: [PATCH 02/12] Revert "initial commit dedup"

This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2.
---
 CHANGELOG.md                                  |   3 -
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------
 src/umi_tools/umi_tools_dedup/help.txt        |  13 -
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ----
 src/umi_tools/umi_tools_dedup/test.sh         |  49 ---
 5 files changed, 409 deletions(-)
 delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1bef9345..4fd7f001 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,9 +39,6 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
-* `umi_tools`:
-    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
-    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
deleted file mode 100644
index 75306541..00000000
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ /dev/null
@@ -1,279 +0,0 @@
-name: umi_tool_dedup
-namespace: umi_tools
-description: |
-  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
-keywords: [umi_tools, deduplication, dedup]
-links:
-  homepage: https://umi-tools.readthedocs.io/en/latest/
-  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
-                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
-  repository: https://github.com/CGATOxford/UMI-tools
-references: 
-  doi: 10.1101/gr.209601.116
-license: MIT
-
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: --input
-        alternatives: -I
-        type: file
-        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
-        required: true
-      - name: --in_sam
-        type: boolean_true
-        description: |
-          By default, inputs are assumed to be in BAM format. Use this options 
-          to specify the use of SAM format for input.
-      - name: --bai
-        type: file
-        description: BAM index
-      - name: --get_output_stats
-        type: boolean
-        description: Whether or not to generate output stats. 
-      - name: --random_seed
-        type: integer
-        description: |
-          Random seed to initialize number generator with.
-        default: none
-
-  - name: Outputs
-    arguments:
-      - name: --output
-        alternatives: -S
-        type: file
-        description: Deduplicated BAM file
-        required: true
-        direction: output
-      - name: --out_sam
-        type: boolean_true
-        description: |
-          By default, outputa are written in BAM format. Use this options to 
-          specify the use of SAM format for output.
-      - name: --paired
-        type: boolean_true
-        description: |
-          BAM is paired end - output both read pairs. This will also force the
-          use of the template length to determine reads with the same mapping
-          coordinates.
-      - name: --output_stats
-        type: file
-        description: Directory containing UMI based deduplication statistics files
-        direction: output
-      - name: --extract_umi_method
-        type: string
-        description: |
-          Specify the method by which the barcodes were encoded in the read.
-          The options are: [read_id, tag, umis].
-        default: read_id
-      - name: --umi_tag
-        type: string
-        description: |
-          The tag containing the UMI sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --umi_separator
-        type: string
-        description: |
-          The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read.
-        default: '_'
-      - name: --umi_tag_split
-        type: string
-        description: |
-          Separate the UMI in tag by <SPLIT> and take the first element.
-      - name: --umi_tag_delimiter
-        type: string
-        description: |
-          Separate the UMI in by <DELIMITER> and concatenate the elements
-      - name: --cell_tag
-        type: string
-        description: |
-          The tag containing the cell barcode sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --cell_tag_split
-        type: string
-        description: |
-          Separate the cell barcode in tag by <SPLIT> and take the first element.
-      - name: --cell_tag_delimiter
-        type: string
-        description: |
-          Separate the cell barcode in by <DELIMITER> and concatenate the elements
-  
-  - name: Grouping Options
-    arguments:    
-      - name: --method
-        type: string
-        description: |
-          The method to use for grouping reads. The options are: 
-          [unique, percentile, cluster, adjacency, directional].
-        default: directional
-      - name: --edit_distance_threshold
-        type: integer
-        description: |
-          For the adjacency and cluster methods the threshold for the edit 
-          distance to connect two UMIs in the network can be increased. The 
-          default value of 1 works best unless the UMI is very long (>14bp).
-        default: 1
-      - name: --spliced_is_unique
-        type: boolean_true
-        description: |
-          Causes two reads that start in the same position on the same strand
-          and having the same UMI to be considered unique if one is spliced 
-          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
-      - name: --soft_clip_threshold
-        type: integer
-        description: |
-          Mappers that soft clip will sometimes do so rather than mapping a
-          spliced read if there is only a small overhang over the exon junction.
-          By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3’ end as spliced.
-        default: 4
-      - name: --multimapping_detection_method
-        type: string
-        description: |
-          If the sam/bam contains tags to identify multimapping reads, you can
-          specify for use when selecting the best read at a given loci. Supported
-          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
-          mapping quality will be selected.
-      - name: --read_length
-        type: integer
-        description: |
-          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
-  
-  - name: Single-cell RNA-Seq Options
-    arguments:
-      - name: --per_gene
-        type: boolean_true
-        description: |
-          Reads will be grouped together if they have the same gene. This is useful
-          if your library prep generates PCR duplicates with non identical alignment
-          positions such as CEL-Seq. Note this option is hardcoded to be on with the
-          count command. I.e counting is always performed per-gene. Must be combined
-          with either --gene_tag or --per_contig option.
-      - name: --gene_tag
-        type: string
-        description: |
-          Deduplicate per gene. The gene information is encoded in the bam read tag
-          specified.
-      - name: --assigned_status_tag
-        type: string
-        description: |
-          BAM tag which describes whether a read is assigned to a gene. Defaults to
-          the same value as given for --gene_tag.
-      - name: --skip_tags_regex
-        type: string
-        description: |
-          Use in conjunction with the --assigned_status_tag option to skip any reads
-          where the tag matches this regex. Default ("^[__|Unassigned]") matches
-          anything which starts with “__” or “Unassigned”.
-      - name: --per_contig
-        type: boolean_true
-        description: |
-          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
-          contig will be considered to have the same alignment position. This is
-          useful if you have aligned to a reference transcriptome with one
-          transcript per gene. If you have aligned to a transcriptome with more
-          than one transcript per gene, you can supply a map between transcripts
-          and gene using the --gene_transcript_map option.
-      - name: --gene_transcript_map
-        type: file
-        description: |
-          A file containing a mapping between gene names and transcript names.
-          The file should be tab separated with the gene name in the first column
-          and the transcript name in the second column.
-      - name: --per_cell
-        type: boolean_true
-        description: |
-          Reads will only be grouped together if they have the same cell barcode.
-          Can be combined with --per_gene.
-  
-  - name: SAM/BAM Options
-    arguments:
-      - name: --mapping_quality
-        type: integer
-        description: |
-          Minimium mapping quality (MAPQ) for a read to be retained.
-        default: 0
-      - name: --unmapped_reads
-        type: string
-        description: |
-          How unmapped reads should be handled. 
-          The options are:
-          "discard": Discard all unmapped reads.
-          "use":     If read2 is unmapped, deduplicate using read1 only. 
-                     Requires --paired.
-          "output":  Output unmapped reads/read pairs without UMI 
-                     grouping/deduplication. Only available in umi_tools group.
-        default: discard
-      - name: --chimeric_pairs
-        type: string
-        description: |
-          How chimeric pairs should be handled. 
-          The options are:
-          "discard": Discard all chimeric read pairs.
-          "use":     Deduplicate using read1 only.
-          "output":  Output chimeric pairs without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --unapired_reads
-        type: string
-        description: |
-          How unpaired reads should be handled. 
-          The options are:
-          "discard": Discard all unpaired reads.
-          "use":     Deduplicate using read1 only.
-          "output":  Output unpaired reads without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --ignore_umi
-        type: boolean_true
-        description: |
-          Ignore the UMI and group reads using mapping coordinates only.
-      - name: --subset
-        type: boolean_true
-        description: |
-          Only consider a fraction of the reads, chosen at random. This is useful
-          for doing saturation analyses.
-      - name: --chrom
-        type: string
-        description: |
-          Only consider a single chromosome. This is useful for debugging/testing 
-          purposes.
-  
-  - name: Group/Dedup Options
-    arguments:
-      - name: --no_sort_output
-        type: boolean_true
-        description: |
-          By default, output is sorted. This involves the use of a temporary unsorted
-          file (saved in --temp-dir). Use this option to turn off sorting.
-      - name: --buffer_whole_contig
-        type: boolean_true
-        description: |
-          Forces dedup to parse an entire contig before yielding any reads for
-          deduplication. This is the only way to absolutely guarantee that all reads
-          with the same start position are grouped together for deduplication since
-          dedup uses the start position of the read, not the alignment coordinate on
-          which the reads are sorted. However, by default, dedup reads for another
-          1000bp before outputting read groups which will avoid any reads being missed
-          with short read sequencing (<1000bp).
-
-
-resources:
-  - type: bash_script
-    path: script.sh
-test_resources:
-  - type: bash_script
-    path: test.sh
-  - type: file
-    path: test_data
-engines:
-  - type: docker
-    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
-    setup:
-      - type: docker
-        run: |
-            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
-runners:
-- type: executable
-- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
deleted file mode 100644
index d3c8fa44..00000000
--- a/src/umi_tools/umi_tools_dedup/help.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-```
-umi_tools dedup
-```
-
-dedup - Deduplicate reads using UMI and mapping coordinates
-
-Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
-
-       note: If --stdout is ommited, standard out is output. To
-             generate a valid BAM file on standard out, please
-             redirect log with --log=LOGFILE or --log2stderr 
-
-For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
deleted file mode 100644
index 57c01258..00000000
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-## VIASH START
-## VIASH END
-
-set -e
-
-test_dir="${metal_executable}/test_data"
-
-[[ "$par_paired" == "false" ]] && unset par_paired
-[[ "$par_in_sam" == "false" ]] && unset par_in_sam
-[[ "$par_out_sam" == "false" ]] && unset par_out_sam
-[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
-[[ "$par_per_gene" == "false" ]] && unset par_per_gene
-[[ "$par_per_contig" == "false" ]] && unset par_per_contig
-[[ "$par_per_cell" == "false" ]] && unset par_per_cell
-[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
-[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
-[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
-[[ "$par_subset" == "false" ]] && unset par_subset
-
-
-$(which umi_tools) dedup \
-    -I "$par_input" \
-    ${par_in_sam:+--in-sam} \
-    ${par_bai:+--bai "$par_bai"} \
-    ${par_get_output_stats:+--get-output-stats} \
-    ${par_random_seed:+--random-seed "$par_random_seed"} \
-    -S "$par_output" \
-    ${par_out_sam:+--out-sam} \
-    ${par_paired:+--paired} \
-    ${par_output_stats:+--output-stats "$par_output_stats"} \
-    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
-    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
-    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
-    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
-    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
-    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
-    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
-    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
-    ${par_method:+--method "$par_method"} \
-    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
-    ${par_spliced_is_unique:+--spliced-is-unique} \
-    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
-    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
-    ${par_read_length:+--read-length "$par_read_length"} \
-    ${par_per_gene:+--per-gene} \
-    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
-    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
-    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
-    ${par_per_contig:+--per-contig}
-    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
-    ${par_per_cell:+--per-cell} \
-    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
-    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
-    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
-    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
-    ${par_ignore_umi:+--ignore-umi} \
-    ${par_subset:+--subset} \
-    ${par_chrom:+--chrom "$par_chrom"} \
-    ${par_no_sort_output:+--no-sort-output} \
-    ${par_buffer_whole_contig:+--buffer-whole-contig}
-
-
-exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
deleted file mode 100644
index 1459ec08..00000000
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-test_dir="${meta_resources_dir}/test_data"
-echo ">>> Testing $meta_functionality_name"
-
-"$meta_executable" \
-  --bam "$test_dir/a.sorted.bam" \
-  --bai "$test_dir/a.sorted.bam.bai" \
-  --output "$test_dir/a.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
-    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/a.sorted.idxstats"
-
-############################################################################################
-
-echo ">>> Testing $meta_functionality_name with singletons in the input"
-
-"$meta_executable" \
-  --bam "$test_dir/test.paired_end.sorted.bam" \
-  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
-  --output "$test_dir/test.paired_end.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
-    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/test.paired_end.sorted.idxstats"
-
-############################################################################################
-
-echo "All tests succeeded!"
-exit 0
\ No newline at end of file

From 0deebc544565f251a92bd50d53829452eb96fcaf Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Sat, 13 Apr 2024 17:25:01 +0100
Subject: [PATCH 03/12] inital commit dedup

---
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++
 src/umi_tools/umi_tools_dedup/help.txt        |  13 +
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ++++
 src/umi_tools/umi_tools_dedup/test.sh         |  49 +++
 4 files changed, 406 insertions(+)
 create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 create mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 create mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 create mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
new file mode 100644
index 00000000..75306541
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -0,0 +1,279 @@
+name: umi_tool_dedup
+namespace: umi_tools
+description: |
+  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
+keywords: [umi_tools, deduplication, dedup]
+links:
+  homepage: https://umi-tools.readthedocs.io/en/latest/
+  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
+                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
+  repository: https://github.com/CGATOxford/UMI-tools
+references: 
+  doi: 10.1101/gr.209601.116
+license: MIT
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        alternatives: -I
+        type: file
+        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
+        required: true
+      - name: --in_sam
+        type: boolean_true
+        description: |
+          By default, inputs are assumed to be in BAM format. Use this options 
+          to specify the use of SAM format for input.
+      - name: --bai
+        type: file
+        description: BAM index
+      - name: --get_output_stats
+        type: boolean
+        description: Whether or not to generate output stats. 
+      - name: --random_seed
+        type: integer
+        description: |
+          Random seed to initialize number generator with.
+        default: none
+
+  - name: Outputs
+    arguments:
+      - name: --output
+        alternatives: -S
+        type: file
+        description: Deduplicated BAM file
+        required: true
+        direction: output
+      - name: --out_sam
+        type: boolean_true
+        description: |
+          By default, outputa are written in BAM format. Use this options to 
+          specify the use of SAM format for output.
+      - name: --paired
+        type: boolean_true
+        description: |
+          BAM is paired end - output both read pairs. This will also force the
+          use of the template length to determine reads with the same mapping
+          coordinates.
+      - name: --output_stats
+        type: file
+        description: Directory containing UMI based deduplication statistics files
+        direction: output
+      - name: --extract_umi_method
+        type: string
+        description: |
+          Specify the method by which the barcodes were encoded in the read.
+          The options are: [read_id, tag, umis].
+        default: read_id
+      - name: --umi_tag
+        type: string
+        description: |
+          The tag containing the UMI sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --umi_separator
+        type: string
+        description: |
+          The separator used to separate the UMI from the read sequence. 
+          This is only required if the extract_umi_method is set to id_read.
+        default: '_'
+      - name: --umi_tag_split
+        type: string
+        description: |
+          Separate the UMI in tag by <SPLIT> and take the first element.
+      - name: --umi_tag_delimiter
+        type: string
+        description: |
+          Separate the UMI in by <DELIMITER> and concatenate the elements
+      - name: --cell_tag
+        type: string
+        description: |
+          The tag containing the cell barcode sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --cell_tag_split
+        type: string
+        description: |
+          Separate the cell barcode in tag by <SPLIT> and take the first element.
+      - name: --cell_tag_delimiter
+        type: string
+        description: |
+          Separate the cell barcode in by <DELIMITER> and concatenate the elements
+  
+  - name: Grouping Options
+    arguments:    
+      - name: --method
+        type: string
+        description: |
+          The method to use for grouping reads. The options are: 
+          [unique, percentile, cluster, adjacency, directional].
+        default: directional
+      - name: --edit_distance_threshold
+        type: integer
+        description: |
+          For the adjacency and cluster methods the threshold for the edit 
+          distance to connect two UMIs in the network can be increased. The 
+          default value of 1 works best unless the UMI is very long (>14bp).
+        default: 1
+      - name: --spliced_is_unique
+        type: boolean_true
+        description: |
+          Causes two reads that start in the same position on the same strand
+          and having the same UMI to be considered unique if one is spliced 
+          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
+      - name: --soft_clip_threshold
+        type: integer
+        description: |
+          Mappers that soft clip will sometimes do so rather than mapping a
+          spliced read if there is only a small overhang over the exon junction.
+          By setting this option, you can treat reads with at least this many
+          bases soft-clipped at the 3’ end as spliced.
+        default: 4
+      - name: --multimapping_detection_method
+        type: string
+        description: |
+          If the sam/bam contains tags to identify multimapping reads, you can
+          specify for use when selecting the best read at a given loci. Supported
+          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
+          mapping quality will be selected.
+      - name: --read_length
+        type: integer
+        description: |
+          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+  
+  - name: Single-cell RNA-Seq Options
+    arguments:
+      - name: --per_gene
+        type: boolean_true
+        description: |
+          Reads will be grouped together if they have the same gene. This is useful
+          if your library prep generates PCR duplicates with non identical alignment
+          positions such as CEL-Seq. Note this option is hardcoded to be on with the
+          count command. I.e counting is always performed per-gene. Must be combined
+          with either --gene_tag or --per_contig option.
+      - name: --gene_tag
+        type: string
+        description: |
+          Deduplicate per gene. The gene information is encoded in the bam read tag
+          specified.
+      - name: --assigned_status_tag
+        type: string
+        description: |
+          BAM tag which describes whether a read is assigned to a gene. Defaults to
+          the same value as given for --gene_tag.
+      - name: --skip_tags_regex
+        type: string
+        description: |
+          Use in conjunction with the --assigned_status_tag option to skip any reads
+          where the tag matches this regex. Default ("^[__|Unassigned]") matches
+          anything which starts with “__” or “Unassigned”.
+      - name: --per_contig
+        type: boolean_true
+        description: |
+          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
+          contig will be considered to have the same alignment position. This is
+          useful if you have aligned to a reference transcriptome with one
+          transcript per gene. If you have aligned to a transcriptome with more
+          than one transcript per gene, you can supply a map between transcripts
+          and gene using the --gene_transcript_map option.
+      - name: --gene_transcript_map
+        type: file
+        description: |
+          A file containing a mapping between gene names and transcript names.
+          The file should be tab separated with the gene name in the first column
+          and the transcript name in the second column.
+      - name: --per_cell
+        type: boolean_true
+        description: |
+          Reads will only be grouped together if they have the same cell barcode.
+          Can be combined with --per_gene.
+  
+  - name: SAM/BAM Options
+    arguments:
+      - name: --mapping_quality
+        type: integer
+        description: |
+          Minimium mapping quality (MAPQ) for a read to be retained.
+        default: 0
+      - name: --unmapped_reads
+        type: string
+        description: |
+          How unmapped reads should be handled. 
+          The options are:
+          "discard": Discard all unmapped reads.
+          "use":     If read2 is unmapped, deduplicate using read1 only. 
+                     Requires --paired.
+          "output":  Output unmapped reads/read pairs without UMI 
+                     grouping/deduplication. Only available in umi_tools group.
+        default: discard
+      - name: --chimeric_pairs
+        type: string
+        description: |
+          How chimeric pairs should be handled. 
+          The options are:
+          "discard": Discard all chimeric read pairs.
+          "use":     Deduplicate using read1 only.
+          "output":  Output chimeric pairs without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --unapired_reads
+        type: string
+        description: |
+          How unpaired reads should be handled. 
+          The options are:
+          "discard": Discard all unpaired reads.
+          "use":     Deduplicate using read1 only.
+          "output":  Output unpaired reads without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --ignore_umi
+        type: boolean_true
+        description: |
+          Ignore the UMI and group reads using mapping coordinates only.
+      - name: --subset
+        type: boolean_true
+        description: |
+          Only consider a fraction of the reads, chosen at random. This is useful
+          for doing saturation analyses.
+      - name: --chrom
+        type: string
+        description: |
+          Only consider a single chromosome. This is useful for debugging/testing 
+          purposes.
+  
+  - name: Group/Dedup Options
+    arguments:
+      - name: --no_sort_output
+        type: boolean_true
+        description: |
+          By default, output is sorted. This involves the use of a temporary unsorted
+          file (saved in --temp-dir). Use this option to turn off sorting.
+      - name: --buffer_whole_contig
+        type: boolean_true
+        description: |
+          Forces dedup to parse an entire contig before yielding any reads for
+          deduplication. This is the only way to absolutely guarantee that all reads
+          with the same start position are grouped together for deduplication since
+          dedup uses the start position of the read, not the alignment coordinate on
+          which the reads are sorted. However, by default, dedup reads for another
+          1000bp before outputting read groups which will avoid any reads being missed
+          with short read sequencing (<1000bp).
+
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
+    setup:
+      - type: docker
+        run: |
+            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
+runners:
+- type: executable
+- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
new file mode 100644
index 00000000..d3c8fa44
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/help.txt
@@ -0,0 +1,13 @@
+```
+umi_tools dedup
+```
+
+dedup - Deduplicate reads using UMI and mapping coordinates
+
+Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+       note: If --stdout is ommited, standard out is output. To
+             generate a valid BAM file on standard out, please
+             redirect log with --log=LOGFILE or --log2stderr 
+
+For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
new file mode 100644
index 00000000..57c01258
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -e
+
+test_dir="${metal_executable}/test_data"
+
+[[ "$par_paired" == "false" ]] && unset par_paired
+[[ "$par_in_sam" == "false" ]] && unset par_in_sam
+[[ "$par_out_sam" == "false" ]] && unset par_out_sam
+[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
+[[ "$par_per_gene" == "false" ]] && unset par_per_gene
+[[ "$par_per_contig" == "false" ]] && unset par_per_contig
+[[ "$par_per_cell" == "false" ]] && unset par_per_cell
+[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
+[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
+[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
+[[ "$par_subset" == "false" ]] && unset par_subset
+
+
+$(which umi_tools) dedup \
+    -I "$par_input" \
+    ${par_in_sam:+--in-sam} \
+    ${par_bai:+--bai "$par_bai"} \
+    ${par_get_output_stats:+--get-output-stats} \
+    ${par_random_seed:+--random-seed "$par_random_seed"} \
+    -S "$par_output" \
+    ${par_out_sam:+--out-sam} \
+    ${par_paired:+--paired} \
+    ${par_output_stats:+--output-stats "$par_output_stats"} \
+    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
+    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
+    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
+    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
+    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
+    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
+    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
+    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
+    ${par_method:+--method "$par_method"} \
+    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
+    ${par_spliced_is_unique:+--spliced-is-unique} \
+    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
+    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
+    ${par_read_length:+--read-length "$par_read_length"} \
+    ${par_per_gene:+--per-gene} \
+    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
+    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
+    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
+    ${par_per_contig:+--per-contig}
+    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
+    ${par_per_cell:+--per-cell} \
+    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
+    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
+    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
+    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
+    ${par_ignore_umi:+--ignore-umi} \
+    ${par_subset:+--subset} \
+    ${par_chrom:+--chrom "$par_chrom"} \
+    ${par_no_sort_output:+--no-sort-output} \
+    ${par_buffer_whole_contig:+--buffer-whole-contig}
+
+
+exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
new file mode 100644
index 00000000..1459ec08
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+test_dir="${meta_resources_dir}/test_data"
+echo ">>> Testing $meta_functionality_name"
+
+"$meta_executable" \
+  --bam "$test_dir/a.sorted.bam" \
+  --bai "$test_dir/a.sorted.bam.bai" \
+  --output "$test_dir/a.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
+    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/a.sorted.idxstats"
+
+############################################################################################
+
+echo ">>> Testing $meta_functionality_name with singletons in the input"
+
+"$meta_executable" \
+  --bam "$test_dir/test.paired_end.sorted.bam" \
+  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
+  --output "$test_dir/test.paired_end.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
+    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/test.paired_end.sorted.idxstats"
+
+############################################################################################
+
+echo "All tests succeeded!"
+exit 0
\ No newline at end of file

From 9706b6c5119dc3222dbf28e416751e74820d88df Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Sun, 12 May 2024 17:01:50 +0200
Subject: [PATCH 04/12] Working component with one test

---
 CHANGELOG.md                                  |   3 +
 src/umi_tools/umi_tools_dedup/config.vsh.yaml |  55 +++++++--
 src/umi_tools/umi_tools_dedup/help.txt        | 110 +++++++++++++++++-
 src/umi_tools/umi_tools_dedup/script.sh       |  25 ++--
 src/umi_tools/umi_tools_dedup/test.sh         |  41 +++----
 .../umi_tools_dedup/test_data/deduped.bam     | Bin 0 -> 2154 bytes
 .../umi_tools_dedup/test_data/sample.bam      | Bin 0 -> 3268 bytes
 .../umi_tools_dedup/test_data/sample.bam.bai  | Bin 0 -> 889032 bytes
 .../umi_tools_dedup/test_data/script.sh       |   7 ++
 9 files changed, 191 insertions(+), 50 deletions(-)
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/deduped.bam
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/sample.bam
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai
 create mode 100755 src/umi_tools/umi_tools_dedup/test_data/script.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba7bf0e3..ceb53ac5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,6 +42,9 @@
     - `samtools/samtools_sort`: Sort SAM/BAM/CRAM files (PR #36).
     - `samtools/samtools_stats`: Reports alignment summary statistics for a BAM file (PR #39).
 
+* `umitools`:
+    - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #XXX).
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index 75306541..312d5078 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -1,12 +1,11 @@
-name: umi_tool_dedup
+name: umi_tools_dedup
 namespace: umi_tools
 description: |
   Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
 keywords: [umi_tools, deduplication, dedup]
 links:
   homepage: https://umi-tools.readthedocs.io/en/latest/
-  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
-                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
+  documentation: https://umi-tools.readthedocs.io/en/latest/reference/dedup.html
   repository: https://github.com/CGATOxford/UMI-tools
 references: 
   doi: 10.1101/gr.209601.116
@@ -16,7 +15,7 @@ argument_groups:
   - name: Inputs
     arguments:
       - name: --input
-        alternatives: -I
+        alternatives: --stdin
         type: file
         description: Input BAM or SAM file. Use --in_sam to specify SAM format.
         required: true
@@ -29,13 +28,12 @@ argument_groups:
         type: file
         description: BAM index
       - name: --get_output_stats
-        type: boolean
-        description: Whether or not to generate output stats. 
+        type: boolean_true
+        description: Generate output stats. 
       - name: --random_seed
         type: integer
         description: |
           Random seed to initialize number generator with.
-        default: none
 
   - name: Outputs
     arguments:
@@ -215,7 +213,7 @@ argument_groups:
           "output":  Output chimeric pairs without UMI grouping/deduplication. 
                      Only available in umi_tools group.
         default: use
-      - name: --unapired_reads
+      - name: --unpaired_reads
         type: string
         description: |
           How unpaired reads should be handled. 
@@ -246,7 +244,7 @@ argument_groups:
         type: boolean_true
         description: |
           By default, output is sorted. This involves the use of a temporary unsorted
-          file (saved in --temp-dir). Use this option to turn off sorting.
+          file (saved in --temp_dir). Use this option to turn off sorting.
       - name: --buffer_whole_contig
         type: boolean_true
         description: |
@@ -257,7 +255,44 @@ argument_groups:
           which the reads are sorted. However, by default, dedup reads for another
           1000bp before outputting read groups which will avoid any reads being missed
           with short read sequencing (<1000bp).
-
+  
+  - name: Common UMI-tools Options
+    arguments:
+      - name: --log
+        alternatives: -L
+        type: file
+        description: File with logging information.
+      - name: --log2stderr
+        type: boolean_true
+        description: Send logging information to stderr.
+      - name: --verbose
+        alternatives: -v
+        type: integer
+        description: Log level. The higher, the more output.
+        default: 1
+      - name: --error
+        alternatives: -E
+        type: file
+        description: File with error information.
+      - name: --temp_dir
+        type: string
+        description: |
+          Directory for temporary files. If not set, the bash environmental variable TMPDIR is used.
+      - name: --compresslevel
+        type: integer
+        description: |
+          Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default.
+        default: 6
+      - name: --timeit
+        type: file
+        description: Store timing information in file.
+      - name: --timeit_name
+        type: string
+        description: Name in timing file for this class of jobs.
+        default: "all"
+      - name: --timeit_header
+        type: string
+        description: Add header for timing information.
 
 resources:
   - type: bash_script
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
index d3c8fa44..acbab88e 100644
--- a/src/umi_tools/umi_tools_dedup/help.txt
+++ b/src/umi_tools/umi_tools_dedup/help.txt
@@ -1,6 +1,9 @@
-```
-umi_tools dedup
-```
+'''
+Generated from the following UMI-tools documentation:
+      https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options
+      https://umi-tools.readthedocs.io/en/latest/reference/dedup.html
+'''
+
 
 dedup - Deduplicate reads using UMI and mapping coordinates
 
@@ -10,4 +13,103 @@ Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
              generate a valid BAM file on standard out, please
              redirect log with --log=LOGFILE or --log2stderr 
 
-For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
+Common UMI-tools Options:
+
+      -S, --stdout                  File where output is to go [default = stdout].
+      -L, --log                     File with logging information [default = stdout].
+      --log2stderr                  Send logging information to stderr [default = False].
+      -v, --verbose                 Log level. The higher, the more output [default = 1].
+      -E, --error                   File with error information [default = stderr].
+      --temp-dir                    Directory for temporary files. If not set, the bash environmental variable TMPDIR is used[default = None].
+      --compresslevel               Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9)
+
+      profiling and debugging options:
+      --timeit                      Store timing information in file [default=none].
+      --timeit-name                 Name in timing file for this class of jobs [default=all].
+      --timeit-header               Add header for timing information [default=none].
+      --random-seed                 Random seed to initialize number generator with [default=none].
+
+Dedup Options:
+      --output-stats=<prefix>             One can use the edit distance between UMIs at the same position as an quality control for the 
+                                          deduplication process by comparing with a null expectation of random sampling. For the random
+                                          sampling, the observed frequency of UMIs is used to more reasonably model the null expectation.
+
+                                          In addition, this option will trigger reporting of further summary statistics for the UMIs which
+                                          may be informative for selecting the optimal deduplication method or debugging.
+                                          Each unique UMI sequence may be observed [0-many] times at multiple positions in the BAM. The
+                                          following files report the distribution for the frequencies of each UMI.
+
+                                          Use this option to generate a stats outfiles called: 
+                                                [PREFIX]_stats_edit_distance.tsv   
+                                                      Reports the (binned) average edit distance between the UMIs at each position.
+                                                [PREFIX]_stats_per_umi_per_position.tsv
+                                                      Tabulates the counts for unique combinations of UMI and position.
+                                                [PREFIX]_stats_per_umi_per.tsv
+                                                      The _stats_per_umi_per.tsv table provides UMI-level summary statistics. 
+      --extract-umi-method=<method>       How are the barcodes encoded in the read?
+                                          Options are: read_id (default), tag, umis
+      --umi-separator=<separator>         Separator between read id and UMI. See --extract-umi-method above. Default=_
+      --umi-tag=<tag>                     Tag which contains UMI. See --extract-umi-method above
+      --umi-tag-split=<split>             Separate the UMI in tag by SPLIT and take the first element
+      --umi-tag-delimiter=<delimiter>     Separate the UMI in by DELIMITER and concatenate the elements
+      --cell-tag=<tag>                    Tag which contains cell barcode. See --extract-umi-method above
+      --cell-tag-split=<split>            Separate the cell barcode in tag by SPLIT and take the first element
+      --cell-tag-delimiter=<delimiter>    Separate the cell barcode in by DELIMITER and concatenate the elements
+      --method=<method>                   What method to use to identify group of reads with the same (or similar) UMI(s)?
+                                          All methods start by identifying the reads with the same mapping position.
+                                          The simplest methods, unique and percentile, group reads with the exact same UMI.
+                                          The network-based methods, cluster, adjacency and directional, build networks where
+                                          nodes are UMIs and edges connect UMIs with an edit distance <= threshold (usually 1).
+                                          The groups of reads are then defined from the network in a method-specific manner.
+                                          For all the network-based methods, each read group is equivalent to one read count for the gene.
+      --edit-distance-threshold=<threshold>     For the adjacency and cluster methods the threshold for the edit distance to connect
+                                                two UMIs in the network can be increased. The default value of 1 works best unless
+                                                the UMI is very long (>14bp).
+      --spliced-is-unique           Causes two reads that start in the same position on the same strand and having the
+                                    same UMI to be considered unique if one is spliced and the other is not.
+                                    (Uses the 'N' cigar operation to test for splicing).
+      --soft-clip-threshold=<threshold>    Mappers that soft clip will sometimes do so rather than mapping a spliced read if
+                                          there is only a small overhang over the exon junction. By setting this option, you
+                                          can treat reads with at least this many bases soft-clipped at the 3' end as spliced.
+                                          Default=4.
+      --multimapping-detection-method=<method>  If the sam/bam contains tags to identify multimapping reads, you can specify
+                                                for use when selecting the best read at a given loci. Supported tags are "NH",
+                                                "X0" and "XT". If not specified, the read with the highest mapping quality will be selected.
+      --read-length                              Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+      --per-gene                    Reads will be grouped together if they have the same gene. This is useful if your
+                                    library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq.
+                                    Note this option is hardcoded to be on with the count command. I.e counting is always
+                                    performed per-gene. Must be combined with either --gene-tag or --per-contig option.
+      --gene-tag=<tag>              Deduplicate per gene. The gene information is encoded in the bam read tag specified
+      --assigned-status-tag=<tag>   BAM tag which describes whether a read is assigned to a gene. Defaults to the same value
+                                    as given for --gene-tag
+      --skip-tags-regex=<regex>     Use in conjunction with the --assigned-status-tag option to skip any reads where the
+                                    tag matches this regex. Default ("^[__|Unassigned]") matches anything which starts with "__"
+                                    or "Unassigned":
+      --per-contig                  Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same contig will be
+                                    considered to have the same alignment position. This is useful if you have aligned to a
+                                    reference transcriptome with one transcript per gene. If you have aligned to a transcriptome
+                                    with more than one transcript per gene, you can supply a map between transcripts and gene
+                                    using the --gene-transcript-map option
+      --gene-transcript-map=<file>  File mapping genes to transcripts (tab separated)
+      --per-cell                    Reads will only be grouped together if they have the same cell barcode. Can be combined with --per-gene.
+      --mapping-quality=<quality>   Minimium mapping quality (MAPQ) for a read to be retained. Default is 0.
+      --unmapped-reads=<option>     How should unmapped reads be handled.
+      --chimeric-pairs=<option>     How should chimeric read pairs be handled.
+      --unpaired-reads=<option>     How should unpaired reads be handled.
+      --ignore-umi                  Ignore the UMI and group reads using mapping coordinates only
+      --subset=<fraction>           Only consider a fraction of the reads, chosen at random. This is useful for doing saturation analyses.
+      --chrom=<chromosome>          Only consider a single chromosome. This is useful for debugging/testing purposes
+      --in-sam                      Input is in SAM format
+      --out-sam                     Output is in SAM format
+      --paired                      BAM is paired end - output both read pairs. This will also force the use of the template
+                                    length to determine reads with the same mapping coordinates.
+      --no-sort-output              By default, output is sorted. This involves the use of a temporary unsorted file since
+                                    reads are considered in the order of their start position which may not be the same as
+                                    their alignment coordinate due to soft-clipping and reverse alignments. The temp file
+                                    will be saved (in --temp-dir) and deleted when it has been sorted to the outfile. Use
+                                    this option to turn off sorting.
+      --buffer-whole-contig         Forces dedup to parse an entire contig before yielding any reads for deduplication.
+                                    This is the only way to absolutely guarantee that all reads with the same start position
+                                    are grouped together for deduplication since dedup uses the start position of the read,
+                                    not the alignment coordinate on which the reads are
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
index 57c01258..00889600 100644
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -18,14 +18,14 @@ test_dir="${metal_executable}/test_data"
 [[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
 [[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
 [[ "$par_subset" == "false" ]] && unset par_subset
+[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr
+[[ "$par_get_output_stats" == "false" ]] && unset par_get_output_stats
 
 
-$(which umi_tools) dedup \
-    -I "$par_input" \
+umi_tools dedup \
+    --stdin "$par_input" \
     ${par_in_sam:+--in-sam} \
-    ${par_bai:+--bai "$par_bai"} \
     ${par_get_output_stats:+--get-output-stats} \
-    ${par_random_seed:+--random-seed "$par_random_seed"} \
     -S "$par_output" \
     ${par_out_sam:+--out-sam} \
     ${par_paired:+--paired} \
@@ -48,7 +48,7 @@ $(which umi_tools) dedup \
     ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
     ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
     ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
-    ${par_per_contig:+--per-contig}
+    ${par_per_contig:+--per-contig} \
     ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
     ${par_per_cell:+--per-cell} \
     ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
@@ -59,7 +59,16 @@ $(which umi_tools) dedup \
     ${par_subset:+--subset} \
     ${par_chrom:+--chrom "$par_chrom"} \
     ${par_no_sort_output:+--no-sort-output} \
-    ${par_buffer_whole_contig:+--buffer-whole-contig}
+    ${par_buffer_whole_contig:+--buffer-whole-contig} \
+    ${par_log:+-L "$par_log"} \
+    ${par_log2stderr:+--log2stderr} \
+    ${par_verbose:+-v "$par_verbose"} \
+    ${par_error:+-E "$par_error"} \
+    ${par_temp_dir:+--temp-dir "$par_temp_dir"} \
+    ${par_compresslevel:+--compresslevel "$par_compresslevel"} \
+    ${par_timeit:+--timeit "$par_timeit"} \
+    ${par_timeit_name:+--timeit-name "$par_timeit_name"} \
+    ${par_timeit_header:+--timeit-header "$par_timeit_header"} \
+    ${par_random_seed:+--random-seed "$par_random_seed"}
 
-
-exit 0
\ No newline at end of file
+exit 0
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
index 1459ec08..4b83ff5e 100644
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -1,49 +1,34 @@
 #!/bin/bash
 
 test_dir="${meta_resources_dir}/test_data"
-echo ">>> Testing $meta_functionality_name"
+out_dir="${meta_resources_dir}/out"
 
-"$meta_executable" \
-  --bam "$test_dir/a.sorted.bam" \
-  --bai "$test_dir/a.sorted.bam.bai" \
-  --output "$test_dir/a.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
-    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/a.sorted.idxstats"
+mkdir -p "$out_dir"
 
 ############################################################################################
 
-echo ">>> Testing $meta_functionality_name with singletons in the input"
+echo ">>> Test 1: Basic usage of $meta_functionality_name"
 
 "$meta_executable" \
-  --bam "$test_dir/test.paired_end.sorted.bam" \
-  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
-  --output "$test_dir/test.paired_end.sorted.idxstats"
+  --paired \
+  --input "$test_dir/sample.bam" \
+  --bai "$test_dir/sample.bam.bai" \
+  --output "$out_dir/deduped.bam"
 
 echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
+[ ! -f "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' does not exist!" && exit 1
 
 echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
+[ ! -s "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' is empty!" && exit 1
 
 echo ">>> Checking whether output is correct"
-diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
-    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
+diff "$out_dir/deduped.bam" "$test_dir/deduped.bam" || \
+    (echo "Output file deduped.bam does not match expected output" && exit 1)
 
-rm "$test_dir/test.paired_end.sorted.idxstats"
 
 ############################################################################################
 
+rm -rf "$out_dir"
+
 echo "All tests succeeded!"
 exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test_data/deduped.bam b/src/umi_tools/umi_tools_dedup/test_data/deduped.bam
new file mode 100644
index 0000000000000000000000000000000000000000..5ffd9fc26f6dfaea2dd1f4cfe2c1cb18d93c59ba
GIT binary patch
literal 2154
zcmV-w2$lCAiwFb&00000{{{d;LjnL-0#%Z`YZO5o$A21<xMydf?T|(>?(MuEE98w2
zNX}?L6RhrX8?%_r9=pe#y^V;4rT;+!!CoxX-r7b)u(Z&^LR;td#oWTee14yuncsJ2
z*H*XAU7rD1xwGLt?1ZF^y`6jEewt?e__#ZaS}QyEy`4_Df0z+((|%wSF@iJd3S1a!
zjA2v=*OYT8(VS@^T|+BFg%S+AhLr|WJBdu~!%HKG;aYOnh{9mT4L54a$cZtU5kth(
zHmVSUDk=qWg)RhEiefe;%tT1SIOS6vmr*oPTw|>qQ6>>a7^5_EL>WW~$*>LTh%$$u
zf+;GDBZ?S=8$lFi?jEY1LAem_9z_hoh+<kRH-{-tkz*x^a?f|G5YpbGQ`{|!QVL9s
zdxLvbIBc)n-ud=z@9svpmX3#URQ^e%O}+KaaG_V!knfMP?qCpQsMpQoeIMJ(@Gy(I
z{Ty-Rj}S$Ef`~0YMFa)$K^`P=mZe$nBsxx$C=beof+R60=traApdUzPSIN7{LfwEJ
z57YE0_qIFEcze1rQ+8H+G>W4!@_Wcnk*y7mqIR#FAU{XAjqM{)^sFhjE(5@<eJ6he
zoc{fEajpUkS6_aduZh5GyxtIna4Bqv!JGbKLmUPVMhyvA5{q*+Noe2vHd~W|&mZ14
zq+$6|+>n9q=TB#<#wd(ezqBe~aI*Q_tAN9`_qS>gaQ5O#4HAB3KQC0Ez<bmf4gYR@
zov*;atCgsREjU|lPGc9|{rFwa*@MR?f8akV_XD<G1ONaZiwFb&00000{{{d;LjnNx
z1(lauh*eb>$JaSW8+FRm$x&*~@mP85GOYW>u-a{5r;Ik2U3rPNm{11EfG;LR`PNej
zB?k2{PZ3cmK}kepDfOTa`cnB21`-(sNj*fRv-{S*tg}y^Y1x4_oOw7uzV%)H-+w)f
z^3R3GC+D`(dAf!cWdEnd$9|s}AE#I-VZ<;Yl5;9e631Z_#c>d#sRlw@g0Q}6{Wcjg
z8552+Zwl*U@WUFM1?Q(`HPh3#d%*BlNFkCS=f)vASj)II;8GS63#~m=K*xUj0-*5<
z57ZDYj1ox{l~6n@gCZNOvPd)vKdHkx|H2cfmF}<m62KS}Lh%HyUL@dZ=2*de?8ig(
zW;!)gE0R0|6Rs{eW^nh&BEz^u+Pe!ADtC7a{b>)I-Ao5pd7xS;OgSzK6~#T`ERl!T
z9s}fxH6EnIj9>+E=IIP0G+B2@X1b8*!R^;I(~s`-RftkUm`cJZ&Z3Oar4EvJqJ$np
z=e|Rb&5@gZ*^~qmDiV8dTc8MyHXKqgS9FU!^Fh1ON>8;sNJ1IM+L$B?a^_6QoMjm=
zt%lzG`Bo2;YQu@tNfZWIZ^r2R%FShloG6ed21fw->}?(-1Zk+!NgS6qZ`a&x5*&yq
zo1%Id=``C6H9Fv{5ut#^2HG==GzidS%|%L?yZmf84?ME9)=b}D;%iQrAwp;pgjw0c
z03EE<NOd9ifB1Bxnf|`a6Q$x>NeulSl|wXIY0hrso|CTu^3Ap{N-njL3_2;Cu|%T<
z@{M2Lg-*KM_T>^wB{U--A9=(7S2A}eUqYBWmwT9!2}%fa^@+3Umd%-|F&c-T^uhW1
zZ`2q{M_RtI#2j0V%ZX4P+!1*WDj#9jNS1o&opaEiM|^UuIhLC8B+B~}?tZCLb=jFr
zi=037>FidzZjHA$ji82V$Bbr}%sFuShjxO1oV~?2nT#`TI5cNaY>ge2=Hy+-4b5vq
zQUzM9Ksn=LGLKd|iFF}=xi%!@oMU49J&!W#F+yVna-e-pNJ+S{xeUWn-z$4a#uB?d
zG#AdP={7)5`{Y<dg@lr2HltqqvP4eoVt^d<#Yu>f)XrtD#?YRVm8**`<hI4TV1~{u
z^C2;2jHoi3P@prFu{185^WZ#l&%J0U4F-IIL<Lv{(D%68C)i{%9Z;QRbG(2qe0?tn
z=aQvfn<OPO8X76+b~Kqy5+U#Sq<%P`9k_y;>3xHq+w;tOfx_o2GY)2`kwu=p`>wub
z`sc7OP@}mr6s{i@qI7>nj%62e_x8`{HPg-Oyw;0rL9ADZijKcJN}JB>;k<gWAIiSZ
zr*jI{I=BTWds(Ey(PZf)A<3=RXhB6+A`qpgeAWxxhk$;vcA?zu$1B~G6&}R}bN9d}
z;Jt*;dvQ#al6DEl-3}y^WExTyVCTCt?iS0HJ^|&@7{+Pomf|k&sUl%q0&akF;`!%l
zt#s{De*&1!_0A1_WkQLozGXJPJq$zt_K>fkG|^I8-yIaoqj&W!@|)xjFg7cPeG+O6
zv5T{O%S33Z^X4ElB+L!dd^mr7oWO{FwZs!B)m#wpilvmqJ^bfbfqZo3$vMq*tMCMB
zFi|j0PVitObppjP?F{{mR-cwh82aS{9w_WhiET$Y!*sl|leml1B3}s+G}FeFKBUG<
zYguX@lzp=Dh@%niigWSm_drRmVBc1fFr;AYidP4W{n1K3jVruGp1j<Ed~TsWdjsy%
z?xe!s{7Xe;z&n9jj=azhf$sB3o?!+H%-vfi=u!CpiR6??BUlpU(+LTEv4WKD?iO-v
zhk?5fGhaFlWf<Ii@$P`Kw>y$YOprg{v%4>XvvC^1D2?3VtGpC6bKiO8VS^m4C|>8K
zZIOL@7xuN%mwk5!Ye$)(rnGjzoT@Y^b@?2y4GKH7nQmC*dmB>43<Ikil#k-8d)^{%
zf;0DrC!yyruJV~$!6Bcx%xBPRD~%S&w@y#P{#frzCIR0-G^K7ddwHgR0Md6qGr17}
g03VA81ONa4009360763o02=@U00000000000FmJWb^rhX

literal 0
HcmV?d00001

diff --git a/src/umi_tools/umi_tools_dedup/test_data/sample.bam b/src/umi_tools/umi_tools_dedup/test_data/sample.bam
new file mode 100644
index 0000000000000000000000000000000000000000..929d93b0a7b427ea9072b9f4a814363ecfc0f8d9
GIT binary patch
literal 3268
zcmV;#3_J55iwFb&00000{{{d;LjnL)0#%YZZ`42(fFDFjaOfzY9i;&gyq^0oQf_fb
zNeHY2iKzB)h(#Q)?A0zk4MG%@{DVY_=n)0bQ_~=XL`eY!1+>ii(yS#*{@&-A_ujl&
zU)w%+eFk9l&ZhTpCnPQGweN+!G|l?)ac3AUthVoa?VYf9m=SNw{$La_f-~w0To`JM
zVN?j$lxrx_oM|ClLn}jt5)8YBl?GG0h)m_-r4ht%ExBt%VKC!{8#QI*#2C$pAz~_x
zDukelN<mzq3xSoQm`w>Y5t1-Y`BcSaE1D>-u~v>KI}t`0qcn2_FNKf{+eRHBN<jrv
zR2WCdLU1F9!pzN~>J^j=;pWhVU_>#kmD`6YPmyCKiE__(yAaaO(FyLAK`8~M#+~4P
z)f~1Xw=>_p?cLoB*VFMZj>?-vTGZRv3KzRY4Eg>z>kI}_hPs_R?)lh8hKE_y>F0<e
ze}pLV6GUwIDIzF{5Aq<1vn<VmC(&`5M0rqd6eNj3K|dM=2mL@YyG!0l7V84+_b^S5
za&LFX`QDu_%#@YY9*yE?jQlS0Q)FX<qo~#GB*@PZZejZf6g_Lg3zq?4*1nS)0jGaI
zU7V``!_}7`=W8PH8gDd2AzTg{V(_NF)DVZkgHb~Qmc`OsO%hr+zs=U9;PZ#K4QW`p
z6gOnx`}xzEsxS)UwJ!@5FgV$I?p46y+WT8I2snFjr3MMVvY!_!P~bgkjD~+VzRp)*
z;MHnW!#13)G^eo#?|%HQ*X+aNlRxkuD)$4nUIYLDABzYC000000RIL6LPG)oQVOM7
zTZmm(8QwF~n0BJIO>9zh&Sa`KX}pAWzf`K-U4`A2B<8Z!v=vXO1&v?{^raR>`{t7t
zv{cZCJc)>xAZQU0s|I~&9@-Z*4^p9s6$Ohvh}z`%t-beJd!KZsYxX(^_GA)rKK|?X
z-}Xm}_Ve;5&R(}ZTQy6?4b9(Y-N*la>f}jAq?T45rc?>Rl%3YKtE###U2$%uC{CAd
z`RLsbD92SzMRDw?TRwp=F5$W1rE}LzW*6U`0LD+Lq@0#5ZtaSvmKxk8;4<#02VMGb
z2YTY4zXE7|YXa0#A+46vstTawng+D+!fFp$9LHCd@x1iKPZsO5CzewN;G9dT#T2S{
z26#Q@i4ODQe|dU&GCQ}n)ERjTOsFme;n2PEkgg7pUUzAoj=Fo$zdp)$OlErzCP0nW
zgb5NHRn?ow86cmz{aHZXdME*@2&Y6t&eoi+D$XuPNN(pKi+dh?(`5FO_oh6=n5A4#
zT~#+yR>iA5r0Atw!E@=41+01G?I~+2N+^?4ueWzlRUBW5kdjBId&n1lw6U^2dwx9u
zNf{T!SUauC7IQ0O;vb9Imv2LC{_UL!OlGX0%1kR)HnCY3e;R168pu-}<gfQ00p#c2
zm4JkhmT5Ds>!A6@Qp`<CN?A2ov>YV8!jHj6Po{iC8DNP;cs7!jWpQ>XO3HXteg~er
zK6?MsWcGu7so+EoQOc&JYuxV2;;Dg;^c>_9KmN?hWcIK92`RNOS`oy16_V)qAUNkC
zAHDE(Kz?f@C8bctDvn5UTe|3Y2l>r^d=HWI+D6KjL}_U_1%0#u|3=0<GyM(3eE60G
zrs9%O%A<ek#=9XnMFV*fU)hQ0cmFF^)@Dc6Q)x*A@f_D8!abxXc>$KMNDL&8`t)}%
zAwGYYqGKb7GE7XXHa?;ItG(A%pP3%=`n8|$SfAZ-C>fhpGRsWFthUI^Yw_$_dl&@r
z;yY5A$psfyAUMmeYCJRuPB900?__(BOoNLxn6vIO^ZX!^_#EWlw+G3E5QO@8Z>6j@
zNud)R<erUfK`JV&x21D|-v@O_)r!PAbRC}OuWSJHVv3FpJS4RW)~wtX4)Tzv9^ru8
zo05}KtC-KrmW>gfX9udwImicYdIULiX@3fm5YDL%)`Wvz7^J0*)?CH&*$;fMSeuo5
zQiQ}LS_SZXJ>MsIV{%h6(^&Il2YTZ-9s_aiyE)k=DaEZpAeHkSP17bxC;C2V7oJ}{
zdAXR(KD0NXy=ZzbFnn7w>v9VpdB_*v|Gu4**?$hF1hqzJ%b>pNNa@Z29jiIWGY|f9
zbuv44N3!)2M#4v~YNgyP)xA02jOS~w?1I}LUr!bdV$dzX?ZHW1N7I;-I@-2g$2-*G
zas^WQ^uc7iASL3-w+o@RKR1Y`#_>hnVV>FZGxT2aP|8JwX{~$_uID?DhLV|Myd<&j
zZg5X~T$LhFp{(VCg>I>y>pgWPZ8X4Jd?WIO&o8adZofHg0B%~oi%CDoP#V4OF<1WZ
z43hpkYpH-T)F|cq?iRA#!uuZb_tSr)u{m%!MWNPGUpT`oQx)g>nS(M%c}%A3@%;Cv
zQ>5r`_ay{nMo5ZYF;GdpN&k7GgZ$Wm3)fC&_sfK!mQam!iiC%X)C)>#)+ha2JwNSN
zNcvm$BtTI&74Z=j8>Z(5HAzG{J>*wifnd7tKnl_jZHx+>2e+Rc3~{U?b8_Bv+xNjF
zmr1HBDI_TxyKd@0+MgfT(?-X8$P2Hnz@DdBs=Yz?>1$Hg-(1C_suaDT9!I^r3xYnL
zqCCO4vXW1$HkF|Lmj;6Nls5v>a!Vu;aXczFiKUYr<g*W1=zf@|%xM`X=;ph*18(2w
zDUaH+9p4+>cjLM5N`X{5@}89Em1Ir#-FhAs<otl~`bpbE?tJXVo$Ir&98R}(j9X?y
zYX{78g8*gGo_l<NqJ~ap_wG*3hD>wI(JGf=D1Lp+d&sx>?%~g$Ld?H%FxAvb0eg~Q
z&$8NL$KWBqeenwF$K5GoV#gq<VJxQRW<ArLgYT~u_v~&*x~(xKYs+ycAE%GH)sYyW
zHEHRflgq$RMiS{UT)rI2DvQp_m0`>UMo|v_|708XAWBfoBCZ|^G8)7P?uU;yJ!_XG
z3AMs<5@z8{Ntnc1&J1S297zx&R4Ev?><v3!5=)d{W_3}Lrw+H2AoV*z>O6Ij>QwaO
zdzl3yjli@K)(Ul=IvCLf14q=u%oz=0(Wgs*xNhpn0nzv{8C{$wiQx)EatO9O?=T<t
zP41NWxx*-@iqas7%kmBr8ZMTDm3EFKibzgam?T`431YCl4CTpXB_Pw-&<izh*wG-^
z07=dk%i?A?gQC+Bb)I)P5!BC8&V{8>hG99;ihELOw{CToVL>KJ847j6t~v*V8Bnzi
z`@cM*=CCYsa9Y=OPX(hvIAV?3FjQ1|l1NL4X5nm;M@n323O9%CdS0+W8~ka*iY+fr
z1fxPXOB<ITvBD75Y<}YQGz-L9v^fa-7Q|(NSc#@j2X*p_m+#jY=&_1xmn;yC)h|vH
zuFP+80Ahp`@v@U;6Ui`8k;rT=i|s*;ocpC_xj^xf0yAhcFS-Rla5=<TF3@8hpD2M<
zh}Q7{nH90hQfP$jaa}JU3Br|N_wE+)jwW2;6C}x_M~%&pR63LlSLcCf%XL38<w?S@
ztx)~NIm<hmQ)+|>%Za>T)7)^2u<vh{IUtz3VA@at^Y}!2hHVCfdI6hg!8L5st-bT=
zjyBlNQ5m)rd10@u-|y1-GCCH-5NgnIg?T5hcp1V7R0_G1*A*CwS}bXFvCEQ#V~MQe
z#=FXjSi=n_I1&$Uc{UOFP5SYz&ZkElli^_+m@U&L2v;lOsW%e@JfIP5)zHObL9Dk(
zW0M8C0*i4}STjFmjCHYPSUyq_hNe6hTZS44?{xdiJU+p?!59qJ<&daHtAjPF^gY{f
z8XYZ(Cd^NxRk=W0Zt;^?8eS)<sw{Pk=4h!0?{#H?7~y-ig>nZSj#B-Rv#rIecMZO<
zzMYL9MIX!JOxuO*XcUHnGg0w<qB0W&_fY@xRJ>nhX@s*smpZ;RROg{+2!o!hBigQU
z8;xR!RRXO}H7sFcQ7FL%#K&CgX;)>UApEr)T(GlH;2`3wSh)}i_pfG%tJb^YQILzA
z#aSWGN7#a3OlDCdm#2}$FaEMs4tZfbipKOKLd#Q?c_<h;uy|Y4E{0Km0xR7@v5SkB
zL{IAGYRFg=Mw9~W#v+o5f&qv^BUcXE{m~jRfq7fS(ZAb_jz<AYHC(p{?=tLWh%Y2n
zxd1aTsVQz@VIxvzYNWV7N5-{trghP<3&lfe%trAqfy8=L=V`>z5H|0I&m9-WqZo~o
ze{=ik*Q}#a7^Y?=uDWH7M&W33jSlgWX<dKL6-EMO5oUmcu+6h!4N&K007tN>=6LEB
zsMAmgwL}eGE<b(Lt^OY}&EBU>FaQ7_iwFb&00000{{{d;LjnLB00RI3000000001Q
Caw19q

literal 0
HcmV?d00001

diff --git a/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai b/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai
new file mode 100644
index 0000000000000000000000000000000000000000..bd87cde81a00c7699faed37ab04f88fc5f77bcfa
GIT binary patch
literal 889032
zcmeI*L8zQp6$kM9&CGk)G-;9sUeffDrpA|$1|CwWs1R(6SULph!h+cf1)(5FL7LKq
zA}-uW9bAb8p&M}#cY>=z7VRP}Asf++tD=JF#wrL6cyG@=&gX+HViTCj`+ble=FYv}
zOV9kz?={V1=GkYSAGS&<lltqx7fKmwE#<}buw3qSzKG}duE*ZCGp=hpt;ShZA0id{
zpbuQ&AqP0X0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<6rogA3f-|T!*
z%B0rIC0@H>h}zc1*L$BSWvC0e?`q%w^_^JH$9ewXvE|-8qegG_K_A*|$$=cmfjQs+
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IB@?vaJ59vRefmlE(daOCpp-x
zn}4xg%24ZzL-g)fz3!Zk^;`Y%3zK+$>s+kozI^^S7h*m4z18=>Hi`A2-zUG$d{Ih$
zs^og^-)R2=qwCv`EcfJIHC_vS(1$iEav%qCU=BFI0S<7019!)P{IiN0{nQ72Xy0Gt
zKn~=<9B_aG9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02RN`!4&-}fs*ziL(1$j^
zav%qCU=BFI0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14(wkCZVi!tRUeuIav%qCU=BFI0S<70103K02ROiiyXn9Sljx_a4=q=6AO~__
z4miL84sd`29N+*4IKTl8aDW3GxEBsg>rLm2QYN*&)&HO5d>XxOYvb$ssZxfz@UK1l
zE}e|^JfF}1;NbF}`J5WP)dziOvn2;|AP44v103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zU{xH*_X<$MC4JC`=Cd5gfgG3v4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<700~}a&2WB;-SI)%ehP>H7Z{)*Xzkeat^YL@DnCZWMI@a^^N%CBNeo3z9{`T1>
z<Lir$)XzHUUqg+a>VrPC8Ic1ykOOnT0S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCKy8fj6h&fT|D8LphLx
zdzpg|yEkg&Ods^2&6^y^fgG3v4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0fpv3WvxfGY
zkC!sk`qPb=;Xn8KkH=#DXR}!UTd(s)DU-VYx&Ao!Z?pjzT|Yk^=kwgt{k|Xd^?aOP
z_v-Q3pZCqLd-Cw|Yvprlye9gf4{hY+Kn~=<9B_aG9N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`299Vw`@@Hq&m}h;^hvu3b$blS~0}gP2103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$U=<z6&t_92pZcH=ZBFGt4(=oe`B|1~<VGL%S0C~-
zT-A8p^+6xn_mA7<pgq^FS^brtr<|Xo{K01QZu;?#<v%W+iqA*ReHZ(_i|1qgjs7^#
z|9U6Zb6-CH&#hAO9%}SjAM~M3njFZ19GC+RaDW3G-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROiiyX`=JW`G*L
z=z~5qXXQW+<iH$ofCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`299TUE-q?&>s`}98Qx4=n4$J`uIKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<7019#%Uq;9_XXemRjAN0R_eDYJ#!}+3=_Wa)U
zS^EN`>qieP_vBtRdZ-Wj&_+cL<UkJ00S7q10S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S>IH1MR)V{_t45AKCBw`;c9DAZ9k-k1XGxEZ3(k(4*_UlW{)x
z{jlF>tG~}#KF<69<WMQOSB+llgFdv8k^?!A19QLu4sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<701FPadzGsLUF6qN3>qEZxj2eB^2YqPY@8v)a<iH$ofCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0Wu+9#&&oX=FSbTn2e!kiF_I&<XuK%vD
zPujom=sJEP&gZ$@cVnZJJgdf>=z~7A(UAi=kOOnDE)L}Pb~W762YqOM%YhuofjQs+
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<7019#tnS#9dScjM>M
z&mV}4z2575QA+z6_W%BzdS7op3qQWTb}`Q9xf^}om$za)&*%L&FD&nw&#BRSeb9$C
zTXG-=a$pWPzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4?zIE?clc^}qYwJf
z+?4}4kOOnT0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)({~UPzK%7(cp*boCav%rhfCC)h00%h00S<70103K02ROh14sd`29N+*4
zIKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103MM`Z};tU%XdOl`_<Nqkm@Q&V!NZZ=H+ve0-&^&ljakM*Ghm
zjQz6~gwgfX`8c2FZuaw^@9X(E&;NaE`FqIUiyFPv2YqOxB?odK2j+kS9N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<701MA^He%6B;ua`dPL;L<E2XY_>=70kn-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8-2V<-c`$OW>O-4%IgkT6Fb5pq00%h00S<701E17^oBhw3vl^yf
zZkICD`r5&mt@nHV(ur6<+aKri^F=9>x<A*?9Ex+(7R=Fgb~n!F{c_(QA6=f!d#KSv
zeb9$CLUJGna$pWPzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00(a8K>lpM8cyhgJ~S`oKn~=<9B_aG9N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103MM?HtJW#8krxeb9&Ir5wnC9GC+RaDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW5%j|0>CVqSi#l%dw|9*S&V?{&T?Wm4DA>{)MX<LlkyaX$CueU6@4
zp3Udf=&?TNLz^KvkOMg|2OQu42ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<70103K02RN`Q4&=`OtKpJ9=tJ{a4&*=%%mD{DzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4
zIKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0Wz=8E~V57cR-*~c=q1JaEitKJ5
zj{5kqSigQK9-r#Z&(FqsK7J+F7o|+<eR6-^XVwBSx^8_o&gZ%Ox<^hg&*nYU=)FGZ
zLmMGEkOMhb{W-|*jq3mYw}XD@NApV#<UkJ00S7q10S<70103K02ROh14sd`29N+*4
zIKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N@ryci@$W!U0tunul^A2XbHzIKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)Z4-UN3-<MU5yz7HL
zw7Hi9IgkT$zyS_$VEr7J))(#7-BN~Hw+}~#uk|`#lrkBupEwfFw=?5w_j7SR&*i>9
zonGEE@2y4;^+6xn49S5U$bmWF00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3GSQiJb9gcZb^`W^X2XY_>=70kn-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8+&c$aZ`(DkzvhclCbgb85}A9u*S0pkE<I7oP#1FF%YEOEcVj&t
z=lQR0FZbpdHF~TM`p{-e4&*=%%mD{DzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`299SI(UOp0eRP~|Fr5wnC9GC+RaDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`2>+is%zKG91T*^@E+x=%l9zGh`o-ayi&+lDtd&k%K?6f}d
zmveEyyw6K}?w9Mo?d$DpjIXae5$E$<?)&I?DS1|n*GM1qq0NpQ$blS~0}gP2103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<700~}aY2VOcF^Qr1Xb4m{6Kn~0S2ROh14sd`29N+*4IKTl8teXQZv+ep$
Gl=2^72Ijl~

literal 0
HcmV?d00001

diff --git a/src/umi_tools/umi_tools_dedup/test_data/script.sh b/src/umi_tools/umi_tools_dedup/test_data/script.sh
new file mode 100755
index 00000000..534c4af2
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test_data/script.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Download test data
+wget https://github.com/CGATOxford/UMI-tools/releases/download/v0.2.3/example.bam
+samtools view -b -o sample.bam -s 0.00005 example.bam
+samtools index sample.bam > sample.bam.bai
+rm example.bam
\ No newline at end of file

From 2e227e4960d022129b190cb3b1ba99aa6da4a33b Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Mon, 13 May 2024 09:18:30 +0200
Subject: [PATCH 05/12] Update test 1 and test data, fix some arg types in
 config and script

---
 src/umi_tools/umi_tools_dedup/config.vsh.yaml |   2 +-
 src/umi_tools/umi_tools_dedup/help.txt        |   8 +++---
 src/umi_tools/umi_tools_dedup/script.sh       |   4 +--
 src/umi_tools/umi_tools_dedup/test.sh         |  24 +++++++++++++++---
 .../test_data/dedup_edit_distance.tsv         |   5 ++++
 .../test_data/dedup_per_umi.tsv               |   6 +++++
 .../test_data/dedup_per_umi_per_position.tsv  |   5 ++++
 .../umi_tools_dedup/test_data/deduped.bam     | Bin 2154 -> 840 bytes
 .../umi_tools_dedup/test_data/sample.bam      | Bin 3268 -> 1584 bytes
 .../umi_tools_dedup/test_data/sample.bam.bai  | Bin 889032 -> 2656 bytes
 .../umi_tools_dedup/test_data/script.sh       |   5 ++--
 11 files changed, 46 insertions(+), 13 deletions(-)
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/dedup_edit_distance.tsv
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi.tsv
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi_per_position.tsv

diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index 312d5078..0614cbb2 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -134,7 +134,7 @@ argument_groups:
           tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
           mapping quality will be selected.
       - name: --read_length
-        type: integer
+        type: boolean_true
         description: |
           Use the read length as a criteria when deduping, for e.g sRNA-Seq.
   
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
index acbab88e..87baf322 100644
--- a/src/umi_tools/umi_tools_dedup/help.txt
+++ b/src/umi_tools/umi_tools_dedup/help.txt
@@ -33,15 +33,13 @@ Dedup Options:
       --output-stats=<prefix>             One can use the edit distance between UMIs at the same position as an quality control for the 
                                           deduplication process by comparing with a null expectation of random sampling. For the random
                                           sampling, the observed frequency of UMIs is used to more reasonably model the null expectation.
-
+                                          Use this option to generate a stats outfiles called: 
+                                                [PREFIX]_stats_edit_distance.tsv   
+                                                      Reports the (binned) average edit distance between the UMIs at each position.
                                           In addition, this option will trigger reporting of further summary statistics for the UMIs which
                                           may be informative for selecting the optimal deduplication method or debugging.
                                           Each unique UMI sequence may be observed [0-many] times at multiple positions in the BAM. The
                                           following files report the distribution for the frequencies of each UMI.
-
-                                          Use this option to generate a stats outfiles called: 
-                                                [PREFIX]_stats_edit_distance.tsv   
-                                                      Reports the (binned) average edit distance between the UMIs at each position.
                                                 [PREFIX]_stats_per_umi_per_position.tsv
                                                       Tabulates the counts for unique combinations of UMI and position.
                                                 [PREFIX]_stats_per_umi_per.tsv
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
index 00889600..8aa89d10 100644
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -20,7 +20,7 @@ test_dir="${metal_executable}/test_data"
 [[ "$par_subset" == "false" ]] && unset par_subset
 [[ "$par_log2stderr" == "false" ]] && unset par_log2stderr
 [[ "$par_get_output_stats" == "false" ]] && unset par_get_output_stats
-
+[[ "$par_read_length" == "false" ]] && unset par_read_length
 
 umi_tools dedup \
     --stdin "$par_input" \
@@ -43,7 +43,7 @@ umi_tools dedup \
     ${par_spliced_is_unique:+--spliced-is-unique} \
     ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
     ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
-    ${par_read_length:+--read-length "$par_read_length"} \
+    ${par_read_length:+--read-length} \
     ${par_per_gene:+--per-gene} \
     ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
     ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
index 4b83ff5e..1b5a9053 100644
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -7,27 +7,45 @@ mkdir -p "$out_dir"
 
 ############################################################################################
 
-echo ">>> Test 1: Basic usage of $meta_functionality_name"
+echo ">>> Test 1: Basic usage of $meta_functionality_name with statistics output"
 
 "$meta_executable" \
   --paired \
   --input "$test_dir/sample.bam" \
   --bai "$test_dir/sample.bam.bai" \
-  --output "$out_dir/deduped.bam"
+  --output "$out_dir/deduped.bam" \
+  --output_stats deduped \
+  --random_seed 1
 
 echo ">>> Checking whether output exists"
 [ ! -f "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' does not exist!" && exit 1
+[ ! -f "$out_dir/deduped_edit_distance.tsv" ] && echo "File 'deduped_edit_distance.tsv' does not exist!" && exit 1
 
 echo ">>> Checking whether output is non-empty"
 [ ! -s "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' is empty!" && exit 1
+[ ! -s "$out_dir/deduped_edit_distance.tsv" ] && echo "File 'deduped_edit_distance.tsv' is empty!" && exit 1
 
 echo ">>> Checking whether output is correct"
 diff "$out_dir/deduped.bam" "$test_dir/deduped.bam" || \
     (echo "Output file deduped.bam does not match expected output" && exit 1)
-
+diff "$out_dir/deduped_edit_distance.tsv" "$test_dir/deduped_edit_distance.tsv" || \
+    (echo "Output file deduped_edit_distance.tsv does not match expected output" && exit 1)
 
 ############################################################################################
 
+echo ">>> Test 2: $meta_functionality_name"
+
+"$meta_executable" \
+  --paired \
+  --input "$test_dir/sample.bam" \
+  --bai "$test_dir/sample.bam.bai" \
+  --output "$out_dir/deduped.bam" \
+  --random_seed 1 \
+
+
+echo ">>> Checking whether output exists"
+[ ! -f "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' does not exist!" && exit 1
+[]
 rm -rf "$out_dir"
 
 echo "All tests succeeded!"
diff --git a/src/umi_tools/umi_tools_dedup/test_data/dedup_edit_distance.tsv b/src/umi_tools/umi_tools_dedup/test_data/dedup_edit_distance.tsv
new file mode 100644
index 00000000..89684b04
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test_data/dedup_edit_distance.tsv
@@ -0,0 +1,5 @@
+unique	unique_null	directional	directional_null	edit_distance
+3	3	4	4	Single_UMI
+0	1	0	0	0
+1	0	0	0	1
+0	0	0	0	2
diff --git a/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi.tsv b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi.tsv
new file mode 100644
index 00000000..a1d364e2
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi.tsv
@@ -0,0 +1,6 @@
+UMI	median_counts_pre	times_observed_pre	total_counts_pre	median_counts_post	times_observed_post	total_counts_post
+ACCGGTTTA	74	1	74	74	1	74
+ACTGGTTTC	48	1	48	49	1	49
+AGCGGTTAC	1	1	1	1	1	1
+CCAGGTTCT	1	1	1	1	1	1
+TCTGGTTTC	1	1	1	0	0	0
diff --git a/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi_per_position.tsv b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi_per_position.tsv
new file mode 100644
index 00000000..d9211d0a
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test_data/dedup_per_umi_per_position.tsv
@@ -0,0 +1,5 @@
+counts	instances_pre	instances_post
+1	3	2
+48	1	0
+49	0	1
+74	1	1
diff --git a/src/umi_tools/umi_tools_dedup/test_data/deduped.bam b/src/umi_tools/umi_tools_dedup/test_data/deduped.bam
index 5ffd9fc26f6dfaea2dd1f4cfe2c1cb18d93c59ba..a82e6c81a47c472acd45c379a42b294f31966e87 100644
GIT binary patch
delta 825
zcmV-91IGO75Xc69ABzYC000000RIL6LPG)oTmpTPxo;Ce6o(%~N!ZX)Ks!nUQdsZY
z$0FqxhZF~cB}fF#I^M)9vRP|w+4M9BQBd+9BvM3=D1e@t1|cL$3MeR`WxbciDJU$>
z_dd^?_nX=EwdT3UGXSf%H=PIDK51a5eb?VhlC&Eg1;cQEVYPkFX>a>`y_7gx*&mD|
zMsP-LfpbHRF^mdfn_>?onlmk=ZD?tzP=aCGu)<)PZ6Z_W@WKdUxR%^DA~%?E!;P9U
za$<~T#1JucjLL<eib_Fjp>u(iqL@tyGZB(7PWjZtMHEdG*H|k{6iI{;#wg7!Q3Me}
zGMojqM3F;(P{9-x#u9l9!i^vbGy4ox?Vwx;`;0sWVMH;lm7T*Bx5%-QMA`S-%!SO}
z(JJl~MJWZQ#(u%wG91pHJbUxjE$7arzn+YTQCNIQq(PmHEq}3-*O2Ru)1coEQ`89t
z(VmO5%CMJ)L3e;Saz}_FH%25YKS2a}(f+`Tqclx_Qtu!<O5$+f6^*<&HpuISBX7U!
zNtSgP1o2`uK(-zx$>G3hZCmTs^kAl#tnz3Sg=6IQQ1~>6`-fqp6U1lxHOSeH_4<E3
zJ8k4HTn2#I>^J#_;N;K8i*qGlxccJ5d_@Fa;f<Oo_{)Ax3|@DaYU0qpKdMQ<vRIm{
zNJ8U(#@E@36nuLBrX~$5m!g^sd^>+KQw~O9y!Lsa1O~@j&zurCTzhx30s*JbuT&u6
zXZqtp2@0HtwbAhR`j`0<47^+oD`>*$N_`tU@b>$!YR@h_I{pp+0PvNx?2rTi03VA8
z1ONa4009360763o0L}n~&^-=<KoAAs0WV;GVy!pO><`P2S~!jgHdwGz7%ez}=g`8d
z7!Tkryb&%7A)%31%`{(Lo`p7_<r?O-oCPrTf8C|IZCNT|5|f#g5k`qPu8=6tnHfP$
z0M1BZakxZ^6&pAe6mrMjpIcben=w{u$0kNrMA*2*unT%uDijoZ*LLppy?1r3bK##b
zn`S~rT2+|)x<~_-Bb0i;K3+%1-o{v|q|j07R$#vZ!8hz#K?&-%?GJ$Vim+V)lS~L6
DG)a52

literal 2154
zcmV-w2$lCAiwFb&00000{{{d;LjnL-0#%Z`YZO5o$A21<xMydf?T|(>?(MuEE98w2
zNX}?L6RhrX8?%_r9=pe#y^V;4rT;+!!CoxX-r7b)u(Z&^LR;td#oWTee14yuncsJ2
z*H*XAU7rD1xwGLt?1ZF^y`6jEewt?e__#ZaS}QyEy`4_Df0z+((|%wSF@iJd3S1a!
zjA2v=*OYT8(VS@^T|+BFg%S+AhLr|WJBdu~!%HKG;aYOnh{9mT4L54a$cZtU5kth(
zHmVSUDk=qWg)RhEiefe;%tT1SIOS6vmr*oPTw|>qQ6>>a7^5_EL>WW~$*>LTh%$$u
zf+;GDBZ?S=8$lFi?jEY1LAem_9z_hoh+<kRH-{-tkz*x^a?f|G5YpbGQ`{|!QVL9s
zdxLvbIBc)n-ud=z@9svpmX3#URQ^e%O}+KaaG_V!knfMP?qCpQsMpQoeIMJ(@Gy(I
z{Ty-Rj}S$Ef`~0YMFa)$K^`P=mZe$nBsxx$C=beof+R60=traApdUzPSIN7{LfwEJ
z57YE0_qIFEcze1rQ+8H+G>W4!@_Wcnk*y7mqIR#FAU{XAjqM{)^sFhjE(5@<eJ6he
zoc{fEajpUkS6_aduZh5GyxtIna4Bqv!JGbKLmUPVMhyvA5{q*+Noe2vHd~W|&mZ14
zq+$6|+>n9q=TB#<#wd(ezqBe~aI*Q_tAN9`_qS>gaQ5O#4HAB3KQC0Ez<bmf4gYR@
zov*;atCgsREjU|lPGc9|{rFwa*@MR?f8akV_XD<G1ONaZiwFb&00000{{{d;LjnNx
z1(lauh*eb>$JaSW8+FRm$x&*~@mP85GOYW>u-a{5r;Ik2U3rPNm{11EfG;LR`PNej
zB?k2{PZ3cmK}kepDfOTa`cnB21`-(sNj*fRv-{S*tg}y^Y1x4_oOw7uzV%)H-+w)f
z^3R3GC+D`(dAf!cWdEnd$9|s}AE#I-VZ<;Yl5;9e631Z_#c>d#sRlw@g0Q}6{Wcjg
z8552+Zwl*U@WUFM1?Q(`HPh3#d%*BlNFkCS=f)vASj)II;8GS63#~m=K*xUj0-*5<
z57ZDYj1ox{l~6n@gCZNOvPd)vKdHkx|H2cfmF}<m62KS}Lh%HyUL@dZ=2*de?8ig(
zW;!)gE0R0|6Rs{eW^nh&BEz^u+Pe!ADtC7a{b>)I-Ao5pd7xS;OgSzK6~#T`ERl!T
z9s}fxH6EnIj9>+E=IIP0G+B2@X1b8*!R^;I(~s`-RftkUm`cJZ&Z3Oar4EvJqJ$np
z=e|Rb&5@gZ*^~qmDiV8dTc8MyHXKqgS9FU!^Fh1ON>8;sNJ1IM+L$B?a^_6QoMjm=
zt%lzG`Bo2;YQu@tNfZWIZ^r2R%FShloG6ed21fw->}?(-1Zk+!NgS6qZ`a&x5*&yq
zo1%Id=``C6H9Fv{5ut#^2HG==GzidS%|%L?yZmf84?ME9)=b}D;%iQrAwp;pgjw0c
z03EE<NOd9ifB1Bxnf|`a6Q$x>NeulSl|wXIY0hrso|CTu^3Ap{N-njL3_2;Cu|%T<
z@{M2Lg-*KM_T>^wB{U--A9=(7S2A}eUqYBWmwT9!2}%fa^@+3Umd%-|F&c-T^uhW1
zZ`2q{M_RtI#2j0V%ZX4P+!1*WDj#9jNS1o&opaEiM|^UuIhLC8B+B~}?tZCLb=jFr
zi=037>FidzZjHA$ji82V$Bbr}%sFuShjxO1oV~?2nT#`TI5cNaY>ge2=Hy+-4b5vq
zQUzM9Ksn=LGLKd|iFF}=xi%!@oMU49J&!W#F+yVna-e-pNJ+S{xeUWn-z$4a#uB?d
zG#AdP={7)5`{Y<dg@lr2HltqqvP4eoVt^d<#Yu>f)XrtD#?YRVm8**`<hI4TV1~{u
z^C2;2jHoi3P@prFu{185^WZ#l&%J0U4F-IIL<Lv{(D%68C)i{%9Z;QRbG(2qe0?tn
z=aQvfn<OPO8X76+b~Kqy5+U#Sq<%P`9k_y;>3xHq+w;tOfx_o2GY)2`kwu=p`>wub
z`sc7OP@}mr6s{i@qI7>nj%62e_x8`{HPg-Oyw;0rL9ADZijKcJN}JB>;k<gWAIiSZ
zr*jI{I=BTWds(Ey(PZf)A<3=RXhB6+A`qpgeAWxxhk$;vcA?zu$1B~G6&}R}bN9d}
z;Jt*;dvQ#al6DEl-3}y^WExTyVCTCt?iS0HJ^|&@7{+Pomf|k&sUl%q0&akF;`!%l
zt#s{De*&1!_0A1_WkQLozGXJPJq$zt_K>fkG|^I8-yIaoqj&W!@|)xjFg7cPeG+O6
zv5T{O%S33Z^X4ElB+L!dd^mr7oWO{FwZs!B)m#wpilvmqJ^bfbfqZo3$vMq*tMCMB
zFi|j0PVitObppjP?F{{mR-cwh82aS{9w_WhiET$Y!*sl|leml1B3}s+G}FeFKBUG<
zYguX@lzp=Dh@%niigWSm_drRmVBc1fFr;AYidP4W{n1K3jVruGp1j<Ed~TsWdjsy%
z?xe!s{7Xe;z&n9jj=azhf$sB3o?!+H%-vfi=u!CpiR6??BUlpU(+LTEv4WKD?iO-v
zhk?5fGhaFlWf<Ii@$P`Kw>y$YOprg{v%4>XvvC^1D2?3VtGpC6bKiO8VS^m4C|>8K
zZIOL@7xuN%mwk5!Ye$)(rnGjzoT@Y^b@?2y4GKH7nQmC*dmB>43<Ikil#k-8d)^{%
zf;0DrC!yyruJV~$!6Bcx%xBPRD~%S&w@y#P{#frzCIR0-G^K7ddwHgR0Md6qGr17}
g03VA81ONa4009360763o02=@U00000000000FmJWb^rhX

diff --git a/src/umi_tools/umi_tools_dedup/test_data/sample.bam b/src/umi_tools/umi_tools_dedup/test_data/sample.bam
index 929d93b0a7b427ea9072b9f4a814363ecfc0f8d9..32192fc6e9bb58449cc20acd27d747a6c60399d5 100644
GIT binary patch
literal 1584
zcmV-02G98)iwFb&00000{{{d;LjnL<0)3J@ZxcZffCr)^T<9pEU8MmjoNwQcL&{q`
zq$C(DK_Y0*_648F<*c)1)6*bCLCHTzq=+6-06jGgLP(SpP*6b2`CeM5NO97B^WDzQ
z?Ckp5_PNJ10IRn*od-KUX<(;y*Y6}r+Kmo_VYsl`y63cZ{7x?=&Q|t=QN#$&s4Z}A
zs4<37A#79Zp+s}0g|rPV4HZf-Y#UY>OtVd7>KtAeK@8WD+eYLDGj6z1Q$|jV(To@(
zrjAj$5L8hqh%Iz3uu>GWDPbl;62>W?s<>!H6U8;w$`VB<!U$uOW|rWE5R&1nQA>zI
zP{9-x#u73Y+z6sDv(He~4$6hF&(OJGL@}+E-G?b|kz*x^vhR007c!fp72GX?QVL9s
zJ;A-QIh>6=n|bq=b7#|EPsYP2EM5|6P-kPyUu@?w<htWD==Z}EwSz&_ad8$I_R=uu
z4iHD~2vOw5h-BfXh#)W8A9!(;rb+5O2@jJv9C$?{FOCiJy5Y#%?|PDDSq4G8SQU`1
zhe>iUaGE>Tx;b5#DJrWx8b#q4xjhs<3*!Dk*k}jw>3j`xI%B;)D`&ff+=a^kFq?go
zHw4FjKV6(F0mIdoALlC~@EUK_M8RM7Yhv)GyHpd0{{2x+0+z+nTtyNZH@?kQq~P<1
zw>4>4xfIo8;QRUGnX)hn<FzjfB``SJdhV3K;oAF~6$m(aaiszYztW!<N>JcDtc`|$
z*T2q}VBpniSiv@&tkk!$3-5mXuJ-J~qoY6YAMlm5?2rTi03VA81ONa4009360763o
z0J#IT+B=TqR1^kace4ScNvB6_V9<F#27)P!)FgvYmjeVrld}mD2-pfDLp){`?5L2c
znvpF)Hl^Q{Ir((--*X?nu7~0B`{m`U;rWT5-3;%apHEM}VHk$v!-J5gw}05i-tl-C
z$NT$fnhwKv?+wG3hw=8!r(ao4GWqcNoALJer2hKy)o{12x|S2?jQwWD`qivSX%w6d
zBGvV3ry2WVon@{JQdxB$p4{KI4Vea!_e}j_rrNb?P)EqBdYq{nqLnU}NHiOso$6>d
z*@dLwY*>-6v(t>-)L%kvxr3V>p$vDGxXpZX=l;I2b6a~cdtc8~y;jYX7F|`3Gj&5N
zaB@;Nnl%c^DmWWv#tCODy$znNTZ=TD%}&cj*Zw;@&Dbrofha0c@Gh4a7u;1g)e5|8
z<jt~q{{FFT{#sY-ycuYw%C)MS7a3X&Ez4L{k27`4S`(JUgl2_Uy&cX<W?R8o)i4q`
zYg``iAyZQ!hi7XU9R#n|$%XJCxQnwCo-ND8W`<!Si5$*`6sy3q4NrB$S6gmTie^)}
zqYTcva5=oeia7(nFm_faHJmMT%k@r}X6&|OO{rwTU8WZ89b7SOR=k_+%S#1}t4z7Y
z{rd>=cH0qziI-OuAIwz#N0YtXcmxskCPuR&DedrT#ZyTM&8l%2IhwWTwrn&T$|Q>y
zUkq#Pve?uyY^p71FN966QioTYVaf_;%dHt7LWgjqt+LaM-L^w8$KoJ{aoO0y8SV;K
zmF(bZx+y*f=_uJDUlnVyJ=|8Q!n;N(zP*3H|MBDD#p$DU*M-vgaU2i#_tP{^!|R)O
zF3W+&SWLICfBs~+I~||hJU#!{EN)^`c|6>o%RP(>cCrKOVq^c|r_YAxr*GF-&c56X
zPUjHD;m_N5Dvf#2xJs?!b1OhqDd!Tjs?;R}S4{$+fQHfeMIEcAl;IPox_sk<s>YHA
z?}2PdfzN_!>hqf!RBelLcmnN|_|a--J#URzHBxe2J<im-4Qm#emjs^mu=CvqmSy3R
z#IsIR_-r}aL?0(GD@<wyuU3rO1J5R@IGj~26~(g|wqaNJzpOcn;;Xd~cYL;?)nRxx
zS*q}CXE=Vyvbzeu)RYwM(6`-f0nb*5Ylk5d67e0X<%kG|v!a~%6->mNnfG8l6V9qq
z5S}$^Yr?ZE)s1H*xbQ<(?V|W<o7HLuHskWgEihYr`A~#rMH_yqQKG6`Xa53j9M$l7
iDgXc<iwFb&00000{{{d;LjnLB00RI3000000000a8~aoM

literal 3268
zcmV;#3_J55iwFb&00000{{{d;LjnL)0#%YZZ`42(fFDFjaOfzY9i;&gyq^0oQf_fb
zNeHY2iKzB)h(#Q)?A0zk4MG%@{DVY_=n)0bQ_~=XL`eY!1+>ii(yS#*{@&-A_ujl&
zU)w%+eFk9l&ZhTpCnPQGweN+!G|l?)ac3AUthVoa?VYf9m=SNw{$La_f-~w0To`JM
zVN?j$lxrx_oM|ClLn}jt5)8YBl?GG0h)m_-r4ht%ExBt%VKC!{8#QI*#2C$pAz~_x
zDukelN<mzq3xSoQm`w>Y5t1-Y`BcSaE1D>-u~v>KI}t`0qcn2_FNKf{+eRHBN<jrv
zR2WCdLU1F9!pzN~>J^j=;pWhVU_>#kmD`6YPmyCKiE__(yAaaO(FyLAK`8~M#+~4P
z)f~1Xw=>_p?cLoB*VFMZj>?-vTGZRv3KzRY4Eg>z>kI}_hPs_R?)lh8hKE_y>F0<e
ze}pLV6GUwIDIzF{5Aq<1vn<VmC(&`5M0rqd6eNj3K|dM=2mL@YyG!0l7V84+_b^S5
za&LFX`QDu_%#@YY9*yE?jQlS0Q)FX<qo~#GB*@PZZejZf6g_Lg3zq?4*1nS)0jGaI
zU7V``!_}7`=W8PH8gDd2AzTg{V(_NF)DVZkgHb~Qmc`OsO%hr+zs=U9;PZ#K4QW`p
z6gOnx`}xzEsxS)UwJ!@5FgV$I?p46y+WT8I2snFjr3MMVvY!_!P~bgkjD~+VzRp)*
z;MHnW!#13)G^eo#?|%HQ*X+aNlRxkuD)$4nUIYLDABzYC000000RIL6LPG)oQVOM7
zTZmm(8QwF~n0BJIO>9zh&Sa`KX}pAWzf`K-U4`A2B<8Z!v=vXO1&v?{^raR>`{t7t
zv{cZCJc)>xAZQU0s|I~&9@-Z*4^p9s6$Ohvh}z`%t-beJd!KZsYxX(^_GA)rKK|?X
z-}Xm}_Ve;5&R(}ZTQy6?4b9(Y-N*la>f}jAq?T45rc?>Rl%3YKtE###U2$%uC{CAd
z`RLsbD92SzMRDw?TRwp=F5$W1rE}LzW*6U`0LD+Lq@0#5ZtaSvmKxk8;4<#02VMGb
z2YTY4zXE7|YXa0#A+46vstTawng+D+!fFp$9LHCd@x1iKPZsO5CzewN;G9dT#T2S{
z26#Q@i4ODQe|dU&GCQ}n)ERjTOsFme;n2PEkgg7pUUzAoj=Fo$zdp)$OlErzCP0nW
zgb5NHRn?ow86cmz{aHZXdME*@2&Y6t&eoi+D$XuPNN(pKi+dh?(`5FO_oh6=n5A4#
zT~#+yR>iA5r0Atw!E@=41+01G?I~+2N+^?4ueWzlRUBW5kdjBId&n1lw6U^2dwx9u
zNf{T!SUauC7IQ0O;vb9Imv2LC{_UL!OlGX0%1kR)HnCY3e;R168pu-}<gfQ00p#c2
zm4JkhmT5Ds>!A6@Qp`<CN?A2ov>YV8!jHj6Po{iC8DNP;cs7!jWpQ>XO3HXteg~er
zK6?MsWcGu7so+EoQOc&JYuxV2;;Dg;^c>_9KmN?hWcIK92`RNOS`oy16_V)qAUNkC
zAHDE(Kz?f@C8bctDvn5UTe|3Y2l>r^d=HWI+D6KjL}_U_1%0#u|3=0<GyM(3eE60G
zrs9%O%A<ek#=9XnMFV*fU)hQ0cmFF^)@Dc6Q)x*A@f_D8!abxXc>$KMNDL&8`t)}%
zAwGYYqGKb7GE7XXHa?;ItG(A%pP3%=`n8|$SfAZ-C>fhpGRsWFthUI^Yw_$_dl&@r
z;yY5A$psfyAUMmeYCJRuPB900?__(BOoNLxn6vIO^ZX!^_#EWlw+G3E5QO@8Z>6j@
zNud)R<erUfK`JV&x21D|-v@O_)r!PAbRC}OuWSJHVv3FpJS4RW)~wtX4)Tzv9^ru8
zo05}KtC-KrmW>gfX9udwImicYdIULiX@3fm5YDL%)`Wvz7^J0*)?CH&*$;fMSeuo5
zQiQ}LS_SZXJ>MsIV{%h6(^&Il2YTZ-9s_aiyE)k=DaEZpAeHkSP17bxC;C2V7oJ}{
zdAXR(KD0NXy=ZzbFnn7w>v9VpdB_*v|Gu4**?$hF1hqzJ%b>pNNa@Z29jiIWGY|f9
zbuv44N3!)2M#4v~YNgyP)xA02jOS~w?1I}LUr!bdV$dzX?ZHW1N7I;-I@-2g$2-*G
zas^WQ^uc7iASL3-w+o@RKR1Y`#_>hnVV>FZGxT2aP|8JwX{~$_uID?DhLV|Myd<&j
zZg5X~T$LhFp{(VCg>I>y>pgWPZ8X4Jd?WIO&o8adZofHg0B%~oi%CDoP#V4OF<1WZ
z43hpkYpH-T)F|cq?iRA#!uuZb_tSr)u{m%!MWNPGUpT`oQx)g>nS(M%c}%A3@%;Cv
zQ>5r`_ay{nMo5ZYF;GdpN&k7GgZ$Wm3)fC&_sfK!mQam!iiC%X)C)>#)+ha2JwNSN
zNcvm$BtTI&74Z=j8>Z(5HAzG{J>*wifnd7tKnl_jZHx+>2e+Rc3~{U?b8_Bv+xNjF
zmr1HBDI_TxyKd@0+MgfT(?-X8$P2Hnz@DdBs=Yz?>1$Hg-(1C_suaDT9!I^r3xYnL
zqCCO4vXW1$HkF|Lmj;6Nls5v>a!Vu;aXczFiKUYr<g*W1=zf@|%xM`X=;ph*18(2w
zDUaH+9p4+>cjLM5N`X{5@}89Em1Ir#-FhAs<otl~`bpbE?tJXVo$Ir&98R}(j9X?y
zYX{78g8*gGo_l<NqJ~ap_wG*3hD>wI(JGf=D1Lp+d&sx>?%~g$Ld?H%FxAvb0eg~Q
z&$8NL$KWBqeenwF$K5GoV#gq<VJxQRW<ArLgYT~u_v~&*x~(xKYs+ycAE%GH)sYyW
zHEHRflgq$RMiS{UT)rI2DvQp_m0`>UMo|v_|708XAWBfoBCZ|^G8)7P?uU;yJ!_XG
z3AMs<5@z8{Ntnc1&J1S297zx&R4Ev?><v3!5=)d{W_3}Lrw+H2AoV*z>O6Ij>QwaO
zdzl3yjli@K)(Ul=IvCLf14q=u%oz=0(Wgs*xNhpn0nzv{8C{$wiQx)EatO9O?=T<t
zP41NWxx*-@iqas7%kmBr8ZMTDm3EFKibzgam?T`431YCl4CTpXB_Pw-&<izh*wG-^
z07=dk%i?A?gQC+Bb)I)P5!BC8&V{8>hG99;ihELOw{CToVL>KJ847j6t~v*V8Bnzi
z`@cM*=CCYsa9Y=OPX(hvIAV?3FjQ1|l1NL4X5nm;M@n323O9%CdS0+W8~ka*iY+fr
z1fxPXOB<ITvBD75Y<}YQGz-L9v^fa-7Q|(NSc#@j2X*p_m+#jY=&_1xmn;yC)h|vH
zuFP+80Ahp`@v@U;6Ui`8k;rT=i|s*;ocpC_xj^xf0yAhcFS-Rla5=<TF3@8hpD2M<
zh}Q7{nH90hQfP$jaa}JU3Br|N_wE+)jwW2;6C}x_M~%&pR63LlSLcCf%XL38<w?S@
ztx)~NIm<hmQ)+|>%Za>T)7)^2u<vh{IUtz3VA@at^Y}!2hHVCfdI6hg!8L5st-bT=
zjyBlNQ5m)rd10@u-|y1-GCCH-5NgnIg?T5hcp1V7R0_G1*A*CwS}bXFvCEQ#V~MQe
z#=FXjSi=n_I1&$Uc{UOFP5SYz&ZkElli^_+m@U&L2v;lOsW%e@JfIP5)zHObL9Dk(
zW0M8C0*i4}STjFmjCHYPSUyq_hNe6hTZS44?{xdiJU+p?!59qJ<&daHtAjPF^gY{f
z8XYZ(Cd^NxRk=W0Zt;^?8eS)<sw{Pk=4h!0?{#H?7~y-ig>nZSj#B-Rv#rIecMZO<
zzMYL9MIX!JOxuO*XcUHnGg0w<qB0W&_fY@xRJ>nhX@s*smpZ;RROg{+2!o!hBigQU
z8;xR!RRXO}H7sFcQ7FL%#K&CgX;)>UApEr)T(GlH;2`3wSh)}i_pfG%tJb^YQILzA
z#aSWGN7#a3OlDCdm#2}$FaEMs4tZfbipKOKLd#Q?c_<h;uy|Y4E{0Km0xR7@v5SkB
zL{IAGYRFg=Mw9~W#v+o5f&qv^BUcXE{m~jRfq7fS(ZAb_jz<AYHC(p{?=tLWh%Y2n
zxd1aTsVQz@VIxvzYNWV7N5-{trghP<3&lfe%trAqfy8=L=V`>z5H|0I&m9-WqZo~o
ze{=ik*Q}#a7^Y?=uDWH7M&W33jSlgWX<dKL6-EMO5oUmcu+6h!4N&K007tN>=6LEB
zsMAmgwL}eGE<b(Lt^OY}&EBU>FaQ7_iwFb&00000{{{d;LjnLB00RI3000000001Q
Caw19q

diff --git a/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai b/src/umi_tools/umi_tools_dedup/test_data/sample.bam.bai
index bd87cde81a00c7699faed37ab04f88fc5f77bcfa..e9e2eee1d579c04b0b58bff33862d56ff931ff5d 100644
GIT binary patch
literal 2656
zcmZ>A^kfucU|?VcVnbmD21X#wz!1d*B8?;=wAUmC2Cx8BoPj}v4J2O+B9K5ZRDTy#
z-$N*E4Hbu(2b0f+io?{w%vA<E0%AAJEu-{kctDccXgG|911JPW)4^yu7)=L+@&T;Q
Wr4|jVyN4<bYikTu8j|OS4haD7sht1-

literal 889032
zcmeI*L8zQp6$kM9&CGk)G-;9sUeffDrpA|$1|CwWs1R(6SULph!h+cf1)(5FL7LKq
zA}-uW9bAb8p&M}#cY>=z7VRP}Asf++tD=JF#wrL6cyG@=&gX+HViTCj`+ble=FYv}
zOV9kz?={V1=GkYSAGS&<lltqx7fKmwE#<}buw3qSzKG}duE*ZCGp=hpt;ShZA0id{
zpbuQ&AqP0X0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<6rogA3f-|T!*
z%B0rIC0@H>h}zc1*L$BSWvC0e?`q%w^_^JH$9ewXvE|-8qegG_K_A*|$$=cmfjQs+
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IB@?vaJ59vRefmlE(daOCpp-x
zn}4xg%24ZzL-g)fz3!Zk^;`Y%3zK+$>s+kozI^^S7h*m4z18=>Hi`A2-zUG$d{Ih$
zs^og^-)R2=qwCv`EcfJIHC_vS(1$iEav%qCU=BFI0S<7019!)P{IiN0{nQ72Xy0Gt
zKn~=<9B_aG9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02RN`!4&-}fs*ziL(1$j^
zav%qCU=BFI0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14(wkCZVi!tRUeuIav%qCU=BFI0S<70103K02ROiiyXn9Sljx_a4=q=6AO~__
z4miL84sd`29N+*4IKTl8aDW3GxEBsg>rLm2QYN*&)&HO5d>XxOYvb$ssZxfz@UK1l
zE}e|^JfF}1;NbF}`J5WP)dziOvn2;|AP44v103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zU{xH*_X<$MC4JC`=Cd5gfgG3v4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<700~}a&2WB;-SI)%ehP>H7Z{)*Xzkeat^YL@DnCZWMI@a^^N%CBNeo3z9{`T1>
z<Lir$)XzHUUqg+a>VrPC8Ic1ykOOnT0S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCKy8fj6h&fT|D8LphLx
zdzpg|yEkg&Ods^2&6^y^fgG3v4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0fpv3WvxfGY
zkC!sk`qPb=;Xn8KkH=#DXR}!UTd(s)DU-VYx&Ao!Z?pjzT|Yk^=kwgt{k|Xd^?aOP
z_v-Q3pZCqLd-Cw|Yvprlye9gf4{hY+Kn~=<9B_aG9N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`299Vw`@@Hq&m}h;^hvu3b$blS~0}gP2103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$U=<z6&t_92pZcH=ZBFGt4(=oe`B|1~<VGL%S0C~-
zT-A8p^+6xn_mA7<pgq^FS^brtr<|Xo{K01QZu;?#<v%W+iqA*ReHZ(_i|1qgjs7^#
z|9U6Zb6-CH&#hAO9%}SjAM~M3njFZ19GC+RaDW3G-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROiiyX`=JW`G*L
z=z~5qXXQW+<iH$ofCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`299TUE-q?&>s`}98Qx4=n4$J`uIKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<7019#%Uq;9_XXemRjAN0R_eDYJ#!}+3=_Wa)U
zS^EN`>qieP_vBtRdZ-Wj&_+cL<UkJ00S7q10S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S>IH1MR)V{_t45AKCBw`;c9DAZ9k-k1XGxEZ3(k(4*_UlW{)x
z{jlF>tG~}#KF<69<WMQOSB+llgFdv8k^?!A19QLu4sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<701FPadzGsLUF6qN3>qEZxj2eB^2YqPY@8v)a<iH$ofCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0Wu+9#&&oX=FSbTn2e!kiF_I&<XuK%vD
zPujom=sJEP&gZ$@cVnZJJgdf>=z~7A(UAi=kOOnDE)L}Pb~W762YqOM%YhuofjQs+
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<7019#tnS#9dScjM>M
z&mV}4z2575QA+z6_W%BzdS7op3qQWTb}`Q9xf^}om$za)&*%L&FD&nw&#BRSeb9$C
zTXG-=a$pWPzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4?zIE?clc^}qYwJf
z+?4}4kOOnT0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)({~UPzK%7(cp*boCav%rhfCC)h00%h00S<70103K02ROh14sd`29N+*4
zIKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103MM`Z};tU%XdOl`_<Nqkm@Q&V!NZZ=H+ve0-&^&ljakM*Ghm
zjQz6~gwgfX`8c2FZuaw^@9X(E&;NaE`FqIUiyFPv2YqOxB?odK2j+kS9N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<701MA^He%6B;ua`dPL;L<E2XY_>=70kn-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8-2V<-c`$OW>O-4%IgkT6Fb5pq00%h00S<701E17^oBhw3vl^yf
zZkICD`r5&mt@nHV(ur6<+aKri^F=9>x<A*?9Ex+(7R=Fgb~n!F{c_(QA6=f!d#KSv
zeb9$CLUJGna$pWPzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00(a8K>lpM8cyhgJ~S`oKn~=<9B_aG9N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103MM?HtJW#8krxeb9&Ir5wnC9GC+RaDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW5%j|0>CVqSi#l%dw|9*S&V?{&T?Wm4DA>{)MX<LlkyaX$CueU6@4
zp3Udf=&?TNLz^KvkOMg|2OQu42ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<70103K02RN`Q4&=`OtKpJ9=tJ{a4&*=%%mD{DzyS_$fCC)h00%h00S<70
z103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4
zIKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0Wz=8E~V57cR-*~c=q1JaEitKJ5
zj{5kqSigQK9-r#Z&(FqsK7J+F7o|+<eR6-^XVwBSx^8_o&gZ%Ox<^hg&*nYU=)FGZ
zLmMGEkOMhb{W-|*jq3mYw}XD@NApV#<UkJ00S7q10S<70103K02ROh14sd`29N+*4
zIKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N@ryci@$W!U0tunul^A2XbHzIKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)Z4-UN3-<MU5yz7HL
zw7Hi9IgkT$zyS_$VEr7J))(#7-BN~Hw+}~#uk|`#lrkBupEwfFw=?5w_j7SR&*i>9
zonGEE@2y4;^+6xn49S5U$bmWF00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G
z-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$
zfCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3GSQiJb9gcZb^`W^X2XY_>=70kn-~b0WzyS_$fCC)h00%h00S<70103K02ROh1
z4sd`29N+*4IKTl8+&c$aZ`(DkzvhclCbgb85}A9u*S0pkE<I7oP#1FF%YEOEcVj&t
z=lQR0FZbpdHF~TM`p{-e4&*=%%mD{DzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70
z103K02ROh14sd`299SI(UOp0eRP~|Fr5wnC9GC+RaDW3G-~b0WzyS_$fCC)h00%h0
z0S<70103K02ROh14sd`2>+is%zKG91T*^@E+x=%l9zGh`o-ayi&+lDtd&k%K?6f}d
zmveEyyw6K}?w9Mo?d$DpjIXae5$E$<?)&I?DS1|n*GM1qq0NpQ$blS~0}gP2103K0
z2ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`2
z9N+*4IKTl8aDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8
zaDW3G-~b0WzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0W
zzyS_$fCC)h00%h00S<70103K02ROh14sd`29N+*4IKTl8aDW3G-~b0WzyS_$fCC)h
z00%h00S<700~}aY2VOcF^Qr1Xb4m{6Kn~0S2ROh14sd`29N+*4IKTl8teXQZv+ep$
Gl=2^72Ijl~

diff --git a/src/umi_tools/umi_tools_dedup/test_data/script.sh b/src/umi_tools/umi_tools_dedup/test_data/script.sh
index 534c4af2..2253a0d1 100755
--- a/src/umi_tools/umi_tools_dedup/test_data/script.sh
+++ b/src/umi_tools/umi_tools_dedup/test_data/script.sh
@@ -2,6 +2,7 @@
 
 # Download test data
 wget https://github.com/CGATOxford/UMI-tools/releases/download/v0.2.3/example.bam
-samtools view -b -o sample.bam -s 0.00005 example.bam
-samtools index sample.bam > sample.bam.bai
+# extract 150 reads with a maximum of two reads having the same start position
+samtools view -h example.bam | head -n 150 | samtools view -bS - > sample.bam
+samtools index sample.bam
 rm example.bam
\ No newline at end of file

From 13e8703a2b9a502dfd5db8a181952d89cade8d4c Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Wed, 22 May 2024 15:20:20 +0200
Subject: [PATCH 06/12] test data files and changes to script

---
 src/umi_tools/umi_tools_dedup/config.vsh.yaml |  14 +++---
 src/umi_tools/umi_tools_dedup/script.sh       |   4 +-
 src/umi_tools/umi_tools_dedup/test.sh         |  42 +++++++++++-------
 .../umi_tools_dedup/test_data/deduped.bam     | Bin 840 -> 0 bytes
 .../umi_tools_dedup/test_data/deduped.sam     |  30 +++++++++++++
 .../test_data/deduped_fraction.sam            |  29 ++++++++++++
 6 files changed, 93 insertions(+), 26 deletions(-)
 delete mode 100644 src/umi_tools/umi_tools_dedup/test_data/deduped.bam
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/deduped.sam
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/deduped_fraction.sam

diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index 0614cbb2..0c34c20a 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -27,9 +27,6 @@ argument_groups:
       - name: --bai
         type: file
         description: BAM index
-      - name: --get_output_stats
-        type: boolean_true
-        description: Generate output stats. 
       - name: --random_seed
         type: integer
         description: |
@@ -55,9 +52,10 @@ argument_groups:
           use of the template length to determine reads with the same mapping
           coordinates.
       - name: --output_stats
-        type: file
-        description: Directory containing UMI based deduplication statistics files
-        direction: output
+        type: string
+        description: |
+          Generate files containing UMI based deduplication statistics files with this prefix
+          in the file names.
       - name: --extract_umi_method
         type: string
         description: |
@@ -228,7 +226,7 @@ argument_groups:
         description: |
           Ignore the UMI and group reads using mapping coordinates only.
       - name: --subset
-        type: boolean_true
+        type: double
         description: |
           Only consider a fraction of the reads, chosen at random. This is useful
           for doing saturation analyses.
@@ -269,7 +267,7 @@ argument_groups:
         alternatives: -v
         type: integer
         description: Log level. The higher, the more output.
-        default: 1
+        default: 0
       - name: --error
         alternatives: -E
         type: file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
index 8aa89d10..cb5b563a 100644
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -19,13 +19,11 @@ test_dir="${metal_executable}/test_data"
 [[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
 [[ "$par_subset" == "false" ]] && unset par_subset
 [[ "$par_log2stderr" == "false" ]] && unset par_log2stderr
-[[ "$par_get_output_stats" == "false" ]] && unset par_get_output_stats
 [[ "$par_read_length" == "false" ]] && unset par_read_length
 
 umi_tools dedup \
     --stdin "$par_input" \
     ${par_in_sam:+--in-sam} \
-    ${par_get_output_stats:+--get-output-stats} \
     -S "$par_output" \
     ${par_out_sam:+--out-sam} \
     ${par_paired:+--paired} \
@@ -56,7 +54,7 @@ umi_tools dedup \
     ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
     ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
     ${par_ignore_umi:+--ignore-umi} \
-    ${par_subset:+--subset} \
+    ${par_subset:+--subset "$par_subset"} \
     ${par_chrom:+--chrom "$par_chrom"} \
     ${par_no_sort_output:+--no-sort-output} \
     ${par_buffer_whole_contig:+--buffer-whole-contig} \
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
index 1b5a9053..db5563bc 100644
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -13,39 +13,51 @@ echo ">>> Test 1: Basic usage of $meta_functionality_name with statistics output
   --paired \
   --input "$test_dir/sample.bam" \
   --bai "$test_dir/sample.bam.bai" \
-  --output "$out_dir/deduped.bam" \
-  --output_stats deduped \
+  --output "$out_dir/deduped.sam" \
+  --out_sam \
+  --output_stats "$out_dir/dedup" \
   --random_seed 1
 
 echo ">>> Checking whether output exists"
-[ ! -f "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' does not exist!" && exit 1
-[ ! -f "$out_dir/deduped_edit_distance.tsv" ] && echo "File 'deduped_edit_distance.tsv' does not exist!" && exit 1
+[ ! -f "$out_dir/deduped.sam" ] && echo "File 'deduped.sam' does not exist!" && exit 1
+[ ! -f "$out_dir/dedup_edit_distance.tsv" ] && echo "File 'dedup_edit_distance.tsv' does not exist!" && exit 1
 
 echo ">>> Checking whether output is non-empty"
-[ ! -s "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' is empty!" && exit 1
-[ ! -s "$out_dir/deduped_edit_distance.tsv" ] && echo "File 'deduped_edit_distance.tsv' is empty!" && exit 1
+[ ! -s "$out_dir/deduped.sam" ] && echo "File 'deduped.sam' is empty!" && exit 1
+[ ! -s "$out_dir/dedup_edit_distance.tsv" ] && echo "File 'dedup_edit_distance.tsv' is empty!" && exit 1
 
 echo ">>> Checking whether output is correct"
-diff "$out_dir/deduped.bam" "$test_dir/deduped.bam" || \
-    (echo "Output file deduped.bam does not match expected output" && exit 1)
-diff "$out_dir/deduped_edit_distance.tsv" "$test_dir/deduped_edit_distance.tsv" || \
-    (echo "Output file deduped_edit_distance.tsv does not match expected output" && exit 1)
+diff "$out_dir/deduped.sam" "$test_dir/deduped.sam" || \
+    (echo "Output file deduped.sam does not match expected output" && exit 1)
+diff "$out_dir/dedup_edit_distance.tsv" "$test_dir/dedup_edit_distance.tsv" || \
+    (echo "Output file dedup_edit_distance.tsv does not match expected output" && exit 1)
 
 ############################################################################################
 
-echo ">>> Test 2: $meta_functionality_name"
+echo ">>> Test 2: $meta_functionality_name with random subset selection"
 
 "$meta_executable" \
   --paired \
   --input "$test_dir/sample.bam" \
   --bai "$test_dir/sample.bam.bai" \
-  --output "$out_dir/deduped.bam" \
-  --random_seed 1 \
+  --output "$out_dir/deduped_fraction.sam" \
+  --out_sam \
+  --subset 0.5 \
+  --random_seed 1
 
 
 echo ">>> Checking whether output exists"
-[ ! -f "$out_dir/deduped.bam" ] && echo "File 'deduped.bam' does not exist!" && exit 1
-[]
+[ ! -f "$out_dir/deduped_fraction.sam" ] && echo "File 'deduped_fraction.sam' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$out_dir/deduped_fraction.sam" ] && echo "File 'deduped_fraction.sam' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$out_dir/deduped_fraction.sam" "$test_dir/deduped_fraction.sam" || \
+    (echo "Output file deduped_fraction.sam does not match expected output" && exit 1)
+
+############################################################################################
+
 rm -rf "$out_dir"
 
 echo "All tests succeeded!"
diff --git a/src/umi_tools/umi_tools_dedup/test_data/deduped.bam b/src/umi_tools/umi_tools_dedup/test_data/deduped.bam
deleted file mode 100644
index a82e6c81a47c472acd45c379a42b294f31966e87..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 840
zcmV-O1GoGiiwFb&00000{{{d;LjnL?0)3LXZxcZjhaW^q*w9fxJ4ypmSnu4&BIOo`
z6bFMPNCeF~-oz`iS!-?C^fU-jQ1TxnQbdm^fS#HLAtXu)C@7$1y_d!*C@jtQKF^!?
zo7wfX=DEi+0IRn*od?@KX<(;)*WXK$v>P1-!*F4>ea~rc`+L2VI9u5tj3P#GMs0y}
zLya+v3SpaK4<(v2Eu?K|X{b<wVcW36V47_rQ|Iu)2x7RF+%_UNm~q36nlf@?jAq0T
zF?Ec}g`kQ`L2RLOft8||O$jp*k}yvB)Wk&;O%&HyD@znfgb~Il%`8y_5kfMY1+_$x
zLr}pK6~+>I48n~d3N!l*Rqdc$2>XmY24O@ot(Bd_6t~E+l0@0}+suW`-q9-V6h$co
zrpA83-7*}`o;-W=)-C7GroWzyhf!F3Nu)uYjV*t%lh=^zj?<vu4^z|$2GO32v&yiS
zhCz3LIC4jbA~!}PD?dR5dC~sBi=#A6Qtu!<O5$+f6^*<&HpuISBX7U!NtSgP1o2`u
zK(-zx$>G3hZCmTs^kAl#tnz3Sg=6IQQ1~>6`-fqp6U1lxHOSeH_4<E3J8k4HTn2#I
z>^J#_;N;K8i*qGlxccJ5d_@Fa;f<Oo_{)Ax3|@DaYU0qpKdMQ<vRIm{NJ8Vr*V&2`
ze0u+;CJif>qM8hRJAX1$4n|?T_IaTM2FF{^oDw)(dv~(}0jJNeR3PDJ`r|?g3Y>?v
z(eU^Bm-!M5yj%?{Xu|1AeH%OQ_WQ4D&n`SV{tf>C@RhXekOTk#ABzYC000000RIL6
zLPG)o&H#nbJr06E5Cz}?FJNM=H_+@4%a2+(jtMqcuu~W<IDqHS!mAh$;4QonE(;-{
zkyp($UtXStHlO7h=Czy!F!g`krMYccDq#|nnUxVni8!v1D9@Q0K}-P7NMUifM2Zy~
zI29Cf$KIb?Sk#*_R%*v4Mpi`FxWupvdRHnG6nocp?)ANQb**#ZpPObvMp{*v`?^R2
zmLrsUz&>6_$KJ+Rsie?R>Q-RC0>L-zSwRWvw(Spq_KL7w0RR9WiwFb&00000{{{d;
SLjnLB00RI3000000000640s#>

diff --git a/src/umi_tools/umi_tools_dedup/test_data/deduped.sam b/src/umi_tools/umi_tools_dedup/test_data/deduped.sam
new file mode 100644
index 00000000..cce2efb4
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test_data/deduped.sam
@@ -0,0 +1,30 @@
+@HD	VN:1.0	SO:coordinate
+@SQ	SN:chr1	LN:197195432
+@SQ	SN:chr10	LN:129993255
+@SQ	SN:chr11	LN:121843856
+@SQ	SN:chr12	LN:121257530
+@SQ	SN:chr13	LN:120284312
+@SQ	SN:chr14	LN:125194864
+@SQ	SN:chr15	LN:103494974
+@SQ	SN:chr16	LN:98319150
+@SQ	SN:chr17	LN:95272651
+@SQ	SN:chr18	LN:90772031
+@SQ	SN:chr19	LN:61342430
+@SQ	SN:chr2	LN:181748087
+@SQ	SN:chr3	LN:159599783
+@SQ	SN:chr4	LN:155630120
+@SQ	SN:chr5	LN:152537259
+@SQ	SN:chr6	LN:149517037
+@SQ	SN:chr7	LN:152524553
+@SQ	SN:chr8	LN:131738871
+@SQ	SN:chr9	LN:124076172
+@SQ	SN:chrM	LN:16299
+@SQ	SN:chrX	LN:166650296
+@SQ	SN:chrY	LN:15902555
+@PG	ID:Bowtie	VN:1.1.2	CL:"bowtie --wrapper basic-0 --threads 4 -v 2 -m 10 -k 1 /ifs/mirror/genomes/bowtie/mm9 /dev/fd/63 --sam"
+@PG	ID:samtools	PN:samtools	PP:Bowtie	VN:1.19.2	CL:samtools view -h example.bam
+@PG	ID:samtools.1	PN:samtools	PP:samtools	VN:1.19.2	CL:samtools view -bS -
+SRR2057595.5052066_ACCGGTTTA	16	chr1	3812794	255	51M	*	0	0	*	*	XA:i:2	MD:Z:42T2T5	NM:i:2
+SRR2057595.13520751_CCAGGTTCT	16	chr1	3967622	255	20M	*	0	0	*	*	XA:i:2	MD:Z:12A0C6	NM:i:2
+SRR2057595.8901432_AGCGGTTAC	0	chr1	4369756	255	20M	*	0	0	*	*	XA:i:2	MD:Z:1T4A13	NM:i:2
+SRR2057595.1210348_ACTGGTTTC	0	chr1	4762503	255	45M	*	0	0	*	*	XA:i:2	MD:Z:0C7A36	NM:i:2
diff --git a/src/umi_tools/umi_tools_dedup/test_data/deduped_fraction.sam b/src/umi_tools/umi_tools_dedup/test_data/deduped_fraction.sam
new file mode 100644
index 00000000..cf9e651a
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test_data/deduped_fraction.sam
@@ -0,0 +1,29 @@
+@HD	VN:1.0	SO:coordinate
+@SQ	SN:chr1	LN:197195432
+@SQ	SN:chr10	LN:129993255
+@SQ	SN:chr11	LN:121843856
+@SQ	SN:chr12	LN:121257530
+@SQ	SN:chr13	LN:120284312
+@SQ	SN:chr14	LN:125194864
+@SQ	SN:chr15	LN:103494974
+@SQ	SN:chr16	LN:98319150
+@SQ	SN:chr17	LN:95272651
+@SQ	SN:chr18	LN:90772031
+@SQ	SN:chr19	LN:61342430
+@SQ	SN:chr2	LN:181748087
+@SQ	SN:chr3	LN:159599783
+@SQ	SN:chr4	LN:155630120
+@SQ	SN:chr5	LN:152537259
+@SQ	SN:chr6	LN:149517037
+@SQ	SN:chr7	LN:152524553
+@SQ	SN:chr8	LN:131738871
+@SQ	SN:chr9	LN:124076172
+@SQ	SN:chrM	LN:16299
+@SQ	SN:chrX	LN:166650296
+@SQ	SN:chrY	LN:15902555
+@PG	ID:Bowtie	VN:1.1.2	CL:"bowtie --wrapper basic-0 --threads 4 -v 2 -m 10 -k 1 /ifs/mirror/genomes/bowtie/mm9 /dev/fd/63 --sam"
+@PG	ID:samtools	PN:samtools	PP:Bowtie	VN:1.19.2	CL:samtools view -h example.bam
+@PG	ID:samtools.1	PN:samtools	PP:samtools	VN:1.19.2	CL:samtools view -bS -
+SRR2057595.4062788_ACCGGTTTA	16	chr1	3812793	255	52M	*	0	0	*	*	XA:i:2	MD:Z:43T2T5	NM:i:2
+SRR2057595.8901432_AGCGGTTAC	0	chr1	4369756	255	20M	*	0	0	*	*	XA:i:2	MD:Z:1T4A13	NM:i:2
+SRR2057595.1999468_ACTGGTTTC	0	chr1	4762503	255	45M	*	0	0	*	*	XA:i:2	MD:Z:0C7A36	NM:i:2

From 74ae973b5eba7a870c9fe128e422c97c8a778285 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 23 May 2024 16:21:34 +0200
Subject: [PATCH 07/12] Add third test and test data

---
 CHANGELOG.md                                  |  2 +-
 src/umi_tools/umi_tools_dedup/config.vsh.yaml |  2 +-
 src/umi_tools/umi_tools_dedup/script.sh       |  4 +--
 src/umi_tools/umi_tools_dedup/test.sh         | 23 ++++++++++++++
 .../test_data/deduped_unique.sam              | 31 +++++++++++++++++++
 5 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 src/umi_tools/umi_tools_dedup/test_data/deduped_unique.sam

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ceb53ac5..bbb4ef4e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,7 +43,7 @@
     - `samtools/samtools_stats`: Reports alignment summary statistics for a BAM file (PR #39).
 
 * `umitools`:
-    - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #XXX).
+    - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54).
 
 ## MAJOR CHANGES
 
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index 0c34c20a..cc5113e8 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -35,7 +35,7 @@ argument_groups:
   - name: Outputs
     arguments:
       - name: --output
-        alternatives: -S
+        alternatives: --stdout
         type: file
         description: Deduplicated BAM file
         required: true
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
index cb5b563a..1e227811 100644
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -24,7 +24,7 @@ test_dir="${metal_executable}/test_data"
 umi_tools dedup \
     --stdin "$par_input" \
     ${par_in_sam:+--in-sam} \
-    -S "$par_output" \
+    --stdout "$par_output" \
     ${par_out_sam:+--out-sam} \
     ${par_paired:+--paired} \
     ${par_output_stats:+--output-stats "$par_output_stats"} \
@@ -52,7 +52,7 @@ umi_tools dedup \
     ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
     ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
     ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
-    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
+    ${par_unpaired_reads:+--unapired-reads "$par_unapired_reads"} \
     ${par_ignore_umi:+--ignore-umi} \
     ${par_subset:+--subset "$par_subset"} \
     ${par_chrom:+--chrom "$par_chrom"} \
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
index db5563bc..adadb410 100644
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -58,6 +58,29 @@ diff "$out_dir/deduped_fraction.sam" "$test_dir/deduped_fraction.sam" || \
 
 ############################################################################################
 
+echo ">>> Test 3: $meta_functionality_name with --method unique"
+
+"$meta_executable" \
+  --paired \
+  --input "$test_dir/sample.bam" \
+  --bai "$test_dir/sample.bam.bai" \
+  --output "$out_dir/deduped_unique.sam" \
+  --out_sam \
+  --method "unique" \
+  --random_seed 1
+
+echo ">>> Checking whether output exists"
+[ ! -f "$out_dir/deduped_unique.sam" ] && echo "File 'deduped_unique.sam' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$out_dir/deduped_unique.sam" ] && echo "File 'deduped_unique.sam' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$out_dir/deduped_unique.sam" "$test_dir/deduped_unique.sam" || \
+    (echo "Output file deduped_unique.sam does not match expected output" && exit 1)
+
+############################################################################################
+
 rm -rf "$out_dir"
 
 echo "All tests succeeded!"
diff --git a/src/umi_tools/umi_tools_dedup/test_data/deduped_unique.sam b/src/umi_tools/umi_tools_dedup/test_data/deduped_unique.sam
new file mode 100644
index 00000000..570ea153
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test_data/deduped_unique.sam
@@ -0,0 +1,31 @@
+@HD	VN:1.0	SO:coordinate
+@SQ	SN:chr1	LN:197195432
+@SQ	SN:chr10	LN:129993255
+@SQ	SN:chr11	LN:121843856
+@SQ	SN:chr12	LN:121257530
+@SQ	SN:chr13	LN:120284312
+@SQ	SN:chr14	LN:125194864
+@SQ	SN:chr15	LN:103494974
+@SQ	SN:chr16	LN:98319150
+@SQ	SN:chr17	LN:95272651
+@SQ	SN:chr18	LN:90772031
+@SQ	SN:chr19	LN:61342430
+@SQ	SN:chr2	LN:181748087
+@SQ	SN:chr3	LN:159599783
+@SQ	SN:chr4	LN:155630120
+@SQ	SN:chr5	LN:152537259
+@SQ	SN:chr6	LN:149517037
+@SQ	SN:chr7	LN:152524553
+@SQ	SN:chr8	LN:131738871
+@SQ	SN:chr9	LN:124076172
+@SQ	SN:chrM	LN:16299
+@SQ	SN:chrX	LN:166650296
+@SQ	SN:chrY	LN:15902555
+@PG	ID:Bowtie	VN:1.1.2	CL:"bowtie --wrapper basic-0 --threads 4 -v 2 -m 10 -k 1 /ifs/mirror/genomes/bowtie/mm9 /dev/fd/63 --sam"
+@PG	ID:samtools	PN:samtools	PP:Bowtie	VN:1.19.2	CL:samtools view -h example.bam
+@PG	ID:samtools.1	PN:samtools	PP:samtools	VN:1.19.2	CL:samtools view -bS -
+SRR2057595.5052066_ACCGGTTTA	16	chr1	3812794	255	51M	*	0	0	*	*	XA:i:2	MD:Z:42T2T5	NM:i:2
+SRR2057595.13520751_CCAGGTTCT	16	chr1	3967622	255	20M	*	0	0	*	*	XA:i:2	MD:Z:12A0C6	NM:i:2
+SRR2057595.8901432_AGCGGTTAC	0	chr1	4369756	255	20M	*	0	0	*	*	XA:i:2	MD:Z:1T4A13	NM:i:2
+SRR2057595.1210348_ACTGGTTTC	0	chr1	4762503	255	45M	*	0	0	*	*	XA:i:2	MD:Z:0C7A36	NM:i:2
+SRR2057595.1169423_TCTGGTTTC	0	chr1	4762503	255	45M	*	0	0	*	*	XA:i:2	MD:Z:0C7A36	NM:i:2

From 0d53a94fc9f018446beaf611097c77f48f794d19 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 23 May 2024 16:25:21 +0200
Subject: [PATCH 08/12] Fix typo in script

---
 src/umi_tools/umi_tools_dedup/script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
index 1e227811..d57a5e76 100644
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -52,7 +52,7 @@ umi_tools dedup \
     ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
     ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
     ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
-    ${par_unpaired_reads:+--unapired-reads "$par_unapired_reads"} \
+    ${par_unpaired_reads:+--unpaired-reads "$par_unpaired_reads"} \
     ${par_ignore_umi:+--ignore-umi} \
     ${par_subset:+--subset "$par_subset"} \
     ${par_chrom:+--chrom "$par_chrom"} \

From 9e65970646daeca4535c293474d07b5b05331b23 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 23 May 2024 16:50:43 +0200
Subject: [PATCH 09/12] remove utf8 characters in config

---
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index cc5113e8..7c54369a 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -115,21 +115,21 @@ argument_groups:
         description: |
           Causes two reads that start in the same position on the same strand
           and having the same UMI to be considered unique if one is spliced 
-          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
+          and the other is not. (Uses the 'N' cigar operation to test for splicing).
       - name: --soft_clip_threshold
         type: integer
         description: |
           Mappers that soft clip will sometimes do so rather than mapping a
           spliced read if there is only a small overhang over the exon junction.
           By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3’ end as spliced.
+          bases soft-clipped at the 3' end as spliced.
         default: 4
       - name: --multimapping_detection_method
         type: string
         description: |
           If the sam/bam contains tags to identify multimapping reads, you can
           specify for use when selecting the best read at a given loci. Supported
-          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
+          tags are "NH", "X0" and "XT". If not specified, the read with the highest
           mapping quality will be selected.
       - name: --read_length
         type: boolean_true
@@ -161,7 +161,7 @@ argument_groups:
         description: |
           Use in conjunction with the --assigned_status_tag option to skip any reads
           where the tag matches this regex. Default ("^[__|Unassigned]") matches
-          anything which starts with “__” or “Unassigned”.
+          anything which starts with "__" or "Unassigned".
       - name: --per_contig
         type: boolean_true
         description: |

From 474109eb7976741024fb3e4391c7eee769ee9509 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Mon, 1 Jul 2024 17:31:37 +0100
Subject: [PATCH 10/12] Add choices fields and change default fields to
 exampels

---
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 82 ++++++++++---------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index 7c54369a..5b03a6e5 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -58,10 +58,14 @@ argument_groups:
           in the file names.
       - name: --extract_umi_method
         type: string
+        choices: ["read_id", "tag", "umis"]
         description: |
           Specify the method by which the barcodes were encoded in the read.
-          The options are: [read_id, tag, umis].
-        default: read_id
+          The options are:
+            * read_id (default) 
+            * tag
+            * umis
+        example: "read_id"
       - name: --umi_tag
         type: string
         description: |
@@ -71,8 +75,8 @@ argument_groups:
         type: string
         description: |
           The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read.
-        default: '_'
+          This is only required if the extract_umi_method is set to id_read. ['_']
+        example: '_'
       - name: --umi_tag_split
         type: string
         description: |
@@ -99,17 +103,23 @@ argument_groups:
     arguments:    
       - name: --method
         type: string
-        description: |
-          The method to use for grouping reads. The options are: 
-          [unique, percentile, cluster, adjacency, directional].
-        default: directional
+        choices: ["unique", "percentile", "cluster", "adjacency", "directional"]
+        description: |
+          The method to use for grouping reads. 
+          The options are: 
+            * unique
+            * percentile
+            * cluster
+            * adjacency
+            * directional (default)
+        example: "directional"
       - name: --edit_distance_threshold
         type: integer
         description: |
           For the adjacency and cluster methods the threshold for the edit 
           distance to connect two UMIs in the network can be increased. The 
-          default value of 1 works best unless the UMI is very long (>14bp).
-        default: 1
+          default value of 1 works best unless the UMI is very long (>14bp). [1]
+        example: 1
       - name: --spliced_is_unique
         type: boolean_true
         description: |
@@ -122,8 +132,8 @@ argument_groups:
           Mappers that soft clip will sometimes do so rather than mapping a
           spliced read if there is only a small overhang over the exon junction.
           By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3' end as spliced.
-        default: 4
+          bases soft-clipped at the 3' end as spliced. [4]
+        example: 4
       - name: --multimapping_detection_method
         type: string
         description: |
@@ -188,39 +198,37 @@ argument_groups:
       - name: --mapping_quality
         type: integer
         description: |
-          Minimium mapping quality (MAPQ) for a read to be retained.
-        default: 0
+          Minimium mapping quality (MAPQ) for a read to be retained. [0]
+        example: 0
       - name: --unmapped_reads
         type: string
         description: |
           How unmapped reads should be handled. 
           The options are:
-          "discard": Discard all unmapped reads.
-          "use":     If read2 is unmapped, deduplicate using read1 only. 
-                     Requires --paired.
-          "output":  Output unmapped reads/read pairs without UMI 
-                     grouping/deduplication. Only available in umi_tools group.
-        default: discard
+            * "discard": Discard all unmapped reads. (default)
+            * "use":     If read2 is unmapped, deduplicate using read1 only. Requires --paired.
+            * "output":  Output unmapped reads/read pairs without UMI grouping/deduplication. Only available in umi_tools group.
+        example: "discard"
       - name: --chimeric_pairs
         type: string
+        choices: ["discard", "use", "output"]
         description: |
           How chimeric pairs should be handled. 
           The options are:
-          "discard": Discard all chimeric read pairs.
-          "use":     Deduplicate using read1 only.
-          "output":  Output chimeric pairs without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
+            * "discard": Discard all chimeric read pairs.
+            * "use":     Deduplicate using read1 only. (default)
+            * "output":  Output chimeric pairs without UMI grouping/deduplication. Only available in umi_tools group.
+        example: "use"
       - name: --unpaired_reads
         type: string
+        choices: ["discard", "use", "output"]
         description: |
           How unpaired reads should be handled. 
-          The options are:
-          "discard": Discard all unpaired reads.
-          "use":     Deduplicate using read1 only.
-          "output":  Output unpaired reads without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
+          The options are: 
+            * "discard": Discard all unmapped reads.
+            * "use": If read2 is unmapped, deduplicate using read1 only. Requires --paired. (default)
+            * "output":  Output unmapped reads/read pairs without UMI grouping/deduplication. Only available in umi_tools group.
+        example: "use"
       - name: --ignore_umi
         type: boolean_true
         description: |
@@ -266,8 +274,8 @@ argument_groups:
       - name: --verbose
         alternatives: -v
         type: integer
-        description: Log level. The higher, the more output.
-        default: 0
+        description: Log level. The higher, the more output. [0]
+        example: 0
       - name: --error
         alternatives: -E
         type: file
@@ -279,15 +287,15 @@ argument_groups:
       - name: --compresslevel
         type: integer
         description: |
-          Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default.
-        default: 6
+          Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default. [6]
+        example: 6
       - name: --timeit
         type: file
         description: Store timing information in file.
       - name: --timeit_name
         type: string
-        description: Name in timing file for this class of jobs.
-        default: "all"
+        description: Name in timing file for this class of jobs. [all]
+        example: "all"
       - name: --timeit_header
         type: string
         description: Add header for timing information.

From cebfead0795eddc06b0e8f4a7c79269ab75b1ed8 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Mon, 1 Jul 2024 17:36:57 +0100
Subject: [PATCH 11/12] Minor formatting changes

---
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index 5b03a6e5..50a5d624 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -58,7 +58,7 @@ argument_groups:
           in the file names.
       - name: --extract_umi_method
         type: string
-        choices: ["read_id", "tag", "umis"]
+        choices: [read_id, tag, umis]
         description: |
           Specify the method by which the barcodes were encoded in the read.
           The options are:
@@ -75,7 +75,7 @@ argument_groups:
         type: string
         description: |
           The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read. ['_']
+          This is only required if the extract_umi_method is set to id_read. [_]
         example: '_'
       - name: --umi_tag_split
         type: string
@@ -103,7 +103,7 @@ argument_groups:
     arguments:    
       - name: --method
         type: string
-        choices: ["unique", "percentile", "cluster", "adjacency", "directional"]
+        choices: [unique, percentile, cluster, adjacency, directional]
         description: |
           The method to use for grouping reads. 
           The options are: 
@@ -211,7 +211,7 @@ argument_groups:
         example: "discard"
       - name: --chimeric_pairs
         type: string
-        choices: ["discard", "use", "output"]
+        choices: [discard, use, output]
         description: |
           How chimeric pairs should be handled. 
           The options are:
@@ -221,7 +221,7 @@ argument_groups:
         example: "use"
       - name: --unpaired_reads
         type: string
-        choices: ["discard", "use", "output"]
+        choices: [discard, use, output]
         description: |
           How unpaired reads should be handled. 
           The options are: 

From b4fe24c270dc6e00277a2718228e16f9e0d3da66 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Fri, 5 Jul 2024 14:07:52 +0100
Subject: [PATCH 12/12] md formatting changes in config

---
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 159 ++++++++----------
 1 file changed, 71 insertions(+), 88 deletions(-)

diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
index 50a5d624..a02e70a1 100644
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -22,40 +22,37 @@ argument_groups:
       - name: --in_sam
         type: boolean_true
         description: |
-          By default, inputs are assumed to be in BAM format. Use this options 
-          to specify the use of SAM format for input.
+          By default, inputs are assumed to be in BAM format. Use this options to specify the use of SAM
+          format for input.
       - name: --bai
         type: file
         description: BAM index
       - name: --random_seed
         type: integer
-        description: |
-          Random seed to initialize number generator with.
+        description: Random seed to initialize number generator with.
 
   - name: Outputs
     arguments:
       - name: --output
         alternatives: --stdout
         type: file
-        description: Deduplicated BAM file
+        description: Deduplicated BAM file.
         required: true
         direction: output
       - name: --out_sam
         type: boolean_true
         description: |
-          By default, outputa are written in BAM format. Use this options to 
-          specify the use of SAM format for output.
+          By default, outputa are written in BAM format. Use this options to specify the use of SAM format
+          for output.
       - name: --paired
         type: boolean_true
         description: |
-          BAM is paired end - output both read pairs. This will also force the
-          use of the template length to determine reads with the same mapping
-          coordinates.
+          BAM is paired end - output both read pairs. This will also force the use of the template length
+          to determine reads with the same mapping coordinates.
       - name: --output_stats
         type: string
         description: |
-          Generate files containing UMI based deduplication statistics files with this prefix
-          in the file names.
+          Generate files containing UMI based deduplication statistics files with this prefix in the file names.
       - name: --extract_umi_method
         type: string
         choices: [read_id, tag, umis]
@@ -69,35 +66,30 @@ argument_groups:
       - name: --umi_tag
         type: string
         description: |
-          The tag containing the UMI sequence. 
-          This is only required if the extract_umi_method is set to tag.
+          The tag containing the UMI sequence. This is only required if the extract_umi_method is set to tag.
       - name: --umi_separator
         type: string
         description: |
-          The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read. [_]
+          The separator used to separate the UMI from the read sequence. This is only required if the
+          extract_umi_method is set to id_read. Default: `_`.
         example: '_'
       - name: --umi_tag_split
         type: string
-        description: |
-          Separate the UMI in tag by <SPLIT> and take the first element.
+        description: Separate the UMI in tag by <SPLIT> and take the first element.
       - name: --umi_tag_delimiter
         type: string
-        description: |
-          Separate the UMI in by <DELIMITER> and concatenate the elements
+        description: Separate the UMI in by <DELIMITER> and concatenate the elements.
       - name: --cell_tag
         type: string
         description: |
-          The tag containing the cell barcode sequence. 
-          This is only required if the extract_umi_method is set to tag.
+          The tag containing the cell barcode sequence. This is only required if the extract_umi_method
+          is set to tag.
       - name: --cell_tag_split
         type: string
-        description: |
-          Separate the cell barcode in tag by <SPLIT> and take the first element.
+        description: Separate the cell barcode in tag by <SPLIT> and take the first element.
       - name: --cell_tag_delimiter
         type: string
-        description: |
-          Separate the cell barcode in by <DELIMITER> and concatenate the elements
+        description: Separate the cell barcode in by <DELIMITER> and concatenate the elements.
   
   - name: Grouping Options
     arguments:    
@@ -116,89 +108,80 @@ argument_groups:
       - name: --edit_distance_threshold
         type: integer
         description: |
-          For the adjacency and cluster methods the threshold for the edit 
-          distance to connect two UMIs in the network can be increased. The 
-          default value of 1 works best unless the UMI is very long (>14bp). [1]
+          For the adjacency and cluster methods the threshold for the edit distance to connect two
+          UMIs in the network can be increased. The default value of 1 works best unless the UMI is
+          very long (>14bp). Default: `1`.
         example: 1
       - name: --spliced_is_unique
         type: boolean_true
         description: |
-          Causes two reads that start in the same position on the same strand
-          and having the same UMI to be considered unique if one is spliced 
-          and the other is not. (Uses the 'N' cigar operation to test for splicing).
+          Causes two reads that start in the same position on the same strand and having the same UMI
+          to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation
+          to test for splicing).
       - name: --soft_clip_threshold
         type: integer
         description: |
-          Mappers that soft clip will sometimes do so rather than mapping a
-          spliced read if there is only a small overhang over the exon junction.
-          By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3' end as spliced. [4]
+          Mappers that soft clip will sometimes do so rather than mapping a spliced read if there is only
+          a small overhang over the exon junction. By setting this option, you can treat reads with at
+          least this many bases soft-clipped at the 3' end as spliced. Default: `4`.
         example: 4
       - name: --multimapping_detection_method
         type: string
         description: |
-          If the sam/bam contains tags to identify multimapping reads, you can
-          specify for use when selecting the best read at a given loci. Supported
-          tags are "NH", "X0" and "XT". If not specified, the read with the highest
-          mapping quality will be selected.
+          If the sam/bam contains tags to identify multimapping reads, you can specify for use when selecting
+          the best read at a given loci. Supported tags are `NH`, `X0` and `XT`. If not specified, the read
+          with the highest mapping quality will be selected.
       - name: --read_length
         type: boolean_true
-        description: |
-          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+        description: Use the read length as a criteria when deduping, for e.g. sRNA-Seq.
   
   - name: Single-cell RNA-Seq Options
     arguments:
       - name: --per_gene
         type: boolean_true
         description: |
-          Reads will be grouped together if they have the same gene. This is useful
-          if your library prep generates PCR duplicates with non identical alignment
-          positions such as CEL-Seq. Note this option is hardcoded to be on with the
-          count command. I.e counting is always performed per-gene. Must be combined
-          with either --gene_tag or --per_contig option.
+          Reads will be grouped together if they have the same gene. This is useful if your library prep
+          generates PCR duplicates with non identical alignment positions such as CEL-Seq. Note this option
+          is hardcoded to be on with the count command. I.e. counting is always performed per-gene. Must be
+          combined with either --gene_tag or --per_contig option.
       - name: --gene_tag
         type: string
         description: |
-          Deduplicate per gene. The gene information is encoded in the bam read tag
-          specified.
+          Deduplicate per gene. The gene information is encoded in the bam read tag specified.
       - name: --assigned_status_tag
         type: string
         description: |
-          BAM tag which describes whether a read is assigned to a gene. Defaults to
-          the same value as given for --gene_tag.
+          BAM tag which describes whether a read is assigned to a gene. Defaults to the same value as given
+          for --gene_tag.
       - name: --skip_tags_regex
         type: string
         description: |
-          Use in conjunction with the --assigned_status_tag option to skip any reads
-          where the tag matches this regex. Default ("^[__|Unassigned]") matches
-          anything which starts with "__" or "Unassigned".
+          Use in conjunction with the --assigned_status_tag option to skip any reads where the tag matches
+          this regex. Default ("^[__|Unassigned]") matches anything which starts with "__" or "Unassigned".
       - name: --per_contig
         type: boolean_true
         description: |
-          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
-          contig will be considered to have the same alignment position. This is
-          useful if you have aligned to a reference transcriptome with one
-          transcript per gene. If you have aligned to a transcriptome with more
-          than one transcript per gene, you can supply a map between transcripts
-          and gene using the --gene_transcript_map option.
+          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the sam contig will be considered to
+          have the same alignment position. This is useful if you have aligned to a reference transcriptome
+          with one transcript per gene. If you have aligned to a transcriptome with more than one transcript
+          per gene, you can supply a map between transcripts and gene using the --gene_transcript_map option.
       - name: --gene_transcript_map
         type: file
         description: |
-          A file containing a mapping between gene names and transcript names.
-          The file should be tab separated with the gene name in the first column
-          and the transcript name in the second column.
+          A file containing a mapping between gene names and transcript names. The file should be tab
+          separated with the gene name in the first column and the transcript name in the second column.
       - name: --per_cell
         type: boolean_true
         description: |
-          Reads will only be grouped together if they have the same cell barcode.
-          Can be combined with --per_gene.
+          Reads will only be grouped together if they have the same cell barcode. Can be combined with
+          --per_gene.
   
   - name: SAM/BAM Options
     arguments:
       - name: --mapping_quality
         type: integer
         description: |
-          Minimium mapping quality (MAPQ) for a read to be retained. [0]
+          Minimium mapping quality (MAPQ) for a read to be retained. Default: `0`.
         example: 0
       - name: --unmapped_reads
         type: string
@@ -217,7 +200,8 @@ argument_groups:
           The options are:
             * "discard": Discard all chimeric read pairs.
             * "use":     Deduplicate using read1 only. (default)
-            * "output":  Output chimeric pairs without UMI grouping/deduplication. Only available in umi_tools group.
+            * "output":  Output chimeric pairs without UMI grouping/deduplication. Only available in
+                         umi_tools group.
         example: "use"
       - name: --unpaired_reads
         type: string
@@ -227,42 +211,38 @@ argument_groups:
           The options are: 
             * "discard": Discard all unmapped reads.
             * "use": If read2 is unmapped, deduplicate using read1 only. Requires --paired. (default)
-            * "output":  Output unmapped reads/read pairs without UMI grouping/deduplication. Only available in umi_tools group.
+            * "output":  Output unmapped reads/read pairs without UMI grouping/deduplication. Only available
+                         in umi_tools group.
         example: "use"
       - name: --ignore_umi
         type: boolean_true
-        description: |
-          Ignore the UMI and group reads using mapping coordinates only.
+        description: Ignore the UMI and group reads using mapping coordinates only.
       - name: --subset
         type: double
         description: |
-          Only consider a fraction of the reads, chosen at random. This is useful
-          for doing saturation analyses.
+          Only consider a fraction of the reads, chosen at random. This is useful for doing saturation
+          analyses.
       - name: --chrom
         type: string
-        description: |
-          Only consider a single chromosome. This is useful for debugging/testing 
-          purposes.
+        description: Only consider a single chromosome. This is useful for debugging/testing purposes.
   
   - name: Group/Dedup Options
     arguments:
       - name: --no_sort_output
         type: boolean_true
         description: |
-          By default, output is sorted. This involves the use of a temporary unsorted
-          file (saved in --temp_dir). Use this option to turn off sorting.
+          By default, output is sorted. This involves the use of a temporary unsorted file (saved in
+          --temp_dir). Use this option to turn off sorting.
       - name: --buffer_whole_contig
         type: boolean_true
         description: |
-          Forces dedup to parse an entire contig before yielding any reads for
-          deduplication. This is the only way to absolutely guarantee that all reads
-          with the same start position are grouped together for deduplication since
-          dedup uses the start position of the read, not the alignment coordinate on
-          which the reads are sorted. However, by default, dedup reads for another
-          1000bp before outputting read groups which will avoid any reads being missed
-          with short read sequencing (<1000bp).
+          Forces dedup to parse an entire contig before yielding any reads for deduplication. This is the
+          only way to absolutely guarantee that all reads with the same start position are grouped together
+          for deduplication since dedup uses the start position of the read, not the alignment coordinate on
+          which the reads are sorted. However, by default, dedup reads for another 1000bp before outputting
+          read groups which will avoid any reads being missed with short read sequencing (<1000bp).
   
-  - name: Common UMI-tools Options
+  - name: Common Options
     arguments:
       - name: --log
         alternatives: -L
@@ -274,7 +254,8 @@ argument_groups:
       - name: --verbose
         alternatives: -v
         type: integer
-        description: Log level. The higher, the more output. [0]
+        description: |
+          Log level. The higher, the more output. Default: `0`.
         example: 0
       - name: --error
         alternatives: -E
@@ -287,14 +268,16 @@ argument_groups:
       - name: --compresslevel
         type: integer
         description: |
-          Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default. [6]
+          Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default.
+          Default: `6`.
         example: 6
       - name: --timeit
         type: file
         description: Store timing information in file.
       - name: --timeit_name
         type: string
-        description: Name in timing file for this class of jobs. [all]
+        description: |
+          Name in timing file for this class of jobs. Default: `all`.
         example: "all"
       - name: --timeit_header
         type: string