From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:04:14 +0100
Subject: [PATCH 1/6] initial commit dedup

---
 CHANGELOG.md                                  |   3 +
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++
 src/umi_tools/umi_tools_dedup/help.txt        |  13 +
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ++++
 src/umi_tools/umi_tools_dedup/test.sh         |  49 +++
 5 files changed, 409 insertions(+)
 create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 create mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 create mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 create mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4fd7f001..1bef9345 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,9 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
+* `umi_tools`:
+    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
+    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
new file mode 100644
index 00000000..75306541
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -0,0 +1,279 @@
+name: umi_tool_dedup
+namespace: umi_tools
+description: |
+  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
+keywords: [umi_tools, deduplication, dedup]
+links:
+  homepage: https://umi-tools.readthedocs.io/en/latest/
+  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
+                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
+  repository: https://github.com/CGATOxford/UMI-tools
+references: 
+  doi: 10.1101/gr.209601.116
+license: MIT
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        alternatives: -I
+        type: file
+        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
+        required: true
+      - name: --in_sam
+        type: boolean_true
+        description: |
+          By default, inputs are assumed to be in BAM format. Use this options 
+          to specify the use of SAM format for input.
+      - name: --bai
+        type: file
+        description: BAM index
+      - name: --get_output_stats
+        type: boolean
+        description: Whether or not to generate output stats. 
+      - name: --random_seed
+        type: integer
+        description: |
+          Random seed to initialize number generator with.
+        default: none
+
+  - name: Outputs
+    arguments:
+      - name: --output
+        alternatives: -S
+        type: file
+        description: Deduplicated BAM file
+        required: true
+        direction: output
+      - name: --out_sam
+        type: boolean_true
+        description: |
+          By default, outputa are written in BAM format. Use this options to 
+          specify the use of SAM format for output.
+      - name: --paired
+        type: boolean_true
+        description: |
+          BAM is paired end - output both read pairs. This will also force the
+          use of the template length to determine reads with the same mapping
+          coordinates.
+      - name: --output_stats
+        type: file
+        description: Directory containing UMI based deduplication statistics files
+        direction: output
+      - name: --extract_umi_method
+        type: string
+        description: |
+          Specify the method by which the barcodes were encoded in the read.
+          The options are: [read_id, tag, umis].
+        default: read_id
+      - name: --umi_tag
+        type: string
+        description: |
+          The tag containing the UMI sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --umi_separator
+        type: string
+        description: |
+          The separator used to separate the UMI from the read sequence. 
+          This is only required if the extract_umi_method is set to id_read.
+        default: '_'
+      - name: --umi_tag_split
+        type: string
+        description: |
+          Separate the UMI in tag by <SPLIT> and take the first element.
+      - name: --umi_tag_delimiter
+        type: string
+        description: |
+          Separate the UMI in by <DELIMITER> and concatenate the elements
+      - name: --cell_tag
+        type: string
+        description: |
+          The tag containing the cell barcode sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --cell_tag_split
+        type: string
+        description: |
+          Separate the cell barcode in tag by <SPLIT> and take the first element.
+      - name: --cell_tag_delimiter
+        type: string
+        description: |
+          Separate the cell barcode in by <DELIMITER> and concatenate the elements
+  
+  - name: Grouping Options
+    arguments:    
+      - name: --method
+        type: string
+        description: |
+          The method to use for grouping reads. The options are: 
+          [unique, percentile, cluster, adjacency, directional].
+        default: directional
+      - name: --edit_distance_threshold
+        type: integer
+        description: |
+          For the adjacency and cluster methods the threshold for the edit 
+          distance to connect two UMIs in the network can be increased. The 
+          default value of 1 works best unless the UMI is very long (>14bp).
+        default: 1
+      - name: --spliced_is_unique
+        type: boolean_true
+        description: |
+          Causes two reads that start in the same position on the same strand
+          and having the same UMI to be considered unique if one is spliced 
+          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
+      - name: --soft_clip_threshold
+        type: integer
+        description: |
+          Mappers that soft clip will sometimes do so rather than mapping a
+          spliced read if there is only a small overhang over the exon junction.
+          By setting this option, you can treat reads with at least this many
+          bases soft-clipped at the 3’ end as spliced.
+        default: 4
+      - name: --multimapping_detection_method
+        type: string
+        description: |
+          If the sam/bam contains tags to identify multimapping reads, you can
+          specify for use when selecting the best read at a given loci. Supported
+          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
+          mapping quality will be selected.
+      - name: --read_length
+        type: integer
+        description: |
+          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+  
+  - name: Single-cell RNA-Seq Options
+    arguments:
+      - name: --per_gene
+        type: boolean_true
+        description: |
+          Reads will be grouped together if they have the same gene. This is useful
+          if your library prep generates PCR duplicates with non identical alignment
+          positions such as CEL-Seq. Note this option is hardcoded to be on with the
+          count command. I.e counting is always performed per-gene. Must be combined
+          with either --gene_tag or --per_contig option.
+      - name: --gene_tag
+        type: string
+        description: |
+          Deduplicate per gene. The gene information is encoded in the bam read tag
+          specified.
+      - name: --assigned_status_tag
+        type: string
+        description: |
+          BAM tag which describes whether a read is assigned to a gene. Defaults to
+          the same value as given for --gene_tag.
+      - name: --skip_tags_regex
+        type: string
+        description: |
+          Use in conjunction with the --assigned_status_tag option to skip any reads
+          where the tag matches this regex. Default ("^[__|Unassigned]") matches
+          anything which starts with “__” or “Unassigned”.
+      - name: --per_contig
+        type: boolean_true
+        description: |
+          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
+          contig will be considered to have the same alignment position. This is
+          useful if you have aligned to a reference transcriptome with one
+          transcript per gene. If you have aligned to a transcriptome with more
+          than one transcript per gene, you can supply a map between transcripts
+          and gene using the --gene_transcript_map option.
+      - name: --gene_transcript_map
+        type: file
+        description: |
+          A file containing a mapping between gene names and transcript names.
+          The file should be tab separated with the gene name in the first column
+          and the transcript name in the second column.
+      - name: --per_cell
+        type: boolean_true
+        description: |
+          Reads will only be grouped together if they have the same cell barcode.
+          Can be combined with --per_gene.
+  
+  - name: SAM/BAM Options
+    arguments:
+      - name: --mapping_quality
+        type: integer
+        description: |
+          Minimium mapping quality (MAPQ) for a read to be retained.
+        default: 0
+      - name: --unmapped_reads
+        type: string
+        description: |
+          How unmapped reads should be handled. 
+          The options are:
+          "discard": Discard all unmapped reads.
+          "use":     If read2 is unmapped, deduplicate using read1 only. 
+                     Requires --paired.
+          "output":  Output unmapped reads/read pairs without UMI 
+                     grouping/deduplication. Only available in umi_tools group.
+        default: discard
+      - name: --chimeric_pairs
+        type: string
+        description: |
+          How chimeric pairs should be handled. 
+          The options are:
+          "discard": Discard all chimeric read pairs.
+          "use":     Deduplicate using read1 only.
+          "output":  Output chimeric pairs without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --unapired_reads
+        type: string
+        description: |
+          How unpaired reads should be handled. 
+          The options are:
+          "discard": Discard all unpaired reads.
+          "use":     Deduplicate using read1 only.
+          "output":  Output unpaired reads without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --ignore_umi
+        type: boolean_true
+        description: |
+          Ignore the UMI and group reads using mapping coordinates only.
+      - name: --subset
+        type: boolean_true
+        description: |
+          Only consider a fraction of the reads, chosen at random. This is useful
+          for doing saturation analyses.
+      - name: --chrom
+        type: string
+        description: |
+          Only consider a single chromosome. This is useful for debugging/testing 
+          purposes.
+  
+  - name: Group/Dedup Options
+    arguments:
+      - name: --no_sort_output
+        type: boolean_true
+        description: |
+          By default, output is sorted. This involves the use of a temporary unsorted
+          file (saved in --temp-dir). Use this option to turn off sorting.
+      - name: --buffer_whole_contig
+        type: boolean_true
+        description: |
+          Forces dedup to parse an entire contig before yielding any reads for
+          deduplication. This is the only way to absolutely guarantee that all reads
+          with the same start position are grouped together for deduplication since
+          dedup uses the start position of the read, not the alignment coordinate on
+          which the reads are sorted. However, by default, dedup reads for another
+          1000bp before outputting read groups which will avoid any reads being missed
+          with short read sequencing (<1000bp).
+
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
+    setup:
+      - type: docker
+        run: |
+            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
+runners:
+- type: executable
+- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
new file mode 100644
index 00000000..d3c8fa44
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/help.txt
@@ -0,0 +1,13 @@
+```
+umi_tools dedup
+```
+
+dedup - Deduplicate reads using UMI and mapping coordinates
+
+Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+       note: If --stdout is ommited, standard out is output. To
+             generate a valid BAM file on standard out, please
+             redirect log with --log=LOGFILE or --log2stderr 
+
+For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
new file mode 100644
index 00000000..57c01258
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -e
+
+test_dir="${metal_executable}/test_data"
+
+[[ "$par_paired" == "false" ]] && unset par_paired
+[[ "$par_in_sam" == "false" ]] && unset par_in_sam
+[[ "$par_out_sam" == "false" ]] && unset par_out_sam
+[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
+[[ "$par_per_gene" == "false" ]] && unset par_per_gene
+[[ "$par_per_contig" == "false" ]] && unset par_per_contig
+[[ "$par_per_cell" == "false" ]] && unset par_per_cell
+[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
+[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
+[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
+[[ "$par_subset" == "false" ]] && unset par_subset
+
+
+$(which umi_tools) dedup \
+    -I "$par_input" \
+    ${par_in_sam:+--in-sam} \
+    ${par_bai:+--bai "$par_bai"} \
+    ${par_get_output_stats:+--get-output-stats} \
+    ${par_random_seed:+--random-seed "$par_random_seed"} \
+    -S "$par_output" \
+    ${par_out_sam:+--out-sam} \
+    ${par_paired:+--paired} \
+    ${par_output_stats:+--output-stats "$par_output_stats"} \
+    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
+    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
+    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
+    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
+    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
+    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
+    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
+    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
+    ${par_method:+--method "$par_method"} \
+    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
+    ${par_spliced_is_unique:+--spliced-is-unique} \
+    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
+    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
+    ${par_read_length:+--read-length "$par_read_length"} \
+    ${par_per_gene:+--per-gene} \
+    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
+    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
+    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
+    ${par_per_contig:+--per-contig}
+    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
+    ${par_per_cell:+--per-cell} \
+    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
+    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
+    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
+    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
+    ${par_ignore_umi:+--ignore-umi} \
+    ${par_subset:+--subset} \
+    ${par_chrom:+--chrom "$par_chrom"} \
+    ${par_no_sort_output:+--no-sort-output} \
+    ${par_buffer_whole_contig:+--buffer-whole-contig}
+
+
+exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
new file mode 100644
index 00000000..1459ec08
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+test_dir="${meta_resources_dir}/test_data"
+echo ">>> Testing $meta_functionality_name"
+
+"$meta_executable" \
+  --bam "$test_dir/a.sorted.bam" \
+  --bai "$test_dir/a.sorted.bam.bai" \
+  --output "$test_dir/a.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
+    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/a.sorted.idxstats"
+
+############################################################################################
+
+echo ">>> Testing $meta_functionality_name with singletons in the input"
+
+"$meta_executable" \
+  --bam "$test_dir/test.paired_end.sorted.bam" \
+  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
+  --output "$test_dir/test.paired_end.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
+    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/test.paired_end.sorted.idxstats"
+
+############################################################################################
+
+echo "All tests succeeded!"
+exit 0
\ No newline at end of file

From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:38:57 +0100
Subject: [PATCH 2/6] Revert "initial commit dedup"

This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2.
---
 CHANGELOG.md                                  |   3 -
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------
 src/umi_tools/umi_tools_dedup/help.txt        |  13 -
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ----
 src/umi_tools/umi_tools_dedup/test.sh         |  49 ---
 5 files changed, 409 deletions(-)
 delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1bef9345..4fd7f001 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,9 +39,6 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
-* `umi_tools`:
-    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
-    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
deleted file mode 100644
index 75306541..00000000
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ /dev/null
@@ -1,279 +0,0 @@
-name: umi_tool_dedup
-namespace: umi_tools
-description: |
-  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
-keywords: [umi_tools, deduplication, dedup]
-links:
-  homepage: https://umi-tools.readthedocs.io/en/latest/
-  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
-                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
-  repository: https://github.com/CGATOxford/UMI-tools
-references: 
-  doi: 10.1101/gr.209601.116
-license: MIT
-
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: --input
-        alternatives: -I
-        type: file
-        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
-        required: true
-      - name: --in_sam
-        type: boolean_true
-        description: |
-          By default, inputs are assumed to be in BAM format. Use this options 
-          to specify the use of SAM format for input.
-      - name: --bai
-        type: file
-        description: BAM index
-      - name: --get_output_stats
-        type: boolean
-        description: Whether or not to generate output stats. 
-      - name: --random_seed
-        type: integer
-        description: |
-          Random seed to initialize number generator with.
-        default: none
-
-  - name: Outputs
-    arguments:
-      - name: --output
-        alternatives: -S
-        type: file
-        description: Deduplicated BAM file
-        required: true
-        direction: output
-      - name: --out_sam
-        type: boolean_true
-        description: |
-          By default, outputa are written in BAM format. Use this options to 
-          specify the use of SAM format for output.
-      - name: --paired
-        type: boolean_true
-        description: |
-          BAM is paired end - output both read pairs. This will also force the
-          use of the template length to determine reads with the same mapping
-          coordinates.
-      - name: --output_stats
-        type: file
-        description: Directory containing UMI based deduplication statistics files
-        direction: output
-      - name: --extract_umi_method
-        type: string
-        description: |
-          Specify the method by which the barcodes were encoded in the read.
-          The options are: [read_id, tag, umis].
-        default: read_id
-      - name: --umi_tag
-        type: string
-        description: |
-          The tag containing the UMI sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --umi_separator
-        type: string
-        description: |
-          The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read.
-        default: '_'
-      - name: --umi_tag_split
-        type: string
-        description: |
-          Separate the UMI in tag by <SPLIT> and take the first element.
-      - name: --umi_tag_delimiter
-        type: string
-        description: |
-          Separate the UMI in by <DELIMITER> and concatenate the elements
-      - name: --cell_tag
-        type: string
-        description: |
-          The tag containing the cell barcode sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --cell_tag_split
-        type: string
-        description: |
-          Separate the cell barcode in tag by <SPLIT> and take the first element.
-      - name: --cell_tag_delimiter
-        type: string
-        description: |
-          Separate the cell barcode in by <DELIMITER> and concatenate the elements
-  
-  - name: Grouping Options
-    arguments:    
-      - name: --method
-        type: string
-        description: |
-          The method to use for grouping reads. The options are: 
-          [unique, percentile, cluster, adjacency, directional].
-        default: directional
-      - name: --edit_distance_threshold
-        type: integer
-        description: |
-          For the adjacency and cluster methods the threshold for the edit 
-          distance to connect two UMIs in the network can be increased. The 
-          default value of 1 works best unless the UMI is very long (>14bp).
-        default: 1
-      - name: --spliced_is_unique
-        type: boolean_true
-        description: |
-          Causes two reads that start in the same position on the same strand
-          and having the same UMI to be considered unique if one is spliced 
-          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
-      - name: --soft_clip_threshold
-        type: integer
-        description: |
-          Mappers that soft clip will sometimes do so rather than mapping a
-          spliced read if there is only a small overhang over the exon junction.
-          By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3’ end as spliced.
-        default: 4
-      - name: --multimapping_detection_method
-        type: string
-        description: |
-          If the sam/bam contains tags to identify multimapping reads, you can
-          specify for use when selecting the best read at a given loci. Supported
-          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
-          mapping quality will be selected.
-      - name: --read_length
-        type: integer
-        description: |
-          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
-  
-  - name: Single-cell RNA-Seq Options
-    arguments:
-      - name: --per_gene
-        type: boolean_true
-        description: |
-          Reads will be grouped together if they have the same gene. This is useful
-          if your library prep generates PCR duplicates with non identical alignment
-          positions such as CEL-Seq. Note this option is hardcoded to be on with the
-          count command. I.e counting is always performed per-gene. Must be combined
-          with either --gene_tag or --per_contig option.
-      - name: --gene_tag
-        type: string
-        description: |
-          Deduplicate per gene. The gene information is encoded in the bam read tag
-          specified.
-      - name: --assigned_status_tag
-        type: string
-        description: |
-          BAM tag which describes whether a read is assigned to a gene. Defaults to
-          the same value as given for --gene_tag.
-      - name: --skip_tags_regex
-        type: string
-        description: |
-          Use in conjunction with the --assigned_status_tag option to skip any reads
-          where the tag matches this regex. Default ("^[__|Unassigned]") matches
-          anything which starts with “__” or “Unassigned”.
-      - name: --per_contig
-        type: boolean_true
-        description: |
-          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
-          contig will be considered to have the same alignment position. This is
-          useful if you have aligned to a reference transcriptome with one
-          transcript per gene. If you have aligned to a transcriptome with more
-          than one transcript per gene, you can supply a map between transcripts
-          and gene using the --gene_transcript_map option.
-      - name: --gene_transcript_map
-        type: file
-        description: |
-          A file containing a mapping between gene names and transcript names.
-          The file should be tab separated with the gene name in the first column
-          and the transcript name in the second column.
-      - name: --per_cell
-        type: boolean_true
-        description: |
-          Reads will only be grouped together if they have the same cell barcode.
-          Can be combined with --per_gene.
-  
-  - name: SAM/BAM Options
-    arguments:
-      - name: --mapping_quality
-        type: integer
-        description: |
-          Minimium mapping quality (MAPQ) for a read to be retained.
-        default: 0
-      - name: --unmapped_reads
-        type: string
-        description: |
-          How unmapped reads should be handled. 
-          The options are:
-          "discard": Discard all unmapped reads.
-          "use":     If read2 is unmapped, deduplicate using read1 only. 
-                     Requires --paired.
-          "output":  Output unmapped reads/read pairs without UMI 
-                     grouping/deduplication. Only available in umi_tools group.
-        default: discard
-      - name: --chimeric_pairs
-        type: string
-        description: |
-          How chimeric pairs should be handled. 
-          The options are:
-          "discard": Discard all chimeric read pairs.
-          "use":     Deduplicate using read1 only.
-          "output":  Output chimeric pairs without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --unapired_reads
-        type: string
-        description: |
-          How unpaired reads should be handled. 
-          The options are:
-          "discard": Discard all unpaired reads.
-          "use":     Deduplicate using read1 only.
-          "output":  Output unpaired reads without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --ignore_umi
-        type: boolean_true
-        description: |
-          Ignore the UMI and group reads using mapping coordinates only.
-      - name: --subset
-        type: boolean_true
-        description: |
-          Only consider a fraction of the reads, chosen at random. This is useful
-          for doing saturation analyses.
-      - name: --chrom
-        type: string
-        description: |
-          Only consider a single chromosome. This is useful for debugging/testing 
-          purposes.
-  
-  - name: Group/Dedup Options
-    arguments:
-      - name: --no_sort_output
-        type: boolean_true
-        description: |
-          By default, output is sorted. This involves the use of a temporary unsorted
-          file (saved in --temp-dir). Use this option to turn off sorting.
-      - name: --buffer_whole_contig
-        type: boolean_true
-        description: |
-          Forces dedup to parse an entire contig before yielding any reads for
-          deduplication. This is the only way to absolutely guarantee that all reads
-          with the same start position are grouped together for deduplication since
-          dedup uses the start position of the read, not the alignment coordinate on
-          which the reads are sorted. However, by default, dedup reads for another
-          1000bp before outputting read groups which will avoid any reads being missed
-          with short read sequencing (<1000bp).
-
-
-resources:
-  - type: bash_script
-    path: script.sh
-test_resources:
-  - type: bash_script
-    path: test.sh
-  - type: file
-    path: test_data
-engines:
-  - type: docker
-    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
-    setup:
-      - type: docker
-        run: |
-            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
-runners:
-- type: executable
-- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
deleted file mode 100644
index d3c8fa44..00000000
--- a/src/umi_tools/umi_tools_dedup/help.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-```
-umi_tools dedup
-```
-
-dedup - Deduplicate reads using UMI and mapping coordinates
-
-Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
-
-       note: If --stdout is ommited, standard out is output. To
-             generate a valid BAM file on standard out, please
-             redirect log with --log=LOGFILE or --log2stderr 
-
-For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
deleted file mode 100644
index 57c01258..00000000
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-## VIASH START
-## VIASH END
-
-set -e
-
-test_dir="${metal_executable}/test_data"
-
-[[ "$par_paired" == "false" ]] && unset par_paired
-[[ "$par_in_sam" == "false" ]] && unset par_in_sam
-[[ "$par_out_sam" == "false" ]] && unset par_out_sam
-[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
-[[ "$par_per_gene" == "false" ]] && unset par_per_gene
-[[ "$par_per_contig" == "false" ]] && unset par_per_contig
-[[ "$par_per_cell" == "false" ]] && unset par_per_cell
-[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
-[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
-[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
-[[ "$par_subset" == "false" ]] && unset par_subset
-
-
-$(which umi_tools) dedup \
-    -I "$par_input" \
-    ${par_in_sam:+--in-sam} \
-    ${par_bai:+--bai "$par_bai"} \
-    ${par_get_output_stats:+--get-output-stats} \
-    ${par_random_seed:+--random-seed "$par_random_seed"} \
-    -S "$par_output" \
-    ${par_out_sam:+--out-sam} \
-    ${par_paired:+--paired} \
-    ${par_output_stats:+--output-stats "$par_output_stats"} \
-    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
-    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
-    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
-    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
-    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
-    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
-    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
-    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
-    ${par_method:+--method "$par_method"} \
-    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
-    ${par_spliced_is_unique:+--spliced-is-unique} \
-    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
-    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
-    ${par_read_length:+--read-length "$par_read_length"} \
-    ${par_per_gene:+--per-gene} \
-    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
-    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
-    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
-    ${par_per_contig:+--per-contig}
-    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
-    ${par_per_cell:+--per-cell} \
-    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
-    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
-    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
-    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
-    ${par_ignore_umi:+--ignore-umi} \
-    ${par_subset:+--subset} \
-    ${par_chrom:+--chrom "$par_chrom"} \
-    ${par_no_sort_output:+--no-sort-output} \
-    ${par_buffer_whole_contig:+--buffer-whole-contig}
-
-
-exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
deleted file mode 100644
index 1459ec08..00000000
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-test_dir="${meta_resources_dir}/test_data"
-echo ">>> Testing $meta_functionality_name"
-
-"$meta_executable" \
-  --bam "$test_dir/a.sorted.bam" \
-  --bai "$test_dir/a.sorted.bam.bai" \
-  --output "$test_dir/a.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
-    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/a.sorted.idxstats"
-
-############################################################################################
-
-echo ">>> Testing $meta_functionality_name with singletons in the input"
-
-"$meta_executable" \
-  --bam "$test_dir/test.paired_end.sorted.bam" \
-  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
-  --output "$test_dir/test.paired_end.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
-    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/test.paired_end.sorted.idxstats"
-
-############################################################################################
-
-echo "All tests succeeded!"
-exit 0
\ No newline at end of file

From 20b858983dffefc9e0c03fe46b5c3c244848e7fd Mon Sep 17 00:00:00 2001
From: Emma Rousseau <emmarou1@icloud.com>
Date: Thu, 31 Oct 2024 18:54:40 +0000
Subject: [PATCH 3/6] mkref component - config, test data, scripts, changelog

---
 CHANGELOG.md                                  |   5 ++
 .../cellranger_mkref/config.vsh.yaml          |  73 ++++++++++++++++++
 src/cellranger/cellranger_mkref/help.txt      |  71 +++++++++++++++++
 src/cellranger/cellranger_mkref/script.sh     |  38 +++++++++
 src/cellranger/cellranger_mkref/test.sh       |  42 ++++++++++
 .../test_data/reference_small.fa.gz           | Bin 0 -> 584 bytes
 .../test_data/reference_small.gtf.gz          | Bin 0 -> 514 bytes
 .../cellranger_mkref/test_data/script.sh      |  51 ++++++++++++
 8 files changed, 280 insertions(+)
 create mode 100644 src/cellranger/cellranger_mkref/config.vsh.yaml
 create mode 100644 src/cellranger/cellranger_mkref/help.txt
 create mode 100644 src/cellranger/cellranger_mkref/script.sh
 create mode 100644 src/cellranger/cellranger_mkref/test.sh
 create mode 100644 src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz
 create mode 100644 src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz
 create mode 100755 src/cellranger/cellranger_mkref/test_data/script.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a2aa5387..54138852 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,11 @@
 
 * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93).
 
+* `cellranger`:
+  - `cellranger/cellranger_count`: Align fastq files using Cell Ranger count (PR #163).
+  - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164).
+
+
 ## BREAKING CHANGES
 
 * `falco`: Fix a typo in the `--reverse_complement` argument (PR #157).
diff --git a/src/cellranger/cellranger_mkref/config.vsh.yaml b/src/cellranger/cellranger_mkref/config.vsh.yaml
new file mode 100644
index 00000000..b0ee993f
--- /dev/null
+++ b/src/cellranger/cellranger_mkref/config.vsh.yaml
@@ -0,0 +1,73 @@
+name: cellranger_mkref
+namespace: cellranger
+description: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files.
+keywords: [ cellranger, single-cell, rna-seq, alignment, reference, gtf, fasta ]
+links:
+  documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references#header
+  repository: https://github.com/10XGenomics/cellranger/blob/main/lib/python/cellranger/reference_builder.py
+  homepage: https://www.10xgenomics.com/support/software/cell-ranger/latest
+  issue_tracker: https://github.com/10XGenomics/cellranger/issues
+references:
+  doi: 10.1038/ncomms14049
+license: Copyright (c) 2023 10x Genomics
+authors:
+  - __merge__: /src/_authors/emma_rousseau.yaml
+    roles: [ author ]
+arguments:
+  # inputs
+  - type: file
+    name: --genome_fasta
+    required: true
+    description: Reference genome fasta.
+    example: genome_sequence.fa.gz
+  - type: file
+    name: --transcriptome_gtf
+    required: true
+    description: Reference transcriptome annotation.
+    example: transcriptome_annotation.gtf.gz
+  - type: string
+    name: "--reference_version"
+    description: "Optional reference version string to include with reference"
+  - type: file
+    name: --output
+    direction: output
+    required: true
+    description: Output folder
+    example: cellranger_reference
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - path: test_data
+
+engines:
+- type: docker
+  image: quay.io/nf-core/cellranger:8.0.0
+  setup:
+    - type: docker
+      run: |
+        DEBIAN_FRONTEND=noninteractive apt update && \
+        apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/*
+  test_setup:
+    - type: apt
+      packages: [ git, wget ]
+    - type: docker
+      run: |
+        TARGETARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \
+        TARGETOS="${TARGETOS:-linux}" && \
+        PATH="${PATH}:/usr/local/go/bin" && \
+        wget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \
+        rm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \
+        git clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \
+        cd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go
+    - type: docker
+      run: |
+        cellranger --version | sed 's/ cellranger-/: /' > /var/software_versions.txt
+
+runners:
+- type: executable
+- type: nextflow
+  directives:
+    label: [ highmem, highcpu ]
diff --git a/src/cellranger/cellranger_mkref/help.txt b/src/cellranger/cellranger_mkref/help.txt
new file mode 100644
index 00000000..9cd45cb9
--- /dev/null
+++ b/src/cellranger/cellranger_mkref/help.txt
@@ -0,0 +1,71 @@
+```
+cellranger mkref -h
+```
+Prepare a reference for use with 10x analysis software. Requires a GTF and
+FASTA
+
+Usage: cellranger mkref [OPTIONS] --genome <GENOME_NAMES> --fasta <FASTA_FILES> --genes <GTF_FILES>
+
+Options:
+      --genome <GENOME_NAMES>
+          Unique genome name, used to name output folder [a-zA-Z0-9_-]+.
+          Specify multiple genomes by specifying this argument multiple
+          times; the output folder will be <name1>_and_<name2>
+      --fasta <FASTA_FILES>
+          Path to FASTA file containing your genome reference. Specify
+          multiple genomes by specifying this argument multiple times
+      --genes <GTF_FILES>
+          Path to genes GTF file containing annotated genes for your genome
+          reference. Specify multiple genomes by specifying this argument
+          multiple times
+      --nthreads <NUM_THREADS>
+          Number of threads used during STAR genome index generation.
+          Defaults to 1 [default: 1]
+      --memgb <MEM_GB>
+          Maximum memory (GB) used [default: 16]
+      --ref-version <REF_VERSION>
+          Optional reference version string to include with reference
+      --dry
+          Do not execute the pipeline. Generate a pipeline invocation (.mro)
+          file and stop
+      --jobmode <MODE>
+          Job manager to use. Valid options: local (default), sge, lsf,
+          slurm or path to a .template file. Search for help on "Cluster
+          Mode" at support.10xgenomics.com for more details on configuring
+          the pipeline to use a compute cluster
+      --localcores <NUM>
+          Set max cores the pipeline may request at one time. Only applies
+          to local jobs
+      --localmem <NUM>
+          Set max GB the pipeline may request at one time. Only applies to
+          local jobs
+      --localvmem <NUM>
+          Set max virtual address space in GB for the pipeline. Only applies
+          to local jobs
+      --mempercore <NUM>
+          Reserve enough threads for each job to ensure enough memory will
+          be available, assuming each core on your cluster has at least this
+          much memory available. Only applies to cluster jobmodes
+      --maxjobs <NUM>
+          Set max jobs submitted to cluster at one time. Only applies to
+          cluster jobmodes
+      --jobinterval <NUM>
+          Set delay between submitting jobs to cluster, in ms. Only applies
+          to cluster jobmodes
+      --overrides <PATH>
+          The path to a JSON file that specifies stage-level overrides for
+          cores and memory. Finer-grained than --localcores, --mempercore
+          and --localmem. Consult https://support.10xgenomics.com/ for an
+          example override file
+      --output-dir <PATH>
+          Output the results to this directory
+      --uiport <PORT>
+          Serve web UI at http://localhost:PORT
+      --disable-ui
+          Do not serve the web UI
+      --noexit
+          Keep web UI running after pipestance completes or fails
+      --nopreflight
+          Skip preflight checks
+  -h, --help
+          Print help
diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh
new file mode 100644
index 00000000..9ee4b25e
--- /dev/null
+++ b/src/cellranger/cellranger_mkref/script.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -eo pipefail
+
+## VIASH START
+par_genome_fasta="resources_test/test_data/reference_small.fa.gz"
+par_transcriptome_gtf="resources_test/test_data/reference_small.gtf.gz"
+par_output="gencode_v41_annotation_cellranger.tar.gz"
+## VIASH END
+
+# create temporary directory
+tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX")
+function clean_up {
+    rm -rf "$tmpdir"
+}
+trap clean_up EXIT
+
+# just to make sure
+par_genome_fasta=$(realpath $par_genome_fasta)
+par_transcriptome_gtf=$(realpath $par_transcriptome_gtf)
+par_output=$(realpath $par_output)
+
+
+echo "> Unzipping input files"
+unpigz -c "$par_genome_fasta" > "$tmpdir/genome.fa"
+
+echo "> Building star index"
+cd "$tmpdir"
+cellranger mkref \
+  --fasta "$tmpdir/genome.fa" \
+  --genes "$par_transcriptome_gtf" \
+  --genome output \
+  ${par_reference_version:+--ref-version $par_reference_version} \
+  ${meta_cpus:+--nthreads $meta_cpus} \
+  ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itseld
+
+echo "> Creating archive"
+tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" .
\ No newline at end of file
diff --git a/src/cellranger/cellranger_mkref/test.sh b/src/cellranger/cellranger_mkref/test.sh
new file mode 100644
index 00000000..663c1c59
--- /dev/null
+++ b/src/cellranger/cellranger_mkref/test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -eou pipefail
+
+## VIASH START
+meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"
+## VIASH END
+
+# create temporary directory
+tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX")
+function clean_up {
+    rm -rf "$tmpdir"
+}
+trap clean_up EXIT
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing $(basename $input)"
+    seqkit subseq -r 1:50000 "$input" | gzip > "$output"
+  fi
+}
+
+seqkit_head "$meta_resources_dir/test_data/reference_small.fa.gz" "$tmpdir/reference_small.fa.gz"
+zcat "$meta_resources_dir/test_data/reference_small.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz"
+
+echo "> Running $meta_name, writing to $tmpdir."
+$meta_executable \
+  --genome_fasta "$tmpdir/reference_small.fa.gz" \
+  --transcriptome_gtf "$tmpdir/reference_small.gtf.gz" \
+  --output "$tmpdir/myreference.tar.gz" \
+  ---cpus ${meta_memory_gb:-1} \
+  ---memory ${meta_memory_gb:-5}GB
+
+exit_code=$?
+[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
+
+echo ">> Checking whether output can be found"
+[[ ! -f "$tmpdir/myreference.tar.gz" ]] && echo "Output tar file could not be found!" && exit 1
+
+echo "> Test succeeded!"
\ No newline at end of file
diff --git a/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz b/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz
new file mode 100644
index 0000000000000000000000000000000000000000..e24b75a949df25d147877afbaf9a46e91775bb40
GIT binary patch
literal 584
zcmV-O0=NAiiwFpZ%_C<319D|%WpZV1V`X1+ZDDL|E@oi>ZIrQbLop0PdrgrQyg=@F
zD7XMONRvAI7a<<x#JP*Vzb#S{MSu|BKR&-6Z;vy*pQ_frtsVB*rS*(@*4i0+rkGoE
zXW2EUJ>!kLPz=|xt5`kE%CR<t>{wTgrZiJ$N7^a-_GgL^RmAW?d!9O5u+~>uH*<C@
z;trGk>~;>#&?zvyud`pbKj+S{Z+PwkZBG97rp5}H2Pko}`z~fM2%0ut<2FcQ??Bg^
zq#I*{`?TXzOrXIMd*Jj3BdM4zdvMBBs@M#6@AJmHvmG#unm~})vnDjc<mk9AMfHkQ
zZo-bq-$1{?fT0YdY;EM@ap{^6K^hHW!#7;}!-n@nB75iwE=5_yaSnn4DsQ22>L5FG
z0#3P9ke6yUMM86A;DnF^;taIlZAzt-aXRe4bAXldH&>-h_YDr36A?F6K#0H~6;<@#
zopOh6P)2v>skqP#L?7_W-hU6}cBwS=Vn3gLasQckTK+kWovUT0$=DQJ@`}7eyDvTb
z-nnzpSkDbd_Xea?n%q?rMZK*eIJWyt&WeU}QR`mLcJ{h0AaqZpIS#di4XVElAP{wv
znVb!m1qEp_LdF}*buR7!ea6m8D}she1ENRObtoNey+;*O6``dWVs!$iu4W(dMd}8t
zlRzP&xW+zPlOBS~xLT7QTF}9EYr9{r3{-R~lR_1Gn>3<JSVdyAA#O^Dcw%B+Gz_vt
W!RAY@KT7a{XU;d$_v!in1polOLl)8i

literal 0
HcmV?d00001

diff --git a/src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz b/src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..1e3ce7ce625c2102865bdee4a4d7bf0980eb2749
GIT binary patch
literal 514
zcmV+d0{#6TiwFn>&m(6519D|%WpZV1V`X1+ZDDL|E@yOR0OgilZ`v>vhOfO}5#@F&
z^7qGfMJoi829rA4+sGJ?MoJu%q-gv1vr{rcfOS<jCbmequ>Jbvob&h`$2^n>OosQv
z*$~*Tg#(N;z_|ni@YcU8>ui!Pqj)s?Jci-Hlu*eAEPfx=rFNAm({<fS=8c33LYN#N
zFKPDI-_|yYGiPpR!*<T;+|Nxv+#&+;6`qXu?e(^FRt`2^VTipXIG?!3d|}H7Z--m2
zM!_khFlGWZUS)P;v!LqOtsd9wqO6mrwK&B0uyUrcF&WQp-Z4yN_)x7^QS6F3S=h=4
z?h+qQk=w*>(xr9AwvuI0*S1Vd;l8A0ZkO%+LRYD22P0S6e32zayTYYLXW_GYqn!?B
zeW0J_^U2-xDfRwQObzf(v-7`I&bLA=DGaf`F@3;F2!NovY{KLiVTlD7>c1wulLPFF
z7$F2-#hEKC)fqXTja|#=ccDN*d01=#!fU(f-(b+KHFUJrDB|jjwRR?6O>?}&qIvfE
zlL|sk{|R6rrs?&^h7KTH00!x4V3HDo`!w?OG>jh^BK;f?h8695(P_Ca@a5Fu;ZQ)u
zU+C_^$!Ybc56)h%B|kd~jiIA?__f<^KU+<eU0$A^>xYCZCTREX^3Ue<JG^~7xwi`d
E0J_5jMgRZ+

literal 0
HcmV?d00001

diff --git a/src/cellranger/cellranger_mkref/test_data/script.sh b/src/cellranger/cellranger_mkref/test_data/script.sh
new file mode 100755
index 00000000..3b09498b
--- /dev/null
+++ b/src/cellranger/cellranger_mkref/test_data/script.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+TMP_DIR=tmp/cellranger_make_reference
+OUT_DIR=src/cellranger/cellranger_mkref/test_data
+
+# check if seqkit is installed
+if ! command -v seqkit &> /dev/null; then
+  echo "seqkit could not be found"
+  exit 1
+fi
+
+# create temporary directory and clean up on exit
+mkdir -p $TMP_DIR
+function clean_up {
+    rm -rf "$TMP_DIR"
+}
+trap clean_up EXIT
+
+# fetch reference
+ORIG_FA=$TMP_DIR/reference.fa.gz
+if [ ! -f $ORIG_FA ]; then
+  wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \
+    -O $ORIG_FA
+fi
+
+ORIG_GTF=$TMP_DIR/reference.gtf.gz
+if [ ! -f $ORIG_GTF ]; then
+  wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \
+    -O $ORIG_GTF
+fi
+
+# create small reference
+START=30000
+END=31500
+CHR=chr1
+
+touch $OUT_DIR/reference_small.fa
+# subset to small region
+seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \
+  seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa
+
+touch $OUT_DIR/reference_small.gtf
+gunzip -c "$ORIG_GTF" | awk -v FS='\t' -v OFS='\t' "
+    \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END {
+      \$4 = \$4 - $START + 1;
+      \$5 = \$5 - $START + 1;
+      print;
+    }" > $OUT_DIR/reference_small.gtf
+
+gzip $OUT_DIR/reference_small.fa
+gzip $OUT_DIR/reference_small.gtf

From 88ed15285e5757cc3833a7a57b928a9af1b81af5 Mon Sep 17 00:00:00 2001
From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Fri, 8 Nov 2024 09:26:23 +0000
Subject: [PATCH 4/6] Small updates

---
 CHANGELOG.md                                  |  2 --
 .../cellranger_mkref/config.vsh.yaml          | 29 ++++++++-----------
 src/cellranger/cellranger_mkref/script.sh     | 10 +++----
 src/cellranger/cellranger_mkref/test.sh       |  2 +-
 4 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6206a5a1..3905de17 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,7 +17,6 @@
 * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93).
 
 * `cellranger`:
-  - `cellranger/cellranger_count`: Align fastq files using Cell Ranger count (PR #163).
   - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164).
 
 
@@ -30,7 +29,6 @@
 
 * `nanoplot`: Plotting tool for long read sequencing data and alignments (PR #95).
 
-
 ## BUG FIXES
 
 * `falco`: Fix a typo in the `--reverse_complement` argument (PR #157).
diff --git a/src/cellranger/cellranger_mkref/config.vsh.yaml b/src/cellranger/cellranger_mkref/config.vsh.yaml
index b0ee993f..b00c8f90 100644
--- a/src/cellranger/cellranger_mkref/config.vsh.yaml
+++ b/src/cellranger/cellranger_mkref/config.vsh.yaml
@@ -3,13 +3,15 @@ namespace: cellranger
 description: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files.
 keywords: [ cellranger, single-cell, rna-seq, alignment, reference, gtf, fasta ]
 links:
-  documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references#header
+  documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references
   repository: https://github.com/10XGenomics/cellranger/blob/main/lib/python/cellranger/reference_builder.py
   homepage: https://www.10xgenomics.com/support/software/cell-ranger/latest
   issue_tracker: https://github.com/10XGenomics/cellranger/issues
 references:
   doi: 10.1038/ncomms14049
-license: Copyright (c) 2023 10x Genomics
+license: Proprietary
+requirements:
+  commands: [cellranger, pigz, unpigz, tar]
 authors:
   - __merge__: /src/_authors/emma_rousseau.yaml
     roles: [ author ]
@@ -27,6 +29,7 @@ arguments:
     example: transcriptome_annotation.gtf.gz
   - type: string
     name: "--reference_version"
+    required: false
     description: "Optional reference version string to include with reference"
   - type: file
     name: --output
@@ -44,24 +47,16 @@ test_resources:
 
 engines:
 - type: docker
-  image: quay.io/nf-core/cellranger:8.0.0
+  image: ghcr.io/data-intuitive/cellranger:8.0
   setup:
-    - type: docker
-      run: |
-        DEBIAN_FRONTEND=noninteractive apt update && \
-        apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/*
+    - type: apt
+      packages:
+        - procps
+        - pigz
   test_setup:
     - type: apt
-      packages: [ git, wget ]
-    - type: docker
-      run: |
-        TARGETARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \
-        TARGETOS="${TARGETOS:-linux}" && \
-        PATH="${PATH}:/usr/local/go/bin" && \
-        wget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \
-        rm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \
-        git clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \
-        cd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go
+      packages:
+        - seqkit
     - type: docker
       run: |
         cellranger --version | sed 's/ cellranger-/: /' > /var/software_versions.txt
diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh
index 9ee4b25e..7179faae 100644
--- a/src/cellranger/cellranger_mkref/script.sh
+++ b/src/cellranger/cellranger_mkref/script.sh
@@ -3,19 +3,19 @@
 set -eo pipefail
 
 ## VIASH START
-par_genome_fasta="resources_test/test_data/reference_small.fa.gz"
-par_transcriptome_gtf="resources_test/test_data/reference_small.gtf.gz"
-par_output="gencode_v41_annotation_cellranger.tar.gz"
+par_genome_fasta="test_data/reference_small.fa.gz"
+par_transcriptome_gtf="test_data/reference_small.gtf.gz"
+par_output="output.tar.gz"
 ## VIASH END
 
 # create temporary directory
-tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX")
+tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX")
 function clean_up {
     rm -rf "$tmpdir"
 }
 trap clean_up EXIT
 
-# just to make sure
+# We change into the tempdir later, so we need absolute paths.
 par_genome_fasta=$(realpath $par_genome_fasta)
 par_transcriptome_gtf=$(realpath $par_transcriptome_gtf)
 par_output=$(realpath $par_output)
diff --git a/src/cellranger/cellranger_mkref/test.sh b/src/cellranger/cellranger_mkref/test.sh
index 663c1c59..5c5c1f3d 100644
--- a/src/cellranger/cellranger_mkref/test.sh
+++ b/src/cellranger/cellranger_mkref/test.sh
@@ -3,7 +3,7 @@
 set -eou pipefail
 
 ## VIASH START
-meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"
+meta_executable="viash run src/reference/make_reference/config.vsh.yaml --"
 ## VIASH END
 
 # create temporary directory

From 48ce1beda9fcb65d6565346d18f952a384a4fb16 Mon Sep 17 00:00:00 2001
From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Fri, 8 Nov 2024 09:29:26 +0000
Subject: [PATCH 5/6] Update CHANGELOG

---
 CHANGELOG.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76f73c03..d751ca81 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,9 +20,6 @@
 * `cellranger`:
   - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164).
 
-
-## BREAKING CHANGES
-
 * `rseqc`:
   - `rseqc/rseqc_inner_distance`: Calculate inner distance between read pairs (PR #159).
   - `rseqc/rseqc_inferexperiment`: Infer strandedness from sequencing reads (PR #158).

From 919df7a87e23962dc3248446c31a040593224378 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Fri, 8 Nov 2024 10:35:11 +0100
Subject: [PATCH 6/6] Update src/cellranger/cellranger_mkref/script.sh

---
 src/cellranger/cellranger_mkref/script.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh
index 7179faae..03755998 100644
--- a/src/cellranger/cellranger_mkref/script.sh
+++ b/src/cellranger/cellranger_mkref/script.sh
@@ -32,7 +32,7 @@ cellranger mkref \
   --genome output \
   ${par_reference_version:+--ref-version $par_reference_version} \
   ${meta_cpus:+--nthreads $meta_cpus} \
-  ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itseld
+  ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itself
 
 echo "> Creating archive"
-tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" .
\ No newline at end of file
+tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" .