From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:04:14 +0100
Subject: [PATCH 01/11] initial commit dedup

---
 CHANGELOG.md                                  |   3 +
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++
 src/umi_tools/umi_tools_dedup/help.txt        |  13 +
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ++++
 src/umi_tools/umi_tools_dedup/test.sh         |  49 +++
 5 files changed, 409 insertions(+)
 create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 create mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 create mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 create mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4fd7f001..1bef9345 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,9 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
+* `umi_tools`:
+    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
+    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
new file mode 100644
index 00000000..75306541
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -0,0 +1,279 @@
+name: umi_tool_dedup
+namespace: umi_tools
+description: |
+  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
+keywords: [umi_tools, deduplication, dedup]
+links:
+  homepage: https://umi-tools.readthedocs.io/en/latest/
+  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
+                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
+  repository: https://github.com/CGATOxford/UMI-tools
+references: 
+  doi: 10.1101/gr.209601.116
+license: MIT
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        alternatives: -I
+        type: file
+        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
+        required: true
+      - name: --in_sam
+        type: boolean_true
+        description: |
+          By default, inputs are assumed to be in BAM format. Use this options 
+          to specify the use of SAM format for input.
+      - name: --bai
+        type: file
+        description: BAM index
+      - name: --get_output_stats
+        type: boolean
+        description: Whether or not to generate output stats. 
+      - name: --random_seed
+        type: integer
+        description: |
+          Random seed to initialize number generator with.
+        default: none
+
+  - name: Outputs
+    arguments:
+      - name: --output
+        alternatives: -S
+        type: file
+        description: Deduplicated BAM file
+        required: true
+        direction: output
+      - name: --out_sam
+        type: boolean_true
+        description: |
+          By default, outputa are written in BAM format. Use this options to 
+          specify the use of SAM format for output.
+      - name: --paired
+        type: boolean_true
+        description: |
+          BAM is paired end - output both read pairs. This will also force the
+          use of the template length to determine reads with the same mapping
+          coordinates.
+      - name: --output_stats
+        type: file
+        description: Directory containing UMI based deduplication statistics files
+        direction: output
+      - name: --extract_umi_method
+        type: string
+        description: |
+          Specify the method by which the barcodes were encoded in the read.
+          The options are: [read_id, tag, umis].
+        default: read_id
+      - name: --umi_tag
+        type: string
+        description: |
+          The tag containing the UMI sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --umi_separator
+        type: string
+        description: |
+          The separator used to separate the UMI from the read sequence. 
+          This is only required if the extract_umi_method is set to id_read.
+        default: '_'
+      - name: --umi_tag_split
+        type: string
+        description: |
+          Separate the UMI in tag by <SPLIT> and take the first element.
+      - name: --umi_tag_delimiter
+        type: string
+        description: |
+          Separate the UMI in by <DELIMITER> and concatenate the elements
+      - name: --cell_tag
+        type: string
+        description: |
+          The tag containing the cell barcode sequence. 
+          This is only required if the extract_umi_method is set to tag.
+      - name: --cell_tag_split
+        type: string
+        description: |
+          Separate the cell barcode in tag by <SPLIT> and take the first element.
+      - name: --cell_tag_delimiter
+        type: string
+        description: |
+          Separate the cell barcode in by <DELIMITER> and concatenate the elements
+  
+  - name: Grouping Options
+    arguments:    
+      - name: --method
+        type: string
+        description: |
+          The method to use for grouping reads. The options are: 
+          [unique, percentile, cluster, adjacency, directional].
+        default: directional
+      - name: --edit_distance_threshold
+        type: integer
+        description: |
+          For the adjacency and cluster methods the threshold for the edit 
+          distance to connect two UMIs in the network can be increased. The 
+          default value of 1 works best unless the UMI is very long (>14bp).
+        default: 1
+      - name: --spliced_is_unique
+        type: boolean_true
+        description: |
+          Causes two reads that start in the same position on the same strand
+          and having the same UMI to be considered unique if one is spliced 
+          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
+      - name: --soft_clip_threshold
+        type: integer
+        description: |
+          Mappers that soft clip will sometimes do so rather than mapping a
+          spliced read if there is only a small overhang over the exon junction.
+          By setting this option, you can treat reads with at least this many
+          bases soft-clipped at the 3’ end as spliced.
+        default: 4
+      - name: --multimapping_detection_method
+        type: string
+        description: |
+          If the sam/bam contains tags to identify multimapping reads, you can
+          specify for use when selecting the best read at a given loci. Supported
+          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
+          mapping quality will be selected.
+      - name: --read_length
+        type: integer
+        description: |
+          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+  
+  - name: Single-cell RNA-Seq Options
+    arguments:
+      - name: --per_gene
+        type: boolean_true
+        description: |
+          Reads will be grouped together if they have the same gene. This is useful
+          if your library prep generates PCR duplicates with non identical alignment
+          positions such as CEL-Seq. Note this option is hardcoded to be on with the
+          count command. I.e counting is always performed per-gene. Must be combined
+          with either --gene_tag or --per_contig option.
+      - name: --gene_tag
+        type: string
+        description: |
+          Deduplicate per gene. The gene information is encoded in the bam read tag
+          specified.
+      - name: --assigned_status_tag
+        type: string
+        description: |
+          BAM tag which describes whether a read is assigned to a gene. Defaults to
+          the same value as given for --gene_tag.
+      - name: --skip_tags_regex
+        type: string
+        description: |
+          Use in conjunction with the --assigned_status_tag option to skip any reads
+          where the tag matches this regex. Default ("^[__|Unassigned]") matches
+          anything which starts with “__” or “Unassigned”.
+      - name: --per_contig
+        type: boolean_true
+        description: |
+          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
+          contig will be considered to have the same alignment position. This is
+          useful if you have aligned to a reference transcriptome with one
+          transcript per gene. If you have aligned to a transcriptome with more
+          than one transcript per gene, you can supply a map between transcripts
+          and gene using the --gene_transcript_map option.
+      - name: --gene_transcript_map
+        type: file
+        description: |
+          A file containing a mapping between gene names and transcript names.
+          The file should be tab separated with the gene name in the first column
+          and the transcript name in the second column.
+      - name: --per_cell
+        type: boolean_true
+        description: |
+          Reads will only be grouped together if they have the same cell barcode.
+          Can be combined with --per_gene.
+  
+  - name: SAM/BAM Options
+    arguments:
+      - name: --mapping_quality
+        type: integer
+        description: |
+          Minimium mapping quality (MAPQ) for a read to be retained.
+        default: 0
+      - name: --unmapped_reads
+        type: string
+        description: |
+          How unmapped reads should be handled. 
+          The options are:
+          "discard": Discard all unmapped reads.
+          "use":     If read2 is unmapped, deduplicate using read1 only. 
+                     Requires --paired.
+          "output":  Output unmapped reads/read pairs without UMI 
+                     grouping/deduplication. Only available in umi_tools group.
+        default: discard
+      - name: --chimeric_pairs
+        type: string
+        description: |
+          How chimeric pairs should be handled. 
+          The options are:
+          "discard": Discard all chimeric read pairs.
+          "use":     Deduplicate using read1 only.
+          "output":  Output chimeric pairs without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --unapired_reads
+        type: string
+        description: |
+          How unpaired reads should be handled. 
+          The options are:
+          "discard": Discard all unpaired reads.
+          "use":     Deduplicate using read1 only.
+          "output":  Output unpaired reads without UMI grouping/deduplication. 
+                     Only available in umi_tools group.
+        default: use
+      - name: --ignore_umi
+        type: boolean_true
+        description: |
+          Ignore the UMI and group reads using mapping coordinates only.
+      - name: --subset
+        type: boolean_true
+        description: |
+          Only consider a fraction of the reads, chosen at random. This is useful
+          for doing saturation analyses.
+      - name: --chrom
+        type: string
+        description: |
+          Only consider a single chromosome. This is useful for debugging/testing 
+          purposes.
+  
+  - name: Group/Dedup Options
+    arguments:
+      - name: --no_sort_output
+        type: boolean_true
+        description: |
+          By default, output is sorted. This involves the use of a temporary unsorted
+          file (saved in --temp-dir). Use this option to turn off sorting.
+      - name: --buffer_whole_contig
+        type: boolean_true
+        description: |
+          Forces dedup to parse an entire contig before yielding any reads for
+          deduplication. This is the only way to absolutely guarantee that all reads
+          with the same start position are grouped together for deduplication since
+          dedup uses the start position of the read, not the alignment coordinate on
+          which the reads are sorted. However, by default, dedup reads for another
+          1000bp before outputting read groups which will avoid any reads being missed
+          with short read sequencing (<1000bp).
+
+
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - type: file
+    path: test_data
+engines:
+  - type: docker
+    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
+    setup:
+      - type: docker
+        run: |
+            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
+runners:
+- type: executable
+- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
new file mode 100644
index 00000000..d3c8fa44
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/help.txt
@@ -0,0 +1,13 @@
+```
+umi_tools dedup
+```
+
+dedup - Deduplicate reads using UMI and mapping coordinates
+
+Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+       note: If --stdout is ommited, standard out is output. To
+             generate a valid BAM file on standard out, please
+             redirect log with --log=LOGFILE or --log2stderr 
+
+For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
new file mode 100644
index 00000000..57c01258
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/script.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+set -e
+
+test_dir="${metal_executable}/test_data"
+
+[[ "$par_paired" == "false" ]] && unset par_paired
+[[ "$par_in_sam" == "false" ]] && unset par_in_sam
+[[ "$par_out_sam" == "false" ]] && unset par_out_sam
+[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
+[[ "$par_per_gene" == "false" ]] && unset par_per_gene
+[[ "$par_per_contig" == "false" ]] && unset par_per_contig
+[[ "$par_per_cell" == "false" ]] && unset par_per_cell
+[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
+[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
+[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
+[[ "$par_subset" == "false" ]] && unset par_subset
+
+
+$(which umi_tools) dedup \
+    -I "$par_input" \
+    ${par_in_sam:+--in-sam} \
+    ${par_bai:+--bai "$par_bai"} \
+    ${par_get_output_stats:+--get-output-stats} \
+    ${par_random_seed:+--random-seed "$par_random_seed"} \
+    -S "$par_output" \
+    ${par_out_sam:+--out-sam} \
+    ${par_paired:+--paired} \
+    ${par_output_stats:+--output-stats "$par_output_stats"} \
+    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
+    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
+    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
+    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
+    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
+    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
+    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
+    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
+    ${par_method:+--method "$par_method"} \
+    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
+    ${par_spliced_is_unique:+--spliced-is-unique} \
+    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
+    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
+    ${par_read_length:+--read-length "$par_read_length"} \
+    ${par_per_gene:+--per-gene} \
+    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
+    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
+    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
+    ${par_per_contig:+--per-contig}
+    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
+    ${par_per_cell:+--per-cell} \
+    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
+    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
+    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
+    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
+    ${par_ignore_umi:+--ignore-umi} \
+    ${par_subset:+--subset} \
+    ${par_chrom:+--chrom "$par_chrom"} \
+    ${par_no_sort_output:+--no-sort-output} \
+    ${par_buffer_whole_contig:+--buffer-whole-contig}
+
+
+exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
new file mode 100644
index 00000000..1459ec08
--- /dev/null
+++ b/src/umi_tools/umi_tools_dedup/test.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+test_dir="${meta_resources_dir}/test_data"
+echo ">>> Testing $meta_functionality_name"
+
+"$meta_executable" \
+  --bam "$test_dir/a.sorted.bam" \
+  --bai "$test_dir/a.sorted.bam.bai" \
+  --output "$test_dir/a.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
+    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/a.sorted.idxstats"
+
+############################################################################################
+
+echo ">>> Testing $meta_functionality_name with singletons in the input"
+
+"$meta_executable" \
+  --bam "$test_dir/test.paired_end.sorted.bam" \
+  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
+  --output "$test_dir/test.paired_end.sorted.idxstats"
+
+echo ">>> Checking whether output exists"
+[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
+
+echo ">>> Checking whether output is non-empty"
+[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
+    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
+
+echo ">>> Checking whether output is correct"
+diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
+    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
+
+rm "$test_dir/test.paired_end.sorted.idxstats"
+
+############################################################################################
+
+echo "All tests succeeded!"
+exit 0
\ No newline at end of file

From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 11 Apr 2024 11:38:57 +0100
Subject: [PATCH 02/11] Revert "initial commit dedup"

This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2.
---
 CHANGELOG.md                                  |   3 -
 src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------
 src/umi_tools/umi_tools_dedup/help.txt        |  13 -
 src/umi_tools/umi_tools_dedup/script.sh       |  65 ----
 src/umi_tools/umi_tools_dedup/test.sh         |  49 ---
 5 files changed, 409 deletions(-)
 delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml
 delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt
 delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh
 delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1bef9345..4fd7f001 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,9 +39,6 @@
     - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31).
     - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32).
 
-* `umi_tools`:
-    - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #).
-    
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
deleted file mode 100644
index 75306541..00000000
--- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml
+++ /dev/null
@@ -1,279 +0,0 @@
-name: umi_tool_dedup
-namespace: umi_tools
-description: |
-  Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.
-keywords: [umi_tools, deduplication, dedup]
-links:
-  homepage: https://umi-tools.readthedocs.io/en/latest/
-  documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html,
-                   https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ]
-  repository: https://github.com/CGATOxford/UMI-tools
-references: 
-  doi: 10.1101/gr.209601.116
-license: MIT
-
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: --input
-        alternatives: -I
-        type: file
-        description: Input BAM or SAM file. Use --in_sam to specify SAM format.
-        required: true
-      - name: --in_sam
-        type: boolean_true
-        description: |
-          By default, inputs are assumed to be in BAM format. Use this options 
-          to specify the use of SAM format for input.
-      - name: --bai
-        type: file
-        description: BAM index
-      - name: --get_output_stats
-        type: boolean
-        description: Whether or not to generate output stats. 
-      - name: --random_seed
-        type: integer
-        description: |
-          Random seed to initialize number generator with.
-        default: none
-
-  - name: Outputs
-    arguments:
-      - name: --output
-        alternatives: -S
-        type: file
-        description: Deduplicated BAM file
-        required: true
-        direction: output
-      - name: --out_sam
-        type: boolean_true
-        description: |
-          By default, outputa are written in BAM format. Use this options to 
-          specify the use of SAM format for output.
-      - name: --paired
-        type: boolean_true
-        description: |
-          BAM is paired end - output both read pairs. This will also force the
-          use of the template length to determine reads with the same mapping
-          coordinates.
-      - name: --output_stats
-        type: file
-        description: Directory containing UMI based deduplication statistics files
-        direction: output
-      - name: --extract_umi_method
-        type: string
-        description: |
-          Specify the method by which the barcodes were encoded in the read.
-          The options are: [read_id, tag, umis].
-        default: read_id
-      - name: --umi_tag
-        type: string
-        description: |
-          The tag containing the UMI sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --umi_separator
-        type: string
-        description: |
-          The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read.
-        default: '_'
-      - name: --umi_tag_split
-        type: string
-        description: |
-          Separate the UMI in tag by <SPLIT> and take the first element.
-      - name: --umi_tag_delimiter
-        type: string
-        description: |
-          Separate the UMI in by <DELIMITER> and concatenate the elements
-      - name: --cell_tag
-        type: string
-        description: |
-          The tag containing the cell barcode sequence. 
-          This is only required if the extract_umi_method is set to tag.
-      - name: --cell_tag_split
-        type: string
-        description: |
-          Separate the cell barcode in tag by <SPLIT> and take the first element.
-      - name: --cell_tag_delimiter
-        type: string
-        description: |
-          Separate the cell barcode in by <DELIMITER> and concatenate the elements
-  
-  - name: Grouping Options
-    arguments:    
-      - name: --method
-        type: string
-        description: |
-          The method to use for grouping reads. The options are: 
-          [unique, percentile, cluster, adjacency, directional].
-        default: directional
-      - name: --edit_distance_threshold
-        type: integer
-        description: |
-          For the adjacency and cluster methods the threshold for the edit 
-          distance to connect two UMIs in the network can be increased. The 
-          default value of 1 works best unless the UMI is very long (>14bp).
-        default: 1
-      - name: --spliced_is_unique
-        type: boolean_true
-        description: |
-          Causes two reads that start in the same position on the same strand
-          and having the same UMI to be considered unique if one is spliced 
-          and the other is not. (Uses the ‘N’ cigar operation to test for splicing).
-      - name: --soft_clip_threshold
-        type: integer
-        description: |
-          Mappers that soft clip will sometimes do so rather than mapping a
-          spliced read if there is only a small overhang over the exon junction.
-          By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3’ end as spliced.
-        default: 4
-      - name: --multimapping_detection_method
-        type: string
-        description: |
-          If the sam/bam contains tags to identify multimapping reads, you can
-          specify for use when selecting the best read at a given loci. Supported
-          tags are “NH”, “X0” and “XT”. If not specified, the read with the highest
-          mapping quality will be selected.
-      - name: --read_length
-        type: integer
-        description: |
-          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
-  
-  - name: Single-cell RNA-Seq Options
-    arguments:
-      - name: --per_gene
-        type: boolean_true
-        description: |
-          Reads will be grouped together if they have the same gene. This is useful
-          if your library prep generates PCR duplicates with non identical alignment
-          positions such as CEL-Seq. Note this option is hardcoded to be on with the
-          count command. I.e counting is always performed per-gene. Must be combined
-          with either --gene_tag or --per_contig option.
-      - name: --gene_tag
-        type: string
-        description: |
-          Deduplicate per gene. The gene information is encoded in the bam read tag
-          specified.
-      - name: --assigned_status_tag
-        type: string
-        description: |
-          BAM tag which describes whether a read is assigned to a gene. Defaults to
-          the same value as given for --gene_tag.
-      - name: --skip_tags_regex
-        type: string
-        description: |
-          Use in conjunction with the --assigned_status_tag option to skip any reads
-          where the tag matches this regex. Default ("^[__|Unassigned]") matches
-          anything which starts with “__” or “Unassigned”.
-      - name: --per_contig
-        type: boolean_true
-        description: |
-          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
-          contig will be considered to have the same alignment position. This is
-          useful if you have aligned to a reference transcriptome with one
-          transcript per gene. If you have aligned to a transcriptome with more
-          than one transcript per gene, you can supply a map between transcripts
-          and gene using the --gene_transcript_map option.
-      - name: --gene_transcript_map
-        type: file
-        description: |
-          A file containing a mapping between gene names and transcript names.
-          The file should be tab separated with the gene name in the first column
-          and the transcript name in the second column.
-      - name: --per_cell
-        type: boolean_true
-        description: |
-          Reads will only be grouped together if they have the same cell barcode.
-          Can be combined with --per_gene.
-  
-  - name: SAM/BAM Options
-    arguments:
-      - name: --mapping_quality
-        type: integer
-        description: |
-          Minimium mapping quality (MAPQ) for a read to be retained.
-        default: 0
-      - name: --unmapped_reads
-        type: string
-        description: |
-          How unmapped reads should be handled. 
-          The options are:
-          "discard": Discard all unmapped reads.
-          "use":     If read2 is unmapped, deduplicate using read1 only. 
-                     Requires --paired.
-          "output":  Output unmapped reads/read pairs without UMI 
-                     grouping/deduplication. Only available in umi_tools group.
-        default: discard
-      - name: --chimeric_pairs
-        type: string
-        description: |
-          How chimeric pairs should be handled. 
-          The options are:
-          "discard": Discard all chimeric read pairs.
-          "use":     Deduplicate using read1 only.
-          "output":  Output chimeric pairs without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --unapired_reads
-        type: string
-        description: |
-          How unpaired reads should be handled. 
-          The options are:
-          "discard": Discard all unpaired reads.
-          "use":     Deduplicate using read1 only.
-          "output":  Output unpaired reads without UMI grouping/deduplication. 
-                     Only available in umi_tools group.
-        default: use
-      - name: --ignore_umi
-        type: boolean_true
-        description: |
-          Ignore the UMI and group reads using mapping coordinates only.
-      - name: --subset
-        type: boolean_true
-        description: |
-          Only consider a fraction of the reads, chosen at random. This is useful
-          for doing saturation analyses.
-      - name: --chrom
-        type: string
-        description: |
-          Only consider a single chromosome. This is useful for debugging/testing 
-          purposes.
-  
-  - name: Group/Dedup Options
-    arguments:
-      - name: --no_sort_output
-        type: boolean_true
-        description: |
-          By default, output is sorted. This involves the use of a temporary unsorted
-          file (saved in --temp-dir). Use this option to turn off sorting.
-      - name: --buffer_whole_contig
-        type: boolean_true
-        description: |
-          Forces dedup to parse an entire contig before yielding any reads for
-          deduplication. This is the only way to absolutely guarantee that all reads
-          with the same start position are grouped together for deduplication since
-          dedup uses the start position of the read, not the alignment coordinate on
-          which the reads are sorted. However, by default, dedup reads for another
-          1000bp before outputting read groups which will avoid any reads being missed
-          with short read sequencing (<1000bp).
-
-
-resources:
-  - type: bash_script
-    path: script.sh
-test_resources:
-  - type: bash_script
-    path: test.sh
-  - type: file
-    path: test_data
-engines:
-  - type: docker
-    image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1
-    setup:
-      - type: docker
-        run: |
-            umi_tools -v | sed 's/ version//g' > /var/software_versions.txt
-runners:
-- type: executable
-- type: nextflow
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt
deleted file mode 100644
index d3c8fa44..00000000
--- a/src/umi_tools/umi_tools_dedup/help.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-```
-umi_tools dedup
-```
-
-dedup - Deduplicate reads using UMI and mapping coordinates
-
-Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
-
-       note: If --stdout is ommited, standard out is output. To
-             generate a valid BAM file on standard out, please
-             redirect log with --log=LOGFILE or --log2stderr 
-
-For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh
deleted file mode 100644
index 57c01258..00000000
--- a/src/umi_tools/umi_tools_dedup/script.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-## VIASH START
-## VIASH END
-
-set -e
-
-test_dir="${metal_executable}/test_data"
-
-[[ "$par_paired" == "false" ]] && unset par_paired
-[[ "$par_in_sam" == "false" ]] && unset par_in_sam
-[[ "$par_out_sam" == "false" ]] && unset par_out_sam
-[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique
-[[ "$par_per_gene" == "false" ]] && unset par_per_gene
-[[ "$par_per_contig" == "false" ]] && unset par_per_contig
-[[ "$par_per_cell" == "false" ]] && unset par_per_cell
-[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output
-[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig
-[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi
-[[ "$par_subset" == "false" ]] && unset par_subset
-
-
-$(which umi_tools) dedup \
-    -I "$par_input" \
-    ${par_in_sam:+--in-sam} \
-    ${par_bai:+--bai "$par_bai"} \
-    ${par_get_output_stats:+--get-output-stats} \
-    ${par_random_seed:+--random-seed "$par_random_seed"} \
-    -S "$par_output" \
-    ${par_out_sam:+--out-sam} \
-    ${par_paired:+--paired} \
-    ${par_output_stats:+--output-stats "$par_output_stats"} \
-    ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \
-    ${par_umi_tag:+--umi-tag "$par_umi_tag"} \
-    ${par_umi_separator:+--umi-separator "$par_umi_separator"} \
-    ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \
-    ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \
-    ${par_cell_tag:+--cell-tag "$par_cell_tag"} \
-    ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \
-    ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \
-    ${par_method:+--method "$par_method"} \
-    ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \
-    ${par_spliced_is_unique:+--spliced-is-unique} \
-    ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \
-    ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \
-    ${par_read_length:+--read-length "$par_read_length"} \
-    ${par_per_gene:+--per-gene} \
-    ${par_gene_tag:+--gene-tag "$par_gene_tag"} \
-    ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \
-    ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \
-    ${par_per_contig:+--per-contig}
-    ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \
-    ${par_per_cell:+--per-cell} \
-    ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \
-    ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \
-    ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \
-    ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \
-    ${par_ignore_umi:+--ignore-umi} \
-    ${par_subset:+--subset} \
-    ${par_chrom:+--chrom "$par_chrom"} \
-    ${par_no_sort_output:+--no-sort-output} \
-    ${par_buffer_whole_contig:+--buffer-whole-contig}
-
-
-exit 0
\ No newline at end of file
diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh
deleted file mode 100644
index 1459ec08..00000000
--- a/src/umi_tools/umi_tools_dedup/test.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-test_dir="${meta_resources_dir}/test_data"
-echo ">>> Testing $meta_functionality_name"
-
-"$meta_executable" \
-  --bam "$test_dir/a.sorted.bam" \
-  --bai "$test_dir/a.sorted.bam.bai" \
-  --output "$test_dir/a.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \
-    (echo "Output file a.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/a.sorted.idxstats"
-
-############################################################################################
-
-echo ">>> Testing $meta_functionality_name with singletons in the input"
-
-"$meta_executable" \
-  --bam "$test_dir/test.paired_end.sorted.bam" \
-  --bai "$test_dir/test.paired_end.sorted.bam.bai" \
-  --output "$test_dir/test.paired_end.sorted.idxstats"
-
-echo ">>> Checking whether output exists"
-[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1
-
-echo ">>> Checking whether output is non-empty"
-[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \
-    echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1
-
-echo ">>> Checking whether output is correct"
-diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \
-    (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1)
-
-rm "$test_dir/test.paired_end.sorted.idxstats"
-
-############################################################################################
-
-echo "All tests succeeded!"
-exit 0
\ No newline at end of file

From 1f02f94844b2196c4208afe64d50cc8d14094227 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Sun, 2 Jun 2024 20:59:47 +0200
Subject: [PATCH 03/11] initial commit

---
 src/dupradar/config.vsh.yaml      | 117 +++++++++++++++++++++++
 src/dupradar/help.txt             |   0
 src/dupradar/script.R             | 154 ++++++++++++++++++++++++++++++
 src/dupradar/script.sh            |  32 +++++++
 src/dupradar/test.sh              |  51 ++++++++++
 src/dupradar/test_data/genes.gtf  |  36 +++++++
 src/dupradar/test_data/sample.bam | Bin 0 -> 8680 bytes
 src/dupradar/test_data/script.sh  |   9 ++
 8 files changed, 399 insertions(+)
 create mode 100644 src/dupradar/config.vsh.yaml
 create mode 100644 src/dupradar/help.txt
 create mode 100644 src/dupradar/script.R
 create mode 100644 src/dupradar/script.sh
 create mode 100644 src/dupradar/test.sh
 create mode 100644 src/dupradar/test_data/genes.gtf
 create mode 100644 src/dupradar/test_data/sample.bam
 create mode 100644 src/dupradar/test_data/script.sh

diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml
new file mode 100644
index 00000000..cae6964f
--- /dev/null
+++ b/src/dupradar/config.vsh.yaml
@@ -0,0 +1,117 @@
+name: dupradar
+description: Assessment of duplication rates in RNA-Seq datasets
+keywords: [ rnaseq, duplication, genomics ]
+links:
+  homepage: https://bioconductor.org/packages/release/bioc/html/dupRadar.html
+  documentation: https://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html
+  repository: https://github.com/ssayols/dupRadar
+references: 
+  doi: "10.1186/s12859-016-1276-2"
+license: GPL-3.0
+
+argument_groups:
+- name: "Input"
+  arguments: 
+  - name: "--id"
+    type: string
+    description: Sample ID
+    
+  - name: "--input"
+    type: file 
+    required: true
+    description: path to input alignment file in BAM format
+
+  - name: "--gtf_annotation"
+    type: file 
+    required: true
+    description: path to GTF annotation file.
+
+  - name: "--paired"
+    type: boolean
+    description: add flag if input alignment file consists of paired reads
+
+  - name: "--strandedness"
+    type: string
+    required: false
+    choices: ["forward", "reverse", "unstranded"]
+    description: strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads))
+
+  - name: "--versions"
+    type: file
+    must_exist: false
+    
+- name: "Output"
+  arguments: 
+  - name: "--output_dupmatrix"
+    type: file
+    direction: output
+    required: false
+    must_exist: true
+    default: $id.dup_matrix.txt
+    description: path to output file (txt) of duplicate tag counts
+  
+  - name: "--output_dup_intercept_mqc"
+    type: file
+    direction: output
+    required: false
+    must_exist: true
+    default: $id.dup_intercept_mqc.txt
+    description: path to output file (txt) of multiqc intercept value DupRadar
+
+  - name: "--output_duprate_exp_boxplot"
+    type: file
+    direction: output
+    required: false
+    must_exist: true
+    default: $id.duprate_exp_boxplot.pdf
+    description: path to output file (pdf) of distribution of expression box plot
+  
+  - name: "--output_duprate_exp_densplot"
+    type: file
+    direction: output
+    required: false
+    must_exist: true
+    default: $id.duprate_exp_densityplot.pdf
+    description: path to output file (pdf) of 2D density scatter plot of duplicate tag counts
+  
+  - name: "--output_duprate_exp_denscurve_mqc"
+    type: file
+    direction: output
+    required: false
+    must_exist: true
+    default: $id.duprate_exp_density_curve_mqc.txt
+    description: path to output file (pdf) of density curve of gene duplication multiqc
+  
+  - name: "--output_expression_histogram"
+    type: file
+    direction: output
+    required: false
+    must_exist: true
+    default: $id.expression_hist.pdf
+    description: path to output file (pdf) of distribution of RPK values per gene histogram
+  
+  - name: "--output_intercept_slope"
+    type: file
+    direction: output
+    required: false
+    must_exist: true
+    default: $id.intercept_slope.txt
+    description: output file (txt) with progression of duplication rate values
+
+resources:
+  - type: bash_script
+    path: script.sh
+  # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r
+  - type: R_script
+    path: script.R
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+  
+engines:
+  - type: docker
+    image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0
+runners:
+  - type: executable  
+  - type: nextflow
diff --git a/src/dupradar/help.txt b/src/dupradar/help.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/src/dupradar/script.R b/src/dupradar/script.R
new file mode 100644
index 00000000..82218c6e
--- /dev/null
+++ b/src/dupradar/script.R
@@ -0,0 +1,154 @@
+#!/usr/bin/env Rscript
+
+# Command line argument processing
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) < 5) {
+    stop("Usage: dupRadar.r <input.bam> <sample_id> <annotation.gtf> <strandDirection:0=unstranded/1=forward/2=reverse> <paired/single> <nbThreads> <R-package-location (optional)>", call.=FALSE)
+}
+
+message("paired_end is", args[5])
+message("the type is is", class(args[5]))
+
+input_bam <- args[1]
+output_prefix <- args[2]
+annotation_gtf <- args[3]
+stranded <- as.numeric(args[4])
+paired_end <- ifelse(args[5] == "true", TRUE, FALSE)
+threads <- as.numeric(args[6])
+
+bamRegex <- "(.+)\\.bam$"
+
+if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) &&  (!file.info(input_bam)$isdir))) stop("First argument '<input.bam>' must be an existing file (not a directory) with '.bam' extension...")
+if(!(file.exists(annotation_gtf) &&  (!file.info(annotation_gtf)$isdir))) stop("Third argument '<annotation.gtf>' must be an existing file (and not a directory)...")
+if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument <strandDirection> must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...")
+if(is.na(threads) || (threads<=0)) stop("Fifth argument <nbThreads> must be a strictly positive numeric value...")
+
+# Debug messages (stderr)
+message("Input bam      (Arg 1): ", input_bam)
+message("Output basename(Arg 2): ", output_prefix)
+message("Input gtf      (Arg 3): ", annotation_gtf)
+message("Strandness     (Arg 4): ", c("unstranded", "forward", "reverse")[stranded+1])
+message("paired_end     (Arg 5): ", paired_end)
+message("Nb threads     (Arg 6): ", threads)
+message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified"))
+
+
+# Load / install packages
+if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) }
+if (!require("dupRadar")){
+    source("http://bioconductor.org/biocLite.R")
+    biocLite("dupRadar", suppressUpdates=TRUE)
+    library("dupRadar")
+}
+if (!require("parallel")) {
+    install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/')
+    library("parallel")
+}
+
+# Duplicate stats
+dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads)
+write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t")
+
+# 2D density scatter plot
+pdf(paste0(output_prefix, "_duprateExpDens.pdf"))
+duprateExpDensPlot(DupMat=dm)
+title("Density scatter plot")
+mtext(output_prefix, side=3)
+dev.off()
+fit <- duprateExpFit(DupMat=dm)
+cat(
+    paste("- dupRadar Int (duprate at low read counts):", fit$intercept),
+    paste("- dupRadar Sl (progression of the duplication rate):", fit$slope),
+    fill=TRUE, labels=output_prefix,
+    file=paste0(output_prefix, "_intercept_slope.txt"), append=FALSE
+)
+
+# Create a multiqc file dupInt
+sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix)
+line="#id: DupInt
+#plot_type: 'generalstats'
+#pconfig:
+#    dupRadar_intercept:
+#        title: 'dupInt'
+#        namespace: 'DupRadar'
+#        description: 'Intercept value from DupRadar'
+#        max: 100
+#        min: 0
+#        scale: 'RdYlGn-rev'
+#        format: '{:.2f}%'
+Sample dupRadar_intercept"
+
+write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
+write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
+
+# Get numbers from dupRadar GLM
+curve_x <- sort(log10(dm$RPK))
+curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response")
+# Remove all of the infinite values
+infs = which(curve_x %in% c(-Inf,Inf))
+curve_x = curve_x[-infs]
+curve_y = curve_y[-infs]
+# Reduce number of data points
+curve_x <- curve_x[seq(1, length(curve_x), 10)]
+curve_y <- curve_y[seq(1, length(curve_y), 10)]
+# Convert x values back to real counts
+curve_x = 10^curve_x
+# Write to file
+line="#id: dupradar
+#section_name: 'DupRadar'
+#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html'
+#description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication.
+#    This plot shows the general linear models - a summary of the gene duplication distributions. \"
+#pconfig:
+#    title: 'DupRadar General Linear Model'
+#    xLog: True
+#    xlab: 'expression (reads/kbp)'
+#    ylab: '% duplicate reads'
+#    ymax: 100
+#    ymin: 0
+#    tt_label: '<b>{point.x:.1f} reads/kbp</b>: {point.y:,.2f}% duplicates'
+#    xPlotLines:
+#        - color: 'green'
+#          dashStyle: 'LongDash'
+#          label:
+#                style: {color: 'green'}
+#                text: '0.5 RPKM'
+#                verticalAlign: 'bottom'
+#                y: -65
+#          value: 0.5
+#          width: 1
+#        - color: 'red'
+#          dashStyle: 'LongDash'
+#          label:
+#                style: {color: 'red'}
+#                text: '1 read/bp'
+#                verticalAlign: 'bottom'
+#                y: -65
+#          value: 1000
+#          width: 1"
+
+write(line,file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),append=TRUE)
+write.table(
+    cbind(curve_x, curve_y),
+    file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),
+    quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE,
+)
+
+# Distribution of expression box plot
+pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf"))
+duprateExpBoxplot(DupMat=dm)
+title("Percent Duplication by Expression")
+mtext(output_prefix, side=3)
+dev.off()
+
+# Distribution of RPK values per gene
+pdf(paste0(output_prefix, "_expressionHist.pdf"))
+expressionHist(DupMat=dm)
+title("Distribution of RPK values per gene")
+mtext(output_prefix, side=3)
+dev.off()
+
+# Print sessioninfo to standard out
+print(output_prefix)
+citation("dupRadar")
+sessionInfo()
diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh
new file mode 100644
index 00000000..85d51167
--- /dev/null
+++ b/src/dupradar/script.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -eo pipefail 
+
+function num_strandness {
+    if [ $par_strandedness == 'unstranded' ]; then echo 0
+    elif [ $par_strandedness == 'forward' ]; then echo 1
+    elif [ $par_strandedness == 'reverse' ]; then echo 2
+    else echo "strandedness must be unstranded, forward or reverse." && \
+        exit 1
+    fi
+}
+
+Rscript "$meta_resources_dir/script.R" \
+    $par_input \
+    $par_id \
+    $par_gtf_annotation \
+    $(num_strandness) \
+    $par_paired
+
+mv "$par_id"_dupMatrix.txt $par_output_dupmatrix
+mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc
+mv "$par_id"_duprateExpBoxplot.pdf $par_output_duprate_exp_boxplot
+mv "$par_id"_duprateExpDens.pdf $par_output_duprate_exp_densplot
+mv "$par_id"_duprateExpDensCurve_mqc.txt $par_output_duprate_exp_denscurve_mqc
+mv "$par_id"_expressionHist.pdf $par_output_expression_histogram
+mv "$par_id"_intercept_slope.txt $par_output_intercept_slope
+
+
+dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")
+text="bioconductor-dupradar: ${dupradar_ver}"
+echo "$text"
\ No newline at end of file
diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh
new file mode 100644
index 00000000..01072e01
--- /dev/null
+++ b/src/dupradar/test.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# define input and output for script
+input_bam="$meta_resources_dir/sample.bam"
+input_gtf="$meta_resources_dir/genes.gtf"
+
+output_dupmatrix="dup_matrix.txt"
+output_dup_intercept_mqc="dup_intercept_mqc.txt"
+output_duprate_exp_boxplot="duprate_exp_boxplot.pdf"
+output_duprate_exp_densplot="duprate_exp_densityplot.pdf"
+output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.pdf"
+output_expression_histogram="expression_hist.pdf"
+output_intercept_slope="intercept_slope.txt"
+
+# Run executable
+echo "> Running $meta_functionality_name for unpaired reads, writing to tmpdir $tmpdir."
+
+"$meta_executable" \
+    --input "$input_bam" \
+    --id "test" \
+    --gtf_annotation "$input_gtf" \
+    --strandedness "forward" \
+    --paired false \
+    --output_dupmatrix $output_dupmatrix \
+    --output_dup_intercept_mqc $output_dup_intercept_mqc \
+    --output_duprate_exp_boxplot $output_duprate_exp_boxplot \
+    --output_duprate_exp_densplot $output_duprate_exp_densplot \
+    --output_duprate_exp_denscurve_mqc $output_duprate_exp_denscurve_mqc \
+    --output_expression_histogram $output_expression_histogram \
+    --output_intercept_slope $output_intercept_slope
+
+exit_code=$?
+[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
+
+echo ">> asserting output has been created for paired read input"
+[ ! -f "$output_dupmatrix" ] && echo "$output_dupmatrix was not created" && exit 1
+[ ! -s "$output_dupmatrix" ] && echo "$output_dupmatrix is empty" && exit 1
+[ ! -f "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc was not created" && exit 1
+[ ! -s "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc is empty" && exit 1
+[ ! -f "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot was not created" && exit 1
+[ ! -s "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot is empty" && exit 1
+[ ! -f "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot was not created" && exit 1
+[ ! -s "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot is empty" && exit 1
+[ ! -f "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc was not created" && exit 1
+[ ! -s "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc is empty" && exit 1
+[ ! -f "$output_expression_histogram" ] && echo "$output_expression_histogram was not created" && exit 1
+[ ! -s "$output_expression_histogram" ] && echo "$output_expression_histogram is empty" && exit 1
+[ ! -f "$output_intercept_slope" ] && echo "$output_intercept_slope was not created" && exit 1
+[ ! -s "$output_intercept_slope" ] && echo "$output_intercept_slope is empty" && exit 1
+
+exit 0
\ No newline at end of file
diff --git a/src/dupradar/test_data/genes.gtf b/src/dupradar/test_data/genes.gtf
new file mode 100644
index 00000000..423ba8f8
--- /dev/null
+++ b/src/dupradar/test_data/genes.gtf
@@ -0,0 +1,36 @@
+chr1	unknown	exon	14362	14829	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	14970	15038	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	15796	15947	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	16607	16765	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	16858	17055	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	17233	17368	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	17606	17742	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	17915	18061	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	18268	18366	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	24738	24891	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	29321	29370	.	-	.	gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
+chr1	unknown	exon	34611	35174	.	-	.	gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403";
+chr1	unknown	exon	34611	35174	.	-	.	gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403";
+chr1	unknown	exon	35277	35481	.	-	.	gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403";
+chr1	unknown	exon	35277	35481	.	-	.	gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403";
+chr1	unknown	exon	35721	36081	.	-	.	gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403";
+chr1	unknown	exon	35721	36081	.	-	.	gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403";
+chr1	unknown	CDS	69091	70005	.	+	0	gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
+chr1	unknown	exon	69091	70008	.	+	.	gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
+chr1	unknown	start_codon	69091	69093	.	+	.	gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
+chr1	unknown	stop_codon	70006	70008	.	+	.	gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
+chr1	unknown	exon	134773	139696	.	-	.	gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541";
+chr1	unknown	exon	139790	139847	.	-	.	gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541";
+chr1	unknown	exon	140075	140566	.	-	.	gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541";
+chr1	unknown	exon	323892	324060	.	+	.	gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303";
+chr1	unknown	exon	324288	324345	.	+	.	gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303";
+chr1	unknown	exon	324439	328581	.	+	.	gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303";
+chr19	unknown	exon	76220	76783	.	-	.	gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829";
+chr19	unknown	exon	76220	76783	.	-	.	gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829";
+chr19	unknown	exon	76886	77090	.	-	.	gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829";
+chr19	unknown	exon	76886	77090	.	-	.	gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829";
+chr19	unknown	exon	77330	77690	.	-	.	gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829";
+chr19	unknown	exon	77330	77690	.	-	.	gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829";
+chr5	unknown	exon	180750507	180750675	.	+	.	gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538";
+chr5	unknown	exon	180750903	180750960	.	+	.	gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538";
+chr5	unknown	exon	180751054	180755196	.	+	.	gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538";
diff --git a/src/dupradar/test_data/sample.bam b/src/dupradar/test_data/sample.bam
new file mode 100644
index 0000000000000000000000000000000000000000..04d8c95eeba209f13a4704d596a800e73d6ff955
GIT binary patch
literal 8680
zcmV<EAs5~siwFb&00000{{{d;LjnLq0#%YdXj4%X$B(V8eN{R+bP&nl+HlXg_uP-z
zLSiaaXk%?46u}`)V%pLswF#+%2rf=Kii3-rv*03v;8p}z5kzMf!L8ur>wO=+86Ldf
z`@MV5`~UB|QCU0pdjg<*YdLDvN?4Rp{Z6UX?;o~zdd*>Hwp_m()oZ2J_8~^AwGt^u
zEHq&tC=n8!lST$Yo(gnUP;5mYR3Z>rk<o!*FRXCLq{9{33xn|zp-mu+7wk}4t;6Bg
z3#Fx2@%%uzL?EGzzc~yjyphaO85IseOpOqlQHO>SBQ2wK9GWCO#_v08Y-p1B!bs&<
ziO_Ijh$Tu#q45PcAz2xgq3{J*M8||<d-(#46~>Sn8}AE<8zbv%xE1;QTE^p<!mY^1
zR~E^QXC$AVQ3}O6tYtGf6(%Ei>>Ss;&}`!3*H@z3%cVhcf7tKu4Wjkh*}eZ2(G@AG
zuI5h*qn*xCVPQ}ZML~syt-?|FX0O$6cdE_3VW+jd(QDQ_4_Ef%V8(9XBg5^=-cGkS
z*ytQkgNj?t{n>c{n2N8{9$=~ZV=e`P7k{2)AmQrEuNPBL&^~Twz_9rIcs2zM*C%8K
z2D;V9Q3@7DjSn*^I5_#VJ2RaQ74Ulga!v>;x|9>a$?m@k8A)i|`#GJFf@{+ca$+by
zy^@oL%KNRUj0}AF`|W&27RvYE=H%ej;<L$=NWj6n&p9D<yT2wfA{c$2n@l5Y!keW^
z7H$nbmI3|)FKa3`IRpRzABzYC000000RIL6LPG)oh9BKrTdd_~b^d?bdlDuPj$H>F
z63VgTBn}sG_WhCx9D5w%q%=)5g&A%kA<(o^AF7Insw!l*YlL4aAgHJp2~b-^)fN?1
zDlb)ON>HS>6aoYuN=w2;3hDz;2^2yK;altfe|zsed!NkSPX~6+#oK4*#NVv7zO~l3
zmi?$uulG!ZxL(}qNwFt>1@E8HFZ$V_clHgprRlkIZ!ONg?#!*PxH&yD>W$ypopooU
z@vJ+V^k)6RNPN-}BJ;B<uhJ|oqbS68&(D0%cZ(>>qA;tfDoetAUznGa^wrXDgtUbv
z>18>VCGmj+$F`+y<DDZNym1_yQV0CH{Z~)R+l$kk)2H8f?{EHYw<k{DaqGX`E5(=e
z|KWqcb?ru6x4nL-aWSf1ZwgeS*?2JP9vn<(<58DUMJsT!2rmQ2kNl{Pva&_B`Q_l3
zNk=kRMs7Qn6t)4QaoL2*UPbj!3#dLvsO(LsUOT;cEvm_Ea6rfgLq_FhVU}cZ64j}n
z<Yg9yVUblu8JAuZm64mfg<q_oa@c5&O}2pXB-;xBY%-JsGuiZ=Ok_Din~L9PR2EQK
z8r6_cz2=syPkdSb^5;&4xX#|%2m3J80S@{DhQ*hiq?w;oWs)aF5d}q<W^SBCZjt2p
zi1Tz${qq~g6U|$A9ej;X0h~pM=oBnx(@$&-W$~q5a0%84z%s`2rn|V{aMm4mhqD2U
zPpF<9l3y$v%hrJKpF^L(0YvaOn9l|t{8EDE(4mbNklBbaia8r6)&<rnxS%oC3&>bo
zqZ+|X-N~%m(>}NcRMmjWa~Gh(>X70@(7(ThE5?<Q`_WhBZ4R+SXW1q=(dlqJ**c_f
zhCZ|pF2R~CMAjdYvE-JK)dQ@-@L)C=E50bp2-Hv|nO|jjnC5;S<bGLsUY2LBpOs0J
zXVC?GK{zBMBS4O31+oPl2{sf5mu5iQVptZ;tWnvEsQ%+^QhZ(i>PLa<dPJ7JWn=-@
za0(|JO~m^V5)WM}#6J8AT%MP89ffIHMM)J$WspX&BZ$NgqBJbsB8&D`@yUe)x%vZy
z7{?y@D@7T<K_rs5st`MOCVj<1n2mt(%o9&Prhms{+8wt<*gXI#cDu8|Ks+uKz()YE
zE+ZhxqB;Y?S6-OaMdhYx8GAwKmr)qic@m$;mC7~n7LIIU5PW6UwD5}^TS&5)L+{u(
zZCg{L0eHzGk9_>*t4}<xf7v&tDn)MzaX6-ML)=Y<;z5n^`?!>&AXXY8LJ#&}qYwgu
zi;KsV6Zw<tk*O2{M{-o!&O&+?IuxCb=1H**<Hvx}IO=yaSK8aKN`EjLG0NdY+@TTP
zjWAqr&OkZmkBoDANtKmBo+RPLq-||gi?afiCz5Lwe7Et=CM$DTQrcdJ@WVj(4gG80
zH3REyZ^bdbbSWu}`V+ASaA#gA#7FUWj`C1oEWH|hb4Z4hEeg75rhGUeRUsBmNXl?j
zogkkxvrsl55+ycLwU;#MZXL*vWLKZ~f(CibDIsF9ZGAGDO?%zh^gyMCAP!5<3yUJj
ziqiALJS>B-@S_rh8kNY0ep$}<=&BuxCje22&8A#HRN67AKs33*W@qgzxM;`fnB$d<
z#{@I_sYwhkJGDa=A0h6#gTZXj?=vPZ+KeY6Jgz%0&#!ep{YAwgF(U;YyCOLtS&vf!
zY^k#zhoZ)GV$o1foNjBA$*CP{lkot#VKik_aUF&z{DZLc=UD<;p(4KwizIQ~Aiy!2
zg7DBFk^ISDE$x<6V-XI?Aq67>kkuw`Qx)5e=1XFd1xqoi|2o~e-{4h92G87TZJX|3
zt=_?`-y4hH!BzEd5aJL-86{<u=YA3;xmUTl?`4$QtE7m_BunxNp%%?AnnqMbW_lk=
z14OHw>umZbz(2(TjYyN<Mg&Q2g0#U<RB0Xb`Y29NZr!0F-3>@UROP7r<F$xlM$}kr
z-OJJ_C_h_Vk61W{%^bX^GLub(h(sg`UHWTC_M+8pdG*yN{zw1LQ-Bz6gt&n;rl`KA
zli6@G72g*M;DZ2I!CvuE_6qYnI@}D-k`7fMRF<$+Eb>i>UkvxekH~KHRcP$iMt~^*
z8XMkpqzyN^<zz7Hb*WStbB#g}sYc0Dw4qg=q<)&lal#nOBC4V)%aGlYz|XVzD7RGj
zh(|20B`pF;hm^|Th}6<qk`On2j=c`z;j7yg?%Rh1@wV-Vf5>jyqui$I=L-SF0OS#%
zxTyRb)~)h&)|1@!v~T!jH*X#KVE?2PN7mCO)HJyS>*av;i2fD7lp?2zEn!V&0}$n4
zdZ60UBFgeQ&5JxO^1R5r3hlQ?<e1=XUg2bxpO>n}pqdK~1{<ihjQ*ob=FCm!5pIM<
za=bYgyr?<Sm@ZlD32uCo6kpVs&SsROZ`+Rg`(rc%Q_#Nx6=Z(lr&WrggLpoKsUl?N
zJPhZlKQGF%%Dq<}a<L?A<~BwLA0xBi1pmZh=i>&Cn>#9#byKQqOGW+oBkS6?C*L52
z39<KQ%Kx_G5E2il*FlBFEOKw8Sfuc4kVqQlbrQowR6&Lr7$r&3s8&Hy;fMYn*@jTT
zLL}!f@!`nSA5z!A_-N}8+Z4rgmOeUo-&w4!oJC(ecVnBff8#Wz>}`D!(bb)dXXB}g
zuo{=2doFw~2wfD9NmN#_7O7f67*^=+s@mHlqY=kjDu|?(d&u>uRi$?%&Qap#R*k-K
z>Xxbuq5gpKxolGQk~e<moD`30RQCbZ^&3%b<i^2x)*Hef2R$YJR0+H>^m7zfuIDFZ
znItGfK$Jd;$pE54U2tO46$sZ-w&d?b`y^`FP0?X5)QRS><LJ-UVi(|g;#{jI{PvO7
z+7M-&vH@>087uMUSxx1K3;nhfO<zj&6dS?edeK&7V%beOW!Qt21e+jP+7DMzee+z)
z9fT_0)(64x!^sdXs1&W>qCl~9<*1&Ei+3UuTqe<=v4zcT3L)tjC|doYAF%zX%3o)y
zZ!VbX{#^pAH<^v8H#-<8M%?Ih8G1An9Vqib8AKLb`r1kp=DvjMUdq3c<vF4e;k0~c
zwVnv6$YyNLdI$UhQ2n<?^#P!|W@|=-srs}2urCg9bzi<*2x>|I$<vVHRY+do)k%r6
z>b!|8FrdJ-1C>;zlI(DV88Obbn+8p>qxuQ{6g*JDTT^i`zR#bP!c>hPBv;&sZ$p3$
z5MZb}`h%{xLj$~<dg;w3Yw*>JC6y71nF_Vs3Tu+X(%eEafl@AL(?O@rXG@aE62Q-$
zZ4LOxM;dVR1Y<flfGo0Pz>9&GTvR7xsen`z!yIpsR|>mvQtn7M;rsUm1ODkQ2Bg9Y
zTr%iQSj!@Sq-|XUytp_N&E(1OW%Es{%%Bt=^$C<n>XI>P9h#>AMl)Co@Zqy9S^gaX
zw72Ay(GX<`=4xXl955MT(v+jG&fPSeCuN;@D2e?n4N(!7e&%K{F@>6%Qjn4?97XmW
zTNP6{VGEW+U@Kji=+3&c5Q_GOp~l-!U46oU`iCPyZSu-KOXZWH0u^mWlq*D#%r9_9
z)ix^&5=t)Xsi1U7GIl7~EXDbvL1?MZb)fD7C{u(yzFQG8Bq`*8DiMWi89;<C24fCP
zH;@;ikW?R(C#t*F;wm$UDxbK6kxGuExT5QpE%)0OOrMMwuIeEN!1Yy)>&v+iH*Aj!
z<uJ`P=s-7Sq%mfG=%)bYqYSQ4N5XhW+t3_gzbrtHy)P61R;Cz^;Mo(mE?ib|LBdiG
z(S&L+Z8chR>?MHfklnaY41X=(p}ZQS4(=lZ9Oz>BSrNr~vM3fbVw|PlG8b|uiBGB7
zg`E-T^n{I50(aZ?T2y_Y`ilM^d>5#a?d>%ka|?~sHXbTqrbofJ8Z-DLDXS12b(v>n
zoYi@igm|?9X0@{!i#?LLM2$g79N@1snkQ0GG3L-)79IiBTAi*ju>BibdEmzYcD>v_
zzy?$gPiB*;5*&fZ+k_|vB+L!V<h((pN_*BD2_k>6>_#0ZPehsB9h;3~HEMTD9mZg~
zCu4@c|J#o=^1#!%N?}_uJlMTA0@06EIb3O{D;J$EPpTp-z0C9c6mU^{aF*35Y&x{r
z9*)*~)v@Yb?B;NhF`8p-t3^!j1}0N>gf}0RmN&JT14LF2n5N<pT;W3}!KwI{Z-HbQ
z2$%d-h%sme^C1c7w7mIcGi)U#Sjsht{FyX)Ixj6Vo0n_)XvX*PZ#~jXn%?#r#T;AV
z>rP=d)K-1e9iPPIeCxR?YzPV@s~oC+;njJFBNro>G|rIKG8#h#33Pojmjx4ALPeIk
ze4Ea*SxG#gf{}9pC2L-&AaAs6#hYwYmC3AiLzr()+Iiu7UZb+vmU82WFitVE==Hd0
zB#Z|v%6T4e<WZeKnFBaDDpVf`b9n~-OmRLz6RM|Us(R8QH)jGKIg#Q<OfOg?U~;i}
zj3M~P#)8~kFVt3-F4k6^*R@*+zjq6beYfO)cxksc0L-pV0A=Zx5QVEeBDok1Tp0#<
zOL7)yrL2+!S^T_s76&D}@ic>cRW@T^o?%NxpfowvYVigA_2a+Z&g(yX%Tdm`0TyKN
z9x49QZl9y?-anEYeHTKha*XLAmH0tH;~F5Yj;Q)o;E|+b5e|aIc})8vulZPXKr)fN
zJgr|6>RvUF!lbxw7dxf6?g4dfqwdTskRWNNRT0#l7l0+bJn`V7dFY2p>A4ZjeoK$2
zMK*cclv{immZ~Jxl8@cI6ieD0YOTeI8UuUhnYP-0%WXSIBjk}NAz?d}-%3~uwMZ2E
zv<y?=FsWS>jb#>v822W18CI+QNMUE2zJ}7VAWkG_lq$7bx}#y#oU{dJaYZNgsy{w`
zrxYf^Ikz9Be{X8CF;Qa?OKJL$xk<;uED-WYMCl;HyEOGN!YIAc_3Mn+vMk$_k2Vit
zm>kHlB%h}|4PdISYPS!Tvs`CG>G-@}tLcB|*LJ`(gpH82dwqq;&*zZuYx1%bbN8(D
zJ-4psZjPwR=V_9;rMKUVZB00t_S2?M0xAq!+FbWjNvx2nc_5`WI$_x>nAlFAdTlFV
zow$Rxm$r01_}-{bn!c8>G>mE)M(78rThGDto*NYD9*I<XBgsrM?L~#OqYo_{bXtXn
zb_$GiMGC6Zs*Ow2m&*>?UNbtgLwN<oyNAj9q^l&<qK3lgLSw|_Gr}l0t25e4t;z}`
z>yQqLAmWg6nA|8g)78?pqoo>Y1|((2F&<~zjV7$NQ_T{Z1$|P3T9Wa<_wu$BzT)*q
zP0BV2!(^)-m-)(8Fi=wZm>sb;hZ%M0mvbMSkylk2poK~DXh|8q@I@(EQu9pyNR67M
znixH6UwBD-wGvja{{My+);(vz>vD%y69ormx!r@V;`L%4)OA+-s2hnmYK&a+xQe_m
zC~GecsMhezro_@>8Mn|>Zb(JtJb_RnG^tI-=}JQ*@3a?mRO5tG+Wz?8r%Ygd;_MFE
zFzLd>5mk$d*3&Qn-9;gWm}QX7Gq-j_8l1&Z5)?%l1!b5e`@A3Quu+Hr1xuoQ^&Qe|
z$TK^%&dJj=$$hgW6qb&yr|t}n7YQ!NRh^sKq08&f?WT83XpV$rHC96xKP^&Bqr==Q
zDPN&)si5Kna6JmNtg0Y8t}UO;GLq^xdE<}QAgGH`@_=RA7p7oit0i1-KGjx*y*C_H
z6&|%nJVJdj987g=#fYlX_uV{mBQK-1DwK&}KsOA7Is)b(Kd+jxX5M1Rp2`Y6;Z)n4
zl(Cw%9NTHXwr$xuSm$qSVV!!*QLr`*D8LrO9@18Kq9~*Gi>M6oktdjMhE;_YrL0oO
zQ*{(VfJ0olyqqdcvBV3}l!M4Jf~RCWw&2sI^_0A@Ly6f@!Nr=D)sD!P67g4}R&~4T
zuAM}Iac?&1qud=%#|l^F(jIYAgozJd1k?Kw3fl@6tji=U=W{nm^&%X1B&_Iiza=S#
z%;tmTm?BWM&@51?{q|OVv)41mt$!ujWc=QHcHxZStbYJhM)*xb^!^bpKXMaPxB<~U
z%K0#zgB9@!(huwNypv+Fs7u7HmcpcRFsT;c$ZQ>!gn3RT9g~ndYF5@!Kb7m8@MZnh
z2|s+#4jtEMNc*Q$x{uX;B6C2PXEoh`;Q4`{rv*MSR1eTC`(!>p4(EqN434Uo94-Ng
z)rBzj(PD?%U9ou1Db*$>@duajM$K{7RZji_th?^pAsw?m+Q*maP{FEd*3{-zT)DoR
z7MMGfet@j(qSDPn2*OcO?#Xs<Sn8!Bo;piL=y1hHGY89&?WDoXYVD>&w%TsQS@gjt
zp3^e?hwj@=aGPQTNM1P53pu%~uH8+8EKO3tLQmpXMeTt*puxFd#w}LhmR@Ahj;+7x
z4IbcfR?Cdv4_eOn=zTk7u0E;^YLSN{Es!cmIYaUQ$y<UnM`scwx_oM@jHOE6>_v;_
ztNyMImtM}=+YhdIMcZgScHa(*Ml@JCm<=W>8>0aZ%L*)?A&zhyqRaBq03(*f^X3Sx
zG%cXfFVLAKY%}<wtuGjX%5uA51xFpwo{GaO0v-OMiud)6RySx%!f*W64xvRmrU%n0
zOx0J}*rzoC^d7EXRp320f<xwIofaUBd>#UGLOWr+s-hNBn7GwSWwTo3G<zMarY&ls
z%^FpwDKe}D7i3dC`cf&rreS^h9g^-i5ZevbK^A>l=Iafo;`_MR(_g(tJP3zLT|cRF
zSSu(apQITV-Iwd3-l$O;)>SYsvgD9}2}cbnd7P*B#dLmBTAJFxVc<e<-|$a_T1--V
z!i(w}as5pM?_U7vGaBCb`Ifl$nLD@EF-CNYS$C@V&`sxV=+?yO_^hJb%~RKf?3>^y
zi#^nhZnY<P*d<hLOTO6QD_6ePQP$E%(I>f!1Q^{_3f)mQt)%?CM)BYtP>3y2psE-V
zs}J@1O-|SDmAKZTCM7LR^Qed+JjIwA<q;w(kJ5dreH`9+An8R_>O3K*+)J^~J)*Fd
z9M0yyMg6{ZKi1L~b~@ND-xD42IgRK;t_6m;bGvAQoIB~^o2n_|JWFu(=on)!#<d4^
zP7+~?juCGu`C?xA>};-fs8;TKQ57OBR|1*ZHInv%+{t*Z1Rk@i?Ryr&=+14KoU+sO
z0D6pq<Q5eyF~=l<)lEp!G|}_R&`smI;4^-CxODZ+o)n3-Tr$xFf%IrC!te^LZEj*=
zuak?@6N2;{1L@D7aT*eTv2P(cZ4GI1K(aJ~Xsj47iHj&J;9PlxZXVNQ47oh7@{%M8
zvrM0VHdXo}U-a)_81DrJ)BV6<FunU11k+1jXo-jPANbxCaJ_9QZUk5ZYO4Cvp_a?s
zJWtR~rL@Bq$8a`KTMlXqe}MBLoiEQjL~V*4;906tXK_i5z;!oh>jGVsee?(Toyt|x
zSz67I#xys)@E5~05B$UthPUs&@+jWkw46MJFiVTB{hl6w=iss$%7hDPtV-ti1dfo*
z<NHzU)sQUq+A7Ynw0vYPk@j18W1Oj)l`$eX$;vbHHdX1C+F-Dq3wZm@Pq&CZwF@G?
z3m8?<XrK_qD0%4Op|V6Z^sqFCoD`)IrdDBHQCjtbhPk!u&$A9&&GZE~@w|9wF-OxK
z_dIjd8;7f6%R+6(h)z7+(%fIX2!1EG_B)t|O*9*+!9eKI{tpa<VKB_%#u1WG0d<yc
z0Y>J9nbLf3#d(_S!-uL(<l99ZzNMbKc*|;5oZ4H{mxuX`=|w+ot#tOP9qNVQa5fkM
z)Oe!0r96wX7}aT%lCx1==ap0=WPXJ<D$M3^%y>DHBu~}I9yZ^q?I>JA-AY4EEPJ7C
z$ie1Hg7)pm6Bq7WV?^(GszvnPt9I}^T0ZWLCMf1~pQNw6qAN%J0|kjFg6bkv4j6t%
zb(}BB%)DZuo)@6*6|xlF@p-;ynfJ5QqBu)^tXh%<7mDu&(GZCKQ~&mlzSzQqZA%*s
z`m^zfwpx`4Mj`{PGI8AlAq*4XB6Jx|jk%i_qv7@U1o@SEUISf}npLYCAvs<I9oT04
zB#mOEZeQSA0#w<6(+g581G@T2r^!RleaTUIXmbe!WqCY6B{b6dSB}U@s{q3_$U<pR
zWn`)xU|^~M;L>V-sK>cxXH}1G_+cA-%QdU~WECx+D1tDxskMe1q_N98uQ|#tn`ZId
z*_4_htqVoaAj8Nn>o6?x5-G?BJtT9?3Nij<EicK#^NO4K+B98Dy|KhXCEwUY)*+sA
z2m=?TO2u1QqSGLrZ|w5ax7yHo-!(gsL66z(U^W@4$~5#z_lZ1=GNKT6KpRJK6x1;W
z)TJ91AiDXX-k7kYUYaI{v-tK3iVZ<mPI-s9+A$BLV0p-_@Egg}AlfsCzP^Cy6FVUq
z&qiz_z23MU0-AFuuc&%TXvh^NxpW6EKPsf7bZ$ze9lf8^Az@hS3)HwtrB0HhrSGdy
zE2&DG)QEh8=)na<m;avwL~jNnu{)wVf|QC%2TqqJp`Vb;B}Elfm6y7hUL@f0@VvR4
zj?&-MJq8vxq`3Z$I_GF<uESAJe^}VMy6aG6;orpJ=QS)Lx+^)^AbR;uh*rFAJp{V-
zG0o=C?RbOdaLI2hrKC!gTl6jxHLYrCktWrAzdLb+5?PqP_2hch>kfg)5W_7%6mLXy
zal{a0a5*0Wc$QE%2ai*GRlr%KK4d#ouKr<@?oEI*W7CY2A9X?tIz-r(sudPULzfhm
z9)OAlP(uvYE|}=DAM9YF?npgUXr!yt+V@}<Ory&v=G`e&b(NdcN$i(s89Z1EQ!{<@
zJ#`K0;VkMQ62b@#_^?=Qu(%$e+CgwxAl2HirJ#`np_e=^H;C@Jwndcegb4jN-L^S7
z80$4E6aq;K{S?_WMmP~9pY~NCU_t6<h<iFGTG~ElphDcosy*=p7eTxn;iv>`D-e9k
zqik;R)HXV45arjlPS^jzQBJq1xtq+UlQAt0=$%ZMBh==|VU9A$>ZH#6h@s+rk_G#k
zsEEI$y3@$qd?YQ!2Z|Y(_5_YahuO{Uu=3wg_en1{5RIxZ1)$GrK=%XC_0Mbqw0WJ6
zo*6McK&dcQn2J1#)YT_RmPB!xqnHZ|%>D|zr#txrw5!XjPgdI<s7G?CT*~{vmMyKN
z2cfLIOqpO@?L$9=tH~YsZOHd}tKgyM&(OstxcG7&!n^7K@8crZron)s*jF_gS2fDr
zC}h+<0{k#1x_EVq2bbRJ`W$KIxcU)FmPTQARM;|Ky6OJQjjLVt<PNU3QjnGi67<(u
z5PBRmb(&NpdBaA;3rSJ|*q&`}Cw2ICX2&rb^*lY&&yq1^sk+O9pUgrx-16VzNt&d;
zy;_jo8J%o)*xvXfM<ilvSL@OTt*G?61{4CvuhIyS53&pLsw}+<AW_F<DU)Aa9xm3D
z&2vW7GPA?|oa(@(dHj>K^j6qLUu)V1FKj7p@YGQhcae=y8KpkYEMcQa!ShqvMWOdT
z5^e&3`<mf&t3X#wOvl9?uSrr8V!mz~BX%-Yh%B|{yntw_5t{6`h~ECx4y3)>tkRVy
z=E`Ft1X4|j*^&fpYyx`7qkZ*w5q@@&qZ?2)aj+bvQSt3I`WZ8hx<}pS+(fI0t2SC{
zRxbr2BPm>oR{VOogX1kr3MG*?GeH`K2_$=Z;1+tkGG@}QlQPM5^~0H`UBr-%ewL4<
z-07%Zf}aJ^hz+7?IWF+KMp8KQk~R_j{f~EG2a>nD5bcM`?PA*QN(fd2(?mpYbbnDx
zctM(#`_6?geQN+$c1jSTo{qNxxYAQk-BR+1e!2zUrn8z^|M{0&x9dE;Lm;gxS$W7j
z9l1dcvlPHVk45t^CxTj!k|@V$B05|_qnA9@nz5zpALekgRxX$pY_s3H5K61ma32tv
zO^rA0f@oE3q`SP}by<mvk0`UE?JH<(Ue+~qU?e0&+P+>LRI;I({V<8EnZJI93WpUX
zD`_o`nfVr@cKW<>=Wmyry&tu+{}22G!L&V#uj;~{2k!$d#7PP@p~U#rk0=}I^I;sP
zQF`cX*e=}RwV(?(xSsQ*Y$is3hEdCP+v8f97{y)*Du`Svdq|+;Y(!^#z^V!myyqt8
z8_$!nbwi=YS`L3u8KIu5$UE|SC?>_~Q|?wLM*SC^Y=(LP24dHUw`}Kt%R@c57$S&P
z?$XK&xM~R93T~K}d6g$oIY+b|>X{a0BnlmFjn)kDgnV{et`7D7@yeDWz6(6pZp3qu
zW7mSR5vjs34Ft!M?xsN@0cqF;6(UVh8NTl{#ht^q)NqArn|Z#z5mY}Wg>S1-ciyw@
zxK!d!mjBkOeE(RWN+S9j@458hlh=xSZu+^^84yZi0(I~EK6c4xG^oGa3Dj|s8va$I
z_%u+&8&Mo{dBONsdD#x8I8Ks+@vqw_u5CblWG7I^NL4VP-u1x8E_p<Q`kTv+a<}6z
zEEr7n11%9e?|Cqk+<^#=MNe*s;DO6`u*`Ak!ynZkzWDryPbM2d{CxD@#>Ia6*Jo_w
zVt>1L6c-%#_S>NN+TIR~c5GVi&uSF!cOO1^z1+daj!BzssQ5Z>hk!put+k1hAO1;8
z#ed-K5GTi`yEZ7kv<r%3(@~o^x%3Le$@W})Y&z(NG!FNL51+hldmP85V}4M>cpqR~
zv#nDdlP=i=#yf%?63DUXjDH3+EnOQfW&i*miwFb&00000{{{d;LjnLB00RI300000
G0001Z$Dr&0

literal 0
HcmV?d00001

diff --git a/src/dupradar/test_data/script.sh b/src/dupradar/test_data/script.sh
new file mode 100644
index 00000000..2b1c1aa5
--- /dev/null
+++ b/src/dupradar/test_data/script.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/genes.gtf
+wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam 
+
+# subsample the bam file
+samtools view -s 0.02 -b wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam > sample.bam
+
+rm wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam
\ No newline at end of file

From d6e5a45043128afcc3a1be88a1a76584a9248022 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Fri, 5 Jul 2024 13:22:53 +0100
Subject: [PATCH 04/11] config formatting

---
 src/dupradar/config.vsh.yaml | 94 ++++++++++++++----------------------
 1 file changed, 35 insertions(+), 59 deletions(-)

diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml
index cae6964f..cb220f47 100644
--- a/src/dupradar/config.vsh.yaml
+++ b/src/dupradar/config.vsh.yaml
@@ -10,105 +10,81 @@ references:
 license: GPL-3.0
 
 argument_groups:
-- name: "Input"
+- name: Input
   arguments: 
-  - name: "--id"
+  - name: --id
     type: string
     description: Sample ID
-    
-  - name: "--input"
+  - name: --input
     type: file 
     required: true
-    description: path to input alignment file in BAM format
-
-  - name: "--gtf_annotation"
+    description: Path to input alignment file in BAM format
+  - name: --gtf_annotation
     type: file 
     required: true
-    description: path to GTF annotation file.
-
-  - name: "--paired"
+    description: Path to GTF annotation file.
+  - name: --paired
     type: boolean
-    description: add flag if input alignment file consists of paired reads
-
-  - name: "--strandedness"
+    description: Add flag if input alignment file consists of paired reads
+  - name: --strandedness
     type: string
-    required: false
-    choices: ["forward", "reverse", "unstranded"]
-    description: strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads))
-
-  - name: "--versions"
-    type: file
-    must_exist: false
+    choices: [forward, reverse, unstranded]
+    description: |
+      Strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads))
     
-- name: "Output"
+- name: Output
   arguments: 
-  - name: "--output_dupmatrix"
+  - name: --output_dupmatrix
     type: file
     direction: output
-    required: false
-    must_exist: true
-    default: $id.dup_matrix.txt
     description: path to output file (txt) of duplicate tag counts
-  
-  - name: "--output_dup_intercept_mqc"
+    example: $id.dup_matrix.txt
+  - name: --output_dup_intercept_mqc
     type: file
     direction: output
-    required: false
-    must_exist: true
-    default: $id.dup_intercept_mqc.txt
     description: path to output file (txt) of multiqc intercept value DupRadar
-
-  - name: "--output_duprate_exp_boxplot"
+    example: $id.dup_intercept_mqc.txt
+  - name: --output_duprate_exp_boxplot
     type: file
     direction: output
     required: false
     must_exist: true
     default: $id.duprate_exp_boxplot.pdf
-    description: path to output file (pdf) of distribution of expression box plot
-  
-  - name: "--output_duprate_exp_densplot"
+    description: |
+      Path to output file (pdf) of distribution of expression box plot
+  - name: --output_duprate_exp_densplot
     type: file
     direction: output
-    required: false
-    must_exist: true
-    default: $id.duprate_exp_densityplot.pdf
-    description: path to output file (pdf) of 2D density scatter plot of duplicate tag counts
-  
-  - name: "--output_duprate_exp_denscurve_mqc"
+    description: |
+      Path to output file (pdf) of 2D density scatter plot of duplicate tag counts
+    example: $id.duprate_exp_densityplot.pdf
+  - name: --output_duprate_exp_denscurve_mqc
     type: file
     direction: output
-    required: false
-    must_exist: true
-    default: $id.duprate_exp_density_curve_mqc.txt
-    description: path to output file (pdf) of density curve of gene duplication multiqc
-  
-  - name: "--output_expression_histogram"
+    description: |
+      Path to output file (pdf) of density curve of gene duplication multiqc
+    example: $id.duprate_exp_density_curve_mqc.txt
+  - name: --output_expression_histogram
     type: file
     direction: output
-    required: false
-    must_exist: true
-    default: $id.expression_hist.pdf
-    description: path to output file (pdf) of distribution of RPK values per gene histogram
-  
-  - name: "--output_intercept_slope"
+    description: |
+      Path to output file (pdf) of distribution of RPK values per gene histogram
+    example: $id.expression_hist.pdf
+  - name: --output_intercept_slope
     type: file
     direction: output
-    required: false
-    must_exist: true
-    default: $id.intercept_slope.txt
     description: output file (txt) with progression of duplication rate values
+    example: $id.intercept_slope.txt
 
 resources:
   - type: bash_script
     path: script.sh
   # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r
-  - type: R_script
+  - type: r_script
     path: script.R
-
 test_resources:
   - type: bash_script
     path: test.sh
-  
 engines:
   - type: docker
     image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0

From 0a798864756f48f577b4810e1c02afcef7d7daaa Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Fri, 5 Jul 2024 14:58:03 +0100
Subject: [PATCH 05/11] scripts and tests

---
 src/dupradar/script.R  | 0
 src/dupradar/script.sh | 7 ++++++-
 src/dupradar/test.sh   | 0
 3 files changed, 6 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 src/dupradar/script.R
 mode change 100644 => 100755 src/dupradar/script.sh
 mode change 100644 => 100755 src/dupradar/test.sh

diff --git a/src/dupradar/script.R b/src/dupradar/script.R
old mode 100644
new mode 100755
diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh
old mode 100644
new mode 100755
index 85d51167..a590401a
--- a/src/dupradar/script.sh
+++ b/src/dupradar/script.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+## VIASH START
+## VIASH END
+
 
 set -eo pipefail 
 
@@ -29,4 +32,6 @@ mv "$par_id"_intercept_slope.txt $par_output_intercept_slope
 
 dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")
 text="bioconductor-dupradar: ${dupradar_ver}"
-echo "$text"
\ No newline at end of file
+echo "$text"
+
+exit 0
diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh
old mode 100644
new mode 100755

From aeb8419f4e17ca9c07d6fb02cb78181ffd68492f Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Wed, 10 Jul 2024 23:39:30 +0200
Subject: [PATCH 06/11] expected output (not correct) and update to R script
 package installation

---
 src/dupradar/config.vsh.yaml                  | 18 ++++-----
 src/dupradar/script.R                         | 10 +++--
 src/dupradar/script.sh                        |  3 +-
 src/dupradar/test.sh                          | 24 +++++++++++-
 src/dupradar/test_data/test_dupMatrix.txt     |  7 ++++
 .../test_data/test_dup_intercept_mqc.txt      | 39 +++++++++++++++++++
 .../test_duprateExpDensCurve_mqc.txt          | 33 ++++++++++++++++
 .../test_data/test_intercept_slope.txt        |  2 +
 8 files changed, 120 insertions(+), 16 deletions(-)
 create mode 100644 src/dupradar/test_data/test_dupMatrix.txt
 create mode 100644 src/dupradar/test_data/test_dup_intercept_mqc.txt
 create mode 100644 src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt
 create mode 100644 src/dupradar/test_data/test_intercept_slope.txt

diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml
index cb220f47..9727fdec 100644
--- a/src/dupradar/config.vsh.yaml
+++ b/src/dupradar/config.vsh.yaml
@@ -38,18 +38,16 @@ argument_groups:
     type: file
     direction: output
     description: path to output file (txt) of duplicate tag counts
-    example: $id.dup_matrix.txt
+    example: dup_matrix.txt
   - name: --output_dup_intercept_mqc
     type: file
     direction: output
     description: path to output file (txt) of multiqc intercept value DupRadar
-    example: $id.dup_intercept_mqc.txt
+    example: dup_intercept_mqc.txt
   - name: --output_duprate_exp_boxplot
     type: file
     direction: output
-    required: false
-    must_exist: true
-    default: $id.duprate_exp_boxplot.pdf
+    default: duprate_exp_boxplot.pdf
     description: |
       Path to output file (pdf) of distribution of expression box plot
   - name: --output_duprate_exp_densplot
@@ -57,24 +55,24 @@ argument_groups:
     direction: output
     description: |
       Path to output file (pdf) of 2D density scatter plot of duplicate tag counts
-    example: $id.duprate_exp_densityplot.pdf
+    example: duprate_exp_densityplot.pdf
   - name: --output_duprate_exp_denscurve_mqc
     type: file
     direction: output
     description: |
       Path to output file (pdf) of density curve of gene duplication multiqc
-    example: $id.duprate_exp_density_curve_mqc.txt
+    example: duprate_exp_density_curve_mqc.txt
   - name: --output_expression_histogram
     type: file
     direction: output
     description: |
       Path to output file (pdf) of distribution of RPK values per gene histogram
-    example: $id.expression_hist.pdf
+    example: expression_hist.pdf
   - name: --output_intercept_slope
     type: file
     direction: output
     description: output file (txt) with progression of duplication rate values
-    example: $id.intercept_slope.txt
+    example: intercept_slope.txt
 
 resources:
   - type: bash_script
@@ -85,6 +83,8 @@ resources:
 test_resources:
   - type: bash_script
     path: test.sh
+  - type: file
+    path: test_data
 engines:
   - type: docker
     image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0
diff --git a/src/dupradar/script.R b/src/dupradar/script.R
index 82218c6e..24ebce2d 100755
--- a/src/dupradar/script.R
+++ b/src/dupradar/script.R
@@ -35,10 +35,12 @@ message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not speci
 
 # Load / install packages
 if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) }
-if (!require("dupRadar")){
-    source("http://bioconductor.org/biocLite.R")
-    biocLite("dupRadar", suppressUpdates=TRUE)
-    library("dupRadar")
+if (!require("dupRadar")) {
+  if (!requireNamespace("BiocManager", quietly = TRUE)) {
+    install.packages("BiocManager")
+  }
+  BiocManager::install("dupRadar", update = TRUE, ask=FALSE)
+  library("dupRadar")
 }
 if (!require("parallel")) {
     install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/')
diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh
index a590401a..c84440a7 100755
--- a/src/dupradar/script.sh
+++ b/src/dupradar/script.sh
@@ -19,7 +19,8 @@ Rscript "$meta_resources_dir/script.R" \
     $par_id \
     $par_gtf_annotation \
     $(num_strandness) \
-    $par_paired
+    $par_paired \
+    ${meta_cpus:-1}
 
 mv "$par_id"_dupMatrix.txt $par_output_dupmatrix
 mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc
diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh
index 01072e01..8a85af27 100755
--- a/src/dupradar/test.sh
+++ b/src/dupradar/test.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
 # define input and output for script
-input_bam="$meta_resources_dir/sample.bam"
-input_gtf="$meta_resources_dir/genes.gtf"
+input_bam="${meta_resources_dir}/test_data/sample.bam"
+input_gtf="${meta_resources_dir}/test_data/genes.gtf"
 
 output_dupmatrix="dup_matrix.txt"
 output_dup_intercept_mqc="dup_intercept_mqc.txt"
@@ -48,4 +48,24 @@ echo ">> asserting output has been created for paired read input"
 [ ! -f "$output_intercept_slope" ] && echo "$output_intercept_slope was not created" && exit 1
 [ ! -s "$output_intercept_slope" ] && echo "$output_intercept_slope is empty" && exit 1
 
+
+echo ">> Check if output is empty"
+[ ! -s "$output_dupmatrix" ] \
+    && echo "Output file $output_dupmatrix is empty" && exit 1
+[ ! -s "$output_dup_intercept_mqc" ] \
+    && echo "Output file $output_dup_intercept_mqc is empty" && exit 1
+[ ! -s "$output_intercept_slope" ] \
+    && echo "Output file $output_intercept_slope is empty" && exit 1
+
+echo ">> Check if output is correct"
+cat "$output_dupmatrix"
+cat "$output_dup_intercept_mqc"
+cat "$output_intercept_slope"
+# diff ignoring white spaces
+diff -B -b "test_dupMatrix.pdf" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || 
+    (echo "Output file $output_dupmatrix is not correct" && exit 1)
+diff -B -b "$output_dup_intercept_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \
+    (echo "Output file $output_dup_intercept_mqc is not correct" && exit 1)
+diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \
+    (echo "Output file $output_intercept_slope is not correct" && exit 1)
 exit 0
\ No newline at end of file
diff --git a/src/dupradar/test_data/test_dupMatrix.txt b/src/dupradar/test_data/test_dupMatrix.txt
new file mode 100644
index 00000000..8ccda72e
--- /dev/null
+++ b/src/dupradar/test_data/test_dupMatrix.txt
@@ -0,0 +1,7 @@
+ID	geneLength	allCountsMulti	filteredCountsMulti	dupRateMulti	dupsPerIdMulti	RPKMulti	PKMMulti	allCounts	filteredCounts	dupRate	dupsPerId	RPK	RPKM
+WASH7P	1769	41	41	0	0	23.1769361221029	188430.374976446	1	1	0	0	0.565291124929339	4595.86280430357
+FAM138A	2260	0	0	NA	0	0	0	0	0	NA	0	0	0
+FAM138F	2260	0	0	NA	0	0	0	0	0	NA	0	0	0
+OR4F5	918	0	0	NA	0	0	0	0	0	NA	0	0	0
+LOC729737	5474	18	18	0	0	3.28827183047132	26733.917320905	3	3	0	0	0.548045305078553	4455.65288681751
+LOC100132287	8740	39	39	0	0	4.46224256292906	36278.3948205615	1	1	0	0	0.11441647597254	930.215251809269
diff --git a/src/dupradar/test_data/test_dup_intercept_mqc.txt b/src/dupradar/test_data/test_dup_intercept_mqc.txt
new file mode 100644
index 00000000..0d0b4e1a
--- /dev/null
+++ b/src/dupradar/test_data/test_dup_intercept_mqc.txt
@@ -0,0 +1,39 @@
+#id: DupInt
+#plot_type: 'generalstats'
+#pconfig:
+#    dupRadar_intercept:
+#        title: 'dupInt'
+#        namespace: 'DupRadar'
+#        description: 'Intercept value from DupRadar'
+#        max: 100
+#        min: 0
+#        scale: 'RdYlGn-rev'
+#        format: '{:.2f}%'
+Sample dupRadar_intercept
+test 5.8262146393079e-11
+#id: DupInt
+#plot_type: 'generalstats'
+#pconfig:
+#    dupRadar_intercept:
+#        title: 'dupInt'
+#        namespace: 'DupRadar'
+#        description: 'Intercept value from DupRadar'
+#        max: 100
+#        min: 0
+#        scale: 'RdYlGn-rev'
+#        format: '{:.2f}%'
+Sample dupRadar_intercept
+test 5.8262146393079e-11
+#id: DupInt
+#plot_type: 'generalstats'
+#pconfig:
+#    dupRadar_intercept:
+#        title: 'dupInt'
+#        namespace: 'DupRadar'
+#        description: 'Intercept value from DupRadar'
+#        max: 100
+#        min: 0
+#        scale: 'RdYlGn-rev'
+#        format: '{:.2f}%'
+Sample dupRadar_intercept
+test 5.8262146393079e-11
diff --git a/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt b/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt
new file mode 100644
index 00000000..ca9f199e
--- /dev/null
+++ b/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt
@@ -0,0 +1,33 @@
+#id: dupradar
+#section_name: 'DupRadar'
+#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html'
+#description: "provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication.
+#    This plot shows the general linear models - a summary of the gene duplication distributions. "
+#pconfig:
+#    title: 'DupRadar General Linear Model'
+#    xLog: True
+#    xlab: 'expression (reads/kbp)'
+#    ylab: '% duplicate reads'
+#    ymax: 100
+#    ymin: 0
+#    tt_label: '<b>{point.x:.1f} reads/kbp</b>: {point.y:,.2f}% duplicates'
+#    xPlotLines:
+#        - color: 'green'
+#          dashStyle: 'LongDash'
+#          label:
+#                style: {color: 'green'}
+#                text: '0.5 RPKM'
+#                verticalAlign: 'bottom'
+#                y: -65
+#          value: 0.5
+#          width: 1
+#        - color: 'red'
+#          dashStyle: 'LongDash'
+#          label:
+#                style: {color: 'red'}
+#                text: '1 read/bp'
+#                verticalAlign: 'bottom'
+#                y: -65
+#          value: 1000
+#          width: 1
+0.11441647597254 5.82621463896864e-09
diff --git a/src/dupradar/test_data/test_intercept_slope.txt b/src/dupradar/test_data/test_intercept_slope.txt
new file mode 100644
index 00000000..5294b8ba
--- /dev/null
+++ b/src/dupradar/test_data/test_intercept_slope.txt
@@ -0,0 +1,2 @@
+test - dupRadar Int (duprate at low read counts): 5.8262146393079e-11 
+test - dupRadar Sl (progression of the duplication rate): 0.999999999999966

From a893a37f70a665bfd238fa2833eb9bfdc1acc1c5 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Mon, 23 Sep 2024 11:15:35 +0200
Subject: [PATCH 07/11] Version info, functional test

---
 src/dupradar/config.vsh.yaml                  |  15 +-
 src/dupradar/{script.R => dupradar.r}         |  23 +-
 src/dupradar/help.txt                         | 583 ++++++++++++++++++
 src/dupradar/script.sh                        |  13 +-
 src/dupradar/test.sh                          |  13 +-
 src/dupradar/test_data/test_dupMatrix.txt     |   2 +-
 .../test_data/test_dup_intercept_mqc.txt      |  39 --
 7 files changed, 628 insertions(+), 60 deletions(-)
 rename src/dupradar/{script.R => dupradar.r} (92%)
 delete mode 100644 src/dupradar/test_data/test_dup_intercept_mqc.txt

diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml
index 9727fdec..d25547db 100644
--- a/src/dupradar/config.vsh.yaml
+++ b/src/dupradar/config.vsh.yaml
@@ -79,15 +79,26 @@ resources:
     path: script.sh
   # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r
   - type: r_script
-    path: script.R
+    path: dupradar.r
 test_resources:
   - type: bash_script
     path: test.sh
   - type: file
     path: test_data
+    
 engines:
   - type: docker
-    image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0
+    image: ubuntu:22.04
+    setup:   
+      - type: apt
+        packages: [ r-base ]
+      - type: r
+        bioc: [ dupRadar ]
+        packages: [ parallel ]
+      - type: docker
+        run: |
+          echo "DupRadar: $(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")" > /var/software_versions.txt
+        
 runners:
   - type: executable  
   - type: nextflow
diff --git a/src/dupradar/script.R b/src/dupradar/dupradar.r
similarity index 92%
rename from src/dupradar/script.R
rename to src/dupradar/dupradar.r
index 24ebce2d..4294a42e 100755
--- a/src/dupradar/script.R
+++ b/src/dupradar/dupradar.r
@@ -47,17 +47,29 @@ if (!require("parallel")) {
     library("parallel")
 }
 
+
 # Duplicate stats
 dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads)
+print(input_bam)
+print(output_prefix)
+print(annotation_gtf)
+print(stranded)
+print(paired_end)
+print(threads)
+print("analyzeDuprates done")
 write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t")
+print("write.table done")
 
 # 2D density scatter plot
 pdf(paste0(output_prefix, "_duprateExpDens.pdf"))
+print("pdf done")
 duprateExpDensPlot(DupMat=dm)
 title("Density scatter plot")
 mtext(output_prefix, side=3)
 dev.off()
+print("duprateExpDensPlot done")
 fit <- duprateExpFit(DupMat=dm)
+print("duprateExpFit done")
 cat(
     paste("- dupRadar Int (duprate at low read counts):", fit$intercept),
     paste("- dupRadar Sl (progression of the duplication rate):", fit$slope),
@@ -82,6 +94,7 @@ Sample dupRadar_intercept"
 
 write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
 write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
+print("write dup_intercept_mqc done")
 
 # Get numbers from dupRadar GLM
 curve_x <- sort(log10(dm$RPK))
@@ -135,22 +148,24 @@ write.table(
     file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),
     quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE,
 )
-
+print("write duprateExpDensCurve_mqc done")
 # Distribution of expression box plot
 pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf"))
 duprateExpBoxplot(DupMat=dm)
 title("Percent Duplication by Expression")
 mtext(output_prefix, side=3)
 dev.off()
-
+print("duprateExpBoxplot done")
 # Distribution of RPK values per gene
 pdf(paste0(output_prefix, "_expressionHist.pdf"))
 expressionHist(DupMat=dm)
 title("Distribution of RPK values per gene")
 mtext(output_prefix, side=3)
 dev.off()
+print("expressionHist done")
 
 # Print sessioninfo to standard out
 print(output_prefix)
-citation("dupRadar")
-sessionInfo()
+# citation("dupRadar")
+# print("citation done")
+# sessionInfo()
diff --git a/src/dupradar/help.txt b/src/dupradar/help.txt
index e69de29b..819c352d 100644
--- a/src/dupradar/help.txt
+++ b/src/dupradar/help.txt
@@ -0,0 +1,583 @@
+---
+title: "Using the dupRadar package"
+author: "Sergi Sayols, Holger Klein"
+date: "June 23, 2015"
+output:
+    BiocStyle::html_document:
+    toc: true
+---
+<!--
+% \VignetteEngine{knitr::rmarkdown}
+% \VignetteIndexEntry{Using dupRadar}
+-->
+
+```{r pre,echo=FALSE,results='hide'}
+library(knitr)
+opts_chunk$set(warning=FALSE,message=FALSE,cache=TRUE)
+```
+```{r style,echo=FALSE,results='asis'}
+BiocStyle::markdown()
+```
+
+
+
+# Introduction to dupRadar
+
+RNA-Seq experiments are a common strategy nowadays to quantify the gene 
+expression levels in cells. Due to its higher sensitivity compared to 
+arrays and the ability to discover novel features, makes them the choice for 
+most modern experiments. 
+
+In RNA-Seq - as in all other NGS applications - quality control is essential.
+Current NGS workflows usually involve PCR steps at some point, which involves
+the danger of PCR artefacts and over-amplification of reads.
+For common DNA-based assays PCR duplicates are often simply removed before 
+further analysis and their overall fraction or the read multiplicity taken 
+as quality metrics.
+In RNA-Seq however, apart from the technical PCR duplicates, there is a strong
+source for biological read duplicates: for a given gene length and sequencing 
+depth there exists an expression level beyond which it is not possible to add
+place more reads inside a gene locus without placing a second read exactly 
+on the position of an already existing read.
+For this reason the overall duplication rate is not a useful measure for 
+RNA-Seq.
+
+As in the NGS/RNA-Seq QC ecosystem there were no suitable tools to address this
+question, we set out to develop a new tool. The `r Rpackage("dupRadar")` 
+package gives an insight into the duplication problem by graphically 
+relating the gene expression level and the duplication rate present on it. 
+Thus, failed experiments can be easily identified at a glance.
+
+
+**Note: by now RNA-Seq protocols have matured so that for bulk RNA protocols data 
+rarely suffer from technical duplicates. With newer low-input or single cell 
+RNA-Seq protocols technical duplicates possible problems are worth to be 
+checked for by default, especially if protocols are pushed to or beyond 
+their boundaries.
+Paired-end libraries make the distinction between duplicates due to highly 
+expressed genes and PCR duplicates easier, but the problem itself is not 
+completely solved, especially at higher sequencing depths.**
+
+# Getting started using dupRadar
+
+## Preparing your data
+
+Previous to running the duplication rate analysis, the BAM file with your
+mapped reads has to be duplicate marked with a like Picard, or the faster 
+BamUtil dedup BioBamBam. 
+The `r Rpackage("dupRadar")` package only works with duplicate marked BAM files.
+
+If you do not have/want a duplication marking step in your default pipeline, 
+the `r Rpackage("dupRadar")` package includes, for your convenience, 
+wrappers to properly call some of these tools from your R session. Note that
+you still have to supply the path of the dupmarker installation though:
+
+```{r loadLibrary}
+library(dupRadar)
+```
+
+Now, simply call the wrapper:
+
+```{r eval=FALSE}
+# call the duplicate marker and analyze the reads
+bamDuprm <- markDuplicates(dupremover="bamutil",
+			   bam="test.bam",
+	    	   path="/opt/bamUtil-master/bin",
+			   rminput=FALSE)
+```
+
+Simply specify which tool to use, the path where this tool is installed, 
+and the input bam file to be analyzed. After marking duplicates, it's safe 
+to remove to original BAM file in order to save space.
+
+The `r Rpackage("dupRadar")` package currently comes with support for:
+
+- [Picard MarkDuplicates](http://broadinstitute.github.io/picard)
+- [BamUtil](http://genome.sph.umich.edu/wiki/BamUtil)
+
+After the BAM file is marked for duplicates, dupRadar is ready to 
+analyze how the duplication rate is related with the estimated gene expression 
+levels.
+
+## A GTF file
+
+Unless there is any specific reason, dupRadar can use the same GTF file 
+that will be later used to count the reads falling on features.
+
+A valid GTF file can be obtained from UCSC, Ensembl, the iGenomes or other 
+projects.
+
+Note that the resulting duplication rate plots depend on the GTF annotation 
+used. GTF files from the gencode projects result in a less clear picture of 
+duplication rates, as there are many more features and feature types 
+annotated, which overlap heavily as well. In some cases creating the plots 
+only using subsets of gencode annotation files (e.g. just protein 
+coding genes) serve the QC purpose of this tool better.
+
+## AnnotationHub as a source of GTF files
+
+The Bioconductor `r Rpackage("AnnotationHub")` package provides an alternative
+approach to obtain annotation in GTF format from entities such as Ensembl, UCSC,
+ENCODE, and 1000 Genomes projects.
+
+This is partly outlined in the AnnotationHub 'HOWTO' vignette
+section "Ensembl GTF and FASTA files for TxDb gene models and sequence
+queries"; for the Takifugu example, the downloaded GTF file is
+available from the cache.
+
+```{r}
+if(suppressWarnings(require(AnnotationHub))) {
+    ah = AnnotationHub()
+    query(ah, c("ensembl", "80", "Takifugu", "gtf"))    # discovery
+    cache(ah["AH47101"])                                # retrieve / file path
+}
+```
+
+# dupRate demo data
+
+In the package we include two precomputed duplication matrices for two RNASeq
+experiments used as examples of a good and a failed (in terms of high redundancy
+of reads) experiments. The experiments come from the ENCODE project, as a source
+of a wide variety of protocols, library types and sequencing facilities.
+
+Load the example dataset with:
+
+```{r}
+attach(dupRadar_examples)
+```
+
+# The duplication rate analysis
+
+The analysis requires some info about the sequencing run and library 
+preparation protocol:
+
+- The strandess information about the reads: is the sequencing strand 
+specific? if so, are the reads reversely sequenced?
+- Are the reads paired, or single?
+
+Due to its phenomenal performance, internally we use the featureCounts() 
+function from the Bioconductor `r Biocpkg("Rsubread")` package, which also
+supports multiple threads.
+
+```{r eval=FALSE}
+# The call parameters: 
+bamDuprm <- "test_duprm.bam"	# the duplicate marked bam file
+gtf <- "genes.gtf"	# the gene model
+stranded <- 2		# '0' (unstranded), '1' (stranded) and '2' (reversely stranded)
+paired   <- FALSE	# is the library paired end?
+threads  <- 4		# number of threads to be used
+
+# Duplication rate analysis
+dm <- analyzeDuprates(bamDuprm,gtf,stranded,paired,threads)
+```
+
+The duplication matrix contains read counts in different scenarios and RPK and RPKM values for every gene.
+
+|ID     | geneLength| allCountsMulti| filteredCountsMulti| dupRateMulti| dupsPerIdMulti|     RPKMulti|  RPKMMulti| allCounts| filteredCounts|   dupRate| dupsPerId|         RPK|       RPKM|
+|:------|----------:|--------------:|-------------------:|------------:|--------------:|------------:|----------:|---------:|--------------:|---------:|---------:|-----------:|----------:|
+|Xkr4   |       3634|              2|                   2|    0.0000000|              0|    0.5503577|  0.0117159|         2|              2| 0.0000000|         0|   0.5503577|  0.0117159|
+|Rp1    |       9747|              0|                   0|          NaN|              0|    0.0000000|  0.0000000|         0|              0|       NaN|         0|   0.0000000|  0.0000000|
+|Sox17  |       3130|              0|                   0|          NaN|              0|    0.0000000|  0.0000000|         0|              0|       NaN|         0|   0.0000000|  0.0000000|
+|Mrpl15 |       4203|            429|                 261|    0.3916084|            168|  102.0699500|  2.1728436|       419|            258| 0.3842482|       161|  99.6906971|  2.1221946|
+|Lypla1 |       2433|           1106|                 576|    0.4792043|            530|  454.5828196|  9.6770634|      1069|            562| 0.4742750|       507| 439.3752569|  9.3533280|
+|Tcea1  |       2847|           2890|                1046|    0.6380623|           1844| 1015.1036178| 21.6093121|      1822|            696| 0.6180022|      1126| 639.9719002| 13.6235871|
+
+# Plotting and interpretation
+
+The number of reads per base assigned to a gene in an ideal RNA-Seq data set 
+is expected to be proportional to the abundance of its transcripts in the 
+sample. For lowly expressed genes we expect read duplication to happen rarely by
+chance, while for highly expressed genes - depending on the total sequencing
+depth - we expect read duplication to happen often.
+
+A good way to learn if a dataset is following this trend is by relating the 
+normalized number of counts per gene (RPK, as a quantification of the gene 
+expression) and the fraction represented by duplicated reads.
+
+The `r Rpackage("dupRadar")` offers several functions to assess this 
+relationship. The most prominent may be the 2d density scatter plot:
+
+```{r fig.width=14,fig.height=7}
+## make a duprate plot (blue cloud)
+par(mfrow=c(1,2))
+duprateExpDensPlot(DupMat=dm)		# a good looking plot
+title("good experiment")
+duprateExpDensPlot(DupMat=dm.bad)	# a dataset with duplication problems
+title("duplication problems")
+
+## duprate boxplot
+duprateExpBoxplot(DupMat=dm)		# a good looking plot
+duprateExpBoxplot(DupMat=dm.bad)	# a dataset with duplication problems
+```
+
+The *duprateExpDensPlot* has helper lines at the threshold of 1 read/bp and at 
+0.5 RPKM. Moreover by default a generalized linear model is fit to the data and
+overplotted (see also below).
+
+## Fitting a model into the data
+
+To summarize the relationship between duplication rates and gene expression, 
+we propose fitting a logistic regression curve onto the data.  With the 
+coefficients of the fitted model, one can get an idea of the initial 
+duplication rate at low read counts (Intercept) and the progression of the 
+duplication rate along with the progression of the read counts (Slope).
+
+
+```{r fig.width=7,fig.height=7}
+duprateExpDensPlot(DupMat=dm)
+
+# or, just to get the fitted model without plot
+fit <- duprateExpFit(DupMat=dm)
+cat("duprate at low read counts: ",fit$intercept,"\n",
+	"progression of the duplication rate: ",fit$slope,fill=TRUE)
+```
+
+Our main use case for that function is the condensation
+of the plots into quality metrics that can be used for automatic flagging of 
+possibly problematic samples in large experiments or aggregation with other
+quality metrics in large tables to analyse interdependencies.
+
+
+The *duprateExpBoxplot* plot shows the range of the duplication rates at 5% 
+bins (default) along the distribution of RPK gene counts. The x-axis displays 
+the quantile of the RPK distribution, and the average RPK of the genes 
+contained in this quantile.
+
+Individual genes can be identified in the plot:
+
+```{r eval=FALSE}
+## INTERACTIVE: identify single points on screen (name="ID" column of dm)
+duprateExpPlot(DupMat=dm)		# a good looking plot
+duprateExpIdentify(DupMat=dm)
+```
+
+One can also call the function *duprateExpPlot* to get smooth color density 
+representation of the same data:
+
+```{r fig.width=14,fig.height=7}
+par(mfrow=c(1,2))
+cols <- colorRampPalette(c("black","blue","green","yellow","red"))
+duprateExpPlot(DupMat=dm,addLegend=FALSE)
+duprateExpPlot(DupMat=dm.bad,addLegend=FALSE,nrpoints=10,nbin=500,colramp=cols)
+```
+
+Any further parm sent to the *duprateExpPlot* is also sent to the 
+*smoothScatter* function.
+
+## Comparing the fitted parameters to other datasets
+
+The parameters of the fitted model may mean very little (or just nothing) for
+many, unless there's other data to compare with. We provide the pre-computed
+duplication matrices for all the RNA-Seq experiments publicly available from 
+the ENCODE project, for human and mouse.
+
+![alt text](figures/encode.svg)
+
+With the experience from the ENCODE datasets, we expect from single read 
+experiments little duplication at low RPKM (low **intercept**) rapidly rising 
+because of natural duplication (high **slope**). In contrast, paired-end
+experiments have a more mild rising of the natural duplication (low **slope**)
+due to having higher diversity of reads pairs since pairs with same start may
+still have different end.
+
+The common denominator for both designs is the importance of having a low
+intercept, suggesting that duplication rate at lowly expressed genes may serve
+as a quality measure.
+
+All the pre-computed duplication matrices are available in the [dupRadar Github site](https://github.com/ssayols/dupRadar).
+
+
+## Other plots
+
+**CAVEAT: Sometimes in discussions duplicate reads (i.e. two physically 
+different reads are mapped to the exact same position) and multi-mapping 
+reads (i.e. a single read is mapped to two or more locations in the genome)
+are mixed up. `r Rpackage("dupRadar")`'s main focus are PCR duplicates in RNA-Seq.
+However internally we keep track of unique mappers and multimappers, and we use
+both in some of the examples, to illustrate use cases of our package beyond
+the main aim. Multi-mapping reads are completely independent of PCR duplicates.**
+
+Apart from the plots relating RPK and duplication rate, the 
+`r Rpackage("dupRadar")` package provides some other useful plots to extract 
+information about the gene expression levels.
+
+An interesting quality metric are the fraction of reads taken up by groups of 
+genes binned by 5% expression levels. 
+```{r fig.width=7,fig.height=7}
+readcountExpBoxplot(DupMat=dm)
+```
+
+In the example we see that the 5% of 
+highest expressed genes in our sample data set take up around 60% of all reads.
+
+The distribution of RPK values per genes can be plotted with:
+
+```{r fig.width=7,fig.height=7}
+expressionHist(DupMat=dm)
+```
+
+This would help in identifying skewed distributions with unusual amount of 
+lowly expressed genes, or to detect no consensus between replicates.
+
+
+# Other information deduced from the data
+
+The duplication rate matrix calculated by the function *analyzeDuprates()* 
+contains some useful information about the sequencing experiment, that can be 
+used to assess the quality of the data.
+
+## Fraction of multimappers per gene
+
+Analogous to per gene duplication rate, the fraction of mutimappers 
+can be easily calculated fom the duplication matrix.
+
+Taking the counts from the column *allCountsMulti*, and substracting form it 
+the counts from the column *allCounts*, one can get the total number of 
+multihits. Thus, the fraction of multihits per gene can be calculating then 
+dividing by *allCountsMulti*.
+
+```{r}
+# calculate the fraction of multimappers per gene
+dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti
+# how many genes are exclusively covered by multimappers
+sum(dm$mhRate == 1, na.rm=TRUE)
+
+# and how many have an RPKM (including multimappers) > 5
+sum(dm$mhRate==1 & dm$RPKMMulti > 5, na.rm=TRUE)
+
+# and which are they?
+dm[dm$mhRate==1 & dm$RPKMMulti > 5, "ID"]
+```
+
+We can also generate an overall picture about less extreme cases:
+```{r fig.width=7,fig.height=7}
+hist(dm$mhRate, 
+     breaks=50, 
+     col="red", 
+     main="Frequency of multimapping rates", 
+     xlab="Multimapping rate per gene", 
+     ylab="Frequency")
+
+```
+
+Also the direct comparison of reads per kilobase between uniquely and 
+multimappers is possible.
+
+```{r fig.width=7,fig.height=7}
+# comparison of multi-mapping RPK and uniquely-mapping RPK
+plot(log2(dm$RPK), 
+     log2(dm$RPKMulti), 
+     xlab="Reads per kb (uniquely mapping reads only)",
+     ylab="Reads per kb (all including multimappers, non-weighted)"
+)
+```
+
+Use with identify() to annotate interesting cases interactively.
+```{r, eval=F}
+identify(log2(dm$RPK), 
+         log2(dm$RPKMulti),
+	 labels=dm$ID)
+```
+
+## Connection between possible PCR artefacts and GC content
+In some cases we wondered about influence of GC content on PCR artefacts. An
+easy way to check using our `r Rpackage("dupRadar")` package in conjunction
+with `r Rpackage("biomaRt")` is demonstrated in the following. For simplicity
+we use our demo data here in which we *do not* see a big influence.
+
+
+```{r eval=F}
+library(dupRadar)
+library(biomaRt)
+
+## for detailed explanations on biomaRt, please see the respective
+## vignette
+
+## set up biomart connection for mouse (needs internet connection)
+ensm <- useMart("ensembl")
+ensm <- useDataset("mmusculus_gene_ensembl", mart=ensm)
+
+## get a table which has the gene GC content for the IDs that have been
+## used to generate the table (depends on the GTF annotation that you
+## use)
+tr <- getBM(attributes=c("mgi_symbol", "percentage_gc_content"),
+            values=TRUE, mart=ensm)
+
+## create a GC vector with IDs as element names
+mgi.gc <- tr$percentage_gc_content
+names(mgi.gc) <- tr$mgi_symbol
+
+## using dm demo duplication matrix that comes with the package
+## add GC content to our demo data and keep only subset for which we can
+## retrieve data
+keep <- dm$ID %in% tr$mgi_symbol
+dm.gc <- dm[keep,]
+dm.gc$gc <- mgi.gc[dm.gc$ID]
+
+## check distribution of annotated gene GC content (in %)
+boxplot(dm.gc$gc, main="Gene GC content", ylab="% GC")
+```
+
+![alt text](figures/GCcontent.svg)
+
+Now we can compare the dependence of duplication rate on expression level
+independently for below and above median GC genes (and to mention again,
+in this data set we don't have a big difference).
+
+```{r eval=F}
+par(mfrow=c(1,2))
+
+## below median GC genes
+duprateExpDensPlot(dm.gc[dm.gc$gc<=45,], main="below median GC genes")
+
+## above median GC genes
+duprateExpDensPlot(dm.gc[dm.gc$gc>=45,], main="above median GC genes")
+```
+
+![alt text](figures/GCdependence.png)
+
+# Conclusion
+
+The `r Rpackage("dupRadar")` package provides a framework for the analysis of 
+duplicate rates in RNAseq datasets. In addition, it includes a set of 
+convenient wrappers to correctly call some common tools used for marking PCR 
+duplicated reads in the BAM file. It's shipped as Bioconductor package in 
+order to offer a common framework for all the tools involved, and simplify its 
+use.
+
+# Including dupRadar into pipelines
+
+To include dupRadar as a single step in an RNA-Seq pipeline, integration into a short R-script can be done like in the following:
+
+```{r,eval=F}
+#!/usr/bin/env Rscript
+
+########################################
+##
+## dupRadar shell script
+## call dupRadar R package from the shell for 
+## easy integration into pipelines
+##
+## Holger Klein & Sergi Sayols
+##
+## https://github.com/ssayols/dupRadar
+##
+## input:
+## - _duplicate marked_ bam file
+## - gtf file
+## - parameters for duplication counting routine:
+##   stranded, paired, outdir, threads.
+##
+########################################
+
+library(dupRadar)
+
+####################
+##
+## get name patterns from command line
+##
+args   <- commandArgs(TRUE)
+
+## the bam file to analyse
+bam <- args[1]
+## usually, same GTF file as used in htseq-count
+gtf <- gsub("gtf=","",args[2])
+## no|yes|reverse
+stranded <- gsub("stranded=","",args[3])
+## is a paired end experiment
+paired   <- gsub("paired=","",args[4])
+## output directory
+outdir   <- gsub("outdir=","",args[5])
+## number of threads to be used
+threads  <- as.integer(gsub("threads=","",args[6]))
+
+if(length(args) != 6) { 
+  stop (paste0("Usage: ./dupRadar.sh <file.bam> <genes.gtf> ",
+			   "<stranded=[no|yes|reverse]> paired=[yes|no] ",
+			   "outdir=./ threads=1"))
+}
+
+if(!file.exists(bam)) { 
+  stop(paste("File",bam,"does NOT exist"))
+}
+
+if(!file.exists(gtf)) {
+  stop(paste("File",gtf,"does NOT exist"))
+}
+
+if(!file.exists(outdir)) { 
+  stop(paste("Dir",outdir,"does NOT exist"))
+}
+
+if(is.na(stranded) | !(grepl("no|yes|reverse",stranded))) {
+  stop("Stranded has to be no|yes|reverse")
+}
+
+if(is.na(paired) | !(grepl("no|yes",paired))) {
+  stop("Paired has to be no|yes")
+}
+
+if(is.na(threads)) {
+  stop("Threads has to be an integer number")
+}
+
+stranded <- if(stranded == "no") 0 else if(stranded == "yes") 1 else 2
+
+## end command line parsing
+##
+########################################
+
+########################################
+##
+## analyze duprates and create plots
+##
+cat("Processing file ", bam, " with GTF ", gtf, "\n")
+
+## calculate duplication rate matrix
+dm <- analyzeDuprates(bam,
+                      gtf,
+                      stranded,
+                      (paired == "yes"),
+                      threads)
+
+## produce plots
+
+## duprate vs. expression smooth scatter
+png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drescatter.png"),
+    width=1000, height=1000)
+duprateExpDensPlot(dm, main=basename(bam))
+dev.off()
+
+## expression histogram
+png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_ehist.png"),
+    width=1000, height=1000)
+expressionHist(dm)
+dev.off()
+
+## duprate vs. expression boxplot
+png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drebp.png"),
+    width=1000, height=1000)
+par(mar=c(10,4,4,2)+.1)
+duprateExpBoxplot(dm, main=basename(bam))
+dev.off()
+```
+
+## Citing dupRadar
+
+Please consider citing dupRadar if used in support of your own research:
+
+```{r citation}
+citation("dupRadar")
+```
+
+## Reporting problems or bugs
+
+If you run into problems using dupRadar, the [Bioconductor Support site](https://support.bioconductor.org/) is a good first place to ask for help. If you think there is a bug or an unreported feature, you can report it using the [dupRadar Github site](https://github.com/ssayols/dupRadar).
+
+# Session info
+
+The following package and versions were used in the production of this vignette.
+
+```{r echo=FALSE}
+sessionInfo()
+```
\ No newline at end of file
diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh
index c84440a7..6ac70bf0 100755
--- a/src/dupradar/script.sh
+++ b/src/dupradar/script.sh
@@ -5,6 +5,7 @@
 
 set -eo pipefail 
 
+
 function num_strandness {
     if [ $par_strandedness == 'unstranded' ]; then echo 0
     elif [ $par_strandedness == 'forward' ]; then echo 1
@@ -14,12 +15,16 @@ function num_strandness {
     fi
 }
 
-Rscript "$meta_resources_dir/script.R" \
+
+if [ $par_paired ]; then paired=TRUE; else paired=FALSE; fi
+
+
+Rscript "$meta_resources_dir/dupradar.r" \
     $par_input \
     $par_id \
     $par_gtf_annotation \
     $(num_strandness) \
-    $par_paired \
+    ${paired} \
     ${meta_cpus:-1}
 
 mv "$par_id"_dupMatrix.txt $par_output_dupmatrix
@@ -31,8 +36,4 @@ mv "$par_id"_expressionHist.pdf $par_output_expression_histogram
 mv "$par_id"_intercept_slope.txt $par_output_intercept_slope
 
 
-dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")
-text="bioconductor-dupradar: ${dupradar_ver}"
-echo "$text"
-
 exit 0
diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh
index 8a85af27..c4afd67a 100755
--- a/src/dupradar/test.sh
+++ b/src/dupradar/test.sh
@@ -58,14 +58,11 @@ echo ">> Check if output is empty"
     && echo "Output file $output_intercept_slope is empty" && exit 1
 
 echo ">> Check if output is correct"
-cat "$output_dupmatrix"
-cat "$output_dup_intercept_mqc"
-cat "$output_intercept_slope"
 # diff ignoring white spaces
-diff -B -b "test_dupMatrix.pdf" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || 
-    (echo "Output file $output_dupmatrix is not correct" && exit 1)
-diff -B -b "$output_dup_intercept_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \
-    (echo "Output file $output_dup_intercept_mqc is not correct" && exit 1)
+diff -B -b "$meta_resources_dir/test_data/test_dupMatrix.txt" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || \
+    { echo "Output file $output_dupmatrix is not correct"; exit 1; }
+diff -B -b "$output_duprate_exp_denscurve_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \
+    { echo "Output file $output_duprate_exp_denscurve_mqc is not correct"; exit 1; }
 diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \
-    (echo "Output file $output_intercept_slope is not correct" && exit 1)
+    { echo "Output file $output_intercept_slope is not correct"; exit 1; }
 exit 0
\ No newline at end of file
diff --git a/src/dupradar/test_data/test_dupMatrix.txt b/src/dupradar/test_data/test_dupMatrix.txt
index 8ccda72e..9035e993 100644
--- a/src/dupradar/test_data/test_dupMatrix.txt
+++ b/src/dupradar/test_data/test_dupMatrix.txt
@@ -1,4 +1,4 @@
-ID	geneLength	allCountsMulti	filteredCountsMulti	dupRateMulti	dupsPerIdMulti	RPKMulti	PKMMulti	allCounts	filteredCounts	dupRate	dupsPerId	RPK	RPKM
+ID	geneLength	allCountsMulti	filteredCountsMulti	dupRateMulti	dupsPerIdMulti	RPKMulti	RPKMMulti	allCounts	filteredCounts	dupRate	dupsPerId	RPK	RPKM
 WASH7P	1769	41	41	0	0	23.1769361221029	188430.374976446	1	1	0	0	0.565291124929339	4595.86280430357
 FAM138A	2260	0	0	NA	0	0	0	0	0	NA	0	0	0
 FAM138F	2260	0	0	NA	0	0	0	0	0	NA	0	0	0
diff --git a/src/dupradar/test_data/test_dup_intercept_mqc.txt b/src/dupradar/test_data/test_dup_intercept_mqc.txt
deleted file mode 100644
index 0d0b4e1a..00000000
--- a/src/dupradar/test_data/test_dup_intercept_mqc.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-#id: DupInt
-#plot_type: 'generalstats'
-#pconfig:
-#    dupRadar_intercept:
-#        title: 'dupInt'
-#        namespace: 'DupRadar'
-#        description: 'Intercept value from DupRadar'
-#        max: 100
-#        min: 0
-#        scale: 'RdYlGn-rev'
-#        format: '{:.2f}%'
-Sample dupRadar_intercept
-test 5.8262146393079e-11
-#id: DupInt
-#plot_type: 'generalstats'
-#pconfig:
-#    dupRadar_intercept:
-#        title: 'dupInt'
-#        namespace: 'DupRadar'
-#        description: 'Intercept value from DupRadar'
-#        max: 100
-#        min: 0
-#        scale: 'RdYlGn-rev'
-#        format: '{:.2f}%'
-Sample dupRadar_intercept
-test 5.8262146393079e-11
-#id: DupInt
-#plot_type: 'generalstats'
-#pconfig:
-#    dupRadar_intercept:
-#        title: 'dupInt'
-#        namespace: 'DupRadar'
-#        description: 'Intercept value from DupRadar'
-#        max: 100
-#        min: 0
-#        scale: 'RdYlGn-rev'
-#        format: '{:.2f}%'
-Sample dupRadar_intercept
-test 5.8262146393079e-11

From b51e6fb4111ddb12415644b3eff355414cdf3a6a Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Mon, 23 Sep 2024 11:24:50 +0200
Subject: [PATCH 08/11] update chagelof

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d380e54..d8016713 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@
 
 * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93).
 
+* `dupradar`: Assessment of duplication rates in RNA-Seq datasets (PR #154).
+
 ## MINOR CHANGES
 
 * Upgrade to Viash 0.9.0.

From 0aa46130fcb04c9baf2748001335702ded651f99 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Thu, 26 Sep 2024 14:24:14 +0200
Subject: [PATCH 09/11] Remove script.sh middleman, add test

---
 src/dupradar/config.vsh.yaml | 15 ++++---
 src/dupradar/dupradar.r      | 78 ++++++++++++++++++++++--------------
 src/dupradar/script.sh       | 39 ------------------
 src/dupradar/test.sh         | 45 +++++++++++++++++++--
 4 files changed, 98 insertions(+), 79 deletions(-)
 delete mode 100755 src/dupradar/script.sh

diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml
index d25547db..ce4f79b1 100644
--- a/src/dupradar/config.vsh.yaml
+++ b/src/dupradar/config.vsh.yaml
@@ -24,13 +24,15 @@ argument_groups:
     required: true
     description: Path to GTF annotation file.
   - name: --paired
-    type: boolean
+    type: boolean_true
     description: Add flag if input alignment file consists of paired reads
   - name: --strandedness
-    type: string
-    choices: [forward, reverse, unstranded]
+    type: integer
+    choices: [0, 1, 2]
     description: |
-      Strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads))
+      Strandedness of input bam file reads (1 = forward, 2 = reverse or 0 = unstranded (default, 
+      applicable to paired reads))
+    example: 0
     
 - name: Output
   arguments: 
@@ -47,7 +49,6 @@ argument_groups:
   - name: --output_duprate_exp_boxplot
     type: file
     direction: output
-    default: duprate_exp_boxplot.pdf
     description: |
       Path to output file (pdf) of distribution of expression box plot
   - name: --output_duprate_exp_densplot
@@ -75,9 +76,7 @@ argument_groups:
     example: intercept_slope.txt
 
 resources:
-  - type: bash_script
-    path: script.sh
-  # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r
+  # Adapted from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r
   - type: r_script
     path: dupradar.r
 test_resources:
diff --git a/src/dupradar/dupradar.r b/src/dupradar/dupradar.r
index 4294a42e..a3497e7a 100755
--- a/src/dupradar/dupradar.r
+++ b/src/dupradar/dupradar.r
@@ -1,24 +1,48 @@
 #!/usr/bin/env Rscript
 
-# Command line argument processing
-args = commandArgs(trailingOnly=TRUE)
-if (length(args) < 5) {
+## VIASH START
+par <- list(
+  "input" = 'test_data/sample.bam',
+  "id" = 'test',
+  "gtf_annotation" = 'test_data/genes.gtf',
+  "strandedness" = 1,
+  "paired" = TRUE,
+  "threads" = 1,
+  "output_dupmatrix"="dup_matrix.txt",
+  "output_dup_intercept_mqc"="dup_intercept_mqc.txt",
+  "output_duprate_exp_boxplot"="duprate_exp_boxplot.pdf",
+  "output_duprate_exp_densplot"="duprate_exp_densityplot.pdf",
+  "output_duprate_exp_denscurve_mqc"="duprate_exp_density_curve_mqc.pdf",
+  "output_expression_histogram"="expression_hist.pdf",
+  "output_intercept_slope"="intercept_slope.txt"
+)
+## VIASH END
+
+
+if (length(par) < 5) {
     stop("Usage: dupRadar.r <input.bam> <sample_id> <annotation.gtf> <strandDirection:0=unstranded/1=forward/2=reverse> <paired/single> <nbThreads> <R-package-location (optional)>", call.=FALSE)
 }
 
-message("paired_end is", args[5])
-message("the type is is", class(args[5]))
 
-input_bam <- args[1]
-output_prefix <- args[2]
-annotation_gtf <- args[3]
-stranded <- as.numeric(args[4])
-paired_end <- ifelse(args[5] == "true", TRUE, FALSE)
-threads <- as.numeric(args[6])
+input_bam <- par$input
+output_prefix <- par$id
+annotation_gtf <- par$gtf_annotation
+stranded <- ifelse(is.null(par$strandedness), 0, as.integer(par$strandedness))
+paired_end <- ifelse(par$paired, TRUE, FALSE)
+threads <- ifelse(is.null(par$threads), 1, as.integer(par$threads))
+
+output_dupmatrix <- ifelse(is.null(par$output_dupmatrix), paste0(output_prefix, "_dupMatrix.txt", sep=""), par$output_dupmatrix)
+output_dup_intercept_mqc <- ifelse(is.null(par$output_dup_intercept_mqc), paste0(output_prefix, "_dup_intercept_mqc.txt", sep=""), par$output_dup_intercept_mqc)
+output_duprate_exp_boxplot <- ifelse(is.null(par$output_duprate_exp_boxplot), paste0(output_prefix, "_duprateExpBoxplot.pdf", sep=""), par$output_duprate_exp_boxplot)
+output_duprate_exp_densplot <- ifelse(is.null(par$output_duprate_exp_densplot), paste0(output_prefix, "_duprate_exp_densplot.pdf", sep=""), par$output_duprate_exp_densplot)
+output_duprate_exp_denscurve_mqc <- ifelse(is.null(par$output_duprate_exp_denscurve_mqc), paste0(output_prefix, "_duprateExpDensCurve_mqc.txt", sep=""), par$output_duprate_exp_denscurve_mqc)
+output_expression_histogram <- ifelse(is.null(par$output_expression_histogram), paste0(output_prefix, "_expressionHist.pdf", sep=""), par$output_expression_histogram)
+output_intercept_slope <- ifelse(is.null(par$output_intercept_slope), paste0(output_prefix, "_intercept_slope.txt", sep=""), par$output_intercept_slope)
 
 bamRegex <- "(.+)\\.bam$"
 
-if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) &&  (!file.info(input_bam)$isdir))) stop("First argument '<input.bam>' must be an existing file (not a directory) with '.bam' extension...")
+
+if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) && (!file.info(input_bam)$isdir))) stop("First argument '<input.bam>' must be an existing file (not a directory) with '.bam' extension...")
 if(!(file.exists(annotation_gtf) &&  (!file.info(annotation_gtf)$isdir))) stop("Third argument '<annotation.gtf>' must be an existing file (and not a directory)...")
 if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument <strandDirection> must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...")
 if(is.na(threads) || (threads<=0)) stop("Fifth argument <nbThreads> must be a strictly positive numeric value...")
@@ -27,7 +51,7 @@ if(is.na(threads) || (threads<=0)) stop("Fifth argument <nbThreads> must be a st
 message("Input bam      (Arg 1): ", input_bam)
 message("Output basename(Arg 2): ", output_prefix)
 message("Input gtf      (Arg 3): ", annotation_gtf)
-message("Strandness     (Arg 4): ", c("unstranded", "forward", "reverse")[stranded+1])
+message("Strandness     (Arg 4): ", c("unstranded", "forward", "reverse")[stranded])
 message("paired_end     (Arg 5): ", paired_end)
 message("Nb threads     (Arg 6): ", threads)
 message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified"))
@@ -43,25 +67,18 @@ if (!require("dupRadar")) {
   library("dupRadar")
 }
 if (!require("parallel")) {
-    install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/')
     library("parallel")
 }
 
 
 # Duplicate stats
 dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads)
-print(input_bam)
-print(output_prefix)
-print(annotation_gtf)
-print(stranded)
-print(paired_end)
-print(threads)
 print("analyzeDuprates done")
-write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t")
+write.table(dm, file=output_dupmatrix, quote=F, row.name=F, sep="\t")
 print("write.table done")
 
 # 2D density scatter plot
-pdf(paste0(output_prefix, "_duprateExpDens.pdf"))
+pdf(output_duprate_exp_densplot)
 print("pdf done")
 duprateExpDensPlot(DupMat=dm)
 title("Density scatter plot")
@@ -74,7 +91,7 @@ cat(
     paste("- dupRadar Int (duprate at low read counts):", fit$intercept),
     paste("- dupRadar Sl (progression of the duplication rate):", fit$slope),
     fill=TRUE, labels=output_prefix,
-    file=paste0(output_prefix, "_intercept_slope.txt"), append=FALSE
+    file=output_intercept_slope, append=FALSE
 )
 
 # Create a multiqc file dupInt
@@ -92,8 +109,8 @@ line="#id: DupInt
 #        format: '{:.2f}%'
 Sample dupRadar_intercept"
 
-write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
-write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
+write(line,file=output_dup_intercept_mqc, append=TRUE)
+write(paste(sample_name, fit$intercept),file=output_dup_intercept_mqc, append=TRUE)
 print("write dup_intercept_mqc done")
 
 # Get numbers from dupRadar GLM
@@ -142,28 +159,31 @@ line="#id: dupradar
 #          value: 1000
 #          width: 1"
 
-write(line,file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),append=TRUE)
+write(line,file=output_duprate_exp_denscurve_mqc, append=TRUE)
 write.table(
     cbind(curve_x, curve_y),
-    file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),
+    file=output_duprate_exp_denscurve_mqc,
     quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE,
 )
 print("write duprateExpDensCurve_mqc done")
 # Distribution of expression box plot
-pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf"))
+pdf(output_duprate_exp_boxplot)
 duprateExpBoxplot(DupMat=dm)
 title("Percent Duplication by Expression")
 mtext(output_prefix, side=3)
 dev.off()
 print("duprateExpBoxplot done")
 # Distribution of RPK values per gene
-pdf(paste0(output_prefix, "_expressionHist.pdf"))
+pdf(output_expression_histogram)
 expressionHist(DupMat=dm)
 title("Distribution of RPK values per gene")
 mtext(output_prefix, side=3)
 dev.off()
 print("expressionHist done")
 
+
+# 
+
 # Print sessioninfo to standard out
 print(output_prefix)
 # citation("dupRadar")
diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh
deleted file mode 100755
index 6ac70bf0..00000000
--- a/src/dupradar/script.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-## VIASH START
-## VIASH END
-
-
-set -eo pipefail 
-
-
-function num_strandness {
-    if [ $par_strandedness == 'unstranded' ]; then echo 0
-    elif [ $par_strandedness == 'forward' ]; then echo 1
-    elif [ $par_strandedness == 'reverse' ]; then echo 2
-    else echo "strandedness must be unstranded, forward or reverse." && \
-        exit 1
-    fi
-}
-
-
-if [ $par_paired ]; then paired=TRUE; else paired=FALSE; fi
-
-
-Rscript "$meta_resources_dir/dupradar.r" \
-    $par_input \
-    $par_id \
-    $par_gtf_annotation \
-    $(num_strandness) \
-    ${paired} \
-    ${meta_cpus:-1}
-
-mv "$par_id"_dupMatrix.txt $par_output_dupmatrix
-mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc
-mv "$par_id"_duprateExpBoxplot.pdf $par_output_duprate_exp_boxplot
-mv "$par_id"_duprateExpDens.pdf $par_output_duprate_exp_densplot
-mv "$par_id"_duprateExpDensCurve_mqc.txt $par_output_duprate_exp_denscurve_mqc
-mv "$par_id"_expressionHist.pdf $par_output_expression_histogram
-mv "$par_id"_intercept_slope.txt $par_output_intercept_slope
-
-
-exit 0
diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh
index c4afd67a..99610357 100755
--- a/src/dupradar/test.sh
+++ b/src/dupradar/test.sh
@@ -8,7 +8,7 @@ output_dupmatrix="dup_matrix.txt"
 output_dup_intercept_mqc="dup_intercept_mqc.txt"
 output_duprate_exp_boxplot="duprate_exp_boxplot.pdf"
 output_duprate_exp_densplot="duprate_exp_densityplot.pdf"
-output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.pdf"
+output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.txt"
 output_expression_histogram="expression_hist.pdf"
 output_intercept_slope="intercept_slope.txt"
 
@@ -19,8 +19,7 @@ echo "> Running $meta_functionality_name for unpaired reads, writing to tmpdir $
     --input "$input_bam" \
     --id "test" \
     --gtf_annotation "$input_gtf" \
-    --strandedness "forward" \
-    --paired false \
+    --strandedness 1 \
     --output_dupmatrix $output_dupmatrix \
     --output_dup_intercept_mqc $output_dup_intercept_mqc \
     --output_duprate_exp_boxplot $output_duprate_exp_boxplot \
@@ -65,4 +64,44 @@ diff -B -b "$output_duprate_exp_denscurve_mqc" "${meta_resources_dir}/test_data/
     { echo "Output file $output_duprate_exp_denscurve_mqc is not correct"; exit 1; }
 diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \
     { echo "Output file $output_intercept_slope is not correct"; exit 1; }
+
+
+echo ">>> Test 2: Example without specified output files"
+
+"$meta_executable" \
+    --input "$input_bam" \
+    --id "test" \
+    --gtf_annotation "$input_gtf" \
+    --strandedness 1
+
+exit_code=$?
+[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
+
+echo ">> asserting output has been created for paired read input"
+# output files with original names from DupRadar and id prefix
+[ ! -f "test_dupMatrix.txt" ] && echo "test_dupMatrix.txt was not created" && exit 1
+[ ! -s "test_dupMatrix.txt" ] && echo "test_dupMatrix.txt is empty" && exit 1
+[ ! -f "test_dup_intercept_mqc.txt" ] && echo "test_dup_intercept_mqc.txt was not created" && exit 1
+[ ! -s "test_dup_intercept_mqc.txt" ] && echo "test_dup_intercept_mqc.txt is empty" && exit 1
+[ ! -f "test_duprateExpBoxplot.pdf" ] && echo "test_duprateExpBoxplot.pdf was not created" && exit 1
+[ ! -s "test_duprateExpBoxplot.pdf" ] && echo "test_duprateExpBoxplot.pdf is empty" && exit 1
+[ ! -f "test_duprate_exp_densplot.pdf" ] && echo "test_duprate_exp_densplot.pdf was not created" && exit 1
+[ ! -s "test_duprate_exp_densplot.pdf" ] && echo "test_duprate_exp_densplot.pdf is empty" && exit 1
+[ ! -f "test_duprateExpDensCurve_mqc.txt" ] && echo "test_duprateExpDensCurve_mqc.txt was not created" && exit 1
+[ ! -s "test_duprateExpDensCurve_mqc.txt" ] && echo "test_duprateExpDensCurve_mqc.txt is empty" && exit 1
+[ ! -f "test_expressionHist.pdf" ] && echo "test_expressionHist.pdf was not created" && exit 1
+[ ! -s "test_expressionHist.pdf" ] && echo "test_expressionHist.pdf is empty" && exit 1
+[ ! -f "test_intercept_slope.txt" ] && echo "test_intercept_slope.txt was not created" && exit 1
+[ ! -s "test_intercept_slope.txt" ] && echo "test_intercept_slope.txt is empty" && exit 1
+
+
+echo ">> Check if output is correct"
+diff -B -b "test_dupMatrix.txt" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || \
+    { echo "Output file test_dupMatrix.txt is not correct"; exit 1; }
+diff -B -b "test_duprateExpDensCurve_mqc.txt" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \
+    { echo "Output file test_duprateExpDensCurve_mqc.txt is not correct"; exit 1; }
+diff -B -b "test_intercept_slope.txt" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \
+    { echo "Output file test_intercept_slope.txt is not correct"; exit 1; }
+
+echo ">>> All tests passed"
 exit 0
\ No newline at end of file

From 81fe5001b1bfdf5e05a503e89e4acfa81b0fb43e Mon Sep 17 00:00:00 2001
From: Emma Rousseau <emmarou1@icloud.com>
Date: Wed, 30 Oct 2024 15:06:33 +0100
Subject: [PATCH 10/11] R-related syntax fixes

---
 src/dupradar/config.vsh.yaml | 2 +-
 src/dupradar/dupradar.r      | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml
index ce4f79b1..52063368 100644
--- a/src/dupradar/config.vsh.yaml
+++ b/src/dupradar/config.vsh.yaml
@@ -93,7 +93,7 @@ engines:
         packages: [ r-base ]
       - type: r
         bioc: [ dupRadar ]
-        packages: [ parallel ]
+        packages: [ parallel, rlang ]
       - type: docker
         run: |
           echo "DupRadar: $(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")" > /var/software_versions.txt
diff --git a/src/dupradar/dupradar.r b/src/dupradar/dupradar.r
index a3497e7a..700e3428 100755
--- a/src/dupradar/dupradar.r
+++ b/src/dupradar/dupradar.r
@@ -18,6 +18,7 @@ par <- list(
 )
 ## VIASH END
 
+library(rlang)
 
 if (length(par) < 5) {
     stop("Usage: dupRadar.r <input.bam> <sample_id> <annotation.gtf> <strandDirection:0=unstranded/1=forward/2=reverse> <paired/single> <nbThreads> <R-package-location (optional)>", call.=FALSE)
@@ -27,11 +28,11 @@ if (length(par) < 5) {
 input_bam <- par$input
 output_prefix <- par$id
 annotation_gtf <- par$gtf_annotation
-stranded <- ifelse(is.null(par$strandedness), 0, as.integer(par$strandedness))
+stranded <- par$strandedness %||% 0L
 paired_end <- ifelse(par$paired, TRUE, FALSE)
 threads <- ifelse(is.null(par$threads), 1, as.integer(par$threads))
 
-output_dupmatrix <- ifelse(is.null(par$output_dupmatrix), paste0(output_prefix, "_dupMatrix.txt", sep=""), par$output_dupmatrix)
+output_dupmatrix <- par$output_dupmatrix %||% paste0(output_prefix, "_dupMatrix.txt")
 output_dup_intercept_mqc <- ifelse(is.null(par$output_dup_intercept_mqc), paste0(output_prefix, "_dup_intercept_mqc.txt", sep=""), par$output_dup_intercept_mqc)
 output_duprate_exp_boxplot <- ifelse(is.null(par$output_duprate_exp_boxplot), paste0(output_prefix, "_duprateExpBoxplot.pdf", sep=""), par$output_duprate_exp_boxplot)
 output_duprate_exp_densplot <- ifelse(is.null(par$output_duprate_exp_densplot), paste0(output_prefix, "_duprate_exp_densplot.pdf", sep=""), par$output_duprate_exp_densplot)
@@ -96,7 +97,7 @@ cat(
 
 # Create a multiqc file dupInt
 sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix)
-line="#id: DupInt
+line <- "#id: DupInt
 #plot_type: 'generalstats'
 #pconfig:
 #    dupRadar_intercept:

From feb97e0f19ae286e2a5aa8799e5143d82567cbf4 Mon Sep 17 00:00:00 2001
From: Emma Rousseau <emmarou1@icloud.com>
Date: Thu, 31 Oct 2024 09:26:50 +0000
Subject: [PATCH 11/11] uniform R syntax changes

---
 src/dupradar/dupradar.r | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/dupradar/dupradar.r b/src/dupradar/dupradar.r
index 700e3428..f813e902 100755
--- a/src/dupradar/dupradar.r
+++ b/src/dupradar/dupradar.r
@@ -8,13 +8,13 @@ par <- list(
   "strandedness" = 1,
   "paired" = TRUE,
   "threads" = 1,
-  "output_dupmatrix"="dup_matrix.txt",
-  "output_dup_intercept_mqc"="dup_intercept_mqc.txt",
-  "output_duprate_exp_boxplot"="duprate_exp_boxplot.pdf",
-  "output_duprate_exp_densplot"="duprate_exp_densityplot.pdf",
-  "output_duprate_exp_denscurve_mqc"="duprate_exp_density_curve_mqc.pdf",
-  "output_expression_histogram"="expression_hist.pdf",
-  "output_intercept_slope"="intercept_slope.txt"
+  "output_dupmatrix" = "dup_matrix.txt",
+  "output_dup_intercept_mqc" = "dup_intercept_mqc.txt",
+  "output_duprate_exp_boxplot" = "duprate_exp_boxplot.pdf",
+  "output_duprate_exp_densplot" = "duprate_exp_densityplot.pdf",
+  "output_duprate_exp_denscurve_mqc" = "duprate_exp_density_curve_mqc.pdf",
+  "output_expression_histogram" = "expression_hist.pdf",
+  "output_intercept_slope" = "intercept_slope.txt"
 )
 ## VIASH END
 
@@ -30,15 +30,15 @@ output_prefix <- par$id
 annotation_gtf <- par$gtf_annotation
 stranded <- par$strandedness %||% 0L
 paired_end <- ifelse(par$paired, TRUE, FALSE)
-threads <- ifelse(is.null(par$threads), 1, as.integer(par$threads))
+threads <- par$threads %||% 1L
 
 output_dupmatrix <- par$output_dupmatrix %||% paste0(output_prefix, "_dupMatrix.txt")
-output_dup_intercept_mqc <- ifelse(is.null(par$output_dup_intercept_mqc), paste0(output_prefix, "_dup_intercept_mqc.txt", sep=""), par$output_dup_intercept_mqc)
-output_duprate_exp_boxplot <- ifelse(is.null(par$output_duprate_exp_boxplot), paste0(output_prefix, "_duprateExpBoxplot.pdf", sep=""), par$output_duprate_exp_boxplot)
-output_duprate_exp_densplot <- ifelse(is.null(par$output_duprate_exp_densplot), paste0(output_prefix, "_duprate_exp_densplot.pdf", sep=""), par$output_duprate_exp_densplot)
-output_duprate_exp_denscurve_mqc <- ifelse(is.null(par$output_duprate_exp_denscurve_mqc), paste0(output_prefix, "_duprateExpDensCurve_mqc.txt", sep=""), par$output_duprate_exp_denscurve_mqc)
-output_expression_histogram <- ifelse(is.null(par$output_expression_histogram), paste0(output_prefix, "_expressionHist.pdf", sep=""), par$output_expression_histogram)
-output_intercept_slope <- ifelse(is.null(par$output_intercept_slope), paste0(output_prefix, "_intercept_slope.txt", sep=""), par$output_intercept_slope)
+output_dup_intercept_mqc <- par$output_dup_intercept_mqc %||% paste0(output_prefix, "_dup_intercept_mqc.txt")
+output_duprate_exp_boxplot <- par$output_duprate_exp_boxplot %||% paste0(output_prefix, "_duprateExpBoxplot.pdf")
+output_duprate_exp_densplot <- par$output_duprate_exp_densplot %||% paste0(output_prefix, "_duprate_exp_densplot.pdf")
+output_duprate_exp_denscurve_mqc <- par$output_duprate_exp_denscurve_mqc %||% paste0(output_prefix, "_duprateExpDensCurve_mqc.txt")
+output_expression_histogram <- par$output_expression_histogram %||% paste0(output_prefix, "_expressionHist.pdf")
+output_intercept_slope <- par$output_intercept_slope %||% paste0(output_prefix, "_intercept_slope.txt")
 
 bamRegex <- "(.+)\\.bam$"
 
@@ -116,18 +116,18 @@ print("write dup_intercept_mqc done")
 
 # Get numbers from dupRadar GLM
 curve_x <- sort(log10(dm$RPK))
-curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response")
+curve_y <- 100*predict(fit$glm, data.frame(x=curve_x), type="response")
 # Remove all of the infinite values
 infs = which(curve_x %in% c(-Inf,Inf))
-curve_x = curve_x[-infs]
-curve_y = curve_y[-infs]
+curve_x <- curve_x[-infs]
+curve_y <- curve_y[-infs]
 # Reduce number of data points
 curve_x <- curve_x[seq(1, length(curve_x), 10)]
 curve_y <- curve_y[seq(1, length(curve_y), 10)]
 # Convert x values back to real counts
-curve_x = 10^curve_x
+curve_x <- 10^curve_x
 # Write to file
-line="#id: dupradar
+line <- "#id: dupradar
 #section_name: 'DupRadar'
 #section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html'
 #description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication.