From a893a37f70a665bfd238fa2833eb9bfdc1acc1c5 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Mon, 23 Sep 2024 11:15:35 +0200
Subject: [PATCH] Version info, functional test

---
 src/dupradar/config.vsh.yaml                  |  15 +-
 src/dupradar/{script.R => dupradar.r}         |  23 +-
 src/dupradar/help.txt                         | 583 ++++++++++++++++++
 src/dupradar/script.sh                        |  13 +-
 src/dupradar/test.sh                          |  13 +-
 src/dupradar/test_data/test_dupMatrix.txt     |   2 +-
 .../test_data/test_dup_intercept_mqc.txt      |  39 --
 7 files changed, 628 insertions(+), 60 deletions(-)
 rename src/dupradar/{script.R => dupradar.r} (92%)
 delete mode 100644 src/dupradar/test_data/test_dup_intercept_mqc.txt

diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml
index 9727fdec..d25547db 100644
--- a/src/dupradar/config.vsh.yaml
+++ b/src/dupradar/config.vsh.yaml
@@ -79,15 +79,26 @@ resources:
     path: script.sh
   # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r
   - type: r_script
-    path: script.R
+    path: dupradar.r
 test_resources:
   - type: bash_script
     path: test.sh
   - type: file
     path: test_data
+    
 engines:
   - type: docker
-    image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0
+    image: ubuntu:22.04
+    setup:   
+      - type: apt
+        packages: [ r-base ]
+      - type: r
+        bioc: [ dupRadar ]
+        packages: [ parallel ]
+      - type: docker
+        run: |
+          echo "DupRadar: $(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")" > /var/software_versions.txt
+        
 runners:
   - type: executable  
   - type: nextflow
diff --git a/src/dupradar/script.R b/src/dupradar/dupradar.r
similarity index 92%
rename from src/dupradar/script.R
rename to src/dupradar/dupradar.r
index 24ebce2d..4294a42e 100755
--- a/src/dupradar/script.R
+++ b/src/dupradar/dupradar.r
@@ -47,17 +47,29 @@ if (!require("parallel")) {
     library("parallel")
 }
 
+
 # Duplicate stats
 dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads)
+print(input_bam)
+print(output_prefix)
+print(annotation_gtf)
+print(stranded)
+print(paired_end)
+print(threads)
+print("analyzeDuprates done")
 write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t")
+print("write.table done")
 
 # 2D density scatter plot
 pdf(paste0(output_prefix, "_duprateExpDens.pdf"))
+print("pdf done")
 duprateExpDensPlot(DupMat=dm)
 title("Density scatter plot")
 mtext(output_prefix, side=3)
 dev.off()
+print("duprateExpDensPlot done")
 fit <- duprateExpFit(DupMat=dm)
+print("duprateExpFit done")
 cat(
     paste("- dupRadar Int (duprate at low read counts):", fit$intercept),
     paste("- dupRadar Sl (progression of the duplication rate):", fit$slope),
@@ -82,6 +94,7 @@ Sample dupRadar_intercept"
 
 write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
 write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
+print("write dup_intercept_mqc done")
 
 # Get numbers from dupRadar GLM
 curve_x <- sort(log10(dm$RPK))
@@ -135,22 +148,24 @@ write.table(
     file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),
     quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE,
 )
-
+print("write duprateExpDensCurve_mqc done")
 # Distribution of expression box plot
 pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf"))
 duprateExpBoxplot(DupMat=dm)
 title("Percent Duplication by Expression")
 mtext(output_prefix, side=3)
 dev.off()
-
+print("duprateExpBoxplot done")
 # Distribution of RPK values per gene
 pdf(paste0(output_prefix, "_expressionHist.pdf"))
 expressionHist(DupMat=dm)
 title("Distribution of RPK values per gene")
 mtext(output_prefix, side=3)
 dev.off()
+print("expressionHist done")
 
 # Print sessioninfo to standard out
 print(output_prefix)
-citation("dupRadar")
-sessionInfo()
+# citation("dupRadar")
+# print("citation done")
+# sessionInfo()
diff --git a/src/dupradar/help.txt b/src/dupradar/help.txt
index e69de29b..819c352d 100644
--- a/src/dupradar/help.txt
+++ b/src/dupradar/help.txt
@@ -0,0 +1,583 @@
+---
+title: "Using the dupRadar package"
+author: "Sergi Sayols, Holger Klein"
+date: "June 23, 2015"
+output:
+    BiocStyle::html_document:
+    toc: true
+---
+<!--
+% \VignetteEngine{knitr::rmarkdown}
+% \VignetteIndexEntry{Using dupRadar}
+-->
+
+```{r pre,echo=FALSE,results='hide'}
+library(knitr)
+opts_chunk$set(warning=FALSE,message=FALSE,cache=TRUE)
+```
+```{r style,echo=FALSE,results='asis'}
+BiocStyle::markdown()
+```
+
+
+
+# Introduction to dupRadar
+
+RNA-Seq experiments are a common strategy nowadays to quantify the gene 
+expression levels in cells. Due to its higher sensitivity compared to 
+arrays and the ability to discover novel features, makes them the choice for 
+most modern experiments. 
+
+In RNA-Seq - as in all other NGS applications - quality control is essential.
+Current NGS workflows usually involve PCR steps at some point, which involves
+the danger of PCR artefacts and over-amplification of reads.
+For common DNA-based assays PCR duplicates are often simply removed before 
+further analysis and their overall fraction or the read multiplicity taken 
+as quality metrics.
+In RNA-Seq however, apart from the technical PCR duplicates, there is a strong
+source for biological read duplicates: for a given gene length and sequencing 
+depth there exists an expression level beyond which it is not possible to add
+place more reads inside a gene locus without placing a second read exactly 
+on the position of an already existing read.
+For this reason the overall duplication rate is not a useful measure for 
+RNA-Seq.
+
+As in the NGS/RNA-Seq QC ecosystem there were no suitable tools to address this
+question, we set out to develop a new tool. The `r Rpackage("dupRadar")` 
+package gives an insight into the duplication problem by graphically 
+relating the gene expression level and the duplication rate present on it. 
+Thus, failed experiments can be easily identified at a glance.
+
+
+**Note: by now RNA-Seq protocols have matured so that for bulk RNA protocols data 
+rarely suffer from technical duplicates. With newer low-input or single cell 
+RNA-Seq protocols technical duplicates possible problems are worth to be 
+checked for by default, especially if protocols are pushed to or beyond 
+their boundaries.
+Paired-end libraries make the distinction between duplicates due to highly 
+expressed genes and PCR duplicates easier, but the problem itself is not 
+completely solved, especially at higher sequencing depths.**
+
+# Getting started using dupRadar
+
+## Preparing your data
+
+Previous to running the duplication rate analysis, the BAM file with your
+mapped reads has to be duplicate marked with a like Picard, or the faster 
+BamUtil dedup BioBamBam. 
+The `r Rpackage("dupRadar")` package only works with duplicate marked BAM files.
+
+If you do not have/want a duplication marking step in your default pipeline, 
+the `r Rpackage("dupRadar")` package includes, for your convenience, 
+wrappers to properly call some of these tools from your R session. Note that
+you still have to supply the path of the dupmarker installation though:
+
+```{r loadLibrary}
+library(dupRadar)
+```
+
+Now, simply call the wrapper:
+
+```{r eval=FALSE}
+# call the duplicate marker and analyze the reads
+bamDuprm <- markDuplicates(dupremover="bamutil",
+			   bam="test.bam",
+	    	   path="/opt/bamUtil-master/bin",
+			   rminput=FALSE)
+```
+
+Simply specify which tool to use, the path where this tool is installed, 
+and the input bam file to be analyzed. After marking duplicates, it's safe 
+to remove to original BAM file in order to save space.
+
+The `r Rpackage("dupRadar")` package currently comes with support for:
+
+- [Picard MarkDuplicates](http://broadinstitute.github.io/picard)
+- [BamUtil](http://genome.sph.umich.edu/wiki/BamUtil)
+
+After the BAM file is marked for duplicates, dupRadar is ready to 
+analyze how the duplication rate is related with the estimated gene expression 
+levels.
+
+## A GTF file
+
+Unless there is any specific reason, dupRadar can use the same GTF file 
+that will be later used to count the reads falling on features.
+
+A valid GTF file can be obtained from UCSC, Ensembl, the iGenomes or other 
+projects.
+
+Note that the resulting duplication rate plots depend on the GTF annotation 
+used. GTF files from the gencode projects result in a less clear picture of 
+duplication rates, as there are many more features and feature types 
+annotated, which overlap heavily as well. In some cases creating the plots 
+only using subsets of gencode annotation files (e.g. just protein 
+coding genes) serve the QC purpose of this tool better.
+
+## AnnotationHub as a source of GTF files
+
+The Bioconductor `r Rpackage("AnnotationHub")` package provides an alternative
+approach to obtain annotation in GTF format from entities such as Ensembl, UCSC,
+ENCODE, and 1000 Genomes projects.
+
+This is partly outlined in the AnnotationHub 'HOWTO' vignette
+section "Ensembl GTF and FASTA files for TxDb gene models and sequence
+queries"; for the Takifugu example, the downloaded GTF file is
+available from the cache.
+
+```{r}
+if(suppressWarnings(require(AnnotationHub))) {
+    ah = AnnotationHub()
+    query(ah, c("ensembl", "80", "Takifugu", "gtf"))    # discovery
+    cache(ah["AH47101"])                                # retrieve / file path
+}
+```
+
+# dupRate demo data
+
+In the package we include two precomputed duplication matrices for two RNASeq
+experiments used as examples of a good and a failed (in terms of high redundancy
+of reads) experiments. The experiments come from the ENCODE project, as a source
+of a wide variety of protocols, library types and sequencing facilities.
+
+Load the example dataset with:
+
+```{r}
+attach(dupRadar_examples)
+```
+
+# The duplication rate analysis
+
+The analysis requires some info about the sequencing run and library 
+preparation protocol:
+
+- The strandess information about the reads: is the sequencing strand 
+specific? if so, are the reads reversely sequenced?
+- Are the reads paired, or single?
+
+Due to its phenomenal performance, internally we use the featureCounts() 
+function from the Bioconductor `r Biocpkg("Rsubread")` package, which also
+supports multiple threads.
+
+```{r eval=FALSE}
+# The call parameters: 
+bamDuprm <- "test_duprm.bam"	# the duplicate marked bam file
+gtf <- "genes.gtf"	# the gene model
+stranded <- 2		# '0' (unstranded), '1' (stranded) and '2' (reversely stranded)
+paired   <- FALSE	# is the library paired end?
+threads  <- 4		# number of threads to be used
+
+# Duplication rate analysis
+dm <- analyzeDuprates(bamDuprm,gtf,stranded,paired,threads)
+```
+
+The duplication matrix contains read counts in different scenarios and RPK and RPKM values for every gene.
+
+|ID     | geneLength| allCountsMulti| filteredCountsMulti| dupRateMulti| dupsPerIdMulti|     RPKMulti|  RPKMMulti| allCounts| filteredCounts|   dupRate| dupsPerId|         RPK|       RPKM|
+|:------|----------:|--------------:|-------------------:|------------:|--------------:|------------:|----------:|---------:|--------------:|---------:|---------:|-----------:|----------:|
+|Xkr4   |       3634|              2|                   2|    0.0000000|              0|    0.5503577|  0.0117159|         2|              2| 0.0000000|         0|   0.5503577|  0.0117159|
+|Rp1    |       9747|              0|                   0|          NaN|              0|    0.0000000|  0.0000000|         0|              0|       NaN|         0|   0.0000000|  0.0000000|
+|Sox17  |       3130|              0|                   0|          NaN|              0|    0.0000000|  0.0000000|         0|              0|       NaN|         0|   0.0000000|  0.0000000|
+|Mrpl15 |       4203|            429|                 261|    0.3916084|            168|  102.0699500|  2.1728436|       419|            258| 0.3842482|       161|  99.6906971|  2.1221946|
+|Lypla1 |       2433|           1106|                 576|    0.4792043|            530|  454.5828196|  9.6770634|      1069|            562| 0.4742750|       507| 439.3752569|  9.3533280|
+|Tcea1  |       2847|           2890|                1046|    0.6380623|           1844| 1015.1036178| 21.6093121|      1822|            696| 0.6180022|      1126| 639.9719002| 13.6235871|
+
+# Plotting and interpretation
+
+The number of reads per base assigned to a gene in an ideal RNA-Seq data set 
+is expected to be proportional to the abundance of its transcripts in the 
+sample. For lowly expressed genes we expect read duplication to happen rarely by
+chance, while for highly expressed genes - depending on the total sequencing
+depth - we expect read duplication to happen often.
+
+A good way to learn if a dataset is following this trend is by relating the 
+normalized number of counts per gene (RPK, as a quantification of the gene 
+expression) and the fraction represented by duplicated reads.
+
+The `r Rpackage("dupRadar")` offers several functions to assess this 
+relationship. The most prominent may be the 2d density scatter plot:
+
+```{r fig.width=14,fig.height=7}
+## make a duprate plot (blue cloud)
+par(mfrow=c(1,2))
+duprateExpDensPlot(DupMat=dm)		# a good looking plot
+title("good experiment")
+duprateExpDensPlot(DupMat=dm.bad)	# a dataset with duplication problems
+title("duplication problems")
+
+## duprate boxplot
+duprateExpBoxplot(DupMat=dm)		# a good looking plot
+duprateExpBoxplot(DupMat=dm.bad)	# a dataset with duplication problems
+```
+
+The *duprateExpDensPlot* has helper lines at the threshold of 1 read/bp and at 
+0.5 RPKM. Moreover by default a generalized linear model is fit to the data and
+overplotted (see also below).
+
+## Fitting a model into the data
+
+To summarize the relationship between duplication rates and gene expression, 
+we propose fitting a logistic regression curve onto the data.  With the 
+coefficients of the fitted model, one can get an idea of the initial 
+duplication rate at low read counts (Intercept) and the progression of the 
+duplication rate along with the progression of the read counts (Slope).
+
+
+```{r fig.width=7,fig.height=7}
+duprateExpDensPlot(DupMat=dm)
+
+# or, just to get the fitted model without plot
+fit <- duprateExpFit(DupMat=dm)
+cat("duprate at low read counts: ",fit$intercept,"\n",
+	"progression of the duplication rate: ",fit$slope,fill=TRUE)
+```
+
+Our main use case for that function is the condensation
+of the plots into quality metrics that can be used for automatic flagging of 
+possibly problematic samples in large experiments or aggregation with other
+quality metrics in large tables to analyse interdependencies.
+
+
+The *duprateExpBoxplot* plot shows the range of the duplication rates at 5% 
+bins (default) along the distribution of RPK gene counts. The x-axis displays 
+the quantile of the RPK distribution, and the average RPK of the genes 
+contained in this quantile.
+
+Individual genes can be identified in the plot:
+
+```{r eval=FALSE}
+## INTERACTIVE: identify single points on screen (name="ID" column of dm)
+duprateExpPlot(DupMat=dm)		# a good looking plot
+duprateExpIdentify(DupMat=dm)
+```
+
+One can also call the function *duprateExpPlot* to get smooth color density 
+representation of the same data:
+
+```{r fig.width=14,fig.height=7}
+par(mfrow=c(1,2))
+cols <- colorRampPalette(c("black","blue","green","yellow","red"))
+duprateExpPlot(DupMat=dm,addLegend=FALSE)
+duprateExpPlot(DupMat=dm.bad,addLegend=FALSE,nrpoints=10,nbin=500,colramp=cols)
+```
+
+Any further parm sent to the *duprateExpPlot* is also sent to the 
+*smoothScatter* function.
+
+## Comparing the fitted parameters to other datasets
+
+The parameters of the fitted model may mean very little (or just nothing) for
+many, unless there's other data to compare with. We provide the pre-computed
+duplication matrices for all the RNA-Seq experiments publicly available from 
+the ENCODE project, for human and mouse.
+
+![alt text](figures/encode.svg)
+
+With the experience from the ENCODE datasets, we expect from single read 
+experiments little duplication at low RPKM (low **intercept**) rapidly rising 
+because of natural duplication (high **slope**). In contrast, paired-end
+experiments have a more mild rising of the natural duplication (low **slope**)
+due to having higher diversity of reads pairs since pairs with same start may
+still have different end.
+
+The common denominator for both designs is the importance of having a low
+intercept, suggesting that duplication rate at lowly expressed genes may serve
+as a quality measure.
+
+All the pre-computed duplication matrices are available in the [dupRadar Github site](https://github.com/ssayols/dupRadar).
+
+
+## Other plots
+
+**CAVEAT: Sometimes in discussions duplicate reads (i.e. two physically 
+different reads are mapped to the exact same position) and multi-mapping 
+reads (i.e. a single read is mapped to two or more locations in the genome)
+are mixed up. `r Rpackage("dupRadar")`'s main focus are PCR duplicates in RNA-Seq.
+However internally we keep track of unique mappers and multimappers, and we use
+both in some of the examples, to illustrate use cases of our package beyond
+the main aim. Multi-mapping reads are completely independent of PCR duplicates.**
+
+Apart from the plots relating RPK and duplication rate, the 
+`r Rpackage("dupRadar")` package provides some other useful plots to extract 
+information about the gene expression levels.
+
+An interesting quality metric are the fraction of reads taken up by groups of 
+genes binned by 5% expression levels. 
+```{r fig.width=7,fig.height=7}
+readcountExpBoxplot(DupMat=dm)
+```
+
+In the example we see that the 5% of 
+highest expressed genes in our sample data set take up around 60% of all reads.
+
+The distribution of RPK values per genes can be plotted with:
+
+```{r fig.width=7,fig.height=7}
+expressionHist(DupMat=dm)
+```
+
+This would help in identifying skewed distributions with unusual amount of 
+lowly expressed genes, or to detect no consensus between replicates.
+
+
+# Other information deduced from the data
+
+The duplication rate matrix calculated by the function *analyzeDuprates()* 
+contains some useful information about the sequencing experiment, that can be 
+used to assess the quality of the data.
+
+## Fraction of multimappers per gene
+
+Analogous to per gene duplication rate, the fraction of mutimappers 
+can be easily calculated fom the duplication matrix.
+
+Taking the counts from the column *allCountsMulti*, and substracting form it 
+the counts from the column *allCounts*, one can get the total number of 
+multihits. Thus, the fraction of multihits per gene can be calculating then 
+dividing by *allCountsMulti*.
+
+```{r}
+# calculate the fraction of multimappers per gene
+dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti
+# how many genes are exclusively covered by multimappers
+sum(dm$mhRate == 1, na.rm=TRUE)
+
+# and how many have an RPKM (including multimappers) > 5
+sum(dm$mhRate==1 & dm$RPKMMulti > 5, na.rm=TRUE)
+
+# and which are they?
+dm[dm$mhRate==1 & dm$RPKMMulti > 5, "ID"]
+```
+
+We can also generate an overall picture about less extreme cases:
+```{r fig.width=7,fig.height=7}
+hist(dm$mhRate, 
+     breaks=50, 
+     col="red", 
+     main="Frequency of multimapping rates", 
+     xlab="Multimapping rate per gene", 
+     ylab="Frequency")
+
+```
+
+Also the direct comparison of reads per kilobase between uniquely and 
+multimappers is possible.
+
+```{r fig.width=7,fig.height=7}
+# comparison of multi-mapping RPK and uniquely-mapping RPK
+plot(log2(dm$RPK), 
+     log2(dm$RPKMulti), 
+     xlab="Reads per kb (uniquely mapping reads only)",
+     ylab="Reads per kb (all including multimappers, non-weighted)"
+)
+```
+
+Use with identify() to annotate interesting cases interactively.
+```{r, eval=F}
+identify(log2(dm$RPK), 
+         log2(dm$RPKMulti),
+	 labels=dm$ID)
+```
+
+## Connection between possible PCR artefacts and GC content
+In some cases we wondered about influence of GC content on PCR artefacts. An
+easy way to check using our `r Rpackage("dupRadar")` package in conjunction
+with `r Rpackage("biomaRt")` is demonstrated in the following. For simplicity
+we use our demo data here in which we *do not* see a big influence.
+
+
+```{r eval=F}
+library(dupRadar)
+library(biomaRt)
+
+## for detailed explanations on biomaRt, please see the respective
+## vignette
+
+## set up biomart connection for mouse (needs internet connection)
+ensm <- useMart("ensembl")
+ensm <- useDataset("mmusculus_gene_ensembl", mart=ensm)
+
+## get a table which has the gene GC content for the IDs that have been
+## used to generate the table (depends on the GTF annotation that you
+## use)
+tr <- getBM(attributes=c("mgi_symbol", "percentage_gc_content"),
+            values=TRUE, mart=ensm)
+
+## create a GC vector with IDs as element names
+mgi.gc <- tr$percentage_gc_content
+names(mgi.gc) <- tr$mgi_symbol
+
+## using dm demo duplication matrix that comes with the package
+## add GC content to our demo data and keep only subset for which we can
+## retrieve data
+keep <- dm$ID %in% tr$mgi_symbol
+dm.gc <- dm[keep,]
+dm.gc$gc <- mgi.gc[dm.gc$ID]
+
+## check distribution of annotated gene GC content (in %)
+boxplot(dm.gc$gc, main="Gene GC content", ylab="% GC")
+```
+
+![alt text](figures/GCcontent.svg)
+
+Now we can compare the dependence of duplication rate on expression level
+independently for below and above median GC genes (and to mention again,
+in this data set we don't have a big difference).
+
+```{r eval=F}
+par(mfrow=c(1,2))
+
+## below median GC genes
+duprateExpDensPlot(dm.gc[dm.gc$gc<=45,], main="below median GC genes")
+
+## above median GC genes
+duprateExpDensPlot(dm.gc[dm.gc$gc>=45,], main="above median GC genes")
+```
+
+![alt text](figures/GCdependence.png)
+
+# Conclusion
+
+The `r Rpackage("dupRadar")` package provides a framework for the analysis of 
+duplicate rates in RNAseq datasets. In addition, it includes a set of 
+convenient wrappers to correctly call some common tools used for marking PCR 
+duplicated reads in the BAM file. It's shipped as Bioconductor package in 
+order to offer a common framework for all the tools involved, and simplify its 
+use.
+
+# Including dupRadar into pipelines
+
+To include dupRadar as a single step in an RNA-Seq pipeline, integration into a short R-script can be done like in the following:
+
+```{r,eval=F}
+#!/usr/bin/env Rscript
+
+########################################
+##
+## dupRadar shell script
+## call dupRadar R package from the shell for 
+## easy integration into pipelines
+##
+## Holger Klein & Sergi Sayols
+##
+## https://github.com/ssayols/dupRadar
+##
+## input:
+## - _duplicate marked_ bam file
+## - gtf file
+## - parameters for duplication counting routine:
+##   stranded, paired, outdir, threads.
+##
+########################################
+
+library(dupRadar)
+
+####################
+##
+## get name patterns from command line
+##
+args   <- commandArgs(TRUE)
+
+## the bam file to analyse
+bam <- args[1]
+## usually, same GTF file as used in htseq-count
+gtf <- gsub("gtf=","",args[2])
+## no|yes|reverse
+stranded <- gsub("stranded=","",args[3])
+## is a paired end experiment
+paired   <- gsub("paired=","",args[4])
+## output directory
+outdir   <- gsub("outdir=","",args[5])
+## number of threads to be used
+threads  <- as.integer(gsub("threads=","",args[6]))
+
+if(length(args) != 6) { 
+  stop (paste0("Usage: ./dupRadar.sh <file.bam> <genes.gtf> ",
+			   "<stranded=[no|yes|reverse]> paired=[yes|no] ",
+			   "outdir=./ threads=1"))
+}
+
+if(!file.exists(bam)) { 
+  stop(paste("File",bam,"does NOT exist"))
+}
+
+if(!file.exists(gtf)) {
+  stop(paste("File",gtf,"does NOT exist"))
+}
+
+if(!file.exists(outdir)) { 
+  stop(paste("Dir",outdir,"does NOT exist"))
+}
+
+if(is.na(stranded) | !(grepl("no|yes|reverse",stranded))) {
+  stop("Stranded has to be no|yes|reverse")
+}
+
+if(is.na(paired) | !(grepl("no|yes",paired))) {
+  stop("Paired has to be no|yes")
+}
+
+if(is.na(threads)) {
+  stop("Threads has to be an integer number")
+}
+
+stranded <- if(stranded == "no") 0 else if(stranded == "yes") 1 else 2
+
+## end command line parsing
+##
+########################################
+
+########################################
+##
+## analyze duprates and create plots
+##
+cat("Processing file ", bam, " with GTF ", gtf, "\n")
+
+## calculate duplication rate matrix
+dm <- analyzeDuprates(bam,
+                      gtf,
+                      stranded,
+                      (paired == "yes"),
+                      threads)
+
+## produce plots
+
+## duprate vs. expression smooth scatter
+png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drescatter.png"),
+    width=1000, height=1000)
+duprateExpDensPlot(dm, main=basename(bam))
+dev.off()
+
+## expression histogram
+png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_ehist.png"),
+    width=1000, height=1000)
+expressionHist(dm)
+dev.off()
+
+## duprate vs. expression boxplot
+png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drebp.png"),
+    width=1000, height=1000)
+par(mar=c(10,4,4,2)+.1)
+duprateExpBoxplot(dm, main=basename(bam))
+dev.off()
+```
+
+## Citing dupRadar
+
+Please consider citing dupRadar if used in support of your own research:
+
+```{r citation}
+citation("dupRadar")
+```
+
+## Reporting problems or bugs
+
+If you run into problems using dupRadar, the [Bioconductor Support site](https://support.bioconductor.org/) is a good first place to ask for help. If you think there is a bug or an unreported feature, you can report it using the [dupRadar Github site](https://github.com/ssayols/dupRadar).
+
+# Session info
+
+The following package and versions were used in the production of this vignette.
+
+```{r echo=FALSE}
+sessionInfo()
+```
\ No newline at end of file
diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh
index c84440a7..6ac70bf0 100755
--- a/src/dupradar/script.sh
+++ b/src/dupradar/script.sh
@@ -5,6 +5,7 @@
 
 set -eo pipefail 
 
+
 function num_strandness {
     if [ $par_strandedness == 'unstranded' ]; then echo 0
     elif [ $par_strandedness == 'forward' ]; then echo 1
@@ -14,12 +15,16 @@ function num_strandness {
     fi
 }
 
-Rscript "$meta_resources_dir/script.R" \
+
+if [ $par_paired ]; then paired=TRUE; else paired=FALSE; fi
+
+
+Rscript "$meta_resources_dir/dupradar.r" \
     $par_input \
     $par_id \
     $par_gtf_annotation \
     $(num_strandness) \
-    $par_paired \
+    ${paired} \
     ${meta_cpus:-1}
 
 mv "$par_id"_dupMatrix.txt $par_output_dupmatrix
@@ -31,8 +36,4 @@ mv "$par_id"_expressionHist.pdf $par_output_expression_histogram
 mv "$par_id"_intercept_slope.txt $par_output_intercept_slope
 
 
-dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")
-text="bioconductor-dupradar: ${dupradar_ver}"
-echo "$text"
-
 exit 0
diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh
index 8a85af27..c4afd67a 100755
--- a/src/dupradar/test.sh
+++ b/src/dupradar/test.sh
@@ -58,14 +58,11 @@ echo ">> Check if output is empty"
     && echo "Output file $output_intercept_slope is empty" && exit 1
 
 echo ">> Check if output is correct"
-cat "$output_dupmatrix"
-cat "$output_dup_intercept_mqc"
-cat "$output_intercept_slope"
 # diff ignoring white spaces
-diff -B -b "test_dupMatrix.pdf" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || 
-    (echo "Output file $output_dupmatrix is not correct" && exit 1)
-diff -B -b "$output_dup_intercept_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \
-    (echo "Output file $output_dup_intercept_mqc is not correct" && exit 1)
+diff -B -b "$meta_resources_dir/test_data/test_dupMatrix.txt" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || \
+    { echo "Output file $output_dupmatrix is not correct"; exit 1; }
+diff -B -b "$output_duprate_exp_denscurve_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \
+    { echo "Output file $output_duprate_exp_denscurve_mqc is not correct"; exit 1; }
 diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \
-    (echo "Output file $output_intercept_slope is not correct" && exit 1)
+    { echo "Output file $output_intercept_slope is not correct"; exit 1; }
 exit 0
\ No newline at end of file
diff --git a/src/dupradar/test_data/test_dupMatrix.txt b/src/dupradar/test_data/test_dupMatrix.txt
index 8ccda72e..9035e993 100644
--- a/src/dupradar/test_data/test_dupMatrix.txt
+++ b/src/dupradar/test_data/test_dupMatrix.txt
@@ -1,4 +1,4 @@
-ID	geneLength	allCountsMulti	filteredCountsMulti	dupRateMulti	dupsPerIdMulti	RPKMulti	PKMMulti	allCounts	filteredCounts	dupRate	dupsPerId	RPK	RPKM
+ID	geneLength	allCountsMulti	filteredCountsMulti	dupRateMulti	dupsPerIdMulti	RPKMulti	RPKMMulti	allCounts	filteredCounts	dupRate	dupsPerId	RPK	RPKM
 WASH7P	1769	41	41	0	0	23.1769361221029	188430.374976446	1	1	0	0	0.565291124929339	4595.86280430357
 FAM138A	2260	0	0	NA	0	0	0	0	0	NA	0	0	0
 FAM138F	2260	0	0	NA	0	0	0	0	0	NA	0	0	0
diff --git a/src/dupradar/test_data/test_dup_intercept_mqc.txt b/src/dupradar/test_data/test_dup_intercept_mqc.txt
deleted file mode 100644
index 0d0b4e1a..00000000
--- a/src/dupradar/test_data/test_dup_intercept_mqc.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-#id: DupInt
-#plot_type: 'generalstats'
-#pconfig:
-#    dupRadar_intercept:
-#        title: 'dupInt'
-#        namespace: 'DupRadar'
-#        description: 'Intercept value from DupRadar'
-#        max: 100
-#        min: 0
-#        scale: 'RdYlGn-rev'
-#        format: '{:.2f}%'
-Sample dupRadar_intercept
-test 5.8262146393079e-11
-#id: DupInt
-#plot_type: 'generalstats'
-#pconfig:
-#    dupRadar_intercept:
-#        title: 'dupInt'
-#        namespace: 'DupRadar'
-#        description: 'Intercept value from DupRadar'
-#        max: 100
-#        min: 0
-#        scale: 'RdYlGn-rev'
-#        format: '{:.2f}%'
-Sample dupRadar_intercept
-test 5.8262146393079e-11
-#id: DupInt
-#plot_type: 'generalstats'
-#pconfig:
-#    dupRadar_intercept:
-#        title: 'dupInt'
-#        namespace: 'DupRadar'
-#        description: 'Intercept value from DupRadar'
-#        max: 100
-#        min: 0
-#        scale: 'RdYlGn-rev'
-#        format: '{:.2f}%'
-Sample dupRadar_intercept
-test 5.8262146393079e-11