diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e32edb1..e36fa5e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). +* `dupradar`: Assessment of duplication rates in RNA-Seq datasets (PR #154). + * `rseqc`: - `rseqc/rseqc_inner_distance`: Calculate inner distance between read pairs (PR #159). - `rseqc/rseqc_inferexperiment`: Infer strandedness from sequencing reads (PR #158). @@ -29,6 +31,7 @@ * `cutadapt`: Fix the the non-functional `action` parameter (PR #161). + ## MINOR CHANGES * `agat_convert_bed2gff`: change type of argument `inflate_off` from `boolean_false` to `boolean_true` (PR #160). diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml new file mode 100644 index 00000000..52063368 --- /dev/null +++ b/src/dupradar/config.vsh.yaml @@ -0,0 +1,103 @@ +name: dupradar +description: Assessment of duplication rates in RNA-Seq datasets +keywords: [ rnaseq, duplication, genomics ] +links: + homepage: https://bioconductor.org/packages/release/bioc/html/dupRadar.html + documentation: https://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html + repository: https://github.com/ssayols/dupRadar +references: + doi: "10.1186/s12859-016-1276-2" +license: GPL-3.0 + +argument_groups: +- name: Input + arguments: + - name: --id + type: string + description: Sample ID + - name: --input + type: file + required: true + description: Path to input alignment file in BAM format + - name: --gtf_annotation + type: file + required: true + description: Path to GTF annotation file. + - name: --paired + type: boolean_true + description: Add flag if input alignment file consists of paired reads + - name: --strandedness + type: integer + choices: [0, 1, 2] + description: | + Strandedness of input bam file reads (1 = forward, 2 = reverse or 0 = unstranded (default, + applicable to paired reads)) + example: 0 + +- name: Output + arguments: + - name: --output_dupmatrix + type: file + direction: output + description: path to output file (txt) of duplicate tag counts + example: dup_matrix.txt + - name: --output_dup_intercept_mqc + type: file + direction: output + description: path to output file (txt) of multiqc intercept value DupRadar + example: dup_intercept_mqc.txt + - name: --output_duprate_exp_boxplot + type: file + direction: output + description: | + Path to output file (pdf) of distribution of expression box plot + - name: --output_duprate_exp_densplot + type: file + direction: output + description: | + Path to output file (pdf) of 2D density scatter plot of duplicate tag counts + example: duprate_exp_densityplot.pdf + - name: --output_duprate_exp_denscurve_mqc + type: file + direction: output + description: | + Path to output file (pdf) of density curve of gene duplication multiqc + example: duprate_exp_density_curve_mqc.txt + - name: --output_expression_histogram + type: file + direction: output + description: | + Path to output file (pdf) of distribution of RPK values per gene histogram + example: expression_hist.pdf + - name: --output_intercept_slope + type: file + direction: output + description: output file (txt) with progression of duplication rate values + example: intercept_slope.txt + +resources: + # Adapted from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r + - type: r_script + path: dupradar.r +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: [ r-base ] + - type: r + bioc: [ dupRadar ] + packages: [ parallel, rlang ] + - type: docker + run: | + echo "DupRadar: $(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/dupradar/dupradar.r b/src/dupradar/dupradar.r new file mode 100755 index 00000000..f813e902 --- /dev/null +++ b/src/dupradar/dupradar.r @@ -0,0 +1,192 @@ +#!/usr/bin/env Rscript + +## VIASH START +par <- list( + "input" = 'test_data/sample.bam', + "id" = 'test', + "gtf_annotation" = 'test_data/genes.gtf', + "strandedness" = 1, + "paired" = TRUE, + "threads" = 1, + "output_dupmatrix" = "dup_matrix.txt", + "output_dup_intercept_mqc" = "dup_intercept_mqc.txt", + "output_duprate_exp_boxplot" = "duprate_exp_boxplot.pdf", + "output_duprate_exp_densplot" = "duprate_exp_densityplot.pdf", + "output_duprate_exp_denscurve_mqc" = "duprate_exp_density_curve_mqc.pdf", + "output_expression_histogram" = "expression_hist.pdf", + "output_intercept_slope" = "intercept_slope.txt" +) +## VIASH END + +library(rlang) + +if (length(par) < 5) { + stop("Usage: dupRadar.r ", call.=FALSE) +} + + +input_bam <- par$input +output_prefix <- par$id +annotation_gtf <- par$gtf_annotation +stranded <- par$strandedness %||% 0L +paired_end <- ifelse(par$paired, TRUE, FALSE) +threads <- par$threads %||% 1L + +output_dupmatrix <- par$output_dupmatrix %||% paste0(output_prefix, "_dupMatrix.txt") +output_dup_intercept_mqc <- par$output_dup_intercept_mqc %||% paste0(output_prefix, "_dup_intercept_mqc.txt") +output_duprate_exp_boxplot <- par$output_duprate_exp_boxplot %||% paste0(output_prefix, "_duprateExpBoxplot.pdf") +output_duprate_exp_densplot <- par$output_duprate_exp_densplot %||% paste0(output_prefix, "_duprate_exp_densplot.pdf") +output_duprate_exp_denscurve_mqc <- par$output_duprate_exp_denscurve_mqc %||% paste0(output_prefix, "_duprateExpDensCurve_mqc.txt") +output_expression_histogram <- par$output_expression_histogram %||% paste0(output_prefix, "_expressionHist.pdf") +output_intercept_slope <- par$output_intercept_slope %||% paste0(output_prefix, "_intercept_slope.txt") + +bamRegex <- "(.+)\\.bam$" + + +if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) && (!file.info(input_bam)$isdir))) stop("First argument '' must be an existing file (not a directory) with '.bam' extension...") +if(!(file.exists(annotation_gtf) && (!file.info(annotation_gtf)$isdir))) stop("Third argument '' must be an existing file (and not a directory)...") +if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...") +if(is.na(threads) || (threads<=0)) stop("Fifth argument must be a strictly positive numeric value...") + +# Debug messages (stderr) +message("Input bam (Arg 1): ", input_bam) +message("Output basename(Arg 2): ", output_prefix) +message("Input gtf (Arg 3): ", annotation_gtf) +message("Strandness (Arg 4): ", c("unstranded", "forward", "reverse")[stranded]) +message("paired_end (Arg 5): ", paired_end) +message("Nb threads (Arg 6): ", threads) +message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified")) + + +# Load / install packages +if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) } +if (!require("dupRadar")) { + if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") + } + BiocManager::install("dupRadar", update = TRUE, ask=FALSE) + library("dupRadar") +} +if (!require("parallel")) { + library("parallel") +} + + +# Duplicate stats +dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads) +print("analyzeDuprates done") +write.table(dm, file=output_dupmatrix, quote=F, row.name=F, sep="\t") +print("write.table done") + +# 2D density scatter plot +pdf(output_duprate_exp_densplot) +print("pdf done") +duprateExpDensPlot(DupMat=dm) +title("Density scatter plot") +mtext(output_prefix, side=3) +dev.off() +print("duprateExpDensPlot done") +fit <- duprateExpFit(DupMat=dm) +print("duprateExpFit done") +cat( + paste("- dupRadar Int (duprate at low read counts):", fit$intercept), + paste("- dupRadar Sl (progression of the duplication rate):", fit$slope), + fill=TRUE, labels=output_prefix, + file=output_intercept_slope, append=FALSE +) + +# Create a multiqc file dupInt +sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix) +line <- "#id: DupInt +#plot_type: 'generalstats' +#pconfig: +# dupRadar_intercept: +# title: 'dupInt' +# namespace: 'DupRadar' +# description: 'Intercept value from DupRadar' +# max: 100 +# min: 0 +# scale: 'RdYlGn-rev' +# format: '{:.2f}%' +Sample dupRadar_intercept" + +write(line,file=output_dup_intercept_mqc, append=TRUE) +write(paste(sample_name, fit$intercept),file=output_dup_intercept_mqc, append=TRUE) +print("write dup_intercept_mqc done") + +# Get numbers from dupRadar GLM +curve_x <- sort(log10(dm$RPK)) +curve_y <- 100*predict(fit$glm, data.frame(x=curve_x), type="response") +# Remove all of the infinite values +infs = which(curve_x %in% c(-Inf,Inf)) +curve_x <- curve_x[-infs] +curve_y <- curve_y[-infs] +# Reduce number of data points +curve_x <- curve_x[seq(1, length(curve_x), 10)] +curve_y <- curve_y[seq(1, length(curve_y), 10)] +# Convert x values back to real counts +curve_x <- 10^curve_x +# Write to file +line <- "#id: dupradar +#section_name: 'DupRadar' +#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html' +#description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication. +# This plot shows the general linear models - a summary of the gene duplication distributions. \" +#pconfig: +# title: 'DupRadar General Linear Model' +# xLog: True +# xlab: 'expression (reads/kbp)' +# ylab: '% duplicate reads' +# ymax: 100 +# ymin: 0 +# tt_label: '{point.x:.1f} reads/kbp: {point.y:,.2f}% duplicates' +# xPlotLines: +# - color: 'green' +# dashStyle: 'LongDash' +# label: +# style: {color: 'green'} +# text: '0.5 RPKM' +# verticalAlign: 'bottom' +# y: -65 +# value: 0.5 +# width: 1 +# - color: 'red' +# dashStyle: 'LongDash' +# label: +# style: {color: 'red'} +# text: '1 read/bp' +# verticalAlign: 'bottom' +# y: -65 +# value: 1000 +# width: 1" + +write(line,file=output_duprate_exp_denscurve_mqc, append=TRUE) +write.table( + cbind(curve_x, curve_y), + file=output_duprate_exp_denscurve_mqc, + quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE, +) +print("write duprateExpDensCurve_mqc done") +# Distribution of expression box plot +pdf(output_duprate_exp_boxplot) +duprateExpBoxplot(DupMat=dm) +title("Percent Duplication by Expression") +mtext(output_prefix, side=3) +dev.off() +print("duprateExpBoxplot done") +# Distribution of RPK values per gene +pdf(output_expression_histogram) +expressionHist(DupMat=dm) +title("Distribution of RPK values per gene") +mtext(output_prefix, side=3) +dev.off() +print("expressionHist done") + + +# + +# Print sessioninfo to standard out +print(output_prefix) +# citation("dupRadar") +# print("citation done") +# sessionInfo() diff --git a/src/dupradar/help.txt b/src/dupradar/help.txt new file mode 100644 index 00000000..819c352d --- /dev/null +++ b/src/dupradar/help.txt @@ -0,0 +1,583 @@ +--- +title: "Using the dupRadar package" +author: "Sergi Sayols, Holger Klein" +date: "June 23, 2015" +output: + BiocStyle::html_document: + toc: true +--- + + +```{r pre,echo=FALSE,results='hide'} +library(knitr) +opts_chunk$set(warning=FALSE,message=FALSE,cache=TRUE) +``` +```{r style,echo=FALSE,results='asis'} +BiocStyle::markdown() +``` + + + +# Introduction to dupRadar + +RNA-Seq experiments are a common strategy nowadays to quantify the gene +expression levels in cells. Due to its higher sensitivity compared to +arrays and the ability to discover novel features, makes them the choice for +most modern experiments. + +In RNA-Seq - as in all other NGS applications - quality control is essential. +Current NGS workflows usually involve PCR steps at some point, which involves +the danger of PCR artefacts and over-amplification of reads. +For common DNA-based assays PCR duplicates are often simply removed before +further analysis and their overall fraction or the read multiplicity taken +as quality metrics. +In RNA-Seq however, apart from the technical PCR duplicates, there is a strong +source for biological read duplicates: for a given gene length and sequencing +depth there exists an expression level beyond which it is not possible to add +place more reads inside a gene locus without placing a second read exactly +on the position of an already existing read. +For this reason the overall duplication rate is not a useful measure for +RNA-Seq. + +As in the NGS/RNA-Seq QC ecosystem there were no suitable tools to address this +question, we set out to develop a new tool. The `r Rpackage("dupRadar")` +package gives an insight into the duplication problem by graphically +relating the gene expression level and the duplication rate present on it. +Thus, failed experiments can be easily identified at a glance. + + +**Note: by now RNA-Seq protocols have matured so that for bulk RNA protocols data +rarely suffer from technical duplicates. With newer low-input or single cell +RNA-Seq protocols technical duplicates possible problems are worth to be +checked for by default, especially if protocols are pushed to or beyond +their boundaries. +Paired-end libraries make the distinction between duplicates due to highly +expressed genes and PCR duplicates easier, but the problem itself is not +completely solved, especially at higher sequencing depths.** + +# Getting started using dupRadar + +## Preparing your data + +Previous to running the duplication rate analysis, the BAM file with your +mapped reads has to be duplicate marked with a like Picard, or the faster +BamUtil dedup BioBamBam. +The `r Rpackage("dupRadar")` package only works with duplicate marked BAM files. + +If you do not have/want a duplication marking step in your default pipeline, +the `r Rpackage("dupRadar")` package includes, for your convenience, +wrappers to properly call some of these tools from your R session. Note that +you still have to supply the path of the dupmarker installation though: + +```{r loadLibrary} +library(dupRadar) +``` + +Now, simply call the wrapper: + +```{r eval=FALSE} +# call the duplicate marker and analyze the reads +bamDuprm <- markDuplicates(dupremover="bamutil", + bam="test.bam", + path="/opt/bamUtil-master/bin", + rminput=FALSE) +``` + +Simply specify which tool to use, the path where this tool is installed, +and the input bam file to be analyzed. After marking duplicates, it's safe +to remove to original BAM file in order to save space. + +The `r Rpackage("dupRadar")` package currently comes with support for: + +- [Picard MarkDuplicates](http://broadinstitute.github.io/picard) +- [BamUtil](http://genome.sph.umich.edu/wiki/BamUtil) + +After the BAM file is marked for duplicates, dupRadar is ready to +analyze how the duplication rate is related with the estimated gene expression +levels. + +## A GTF file + +Unless there is any specific reason, dupRadar can use the same GTF file +that will be later used to count the reads falling on features. + +A valid GTF file can be obtained from UCSC, Ensembl, the iGenomes or other +projects. + +Note that the resulting duplication rate plots depend on the GTF annotation +used. GTF files from the gencode projects result in a less clear picture of +duplication rates, as there are many more features and feature types +annotated, which overlap heavily as well. In some cases creating the plots +only using subsets of gencode annotation files (e.g. just protein +coding genes) serve the QC purpose of this tool better. + +## AnnotationHub as a source of GTF files + +The Bioconductor `r Rpackage("AnnotationHub")` package provides an alternative +approach to obtain annotation in GTF format from entities such as Ensembl, UCSC, +ENCODE, and 1000 Genomes projects. + +This is partly outlined in the AnnotationHub 'HOWTO' vignette +section "Ensembl GTF and FASTA files for TxDb gene models and sequence +queries"; for the Takifugu example, the downloaded GTF file is +available from the cache. + +```{r} +if(suppressWarnings(require(AnnotationHub))) { + ah = AnnotationHub() + query(ah, c("ensembl", "80", "Takifugu", "gtf")) # discovery + cache(ah["AH47101"]) # retrieve / file path +} +``` + +# dupRate demo data + +In the package we include two precomputed duplication matrices for two RNASeq +experiments used as examples of a good and a failed (in terms of high redundancy +of reads) experiments. The experiments come from the ENCODE project, as a source +of a wide variety of protocols, library types and sequencing facilities. + +Load the example dataset with: + +```{r} +attach(dupRadar_examples) +``` + +# The duplication rate analysis + +The analysis requires some info about the sequencing run and library +preparation protocol: + +- The strandess information about the reads: is the sequencing strand +specific? if so, are the reads reversely sequenced? +- Are the reads paired, or single? + +Due to its phenomenal performance, internally we use the featureCounts() +function from the Bioconductor `r Biocpkg("Rsubread")` package, which also +supports multiple threads. + +```{r eval=FALSE} +# The call parameters: +bamDuprm <- "test_duprm.bam" # the duplicate marked bam file +gtf <- "genes.gtf" # the gene model +stranded <- 2 # '0' (unstranded), '1' (stranded) and '2' (reversely stranded) +paired <- FALSE # is the library paired end? +threads <- 4 # number of threads to be used + +# Duplication rate analysis +dm <- analyzeDuprates(bamDuprm,gtf,stranded,paired,threads) +``` + +The duplication matrix contains read counts in different scenarios and RPK and RPKM values for every gene. + +|ID | geneLength| allCountsMulti| filteredCountsMulti| dupRateMulti| dupsPerIdMulti| RPKMulti| RPKMMulti| allCounts| filteredCounts| dupRate| dupsPerId| RPK| RPKM| +|:------|----------:|--------------:|-------------------:|------------:|--------------:|------------:|----------:|---------:|--------------:|---------:|---------:|-----------:|----------:| +|Xkr4 | 3634| 2| 2| 0.0000000| 0| 0.5503577| 0.0117159| 2| 2| 0.0000000| 0| 0.5503577| 0.0117159| +|Rp1 | 9747| 0| 0| NaN| 0| 0.0000000| 0.0000000| 0| 0| NaN| 0| 0.0000000| 0.0000000| +|Sox17 | 3130| 0| 0| NaN| 0| 0.0000000| 0.0000000| 0| 0| NaN| 0| 0.0000000| 0.0000000| +|Mrpl15 | 4203| 429| 261| 0.3916084| 168| 102.0699500| 2.1728436| 419| 258| 0.3842482| 161| 99.6906971| 2.1221946| +|Lypla1 | 2433| 1106| 576| 0.4792043| 530| 454.5828196| 9.6770634| 1069| 562| 0.4742750| 507| 439.3752569| 9.3533280| +|Tcea1 | 2847| 2890| 1046| 0.6380623| 1844| 1015.1036178| 21.6093121| 1822| 696| 0.6180022| 1126| 639.9719002| 13.6235871| + +# Plotting and interpretation + +The number of reads per base assigned to a gene in an ideal RNA-Seq data set +is expected to be proportional to the abundance of its transcripts in the +sample. For lowly expressed genes we expect read duplication to happen rarely by +chance, while for highly expressed genes - depending on the total sequencing +depth - we expect read duplication to happen often. + +A good way to learn if a dataset is following this trend is by relating the +normalized number of counts per gene (RPK, as a quantification of the gene +expression) and the fraction represented by duplicated reads. + +The `r Rpackage("dupRadar")` offers several functions to assess this +relationship. The most prominent may be the 2d density scatter plot: + +```{r fig.width=14,fig.height=7} +## make a duprate plot (blue cloud) +par(mfrow=c(1,2)) +duprateExpDensPlot(DupMat=dm) # a good looking plot +title("good experiment") +duprateExpDensPlot(DupMat=dm.bad) # a dataset with duplication problems +title("duplication problems") + +## duprate boxplot +duprateExpBoxplot(DupMat=dm) # a good looking plot +duprateExpBoxplot(DupMat=dm.bad) # a dataset with duplication problems +``` + +The *duprateExpDensPlot* has helper lines at the threshold of 1 read/bp and at +0.5 RPKM. Moreover by default a generalized linear model is fit to the data and +overplotted (see also below). + +## Fitting a model into the data + +To summarize the relationship between duplication rates and gene expression, +we propose fitting a logistic regression curve onto the data. With the +coefficients of the fitted model, one can get an idea of the initial +duplication rate at low read counts (Intercept) and the progression of the +duplication rate along with the progression of the read counts (Slope). + + +```{r fig.width=7,fig.height=7} +duprateExpDensPlot(DupMat=dm) + +# or, just to get the fitted model without plot +fit <- duprateExpFit(DupMat=dm) +cat("duprate at low read counts: ",fit$intercept,"\n", + "progression of the duplication rate: ",fit$slope,fill=TRUE) +``` + +Our main use case for that function is the condensation +of the plots into quality metrics that can be used for automatic flagging of +possibly problematic samples in large experiments or aggregation with other +quality metrics in large tables to analyse interdependencies. + + +The *duprateExpBoxplot* plot shows the range of the duplication rates at 5% +bins (default) along the distribution of RPK gene counts. The x-axis displays +the quantile of the RPK distribution, and the average RPK of the genes +contained in this quantile. + +Individual genes can be identified in the plot: + +```{r eval=FALSE} +## INTERACTIVE: identify single points on screen (name="ID" column of dm) +duprateExpPlot(DupMat=dm) # a good looking plot +duprateExpIdentify(DupMat=dm) +``` + +One can also call the function *duprateExpPlot* to get smooth color density +representation of the same data: + +```{r fig.width=14,fig.height=7} +par(mfrow=c(1,2)) +cols <- colorRampPalette(c("black","blue","green","yellow","red")) +duprateExpPlot(DupMat=dm,addLegend=FALSE) +duprateExpPlot(DupMat=dm.bad,addLegend=FALSE,nrpoints=10,nbin=500,colramp=cols) +``` + +Any further parm sent to the *duprateExpPlot* is also sent to the +*smoothScatter* function. + +## Comparing the fitted parameters to other datasets + +The parameters of the fitted model may mean very little (or just nothing) for +many, unless there's other data to compare with. We provide the pre-computed +duplication matrices for all the RNA-Seq experiments publicly available from +the ENCODE project, for human and mouse. + +![alt text](figures/encode.svg) + +With the experience from the ENCODE datasets, we expect from single read +experiments little duplication at low RPKM (low **intercept**) rapidly rising +because of natural duplication (high **slope**). In contrast, paired-end +experiments have a more mild rising of the natural duplication (low **slope**) +due to having higher diversity of reads pairs since pairs with same start may +still have different end. + +The common denominator for both designs is the importance of having a low +intercept, suggesting that duplication rate at lowly expressed genes may serve +as a quality measure. + +All the pre-computed duplication matrices are available in the [dupRadar Github site](https://github.com/ssayols/dupRadar). + + +## Other plots + +**CAVEAT: Sometimes in discussions duplicate reads (i.e. two physically +different reads are mapped to the exact same position) and multi-mapping +reads (i.e. a single read is mapped to two or more locations in the genome) +are mixed up. `r Rpackage("dupRadar")`'s main focus are PCR duplicates in RNA-Seq. +However internally we keep track of unique mappers and multimappers, and we use +both in some of the examples, to illustrate use cases of our package beyond +the main aim. Multi-mapping reads are completely independent of PCR duplicates.** + +Apart from the plots relating RPK and duplication rate, the +`r Rpackage("dupRadar")` package provides some other useful plots to extract +information about the gene expression levels. + +An interesting quality metric are the fraction of reads taken up by groups of +genes binned by 5% expression levels. +```{r fig.width=7,fig.height=7} +readcountExpBoxplot(DupMat=dm) +``` + +In the example we see that the 5% of +highest expressed genes in our sample data set take up around 60% of all reads. + +The distribution of RPK values per genes can be plotted with: + +```{r fig.width=7,fig.height=7} +expressionHist(DupMat=dm) +``` + +This would help in identifying skewed distributions with unusual amount of +lowly expressed genes, or to detect no consensus between replicates. + + +# Other information deduced from the data + +The duplication rate matrix calculated by the function *analyzeDuprates()* +contains some useful information about the sequencing experiment, that can be +used to assess the quality of the data. + +## Fraction of multimappers per gene + +Analogous to per gene duplication rate, the fraction of mutimappers +can be easily calculated fom the duplication matrix. + +Taking the counts from the column *allCountsMulti*, and substracting form it +the counts from the column *allCounts*, one can get the total number of +multihits. Thus, the fraction of multihits per gene can be calculating then +dividing by *allCountsMulti*. + +```{r} +# calculate the fraction of multimappers per gene +dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti +# how many genes are exclusively covered by multimappers +sum(dm$mhRate == 1, na.rm=TRUE) + +# and how many have an RPKM (including multimappers) > 5 +sum(dm$mhRate==1 & dm$RPKMMulti > 5, na.rm=TRUE) + +# and which are they? +dm[dm$mhRate==1 & dm$RPKMMulti > 5, "ID"] +``` + +We can also generate an overall picture about less extreme cases: +```{r fig.width=7,fig.height=7} +hist(dm$mhRate, + breaks=50, + col="red", + main="Frequency of multimapping rates", + xlab="Multimapping rate per gene", + ylab="Frequency") + +``` + +Also the direct comparison of reads per kilobase between uniquely and +multimappers is possible. + +```{r fig.width=7,fig.height=7} +# comparison of multi-mapping RPK and uniquely-mapping RPK +plot(log2(dm$RPK), + log2(dm$RPKMulti), + xlab="Reads per kb (uniquely mapping reads only)", + ylab="Reads per kb (all including multimappers, non-weighted)" +) +``` + +Use with identify() to annotate interesting cases interactively. +```{r, eval=F} +identify(log2(dm$RPK), + log2(dm$RPKMulti), + labels=dm$ID) +``` + +## Connection between possible PCR artefacts and GC content +In some cases we wondered about influence of GC content on PCR artefacts. An +easy way to check using our `r Rpackage("dupRadar")` package in conjunction +with `r Rpackage("biomaRt")` is demonstrated in the following. For simplicity +we use our demo data here in which we *do not* see a big influence. + + +```{r eval=F} +library(dupRadar) +library(biomaRt) + +## for detailed explanations on biomaRt, please see the respective +## vignette + +## set up biomart connection for mouse (needs internet connection) +ensm <- useMart("ensembl") +ensm <- useDataset("mmusculus_gene_ensembl", mart=ensm) + +## get a table which has the gene GC content for the IDs that have been +## used to generate the table (depends on the GTF annotation that you +## use) +tr <- getBM(attributes=c("mgi_symbol", "percentage_gc_content"), + values=TRUE, mart=ensm) + +## create a GC vector with IDs as element names +mgi.gc <- tr$percentage_gc_content +names(mgi.gc) <- tr$mgi_symbol + +## using dm demo duplication matrix that comes with the package +## add GC content to our demo data and keep only subset for which we can +## retrieve data +keep <- dm$ID %in% tr$mgi_symbol +dm.gc <- dm[keep,] +dm.gc$gc <- mgi.gc[dm.gc$ID] + +## check distribution of annotated gene GC content (in %) +boxplot(dm.gc$gc, main="Gene GC content", ylab="% GC") +``` + +![alt text](figures/GCcontent.svg) + +Now we can compare the dependence of duplication rate on expression level +independently for below and above median GC genes (and to mention again, +in this data set we don't have a big difference). + +```{r eval=F} +par(mfrow=c(1,2)) + +## below median GC genes +duprateExpDensPlot(dm.gc[dm.gc$gc<=45,], main="below median GC genes") + +## above median GC genes +duprateExpDensPlot(dm.gc[dm.gc$gc>=45,], main="above median GC genes") +``` + +![alt text](figures/GCdependence.png) + +# Conclusion + +The `r Rpackage("dupRadar")` package provides a framework for the analysis of +duplicate rates in RNAseq datasets. In addition, it includes a set of +convenient wrappers to correctly call some common tools used for marking PCR +duplicated reads in the BAM file. It's shipped as Bioconductor package in +order to offer a common framework for all the tools involved, and simplify its +use. + +# Including dupRadar into pipelines + +To include dupRadar as a single step in an RNA-Seq pipeline, integration into a short R-script can be done like in the following: + +```{r,eval=F} +#!/usr/bin/env Rscript + +######################################## +## +## dupRadar shell script +## call dupRadar R package from the shell for +## easy integration into pipelines +## +## Holger Klein & Sergi Sayols +## +## https://github.com/ssayols/dupRadar +## +## input: +## - _duplicate marked_ bam file +## - gtf file +## - parameters for duplication counting routine: +## stranded, paired, outdir, threads. +## +######################################## + +library(dupRadar) + +#################### +## +## get name patterns from command line +## +args <- commandArgs(TRUE) + +## the bam file to analyse +bam <- args[1] +## usually, same GTF file as used in htseq-count +gtf <- gsub("gtf=","",args[2]) +## no|yes|reverse +stranded <- gsub("stranded=","",args[3]) +## is a paired end experiment +paired <- gsub("paired=","",args[4]) +## output directory +outdir <- gsub("outdir=","",args[5]) +## number of threads to be used +threads <- as.integer(gsub("threads=","",args[6])) + +if(length(args) != 6) { + stop (paste0("Usage: ./dupRadar.sh ", + " paired=[yes|no] ", + "outdir=./ threads=1")) +} + +if(!file.exists(bam)) { + stop(paste("File",bam,"does NOT exist")) +} + +if(!file.exists(gtf)) { + stop(paste("File",gtf,"does NOT exist")) +} + +if(!file.exists(outdir)) { + stop(paste("Dir",outdir,"does NOT exist")) +} + +if(is.na(stranded) | !(grepl("no|yes|reverse",stranded))) { + stop("Stranded has to be no|yes|reverse") +} + +if(is.na(paired) | !(grepl("no|yes",paired))) { + stop("Paired has to be no|yes") +} + +if(is.na(threads)) { + stop("Threads has to be an integer number") +} + +stranded <- if(stranded == "no") 0 else if(stranded == "yes") 1 else 2 + +## end command line parsing +## +######################################## + +######################################## +## +## analyze duprates and create plots +## +cat("Processing file ", bam, " with GTF ", gtf, "\n") + +## calculate duplication rate matrix +dm <- analyzeDuprates(bam, + gtf, + stranded, + (paired == "yes"), + threads) + +## produce plots + +## duprate vs. expression smooth scatter +png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drescatter.png"), + width=1000, height=1000) +duprateExpDensPlot(dm, main=basename(bam)) +dev.off() + +## expression histogram +png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_ehist.png"), + width=1000, height=1000) +expressionHist(dm) +dev.off() + +## duprate vs. expression boxplot +png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drebp.png"), + width=1000, height=1000) +par(mar=c(10,4,4,2)+.1) +duprateExpBoxplot(dm, main=basename(bam)) +dev.off() +``` + +## Citing dupRadar + +Please consider citing dupRadar if used in support of your own research: + +```{r citation} +citation("dupRadar") +``` + +## Reporting problems or bugs + +If you run into problems using dupRadar, the [Bioconductor Support site](https://support.bioconductor.org/) is a good first place to ask for help. If you think there is a bug or an unreported feature, you can report it using the [dupRadar Github site](https://github.com/ssayols/dupRadar). + +# Session info + +The following package and versions were used in the production of this vignette. + +```{r echo=FALSE} +sessionInfo() +``` \ No newline at end of file diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh new file mode 100755 index 00000000..99610357 --- /dev/null +++ b/src/dupradar/test.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +# define input and output for script +input_bam="${meta_resources_dir}/test_data/sample.bam" +input_gtf="${meta_resources_dir}/test_data/genes.gtf" + +output_dupmatrix="dup_matrix.txt" +output_dup_intercept_mqc="dup_intercept_mqc.txt" +output_duprate_exp_boxplot="duprate_exp_boxplot.pdf" +output_duprate_exp_densplot="duprate_exp_densityplot.pdf" +output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.txt" +output_expression_histogram="expression_hist.pdf" +output_intercept_slope="intercept_slope.txt" + +# Run executable +echo "> Running $meta_functionality_name for unpaired reads, writing to tmpdir $tmpdir." + +"$meta_executable" \ + --input "$input_bam" \ + --id "test" \ + --gtf_annotation "$input_gtf" \ + --strandedness 1 \ + --output_dupmatrix $output_dupmatrix \ + --output_dup_intercept_mqc $output_dup_intercept_mqc \ + --output_duprate_exp_boxplot $output_duprate_exp_boxplot \ + --output_duprate_exp_densplot $output_duprate_exp_densplot \ + --output_duprate_exp_denscurve_mqc $output_duprate_exp_denscurve_mqc \ + --output_expression_histogram $output_expression_histogram \ + --output_intercept_slope $output_intercept_slope + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> asserting output has been created for paired read input" +[ ! -f "$output_dupmatrix" ] && echo "$output_dupmatrix was not created" && exit 1 +[ ! -s "$output_dupmatrix" ] && echo "$output_dupmatrix is empty" && exit 1 +[ ! -f "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc was not created" && exit 1 +[ ! -s "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc is empty" && exit 1 +[ ! -f "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot was not created" && exit 1 +[ ! -s "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot is empty" && exit 1 +[ ! -f "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot was not created" && exit 1 +[ ! -s "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot is empty" && exit 1 +[ ! -f "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc was not created" && exit 1 +[ ! -s "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc is empty" && exit 1 +[ ! -f "$output_expression_histogram" ] && echo "$output_expression_histogram was not created" && exit 1 +[ ! -s "$output_expression_histogram" ] && echo "$output_expression_histogram is empty" && exit 1 +[ ! -f "$output_intercept_slope" ] && echo "$output_intercept_slope was not created" && exit 1 +[ ! -s "$output_intercept_slope" ] && echo "$output_intercept_slope is empty" && exit 1 + + +echo ">> Check if output is empty" +[ ! -s "$output_dupmatrix" ] \ + && echo "Output file $output_dupmatrix is empty" && exit 1 +[ ! -s "$output_dup_intercept_mqc" ] \ + && echo "Output file $output_dup_intercept_mqc is empty" && exit 1 +[ ! -s "$output_intercept_slope" ] \ + && echo "Output file $output_intercept_slope is empty" && exit 1 + +echo ">> Check if output is correct" +# diff ignoring white spaces +diff -B -b "$meta_resources_dir/test_data/test_dupMatrix.txt" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || \ + { echo "Output file $output_dupmatrix is not correct"; exit 1; } +diff -B -b "$output_duprate_exp_denscurve_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \ + { echo "Output file $output_duprate_exp_denscurve_mqc is not correct"; exit 1; } +diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \ + { echo "Output file $output_intercept_slope is not correct"; exit 1; } + + +echo ">>> Test 2: Example without specified output files" + +"$meta_executable" \ + --input "$input_bam" \ + --id "test" \ + --gtf_annotation "$input_gtf" \ + --strandedness 1 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> asserting output has been created for paired read input" +# output files with original names from DupRadar and id prefix +[ ! -f "test_dupMatrix.txt" ] && echo "test_dupMatrix.txt was not created" && exit 1 +[ ! -s "test_dupMatrix.txt" ] && echo "test_dupMatrix.txt is empty" && exit 1 +[ ! -f "test_dup_intercept_mqc.txt" ] && echo "test_dup_intercept_mqc.txt was not created" && exit 1 +[ ! -s "test_dup_intercept_mqc.txt" ] && echo "test_dup_intercept_mqc.txt is empty" && exit 1 +[ ! -f "test_duprateExpBoxplot.pdf" ] && echo "test_duprateExpBoxplot.pdf was not created" && exit 1 +[ ! -s "test_duprateExpBoxplot.pdf" ] && echo "test_duprateExpBoxplot.pdf is empty" && exit 1 +[ ! -f "test_duprate_exp_densplot.pdf" ] && echo "test_duprate_exp_densplot.pdf was not created" && exit 1 +[ ! -s "test_duprate_exp_densplot.pdf" ] && echo "test_duprate_exp_densplot.pdf is empty" && exit 1 +[ ! -f "test_duprateExpDensCurve_mqc.txt" ] && echo "test_duprateExpDensCurve_mqc.txt was not created" && exit 1 +[ ! -s "test_duprateExpDensCurve_mqc.txt" ] && echo "test_duprateExpDensCurve_mqc.txt is empty" && exit 1 +[ ! -f "test_expressionHist.pdf" ] && echo "test_expressionHist.pdf was not created" && exit 1 +[ ! -s "test_expressionHist.pdf" ] && echo "test_expressionHist.pdf is empty" && exit 1 +[ ! -f "test_intercept_slope.txt" ] && echo "test_intercept_slope.txt was not created" && exit 1 +[ ! -s "test_intercept_slope.txt" ] && echo "test_intercept_slope.txt is empty" && exit 1 + + +echo ">> Check if output is correct" +diff -B -b "test_dupMatrix.txt" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || \ + { echo "Output file test_dupMatrix.txt is not correct"; exit 1; } +diff -B -b "test_duprateExpDensCurve_mqc.txt" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \ + { echo "Output file test_duprateExpDensCurve_mqc.txt is not correct"; exit 1; } +diff -B -b "test_intercept_slope.txt" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \ + { echo "Output file test_intercept_slope.txt is not correct"; exit 1; } + +echo ">>> All tests passed" +exit 0 \ No newline at end of file diff --git a/src/dupradar/test_data/genes.gtf b/src/dupradar/test_data/genes.gtf new file mode 100644 index 00000000..423ba8f8 --- /dev/null +++ b/src/dupradar/test_data/genes.gtf @@ -0,0 +1,36 @@ +chr1 unknown exon 14362 14829 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 14970 15038 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 15796 15947 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 16607 16765 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 16858 17055 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17233 17368 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17606 17742 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17915 18061 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 18268 18366 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 24738 24891 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 29321 29370 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 34611 35174 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 34611 35174 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown exon 35277 35481 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 35277 35481 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown exon 35721 36081 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 35721 36081 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown CDS 69091 70005 . + 0 gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown exon 69091 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown start_codon 69091 69093 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown stop_codon 70006 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown exon 134773 139696 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 139790 139847 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 140075 140566 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 323892 324060 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr1 unknown exon 324288 324345 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr1 unknown exon 324439 328581 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr19 unknown exon 76220 76783 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 76220 76783 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr19 unknown exon 76886 77090 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 76886 77090 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr19 unknown exon 77330 77690 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 77330 77690 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr5 unknown exon 180750507 180750675 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; +chr5 unknown exon 180750903 180750960 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; +chr5 unknown exon 180751054 180755196 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; diff --git a/src/dupradar/test_data/sample.bam b/src/dupradar/test_data/sample.bam new file mode 100644 index 00000000..04d8c95e Binary files /dev/null and b/src/dupradar/test_data/sample.bam differ diff --git a/src/dupradar/test_data/script.sh b/src/dupradar/test_data/script.sh new file mode 100644 index 00000000..2b1c1aa5 --- /dev/null +++ b/src/dupradar/test_data/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/genes.gtf +wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam + +# subsample the bam file +samtools view -s 0.02 -b wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam > sample.bam + +rm wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam \ No newline at end of file diff --git a/src/dupradar/test_data/test_dupMatrix.txt b/src/dupradar/test_data/test_dupMatrix.txt new file mode 100644 index 00000000..9035e993 --- /dev/null +++ b/src/dupradar/test_data/test_dupMatrix.txt @@ -0,0 +1,7 @@ +ID geneLength allCountsMulti filteredCountsMulti dupRateMulti dupsPerIdMulti RPKMulti RPKMMulti allCounts filteredCounts dupRate dupsPerId RPK RPKM +WASH7P 1769 41 41 0 0 23.1769361221029 188430.374976446 1 1 0 0 0.565291124929339 4595.86280430357 +FAM138A 2260 0 0 NA 0 0 0 0 0 NA 0 0 0 +FAM138F 2260 0 0 NA 0 0 0 0 0 NA 0 0 0 +OR4F5 918 0 0 NA 0 0 0 0 0 NA 0 0 0 +LOC729737 5474 18 18 0 0 3.28827183047132 26733.917320905 3 3 0 0 0.548045305078553 4455.65288681751 +LOC100132287 8740 39 39 0 0 4.46224256292906 36278.3948205615 1 1 0 0 0.11441647597254 930.215251809269 diff --git a/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt b/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt new file mode 100644 index 00000000..ca9f199e --- /dev/null +++ b/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt @@ -0,0 +1,33 @@ +#id: dupradar +#section_name: 'DupRadar' +#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html' +#description: "provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication. +# This plot shows the general linear models - a summary of the gene duplication distributions. " +#pconfig: +# title: 'DupRadar General Linear Model' +# xLog: True +# xlab: 'expression (reads/kbp)' +# ylab: '% duplicate reads' +# ymax: 100 +# ymin: 0 +# tt_label: '{point.x:.1f} reads/kbp: {point.y:,.2f}% duplicates' +# xPlotLines: +# - color: 'green' +# dashStyle: 'LongDash' +# label: +# style: {color: 'green'} +# text: '0.5 RPKM' +# verticalAlign: 'bottom' +# y: -65 +# value: 0.5 +# width: 1 +# - color: 'red' +# dashStyle: 'LongDash' +# label: +# style: {color: 'red'} +# text: '1 read/bp' +# verticalAlign: 'bottom' +# y: -65 +# value: 1000 +# width: 1 +0.11441647597254 5.82621463896864e-09 diff --git a/src/dupradar/test_data/test_intercept_slope.txt b/src/dupradar/test_data/test_intercept_slope.txt new file mode 100644 index 00000000..5294b8ba --- /dev/null +++ b/src/dupradar/test_data/test_intercept_slope.txt @@ -0,0 +1,2 @@ +test - dupRadar Int (duprate at low read counts): 5.8262146393079e-11 +test - dupRadar Sl (progression of the duplication rate): 0.999999999999966