diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml new file mode 100644 index 00000000..cae6964f --- /dev/null +++ b/src/dupradar/config.vsh.yaml @@ -0,0 +1,117 @@ +name: dupradar +description: Assessment of duplication rates in RNA-Seq datasets +keywords: [ rnaseq, duplication, genomics ] +links: + homepage: https://bioconductor.org/packages/release/bioc/html/dupRadar.html + documentation: https://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html + repository: https://github.com/ssayols/dupRadar +references: + doi: "10.1186/s12859-016-1276-2" +license: GPL-3.0 + +argument_groups: +- name: "Input" + arguments: + - name: "--id" + type: string + description: Sample ID + + - name: "--input" + type: file + required: true + description: path to input alignment file in BAM format + + - name: "--gtf_annotation" + type: file + required: true + description: path to GTF annotation file. + + - name: "--paired" + type: boolean + description: add flag if input alignment file consists of paired reads + + - name: "--strandedness" + type: string + required: false + choices: ["forward", "reverse", "unstranded"] + description: strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads)) + + - name: "--versions" + type: file + must_exist: false + +- name: "Output" + arguments: + - name: "--output_dupmatrix" + type: file + direction: output + required: false + must_exist: true + default: $id.dup_matrix.txt + description: path to output file (txt) of duplicate tag counts + + - name: "--output_dup_intercept_mqc" + type: file + direction: output + required: false + must_exist: true + default: $id.dup_intercept_mqc.txt + description: path to output file (txt) of multiqc intercept value DupRadar + + - name: "--output_duprate_exp_boxplot" + type: file + direction: output + required: false + must_exist: true + default: $id.duprate_exp_boxplot.pdf + description: path to output file (pdf) of distribution of expression box plot + + - name: "--output_duprate_exp_densplot" + type: file + direction: output + required: false + must_exist: true + default: $id.duprate_exp_densityplot.pdf + description: path to output file (pdf) of 2D density scatter plot of duplicate tag counts + + - name: "--output_duprate_exp_denscurve_mqc" + type: file + direction: output + required: false + must_exist: true + default: $id.duprate_exp_density_curve_mqc.txt + description: path to output file (pdf) of density curve of gene duplication multiqc + + - name: "--output_expression_histogram" + type: file + direction: output + required: false + must_exist: true + default: $id.expression_hist.pdf + description: path to output file (pdf) of distribution of RPK values per gene histogram + + - name: "--output_intercept_slope" + type: file + direction: output + required: false + must_exist: true + default: $id.intercept_slope.txt + description: output file (txt) with progression of duplication rate values + +resources: + - type: bash_script + path: script.sh + # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r + - type: R_script + path: script.R + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0 +runners: + - type: executable + - type: nextflow diff --git a/src/dupradar/help.txt b/src/dupradar/help.txt new file mode 100644 index 00000000..e69de29b diff --git a/src/dupradar/script.R b/src/dupradar/script.R new file mode 100644 index 00000000..82218c6e --- /dev/null +++ b/src/dupradar/script.R @@ -0,0 +1,154 @@ +#!/usr/bin/env Rscript + +# Command line argument processing +args = commandArgs(trailingOnly=TRUE) +if (length(args) < 5) { + stop("Usage: dupRadar.r ", call.=FALSE) +} + +message("paired_end is", args[5]) +message("the type is is", class(args[5])) + +input_bam <- args[1] +output_prefix <- args[2] +annotation_gtf <- args[3] +stranded <- as.numeric(args[4]) +paired_end <- ifelse(args[5] == "true", TRUE, FALSE) +threads <- as.numeric(args[6]) + +bamRegex <- "(.+)\\.bam$" + +if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) && (!file.info(input_bam)$isdir))) stop("First argument '' must be an existing file (not a directory) with '.bam' extension...") +if(!(file.exists(annotation_gtf) && (!file.info(annotation_gtf)$isdir))) stop("Third argument '' must be an existing file (and not a directory)...") +if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...") +if(is.na(threads) || (threads<=0)) stop("Fifth argument must be a strictly positive numeric value...") + +# Debug messages (stderr) +message("Input bam (Arg 1): ", input_bam) +message("Output basename(Arg 2): ", output_prefix) +message("Input gtf (Arg 3): ", annotation_gtf) +message("Strandness (Arg 4): ", c("unstranded", "forward", "reverse")[stranded+1]) +message("paired_end (Arg 5): ", paired_end) +message("Nb threads (Arg 6): ", threads) +message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified")) + + +# Load / install packages +if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) } +if (!require("dupRadar")){ + source("http://bioconductor.org/biocLite.R") + biocLite("dupRadar", suppressUpdates=TRUE) + library("dupRadar") +} +if (!require("parallel")) { + install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/') + library("parallel") +} + +# Duplicate stats +dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads) +write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t") + +# 2D density scatter plot +pdf(paste0(output_prefix, "_duprateExpDens.pdf")) +duprateExpDensPlot(DupMat=dm) +title("Density scatter plot") +mtext(output_prefix, side=3) +dev.off() +fit <- duprateExpFit(DupMat=dm) +cat( + paste("- dupRadar Int (duprate at low read counts):", fit$intercept), + paste("- dupRadar Sl (progression of the duplication rate):", fit$slope), + fill=TRUE, labels=output_prefix, + file=paste0(output_prefix, "_intercept_slope.txt"), append=FALSE +) + +# Create a multiqc file dupInt +sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix) +line="#id: DupInt +#plot_type: 'generalstats' +#pconfig: +# dupRadar_intercept: +# title: 'dupInt' +# namespace: 'DupRadar' +# description: 'Intercept value from DupRadar' +# max: 100 +# min: 0 +# scale: 'RdYlGn-rev' +# format: '{:.2f}%' +Sample dupRadar_intercept" + +write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) +write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) + +# Get numbers from dupRadar GLM +curve_x <- sort(log10(dm$RPK)) +curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response") +# Remove all of the infinite values +infs = which(curve_x %in% c(-Inf,Inf)) +curve_x = curve_x[-infs] +curve_y = curve_y[-infs] +# Reduce number of data points +curve_x <- curve_x[seq(1, length(curve_x), 10)] +curve_y <- curve_y[seq(1, length(curve_y), 10)] +# Convert x values back to real counts +curve_x = 10^curve_x +# Write to file +line="#id: dupradar +#section_name: 'DupRadar' +#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html' +#description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication. +# This plot shows the general linear models - a summary of the gene duplication distributions. \" +#pconfig: +# title: 'DupRadar General Linear Model' +# xLog: True +# xlab: 'expression (reads/kbp)' +# ylab: '% duplicate reads' +# ymax: 100 +# ymin: 0 +# tt_label: '{point.x:.1f} reads/kbp: {point.y:,.2f}% duplicates' +# xPlotLines: +# - color: 'green' +# dashStyle: 'LongDash' +# label: +# style: {color: 'green'} +# text: '0.5 RPKM' +# verticalAlign: 'bottom' +# y: -65 +# value: 0.5 +# width: 1 +# - color: 'red' +# dashStyle: 'LongDash' +# label: +# style: {color: 'red'} +# text: '1 read/bp' +# verticalAlign: 'bottom' +# y: -65 +# value: 1000 +# width: 1" + +write(line,file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),append=TRUE) +write.table( + cbind(curve_x, curve_y), + file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"), + quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE, +) + +# Distribution of expression box plot +pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf")) +duprateExpBoxplot(DupMat=dm) +title("Percent Duplication by Expression") +mtext(output_prefix, side=3) +dev.off() + +# Distribution of RPK values per gene +pdf(paste0(output_prefix, "_expressionHist.pdf")) +expressionHist(DupMat=dm) +title("Distribution of RPK values per gene") +mtext(output_prefix, side=3) +dev.off() + +# Print sessioninfo to standard out +print(output_prefix) +citation("dupRadar") +sessionInfo() diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh new file mode 100644 index 00000000..85d51167 --- /dev/null +++ b/src/dupradar/script.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -eo pipefail + +function num_strandness { + if [ $par_strandedness == 'unstranded' ]; then echo 0 + elif [ $par_strandedness == 'forward' ]; then echo 1 + elif [ $par_strandedness == 'reverse' ]; then echo 2 + else echo "strandedness must be unstranded, forward or reverse." && \ + exit 1 + fi +} + +Rscript "$meta_resources_dir/script.R" \ + $par_input \ + $par_id \ + $par_gtf_annotation \ + $(num_strandness) \ + $par_paired + +mv "$par_id"_dupMatrix.txt $par_output_dupmatrix +mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc +mv "$par_id"_duprateExpBoxplot.pdf $par_output_duprate_exp_boxplot +mv "$par_id"_duprateExpDens.pdf $par_output_duprate_exp_densplot +mv "$par_id"_duprateExpDensCurve_mqc.txt $par_output_duprate_exp_denscurve_mqc +mv "$par_id"_expressionHist.pdf $par_output_expression_histogram +mv "$par_id"_intercept_slope.txt $par_output_intercept_slope + + +dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))") +text="bioconductor-dupradar: ${dupradar_ver}" +echo "$text" \ No newline at end of file diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh new file mode 100644 index 00000000..01072e01 --- /dev/null +++ b/src/dupradar/test.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# define input and output for script +input_bam="$meta_resources_dir/sample.bam" +input_gtf="$meta_resources_dir/genes.gtf" + +output_dupmatrix="dup_matrix.txt" +output_dup_intercept_mqc="dup_intercept_mqc.txt" +output_duprate_exp_boxplot="duprate_exp_boxplot.pdf" +output_duprate_exp_densplot="duprate_exp_densityplot.pdf" +output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.pdf" +output_expression_histogram="expression_hist.pdf" +output_intercept_slope="intercept_slope.txt" + +# Run executable +echo "> Running $meta_functionality_name for unpaired reads, writing to tmpdir $tmpdir." + +"$meta_executable" \ + --input "$input_bam" \ + --id "test" \ + --gtf_annotation "$input_gtf" \ + --strandedness "forward" \ + --paired false \ + --output_dupmatrix $output_dupmatrix \ + --output_dup_intercept_mqc $output_dup_intercept_mqc \ + --output_duprate_exp_boxplot $output_duprate_exp_boxplot \ + --output_duprate_exp_densplot $output_duprate_exp_densplot \ + --output_duprate_exp_denscurve_mqc $output_duprate_exp_denscurve_mqc \ + --output_expression_histogram $output_expression_histogram \ + --output_intercept_slope $output_intercept_slope + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> asserting output has been created for paired read input" +[ ! -f "$output_dupmatrix" ] && echo "$output_dupmatrix was not created" && exit 1 +[ ! -s "$output_dupmatrix" ] && echo "$output_dupmatrix is empty" && exit 1 +[ ! -f "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc was not created" && exit 1 +[ ! -s "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc is empty" && exit 1 +[ ! -f "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot was not created" && exit 1 +[ ! -s "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot is empty" && exit 1 +[ ! -f "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot was not created" && exit 1 +[ ! -s "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot is empty" && exit 1 +[ ! -f "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc was not created" && exit 1 +[ ! -s "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc is empty" && exit 1 +[ ! -f "$output_expression_histogram" ] && echo "$output_expression_histogram was not created" && exit 1 +[ ! -s "$output_expression_histogram" ] && echo "$output_expression_histogram is empty" && exit 1 +[ ! -f "$output_intercept_slope" ] && echo "$output_intercept_slope was not created" && exit 1 +[ ! -s "$output_intercept_slope" ] && echo "$output_intercept_slope is empty" && exit 1 + +exit 0 \ No newline at end of file diff --git a/src/dupradar/test_data/genes.gtf b/src/dupradar/test_data/genes.gtf new file mode 100644 index 00000000..423ba8f8 --- /dev/null +++ b/src/dupradar/test_data/genes.gtf @@ -0,0 +1,36 @@ +chr1 unknown exon 14362 14829 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 14970 15038 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 15796 15947 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 16607 16765 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 16858 17055 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17233 17368 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17606 17742 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17915 18061 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 18268 18366 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 24738 24891 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 29321 29370 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 34611 35174 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 34611 35174 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown exon 35277 35481 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 35277 35481 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown exon 35721 36081 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 35721 36081 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown CDS 69091 70005 . + 0 gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown exon 69091 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown start_codon 69091 69093 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown stop_codon 70006 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown exon 134773 139696 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 139790 139847 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 140075 140566 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 323892 324060 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr1 unknown exon 324288 324345 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr1 unknown exon 324439 328581 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr19 unknown exon 76220 76783 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 76220 76783 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr19 unknown exon 76886 77090 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 76886 77090 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr19 unknown exon 77330 77690 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 77330 77690 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr5 unknown exon 180750507 180750675 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; +chr5 unknown exon 180750903 180750960 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; +chr5 unknown exon 180751054 180755196 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; diff --git a/src/dupradar/test_data/sample.bam b/src/dupradar/test_data/sample.bam new file mode 100644 index 00000000..04d8c95e Binary files /dev/null and b/src/dupradar/test_data/sample.bam differ diff --git a/src/dupradar/test_data/script.sh b/src/dupradar/test_data/script.sh new file mode 100644 index 00000000..2b1c1aa5 --- /dev/null +++ b/src/dupradar/test_data/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/genes.gtf +wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam + +# subsample the bam file +samtools view -s 0.02 -b wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam > sample.bam + +rm wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam \ No newline at end of file