Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
emmarousseau committed Jun 2, 2024
1 parent ea0383c commit 1f02f94
Show file tree
Hide file tree
Showing 8 changed files with 399 additions and 0 deletions.
117 changes: 117 additions & 0 deletions src/dupradar/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
name: dupradar
description: Assessment of duplication rates in RNA-Seq datasets
keywords: [ rnaseq, duplication, genomics ]
links:
homepage: https://bioconductor.org/packages/release/bioc/html/dupRadar.html
documentation: https://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html
repository: https://github.com/ssayols/dupRadar
references:
doi: "10.1186/s12859-016-1276-2"
license: GPL-3.0

argument_groups:
- name: "Input"
arguments:
- name: "--id"
type: string
description: Sample ID

- name: "--input"
type: file
required: true
description: path to input alignment file in BAM format

- name: "--gtf_annotation"
type: file
required: true
description: path to GTF annotation file.

- name: "--paired"
type: boolean
description: add flag if input alignment file consists of paired reads

- name: "--strandedness"
type: string
required: false
choices: ["forward", "reverse", "unstranded"]
description: strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads))

- name: "--versions"
type: file
must_exist: false

- name: "Output"
arguments:
- name: "--output_dupmatrix"
type: file
direction: output
required: false
must_exist: true
default: $id.dup_matrix.txt
description: path to output file (txt) of duplicate tag counts

- name: "--output_dup_intercept_mqc"
type: file
direction: output
required: false
must_exist: true
default: $id.dup_intercept_mqc.txt
description: path to output file (txt) of multiqc intercept value DupRadar

- name: "--output_duprate_exp_boxplot"
type: file
direction: output
required: false
must_exist: true
default: $id.duprate_exp_boxplot.pdf
description: path to output file (pdf) of distribution of expression box plot

- name: "--output_duprate_exp_densplot"
type: file
direction: output
required: false
must_exist: true
default: $id.duprate_exp_densityplot.pdf
description: path to output file (pdf) of 2D density scatter plot of duplicate tag counts

- name: "--output_duprate_exp_denscurve_mqc"
type: file
direction: output
required: false
must_exist: true
default: $id.duprate_exp_density_curve_mqc.txt
description: path to output file (pdf) of density curve of gene duplication multiqc

- name: "--output_expression_histogram"
type: file
direction: output
required: false
must_exist: true
default: $id.expression_hist.pdf
description: path to output file (pdf) of distribution of RPK values per gene histogram

- name: "--output_intercept_slope"
type: file
direction: output
required: false
must_exist: true
default: $id.intercept_slope.txt
description: output file (txt) with progression of duplication rate values

resources:
- type: bash_script
path: script.sh
# Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r
- type: R_script
path: script.R

test_resources:
- type: bash_script
path: test.sh

engines:
- type: docker
image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0
runners:
- type: executable
- type: nextflow
Empty file added src/dupradar/help.txt
Empty file.
154 changes: 154 additions & 0 deletions src/dupradar/script.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env Rscript

# Command line argument processing
args = commandArgs(trailingOnly=TRUE)
if (length(args) < 5) {
stop("Usage: dupRadar.r <input.bam> <sample_id> <annotation.gtf> <strandDirection:0=unstranded/1=forward/2=reverse> <paired/single> <nbThreads> <R-package-location (optional)>", call.=FALSE)
}

message("paired_end is", args[5])
message("the type is is", class(args[5]))

input_bam <- args[1]
output_prefix <- args[2]
annotation_gtf <- args[3]
stranded <- as.numeric(args[4])
paired_end <- ifelse(args[5] == "true", TRUE, FALSE)
threads <- as.numeric(args[6])

bamRegex <- "(.+)\\.bam$"

if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) && (!file.info(input_bam)$isdir))) stop("First argument '<input.bam>' must be an existing file (not a directory) with '.bam' extension...")
if(!(file.exists(annotation_gtf) && (!file.info(annotation_gtf)$isdir))) stop("Third argument '<annotation.gtf>' must be an existing file (and not a directory)...")
if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument <strandDirection> must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...")
if(is.na(threads) || (threads<=0)) stop("Fifth argument <nbThreads> must be a strictly positive numeric value...")

# Debug messages (stderr)
message("Input bam (Arg 1): ", input_bam)
message("Output basename(Arg 2): ", output_prefix)
message("Input gtf (Arg 3): ", annotation_gtf)
message("Strandness (Arg 4): ", c("unstranded", "forward", "reverse")[stranded+1])
message("paired_end (Arg 5): ", paired_end)
message("Nb threads (Arg 6): ", threads)
message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified"))


# Load / install packages
if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) }
if (!require("dupRadar")){
source("http://bioconductor.org/biocLite.R")
biocLite("dupRadar", suppressUpdates=TRUE)
library("dupRadar")
}
if (!require("parallel")) {
install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/')
library("parallel")
}

# Duplicate stats
dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads)
write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t")

# 2D density scatter plot
pdf(paste0(output_prefix, "_duprateExpDens.pdf"))
duprateExpDensPlot(DupMat=dm)
title("Density scatter plot")
mtext(output_prefix, side=3)
dev.off()
fit <- duprateExpFit(DupMat=dm)
cat(
paste("- dupRadar Int (duprate at low read counts):", fit$intercept),
paste("- dupRadar Sl (progression of the duplication rate):", fit$slope),
fill=TRUE, labels=output_prefix,
file=paste0(output_prefix, "_intercept_slope.txt"), append=FALSE
)

# Create a multiqc file dupInt
sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix)
line="#id: DupInt
#plot_type: 'generalstats'
#pconfig:
# dupRadar_intercept:
# title: 'dupInt'
# namespace: 'DupRadar'
# description: 'Intercept value from DupRadar'
# max: 100
# min: 0
# scale: 'RdYlGn-rev'
# format: '{:.2f}%'
Sample dupRadar_intercept"

write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)

# Get numbers from dupRadar GLM
curve_x <- sort(log10(dm$RPK))
curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response")
# Remove all of the infinite values
infs = which(curve_x %in% c(-Inf,Inf))
curve_x = curve_x[-infs]
curve_y = curve_y[-infs]
# Reduce number of data points
curve_x <- curve_x[seq(1, length(curve_x), 10)]
curve_y <- curve_y[seq(1, length(curve_y), 10)]
# Convert x values back to real counts
curve_x = 10^curve_x
# Write to file
line="#id: dupradar
#section_name: 'DupRadar'
#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html'
#description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication.
# This plot shows the general linear models - a summary of the gene duplication distributions. \"
#pconfig:
# title: 'DupRadar General Linear Model'
# xLog: True
# xlab: 'expression (reads/kbp)'
# ylab: '% duplicate reads'
# ymax: 100
# ymin: 0
# tt_label: '<b>{point.x:.1f} reads/kbp</b>: {point.y:,.2f}% duplicates'
# xPlotLines:
# - color: 'green'
# dashStyle: 'LongDash'
# label:
# style: {color: 'green'}
# text: '0.5 RPKM'
# verticalAlign: 'bottom'
# y: -65
# value: 0.5
# width: 1
# - color: 'red'
# dashStyle: 'LongDash'
# label:
# style: {color: 'red'}
# text: '1 read/bp'
# verticalAlign: 'bottom'
# y: -65
# value: 1000
# width: 1"

write(line,file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),append=TRUE)
write.table(
cbind(curve_x, curve_y),
file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),
quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE,
)

# Distribution of expression box plot
pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf"))
duprateExpBoxplot(DupMat=dm)
title("Percent Duplication by Expression")
mtext(output_prefix, side=3)
dev.off()

# Distribution of RPK values per gene
pdf(paste0(output_prefix, "_expressionHist.pdf"))
expressionHist(DupMat=dm)
title("Distribution of RPK values per gene")
mtext(output_prefix, side=3)
dev.off()

# Print sessioninfo to standard out
print(output_prefix)
citation("dupRadar")
sessionInfo()
32 changes: 32 additions & 0 deletions src/dupradar/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

set -eo pipefail

function num_strandness {
if [ $par_strandedness == 'unstranded' ]; then echo 0
elif [ $par_strandedness == 'forward' ]; then echo 1
elif [ $par_strandedness == 'reverse' ]; then echo 2
else echo "strandedness must be unstranded, forward or reverse." && \
exit 1
fi
}

Rscript "$meta_resources_dir/script.R" \
$par_input \
$par_id \
$par_gtf_annotation \
$(num_strandness) \
$par_paired

mv "$par_id"_dupMatrix.txt $par_output_dupmatrix
mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc
mv "$par_id"_duprateExpBoxplot.pdf $par_output_duprate_exp_boxplot
mv "$par_id"_duprateExpDens.pdf $par_output_duprate_exp_densplot
mv "$par_id"_duprateExpDensCurve_mqc.txt $par_output_duprate_exp_denscurve_mqc
mv "$par_id"_expressionHist.pdf $par_output_expression_histogram
mv "$par_id"_intercept_slope.txt $par_output_intercept_slope


dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")
text="bioconductor-dupradar: ${dupradar_ver}"
echo "$text"
51 changes: 51 additions & 0 deletions src/dupradar/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# define input and output for script
input_bam="$meta_resources_dir/sample.bam"
input_gtf="$meta_resources_dir/genes.gtf"

output_dupmatrix="dup_matrix.txt"
output_dup_intercept_mqc="dup_intercept_mqc.txt"
output_duprate_exp_boxplot="duprate_exp_boxplot.pdf"
output_duprate_exp_densplot="duprate_exp_densityplot.pdf"
output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.pdf"
output_expression_histogram="expression_hist.pdf"
output_intercept_slope="intercept_slope.txt"

# Run executable
echo "> Running $meta_functionality_name for unpaired reads, writing to tmpdir $tmpdir."

"$meta_executable" \
--input "$input_bam" \
--id "test" \
--gtf_annotation "$input_gtf" \
--strandedness "forward" \
--paired false \
--output_dupmatrix $output_dupmatrix \
--output_dup_intercept_mqc $output_dup_intercept_mqc \
--output_duprate_exp_boxplot $output_duprate_exp_boxplot \
--output_duprate_exp_densplot $output_duprate_exp_densplot \
--output_duprate_exp_denscurve_mqc $output_duprate_exp_denscurve_mqc \
--output_expression_histogram $output_expression_histogram \
--output_intercept_slope $output_intercept_slope

exit_code=$?
[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1

echo ">> asserting output has been created for paired read input"
[ ! -f "$output_dupmatrix" ] && echo "$output_dupmatrix was not created" && exit 1
[ ! -s "$output_dupmatrix" ] && echo "$output_dupmatrix is empty" && exit 1
[ ! -f "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc was not created" && exit 1
[ ! -s "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc is empty" && exit 1
[ ! -f "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot was not created" && exit 1
[ ! -s "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot is empty" && exit 1
[ ! -f "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot was not created" && exit 1
[ ! -s "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot is empty" && exit 1
[ ! -f "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc was not created" && exit 1
[ ! -s "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc is empty" && exit 1
[ ! -f "$output_expression_histogram" ] && echo "$output_expression_histogram was not created" && exit 1
[ ! -s "$output_expression_histogram" ] && echo "$output_expression_histogram is empty" && exit 1
[ ! -f "$output_intercept_slope" ] && echo "$output_intercept_slope was not created" && exit 1
[ ! -s "$output_intercept_slope" ] && echo "$output_intercept_slope is empty" && exit 1

exit 0
36 changes: 36 additions & 0 deletions src/dupradar/test_data/genes.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
chr1 unknown exon 14362 14829 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 14970 15038 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 15796 15947 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 16607 16765 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 16858 17055 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 17233 17368 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 17606 17742 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 17915 18061 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 18268 18366 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 24738 24891 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 29321 29370 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514";
chr1 unknown exon 34611 35174 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403";
chr1 unknown exon 34611 35174 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403";
chr1 unknown exon 35277 35481 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403";
chr1 unknown exon 35277 35481 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403";
chr1 unknown exon 35721 36081 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403";
chr1 unknown exon 35721 36081 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403";
chr1 unknown CDS 69091 70005 . + 0 gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
chr1 unknown exon 69091 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
chr1 unknown start_codon 69091 69093 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
chr1 unknown stop_codon 70006 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428";
chr1 unknown exon 134773 139696 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541";
chr1 unknown exon 139790 139847 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541";
chr1 unknown exon 140075 140566 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541";
chr1 unknown exon 323892 324060 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303";
chr1 unknown exon 324288 324345 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303";
chr1 unknown exon 324439 328581 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303";
chr19 unknown exon 76220 76783 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829";
chr19 unknown exon 76220 76783 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829";
chr19 unknown exon 76886 77090 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829";
chr19 unknown exon 76886 77090 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829";
chr19 unknown exon 77330 77690 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829";
chr19 unknown exon 77330 77690 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829";
chr5 unknown exon 180750507 180750675 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538";
chr5 unknown exon 180750903 180750960 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538";
chr5 unknown exon 180751054 180755196 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538";
Binary file added src/dupradar/test_data/sample.bam
Binary file not shown.
Loading

0 comments on commit 1f02f94

Please sign in to comment.