diff --git a/NEWS b/NEWS index 0736609..8784ca0 100755 --- a/NEWS +++ b/NEWS @@ -1,6 +1,10 @@ Changes in version 2.2.0 ------------------------ +NEW FEATURES + o Added chunks parameter to Coverage.R and calculateBamCoverageByInterval + to reduce memory usage (#218) + SIGNIFICANT USER-VISIBLE CHANGES o When base quality scores are found in the VCF, they are now used to diff --git a/R/calculateBamCoverageByInterval.R b/R/calculateBamCoverageByInterval.R index 24c304d..0d8dec1 100644 --- a/R/calculateBamCoverageByInterval.R +++ b/R/calculateBamCoverageByInterval.R @@ -16,6 +16,8 @@ #' @param index.file The bai index. This is expected without the .bai file #' suffix, see \code{?scanBam}. #' @param keep.duplicates Keep or remove duplicated reads. +#' @param chunks Split \code{interval.file} into specified number of chunks +#' to reduce memory usage. #' @param ... Additional parameters passed to \code{ScanBamParam}. #' @return Returns total and average coverage by intervals. #' @author Markus Riester @@ -39,36 +41,56 @@ #' scanBam scanFa scanFaIndex TabixFile calculateBamCoverageByInterval <- function(bam.file, interval.file, output.file = NULL, index.file = bam.file, keep.duplicates = FALSE, - ...) { - intervalGr <- readIntervalFile(interval.file, strict = FALSE, + chunks = 20, ...) { + intervalGrAll <- readIntervalFile(interval.file, strict = FALSE, verbose = FALSE) + # skip splitting with small panels + if (is.null(chunks) || is.na(chunks) || chunks < 2 || + length(intervalGrAll) < chunks * 50) { + chunkIds <- rep(1, length(intervalGrAll)) + } else { + chunkIds <- cut(seq_along(intervalGrAll), chunks, labels = FALSE) + } + .calculateBamCoverageByInterval <- function(intervalGr, ...) { + param <- ScanBamParam(what = c("pos", "qwidth", "flag"), + which = intervalGr, + flag = scanBamFlag(isUnmappedQuery = FALSE, + isNotPassingQualityControls = FALSE, + isSecondaryAlignment = FALSE, + isDuplicate = NA + ), + ... + ) - param <- ScanBamParam(what = c("pos", "qwidth", "flag"), - which = intervalGr, - flag = scanBamFlag(isUnmappedQuery = FALSE, - isNotPassingQualityControls = FALSE, - isSecondaryAlignment = FALSE, - isDuplicate = NA - ), - ... - ) + xAll <- scanBam(bam.file, index = index.file, param = param) + xDupFiltered <- .filterDuplicates(xAll) - xAll <- scanBam(bam.file, index = index.file, param = param) - xDupFiltered <- .filterDuplicates(xAll) + x <- xDupFiltered + if (keep.duplicates) x <- xAll - x <- xDupFiltered - if (keep.duplicates) x <- xAll + intervalGr$coverage <- vapply(seq_along(x), function(i) + sum(coverage(IRanges(x[[i]][["pos"]], width = x[[i]][["qwidth"]]), + shift = -start(intervalGr)[i], width = width(intervalGr)[i])), integer(1)) - intervalGr$coverage <- vapply(seq_along(x), function(i) - sum(coverage(IRanges(x[[i]][["pos"]], width = x[[i]][["qwidth"]]), - shift = -start(intervalGr)[i], width = width(intervalGr)[i])), integer(1)) + intervalGr$average.coverage <- intervalGr$coverage / width(intervalGr) - intervalGr$average.coverage <- intervalGr$coverage / width(intervalGr) + intervalGr$counts <- as.numeric(vapply(x, function(y) length(y$pos), integer(1))) + intervalGr$duplication.rate <- 1 - + vapply(xDupFiltered, function(y) length(y$pos), integer(1)) / + vapply(xAll, function(y) length(y$pos), integer(1)) + intervalGr + } + intervalGrAll <- split(intervalGrAll, chunkIds) - intervalGr$counts <- as.numeric(vapply(x, function(y) length(y$pos), integer(1))) - intervalGr$duplication.rate <- 1 - - vapply(xDupFiltered, function(y) length(y$pos), integer(1)) / - vapply(xAll, function(y) length(y$pos), integer(1)) + intervalGr <- unlist(GRangesList( + lapply(seq_along(intervalGrAll), function(i) { + if (length(intervalGrAll) > 1) { + flog.info("Processing %s (%i/%i)...", as.character(intervalGrAll[[i]][1]), + i, length(intervalGrAll)) + } + .calculateBamCoverageByInterval(intervalGrAll[[i]], ...) + } + ))) if (!is.null(output.file)) { .writeCoverage(intervalGr, output.file) diff --git a/inst/extdata/Coverage.R b/inst/extdata/Coverage.R index f84a900..14ce923 100644 --- a/inst/extdata/Coverage.R +++ b/inst/extdata/Coverage.R @@ -25,6 +25,9 @@ option_list <- list( make_option(c("--out-dir"), action = "store", type = "character", default = NULL, help = "Output directory to which results should be written"), + make_option(c("--chunks"), action = "store", type = "integer", + default = formals(PureCN::calculateBamCoverageByInterval)$chunks, + help = "Split intervals into specified number of chunks to reduce memory usage [default %default]"), make_option(c("--parallel"), action = "store_true", default = FALSE, help = "Use BiocParallel to calculate coverage in parallel whem --bam is a list of BAM files."), make_option(c("--cores"), action = "store", type = "integer", default = 1, @@ -132,6 +135,7 @@ getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file, PureCN::calculateBamCoverageByInterval(bam.file = bam.file, interval.file = interval.file, output.file = output.file, index.file = index.file, keep.duplicates = keep.duplicates, + chunks = opt$chunks, mapqFilter = if (remove_mapq0) 1 else NA) } output.file diff --git a/man/calculateBamCoverageByInterval.Rd b/man/calculateBamCoverageByInterval.Rd index 7d62e7a..2a62d84 100644 --- a/man/calculateBamCoverageByInterval.Rd +++ b/man/calculateBamCoverageByInterval.Rd @@ -10,6 +10,7 @@ calculateBamCoverageByInterval( output.file = NULL, index.file = bam.file, keep.duplicates = FALSE, + chunks = 20, ... ) } @@ -27,6 +28,9 @@ suffix, see \code{?scanBam}.} \item{keep.duplicates}{Keep or remove duplicated reads.} +\item{chunks}{Split \code{interval.file} into specified number of chunks +to reduce memory usage.} + \item{...}{Additional parameters passed to \code{ScanBamParam}.} } \value{ diff --git a/vignettes/Quick.Rmd b/vignettes/Quick.Rmd index b863193..ee158b2 100755 --- a/vignettes/Quick.Rmd +++ b/vignettes/Quick.Rmd @@ -733,9 +733,10 @@ Argument name | Corresponding PureCN argument | PureCN function `--intervals` | `interval.file` | `correctCoverageBias` `--method` | `method` | `correctCoverageBias` `--keep-duplicates` | `keep.duplicates` | `calculateBamCoverageByInterval` -`--remove-mapq0` | `mapqFilter` | `ScanBamParam` -`--skip-gc-norm` | | `correctCoverageBias` -`--out-dir` | | +`--chunks` | `chunks` | `calculateBamCoverageByInterval` +`--remove-mapq0` | `mapqFilter` | `ScanBamParam` +`--skip-gc-norm` | | `correctCoverageBias` +`--out-dir` | | `--cores` | | Number of CPUs to use when multiple BAMs are provided `--parallel` | | Use default `r Biocpkg("BiocParallel")` backend when multiple BAMs are provided `--seed` | |