Skip to content

Commit

Permalink
Added chunks paramater to calculateBamCoverageByInternal (lima1#218).
Browse files Browse the repository at this point in the history
  • Loading branch information
lima1 committed Feb 2, 2022
1 parent 26f9dbb commit f058d5e
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 26 deletions.
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changes in version 2.2.0
------------------------

NEW FEATURES
o Added chunks parameter to Coverage.R and calculateBamCoverageByInterval
to reduce memory usage (#218)

SIGNIFICANT USER-VISIBLE CHANGES

o When base quality scores are found in the VCF, they are now used to
Expand Down
68 changes: 45 additions & 23 deletions R/calculateBamCoverageByInterval.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#' @param index.file The bai index. This is expected without the .bai file
#' suffix, see \code{?scanBam}.
#' @param keep.duplicates Keep or remove duplicated reads.
#' @param chunks Split \code{interval.file} into specified number of chunks
#' to reduce memory usage.
#' @param ... Additional parameters passed to \code{ScanBamParam}.
#' @return Returns total and average coverage by intervals.
#' @author Markus Riester
Expand All @@ -39,36 +41,56 @@
#' scanBam scanFa scanFaIndex TabixFile
calculateBamCoverageByInterval <- function(bam.file, interval.file,
output.file = NULL, index.file = bam.file, keep.duplicates = FALSE,
...) {
intervalGr <- readIntervalFile(interval.file, strict = FALSE,
chunks = 20, ...) {
intervalGrAll <- readIntervalFile(interval.file, strict = FALSE,
verbose = FALSE)
# skip splitting with small panels
if (is.null(chunks) || is.na(chunks) || chunks < 2 ||
length(intervalGrAll) < chunks * 50) {
chunkIds <- rep(1, length(intervalGrAll))
} else {
chunkIds <- cut(seq_along(intervalGrAll), chunks, labels = FALSE)
}
.calculateBamCoverageByInterval <- function(intervalGr, ...) {
param <- ScanBamParam(what = c("pos", "qwidth", "flag"),
which = intervalGr,
flag = scanBamFlag(isUnmappedQuery = FALSE,
isNotPassingQualityControls = FALSE,
isSecondaryAlignment = FALSE,
isDuplicate = NA
),
...
)

param <- ScanBamParam(what = c("pos", "qwidth", "flag"),
which = intervalGr,
flag = scanBamFlag(isUnmappedQuery = FALSE,
isNotPassingQualityControls = FALSE,
isSecondaryAlignment = FALSE,
isDuplicate = NA
),
...
)
xAll <- scanBam(bam.file, index = index.file, param = param)
xDupFiltered <- .filterDuplicates(xAll)

xAll <- scanBam(bam.file, index = index.file, param = param)
xDupFiltered <- .filterDuplicates(xAll)
x <- xDupFiltered
if (keep.duplicates) x <- xAll

x <- xDupFiltered
if (keep.duplicates) x <- xAll
intervalGr$coverage <- vapply(seq_along(x), function(i)
sum(coverage(IRanges(x[[i]][["pos"]], width = x[[i]][["qwidth"]]),
shift = -start(intervalGr)[i], width = width(intervalGr)[i])), integer(1))

intervalGr$coverage <- vapply(seq_along(x), function(i)
sum(coverage(IRanges(x[[i]][["pos"]], width = x[[i]][["qwidth"]]),
shift = -start(intervalGr)[i], width = width(intervalGr)[i])), integer(1))
intervalGr$average.coverage <- intervalGr$coverage / width(intervalGr)

intervalGr$average.coverage <- intervalGr$coverage / width(intervalGr)
intervalGr$counts <- as.numeric(vapply(x, function(y) length(y$pos), integer(1)))
intervalGr$duplication.rate <- 1 -
vapply(xDupFiltered, function(y) length(y$pos), integer(1)) /
vapply(xAll, function(y) length(y$pos), integer(1))
intervalGr
}
intervalGrAll <- split(intervalGrAll, chunkIds)

intervalGr$counts <- as.numeric(vapply(x, function(y) length(y$pos), integer(1)))
intervalGr$duplication.rate <- 1 -
vapply(xDupFiltered, function(y) length(y$pos), integer(1)) /
vapply(xAll, function(y) length(y$pos), integer(1))
intervalGr <- unlist(GRangesList(
lapply(seq_along(intervalGrAll), function(i) {
if (length(intervalGrAll) > 1) {
flog.info("Processing %s (%i/%i)...", as.character(intervalGrAll[[i]][1]),
i, length(intervalGrAll))
}
.calculateBamCoverageByInterval(intervalGrAll[[i]], ...)
}
)))

if (!is.null(output.file)) {
.writeCoverage(intervalGr, output.file)
Expand Down
4 changes: 4 additions & 0 deletions inst/extdata/Coverage.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ option_list <- list(
make_option(c("--out-dir"), action = "store", type = "character",
default = NULL,
help = "Output directory to which results should be written"),
make_option(c("--chunks"), action = "store", type = "integer",
default = formals(PureCN::calculateBamCoverageByInterval)$chunks,
help = "Split intervals into specified number of chunks to reduce memory usage [default %default]"),
make_option(c("--parallel"), action = "store_true", default = FALSE,
help = "Use BiocParallel to calculate coverage in parallel whem --bam is a list of BAM files."),
make_option(c("--cores"), action = "store", type = "integer", default = 1,
Expand Down Expand Up @@ -132,6 +135,7 @@ getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file,
PureCN::calculateBamCoverageByInterval(bam.file = bam.file,
interval.file = interval.file, output.file = output.file,
index.file = index.file, keep.duplicates = keep.duplicates,
chunks = opt$chunks,
mapqFilter = if (remove_mapq0) 1 else NA)
}
output.file
Expand Down
4 changes: 4 additions & 0 deletions man/calculateBamCoverageByInterval.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions vignettes/Quick.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -733,9 +733,10 @@ Argument name | Corresponding PureCN argument | PureCN function
`--intervals` | `interval.file` | `correctCoverageBias`
`--method` | `method` | `correctCoverageBias`
`--keep-duplicates` | `keep.duplicates` | `calculateBamCoverageByInterval`
`--remove-mapq0` | `mapqFilter` | `ScanBamParam`
`--skip-gc-norm` | | `correctCoverageBias`
`--out-dir` | |
`--chunks` | `chunks` | `calculateBamCoverageByInterval`
`--remove-mapq0` | `mapqFilter` | `ScanBamParam`
`--skip-gc-norm` | | `correctCoverageBias`
`--out-dir` | |
`--cores` | | Number of CPUs to use when multiple BAMs are provided
`--parallel` | | Use default `r Biocpkg("BiocParallel")` backend when multiple BAMs are provided
`--seed` | |
Expand Down

0 comments on commit f058d5e

Please sign in to comment.