Skip to content

Commit

Permalink
getPrevalence, NA values (microbiome#486)
Browse files Browse the repository at this point in the history
  • Loading branch information
TuomasBorman authored Mar 5, 2024
1 parent 6a356cd commit aa1d488
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 12 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mia
Type: Package
Version: 1.11.7
Version: 1.11.8
Authors@R:
c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"),
email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,4 @@ Changes in version 1.11.x
+ perSampleDominantFeatures: add new arguments (n, other.name, complete)
+ loadFromMetaphlan: support "taxonomy" column for specifying taxonomy
+ cluster: Overwrite old results instead of failing
+ getPrevalence: bugfix, if assay contains NA values, it does not end up to NA anymore.
44 changes: 34 additions & 10 deletions R/getPrevalence.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,17 @@
#'
#' @param rank a single character defining a taxonomic rank. Must be a value of
#' \code{taxonomyRanks()} function.
#'
#'
#' @param na.rm logical scalar: Should NA values be omitted when calculating
#' prevalence? (default: \code{na.rm = TRUE})
#'
#' @param ... additional arguments
#' \itemize{
#' \item{If \code{!is.null(rank)} arguments are passed on to
#' \code{\link[=agglomerate-methods]{agglomerateByRank}}. See
#' \code{\link[=agglomerate-methods]{?agglomerateByRank}} for more details.
#' Note that you can specify whether to remove empty ranks with
#' \code{agg.na.rm} instead of \code{na.rm}. (default: \code{FALSE})
#' }
#' \item{for \code{getPrevalentFeatures}, \code{getRareFeatures},
#' \code{subsetByPrevalentFeatures} and \code{subsetByRareFeatures} additional
Expand Down Expand Up @@ -186,16 +191,19 @@ setGeneric("getPrevalence", signature = "x",

#' @rdname getPrevalence
#' @export
setMethod("getPrevalence", signature = c(x = "ANY"),
function(x, detection = 0, include_lowest = FALSE, sort = FALSE, ...){

setMethod("getPrevalence", signature = c(x = "ANY"), function(
x, detection = 0, include_lowest = FALSE, sort = FALSE, na.rm = TRUE, ...){
# input check
if (!.is_numeric_string(detection)) {
stop("'detection' must be a single numeric value or coercible to ",
"one.",
call. = FALSE)
}

#
if(!.is_a_bool(na.rm)){
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
detection <- as.numeric(detection)
if(!.is_a_bool(include_lowest)){
stop("'include_lowest' must be TRUE or FALSE.", call. = FALSE)
Expand All @@ -204,13 +212,21 @@ setMethod("getPrevalence", signature = c(x = "ANY"),
stop("'sort' must be TRUE or FALSE.", call. = FALSE)
}
#

# Give warning if there are taxa with NA values
if( any( is.na(x) ) ){
msg <- paste0(
"The abundance table contains NA values and they are",
ifelse(na.rm, " not ", " "), "excluded (see 'na.rm').")
warning(msg, call. = FALSE)
}
#
if (include_lowest) {
prev <- x >= detection
} else {
prev <- x > detection
}
prev <- rowSums(prev)
# Calculate prevalence for each taxa
prev <- rowSums(prev, na.rm = na.rm)
# Always return prevalence as a relative frequency.
# This helps to avoid confusion with detection limit
prev <- prev / ncol(x)
Expand All @@ -222,15 +238,23 @@ setMethod("getPrevalence", signature = c(x = "ANY"),
)

.agg_for_prevalence <- function(
x, rank, relabel = FALSE, make_unique = TRUE, na.rm = FALSE, ...){
# Check na.rm
x, rank, relabel = FALSE, make_unique = TRUE, na.rm = FALSE,
agg.na.rm = FALSE, ...){
# Check na.rm. It is not used in this function, it is only catched so that
# it can be passed to getPrevalence(matrix) and not use it here in
# agglomerateByRank function.
if(!.is_a_bool(na.rm)){
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
# Check drop.empty.rank
if(!.is_a_bool(agg.na.rm)){
stop("'agg.na.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
if(!is.null(rank)){
.check_taxonomic_rank(rank, x)
args <- c(list(x = x, rank = rank, na.rm = na.rm), list(...))
args <- c(list(x = x, rank = rank, na.rm = agg.na.rm), list(...))
argNames <- c("x","rank","onRankOnly","na.rm","empty.fields",
"archetype","mergeTree","average","BPPARAM")
args <- args[names(args) %in% argNames]
Expand Down
14 changes: 13 additions & 1 deletion man/getPrevalence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 50 additions & 0 deletions tests/testthat/test-5prevalence.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,56 @@ test_that("getPrevalence", {
pr2 <- getPrevalence(gp_null, detection=0.004, as_relative=TRUE, rank = "Family")
expect_equal(pr1, pr2)

# Check that na.rm works correctly
tse <- GlobalPatterns
# Get reference value
ref <- getPrevalence(tse, assay.type = "counts")
# Add NA values to matrix
remove <- c(1, 3, 10)
assay(tse, "counts")[remove, ] <- NA
# There should be 3 NA values if na.rm = FALSE. Otherwise there should be 0
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = FALSE) )
expect_true( sum(is.na(res)) == 3)
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = TRUE) )
expect_true( sum(is.na(res)) == 0)
# Expect that other than features with NA values are the same as in reference
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = TRUE))
res <- res[ !names(res) %in% remove ]
ref <- ref[ !names(ref) %in% remove ]
expect_equal( res[ names(ref) ], res[ names(ref) ] )

# Now test that the number of samples where feature was detected is correct
tse <- GlobalPatterns
ref <- getPrevalence(tse, assay.type = "counts")
# Add NA values to specific feature that has non-zero value
feature <- rownames(tse)[[7]]
assay(tse, "counts")[feature, 1] <- NA
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = TRUE))
# Get the feature values and check that they have correct number of samples
res <- res[ feature ]
ref <- ref[ feature ]
expect_true(res*ncol(tse) == 2)
expect_true(ref*ncol(tse) == 3)

#
tse <- GlobalPatterns
rank <- "Genus"
# Add NA values to matrix
remove <- c(15, 200)
assay(tse, "counts")[remove, ] <- NA
# Check that agglomeration works
tse_agg <- agglomerateByRank(tse, rank = rank)
expect_warning(ref <- getPrevalence(tse_agg, na.rm = FALSE))
expect_warning(res <- getPrevalence(tse, na.rm = FALSE, rank = "Genus"))
expect_true( all(res == ref, na.rm = TRUE) )
#
expect_warning(ref <- getPrevalence(tse_agg, na.rm = TRUE))
expect_warning(res <- getPrevalence(tse, na.rm = TRUE, rank = "Genus"))
expect_true( all(res == ref, na.rm = TRUE) )
})


Expand Down

0 comments on commit aa1d488

Please sign in to comment.