Skip to content

Commit

Permalink
Fixes to agglomeration (microbiome#556)
Browse files Browse the repository at this point in the history
Signed-off-by: Daena Rys <[email protected]>
Co-authored-by: Leo Lahti <[email protected]>
Co-authored-by: Daena Rys <[email protected]>
  • Loading branch information
3 people authored Jun 11, 2024
1 parent ba42574 commit 9546b55
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 239 deletions.
85 changes: 44 additions & 41 deletions R/agglomerate.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,32 @@
#' to \code{getPrevalence} and \code{getPrevalentTaxa} and used in
#' \code{agglomeratebyPrevalence}
#' \itemize{
#' \item \code{remove_empty_ranks}: A single boolean value for selecting
#' \item \code{remove_empty_ranks}: A single boolean value for selecting
#' whether to remove those columns of rowData that include only NAs after
#' agglomeration. (By default: \code{remove_empty_ranks = FALSE})
#' \item \code{make_unique}: A single boolean value for selecting
#' \item \code{make_unique}: A single boolean value for selecting
#' whether to make rownames unique. (By default: \code{make_unique = TRUE})
#' \item \code{detection}: Detection threshold for absence/presence.
#' Either an absolute value compared directly to the values of \code{x}
#' \item \code{detection}: Detection threshold for absence/presence.
#' Either an absolute value compared directly to the values of \code{x}
#' or a relative value between 0 and 1, if \code{as_relative = FALSE}.
#' \item \code{prevalence}: Prevalence threshold (in 0 to 1). The
#' required prevalence is strictly greater by default. To include the
#' \item \code{prevalence}: Prevalence threshold (in 0 to 1). The
#' required prevalence is strictly greater by default. To include the
#' limit, set \code{include_lowest} to \code{TRUE}.
#' \item \code{as.relative}: Logical scalar: Should the detection
#' threshold be applied on compositional (relative) abundances?
#' \item \code{as.relative}: Logical scalar: Should the detection
#' threshold be applied on compositional (relative) abundances?
#' (default: \code{FALSE})
#' \item \code{mergeRefSeq} \code{TRUE} or \code{FALSE}: Should a
#' consensus sequence be calculated? If set to \code{FALSE}, the result
#' from \code{archetype} is returned; If set to \code{TRUE} the result
#' from
#' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}}
#' is returned. (Default: \code{mergeRefSeq = FALSE})
#' \item \code{archetype} Of each level of \code{f}, which element should
#' be regarded as the archetype and metadata in the columns or rows kept,
#' while merging? This can be single integer value or an integer vector
#' of the same length as \code{levels(f)}. (Default:
#' \code{archetype = 1L}, which means the first element encountered per
#' factor level will be kept)
#' }
#'
#' @param altexp String or integer scalar specifying an alternative experiment
Expand All @@ -78,28 +90,16 @@
#' merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be
#' returned unchanged.
#'
#' @param archetype Of each level of \code{f}, which element should be regarded
#' as the archetype and metadata in the columns or rows kept, while merging?
#' This can be single integer value or an integer vector of the same length
#' as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first
#' element encountered per factor level will be kept)
#'
#' @param mergeTree \code{TRUE} or \code{FALSE}: Should
#' \code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE})
#'
#' @param mergeRefSeq \code{TRUE} or \code{FALSE}: Should a consensus sequence
#' be calculated? If set to \code{FALSE}, the result from \code{archetype} is
#' returned; If set to \code{TRUE} the result from
#' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is
#' returned. (Default: \code{mergeRefSeq = FALSE})
#'
#' @details
#'
#' Agglomeration sums up the values of assays at the specified taxonomic level. With
#' certain assays, e.g. those that include binary or negative values, this summing
#' can produce meaningless values. In those cases, consider performing agglomeration
#' first, and then applying the transformation afterwards.
#'
#'
#' \code{agglomerateByVariable} works similarly to
#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}.
#' However, additional support for \code{TreeSummarizedExperiment} was added and
Expand All @@ -108,12 +108,12 @@
#'
#' For merge data of assays the function from \code{scuttle} are used.
#'
#' @return
#' \code{agglomerateByRank} returns a taxonomically-agglomerated,
#' @return
#' \code{agglomerateByRank} returns a taxonomically-agglomerated,
#' optionally-pruned object of the same class as \code{x}.
#' \code{agglomerateByVariable} returns an object of the same class as \code{x}
#' \code{agglomerateByVariable} returns an object of the same class as \code{x}
#' with the specified entries merged into one entry in all relevant components.
#' \code{agglomerateByRank} returns a taxonomically-agglomerated,
#' \code{agglomerateByRank} returns a taxonomically-agglomerated,
#' optionally-pruned object of the same class as \code{x}.
#'
#' @name agglomerate-methods
Expand Down Expand Up @@ -232,7 +232,7 @@ setGeneric("agglomerateByVariable",
#'
#' @export
setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
function(x, rank = taxonomyRanks(x)[1], na.rm = FALSE,
function(x, rank = taxonomyRanks(x)[1], na.rm = TRUE,
empty.fields = c(NA, "", " ", "\t", "-", "_"), ...){
# input check
if(nrow(x) == 0L){
Expand Down Expand Up @@ -273,9 +273,14 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),

# get groups of taxonomy entries
tax_factors <- .get_tax_groups(x, col = col, ...)
# Convert to factors. Use na.rm so that NA values are not preserved.
# i.e. they are not convrted into character values.
# NA values are handled earlier in this function.
tax_factors <- .norm_f(nrow(x), tax_factors, na.rm = TRUE)

# merge taxa
x <- agglomerateByVariable(x, MARGIN = "rows", f = tax_factors, ...)
x <- agglomerateByVariable(
x, MARGIN = "rows", f = tax_factors, na.rm = TRUE, ...)

# "Empty" the values to the right of the rank, using NA_character_.
if( col < length(taxonomyRanks(x)) ){
Expand Down Expand Up @@ -303,10 +308,11 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
#' @aliases agglomerateByVariable
#' @export
setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
function(x, MARGIN, f, archetype = 1L, ...){
function(x, MARGIN, f, ...){
MARGIN <- .check_MARGIN(MARGIN)
FUN <- switch(MARGIN, .merge_rows_SE, .merge_cols_SE)
FUN(x, f, archetype = archetype, ...)
FUN <- switch(MARGIN, .merge_rows, .merge_cols)
x <- FUN(x, f, ...)
return(x)
}
)

Expand All @@ -315,17 +321,14 @@ setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
#' @export
setMethod("agglomerateByVariable",
signature = c(x = "TreeSummarizedExperiment"),
function(x, MARGIN, f, archetype = 1L, mergeTree = FALSE,
mergeRefSeq = FALSE, ...){
function(x, MARGIN, f, mergeTree = FALSE, ...){
# Check MARGIN
MARGIN <- .check_MARGIN(MARGIN)
if ( MARGIN == 1L ){
.merge_rows_TSE(x, f, archetype = 1L, mergeTree = mergeTree,
mergeRefSeq = mergeRefSeq, ...)
}
else{
.merge_cols_TSE(x, f, archetype = 1L, mergeTree = mergeTree,
...)
}
# Get function based on MARGIN
FUN <- switch(MARGIN, .merge_rows_TSE, .merge_cols_TSE)
# Agglomerate
x <- FUN(x, f, mergeTree = mergeTree, ...)
return(x)
}
)

Expand Down Expand Up @@ -507,4 +510,4 @@ setMethod(
tree <- collapse.singles(tree)
}
return(tree)
}
}
44 changes: 22 additions & 22 deletions R/getPrevalence.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
#' \code{\link[=agglomerate-methods]{?agglomerateByRank}} for more details.
#' Note that you can specify whether to remove empty ranks with
#' \code{agg.na.rm} instead of \code{na.rm}. (default: \code{FALSE})
#'
#'
#' \item for \code{getPrevalent}, \code{getRare}, \code{subsetByPrevalent}
#' and \code{subsetByRare} additional parameters passed to
#' \code{getPrevalence}
#'
#'
#' \item for \code{getPrevalentAbundance} additional parameters passed to
#' \code{getPrevalent}
#' }
Expand All @@ -64,27 +64,27 @@
#' proportion of the core species (in between 0 and 1). The core taxa are
#' defined as those that exceed the given population prevalence threshold at the
#' given detection level as set for \code{getPrevalent}.
#'
#'
#' \code{subsetPrevalent} and \code{subsetRareFeatures} return a subset of
#' \code{x}.
#' The subset includes the most prevalent or rare taxa that are calculated with
#' \code{x}.
#' The subset includes the most prevalent or rare taxa that are calculated with
#' \code{getPrevalent} or \code{getRare} respectively.
#'
#' @return
#' \code{subsetPrevalent} and \code{subsetRareFeatures} return subset of
#' \code{x}.
#'
#'
#' All other functions return a named vectors:
#' \itemize{
#' \item \code{getPrevalence} returns a \code{numeric} vector with the
#' names being set to either the row names of \code{x} or the names after
#' \item \code{getPrevalence} returns a \code{numeric} vector with the
#' names being set to either the row names of \code{x} or the names after
#' agglomeration.
#'
#'
#' \item \code{getPrevalentAbundance} returns a \code{numeric} vector with
#' the names corresponding to the column name of \code{x} and include the
#' the names corresponding to the column name of \code{x} and include the
#' joint abundance of prevalent taxa.
#'
#' \item \code{getPrevalent} and \code{getRare} return a
#'
#' \item \code{getPrevalent} and \code{getRare} return a
#' \code{character} vector with only the names exceeding the threshold set
#' by \code{prevalence}, if the \code{rownames} of \code{x} is set.
#' Otherwise an \code{integer} vector is returned matching the rows in
Expand Down Expand Up @@ -153,7 +153,7 @@
#' detection = 0.001,
#' prevalence = 0.55,
#' as_relative = TRUE)
#' altExp(tse, "prevalent")
#' altExp(tse, "prevalent")
#'
#' # getRare returns the inverse
#' rare <- getRare(tse,
Expand All @@ -169,8 +169,8 @@
#' detection = 0.001,
#' prevalence = 0.001,
#' as_relative = TRUE)
#' altExp(tse, "rare")
#'
#' altExp(tse, "rare")
#'
#' # Names of both experiments, prevalent and rare, can be found from slot
#' # altExpNames
#' tse
Expand Down Expand Up @@ -236,7 +236,7 @@ setMethod("getPrevalence", signature = c(x = "ANY"), function(

.agg_for_prevalence <- function(
x, rank, relabel = FALSE, make_unique = TRUE, na.rm = FALSE,
agg.na.rm = FALSE, ...){
agg.na.rm = TRUE, ...){
# Check na.rm. It is not used in this function, it is only catched so that
# it can be passed to getPrevalence(matrix) and not use it here in
# agglomerateByRank function.
Expand Down Expand Up @@ -295,7 +295,7 @@ setMethod("getPrevalence", signature = c(x = "SummarizedExperiment"),
#' given detection threshold for the selected taxonomic rank.
#'
#' @aliases getPrevalent
#'
#'
#' @export
setGeneric("getPrevalent", signature = "x",
function(x, ...)
Expand Down Expand Up @@ -376,7 +376,7 @@ setMethod("getPrevalent", signature = c(x = "ANY"),
#' @rdname getPrevalence
#' @export
setMethod("getPrevalent", signature = c(x = "SummarizedExperiment"),
function(x, rank = NULL, prevalence = 50/100,
function(x, rank = NULL, prevalence = 50/100,
include_lowest = FALSE, ...){
.get_prevalent_taxa(x, rank = rank, prevalence = prevalence,
include_lowest = include_lowest, ...)
Expand All @@ -389,7 +389,7 @@ setMethod("getPrevalent", signature = c(x = "SummarizedExperiment"),
#'
#' @details
#' \code{getRare} returns complement of \code{getPrevalent}.
#'
#'
#' @export
setGeneric("getRare", signature = "x",
function(x, ...)
Expand Down Expand Up @@ -432,7 +432,7 @@ setMethod("getRare", signature = c(x = "ANY"),
#' @rdname getPrevalence
#' @export
setMethod("getRare", signature = c(x = "SummarizedExperiment"),
function(x, rank = NULL, prevalence = 50/100,
function(x, rank = NULL, prevalence = 50/100,
include_lowest = FALSE, ...){
.get_rare_taxa(x, rank = rank, prevalence = prevalence,
include_lowest = include_lowest, ...)
Expand Down Expand Up @@ -571,8 +571,8 @@ setMethod("agglomerateByPrevalence", signature = c(x = "SummarizedExperiment"),
pr <- getPrevalent(x, rank = NULL, ...)
f <- rownames(x) %in% pr
if(any(!f)){
other_x <- agglomerateByVariable(x[!f,], MARGIN = "rows",
factor(rep(1L,sum(!f))),
other_x <- agglomerateByVariable(x[!f,], MARGIN = "rows",
factor(rep(1L,sum(!f))),
check_assays = FALSE)
rowData(other_x)[,colnames(rowData(other_x))] <- NA
# set the other label
Expand Down
22 changes: 7 additions & 15 deletions R/merge.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
.norm_f <- function(i, f, dim.type = c("rows","columns"), na.rm = FALSE, ...){
dim.type <- match.arg(dim.type)
if(!.is_a_bool(na.rm)){
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
}
dim.type <- match.arg(dim.type)
if(!is.character(f) && !is.factor(f)){
stop("'f' must be a factor or character vector coercible to a ",
"meaningful factor.",
Expand All @@ -13,7 +13,7 @@
call. = FALSE)
}
# This is done otherwise we lose NA values
if (!na.rm && any(is.na(f))) {
if( !na.rm && any(is.na(f)) ){
f <- as.character(f)
f[ is.na(f) ] <- "NA"
}
Expand Down Expand Up @@ -75,7 +75,7 @@
if( .is_a_string(f) && f %in% colnames(rowData(x)) ){
f <- rowData(x)[[ f ]]
}
f <- .norm_f(nrow(x), f)
f <- .norm_f(nrow(x), f, ...)
if(length(levels(f)) == nrow(x)){
return(x)
}
Expand Down Expand Up @@ -128,7 +128,7 @@
if( .is_a_string(f) && f %in% colnames(colData(x)) ){
f <- colData(x)[[ f ]]
}
f <- .norm_f(ncol(x), f, "columns")
f <- .norm_f(ncol(x), f, "columns", ...)

if(length(levels(f)) == ncol(x)){
return(x)
Expand Down Expand Up @@ -163,14 +163,6 @@
x
}

.merge_rows_SE <- function(x, f, archetype = 1L, ...){
.merge_rows(x, f, archetype = archetype, ...)
}

.merge_cols_SE <- function(x, f, archetype = 1L, ...){
.merge_cols(x, f, archetype = archetype, ...)
}

#' @importFrom Biostrings DNAStringSetList
.merge_refseq_list <- function(sequences_list, f, names, ...){
threshold <- list(...)[["threshold"]]
Expand Down Expand Up @@ -211,7 +203,7 @@
refSeq <- referenceSeq(x)
}
#
x <- .merge_rows_SE(x, f, archetype = 1L, ...)
x <- .merge_rows(x, f, archetype = 1L, ...)
# optionally merge rowTree
if( mergeTree ){
x <- .agglomerate_trees(x, 1)
Expand All @@ -229,7 +221,7 @@
stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE)
}
#
x <- .merge_cols_SE(x, f, archetype = 1L, ...)
x <- .merge_cols(x, f, archetype = 1L, ...)
# optionally merge colTree
if( mergeTree ){
x <- .agglomerate_trees(x, 2)
Expand Down
Loading

0 comments on commit 9546b55

Please sign in to comment.