Fixes to agglomeration (microbiome#556)

Signed-off-by: Daena Rys <[email protected]> Co-authored-by: Leo Lahti <[email protected]> Co-authored-by: Daena Rys <[email protected]>
Daenarys8 · Jun 11, 2024 · 9546b55 · 9546b55
1 parent ba42574
commit 9546b55
Show file tree

Hide file tree

Showing 8 changed files with 236 additions and 239 deletions.
diff --git a/R/agglomerate.R b/R/agglomerate.R
@@ -45,20 +45,32 @@
 #'   to \code{getPrevalence} and \code{getPrevalentTaxa} and used in
 #'   \code{agglomeratebyPrevalence}
 #'   \itemize{
-#'        \item \code{remove_empty_ranks}: A single boolean value for selecting 
+#'        \item \code{remove_empty_ranks}: A single boolean value for selecting
 #'        whether to remove those columns of rowData that include only NAs after
 #'        agglomeration. (By default: \code{remove_empty_ranks = FALSE})
-#'        \item \code{make_unique}: A single boolean value for selecting 
+#'        \item \code{make_unique}: A single boolean value for selecting
 #'        whether to make rownames unique. (By default: \code{make_unique = TRUE})
-#'        \item \code{detection}: Detection threshold for absence/presence. 
-#'        Either an absolute value compared directly to the values of \code{x} 
+#'        \item \code{detection}: Detection threshold for absence/presence.
+#'        Either an absolute value compared directly to the values of \code{x}
 #'        or a relative value between 0 and 1, if \code{as_relative = FALSE}.
-#'        \item \code{prevalence}: Prevalence threshold (in 0 to 1). The 
-#'        required prevalence is strictly greater by default. To include the 
+#'        \item \code{prevalence}: Prevalence threshold (in 0 to 1). The
+#'        required prevalence is strictly greater by default. To include the
 #'        limit, set \code{include_lowest} to \code{TRUE}.
-#'        \item \code{as.relative}: Logical scalar: Should the detection 
-#'        threshold be applied on compositional (relative) abundances? 
+#'        \item \code{as.relative}: Logical scalar: Should the detection
+#'        threshold be applied on compositional (relative) abundances?
 #'        (default: \code{FALSE})
+#'        \item \code{mergeRefSeq} \code{TRUE} or \code{FALSE}: Should a
+#'        consensus sequence be calculated? If set to \code{FALSE}, the result
+#'        from \code{archetype} is returned; If set to \code{TRUE} the result
+#'        from
+#'        \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}}
+#'        is returned. (Default: \code{mergeRefSeq = FALSE})
+#'        \item \code{archetype} Of each level of \code{f}, which element should
+#'        be regarded as the archetype and metadata in the columns or rows kept,
+#'        while merging? This can be single integer value or an integer vector
+#'        of the same length as \code{levels(f)}. (Default:
+#'        \code{archetype = 1L}, which means the first element encountered per
+#'        factor level will be kept)
 #'    }
 #'
 #' @param altexp String or integer scalar specifying an alternative experiment
@@ -78,28 +90,16 @@
 #'   merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be
 #'   returned unchanged.
 #'
-#' @param archetype Of each level of \code{f}, which element should be regarded
-#'   as the archetype and metadata in the columns or rows kept, while merging?
-#'   This can be single integer value or an integer vector of the same length
-#'   as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first
-#'   element encountered per factor level will be kept)
-#'
 #' @param mergeTree \code{TRUE} or \code{FALSE}: Should
 #'   \code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE})
 #'
-#' @param mergeRefSeq \code{TRUE} or \code{FALSE}: Should a consensus sequence
-#'   be calculated? If set to \code{FALSE}, the result from \code{archetype} is
-#'   returned; If set to \code{TRUE} the result from
-#'   \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is
-#'   returned. (Default: \code{mergeRefSeq = FALSE})
-#'
 #' @details
 #' 
 #' Agglomeration sums up the values of assays at the specified taxonomic level. With
 #' certain assays, e.g. those that include binary or negative values, this summing
 #' can produce meaningless values. In those cases, consider performing agglomeration
 #' first, and then applying the transformation afterwards.
-#' 
+#'
 #' \code{agglomerateByVariable} works similarly to
 #' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}.
 #' However, additional support for \code{TreeSummarizedExperiment} was added and
@@ -108,12 +108,12 @@
 #'
 #' For merge data of assays the function from \code{scuttle} are used.
 #'
-#' @return 
-#' \code{agglomerateByRank} returns a taxonomically-agglomerated, 
+#' @return
+#' \code{agglomerateByRank} returns a taxonomically-agglomerated,
 #' optionally-pruned object of the same class as \code{x}.
-#' \code{agglomerateByVariable} returns an object of the same class as \code{x} 
+#' \code{agglomerateByVariable} returns an object of the same class as \code{x}
 #' with the specified entries merged into one entry in all relevant components.
-#' \code{agglomerateByRank} returns a taxonomically-agglomerated, 
+#' \code{agglomerateByRank} returns a taxonomically-agglomerated,
 #' optionally-pruned object of the same class as \code{x}.
 #'
 #' @name agglomerate-methods
@@ -232,7 +232,7 @@ setGeneric("agglomerateByVariable",
 #'
 #' @export
 setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
-    function(x, rank = taxonomyRanks(x)[1], na.rm = FALSE,
+    function(x, rank = taxonomyRanks(x)[1], na.rm = TRUE,
         empty.fields = c(NA, "", " ", "\t", "-", "_"), ...){
         # input check
         if(nrow(x) == 0L){
@@ -273,9 +273,14 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
 
         # get groups of taxonomy entries
         tax_factors <- .get_tax_groups(x, col = col, ...)
+        # Convert to factors. Use na.rm so that NA values are not preserved.
+        # i.e. they are not convrted into character values.
+        # NA values are handled earlier in this function.
+        tax_factors <- .norm_f(nrow(x), tax_factors, na.rm = TRUE)
 
         # merge taxa
-        x <- agglomerateByVariable(x, MARGIN = "rows", f = tax_factors, ...)
+        x <- agglomerateByVariable(
+            x, MARGIN = "rows", f = tax_factors, na.rm = TRUE, ...)
 
         # "Empty" the values to the right of the rank, using NA_character_.
         if( col < length(taxonomyRanks(x)) ){
@@ -303,10 +308,11 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
 #' @aliases agglomerateByVariable
 #' @export
 setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
-            function(x, MARGIN, f, archetype = 1L, ...){
+            function(x, MARGIN, f, ...){
                 MARGIN <- .check_MARGIN(MARGIN)
-                FUN <- switch(MARGIN, .merge_rows_SE, .merge_cols_SE)
-                FUN(x, f, archetype = archetype, ...)
+                FUN <- switch(MARGIN, .merge_rows, .merge_cols)
+                x <- FUN(x, f, ...)
+                return(x)
             }
 )
 
@@ -315,17 +321,14 @@ setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
 #' @export
 setMethod("agglomerateByVariable",
             signature = c(x = "TreeSummarizedExperiment"),
-            function(x, MARGIN, f, archetype = 1L, mergeTree = FALSE,
-                     mergeRefSeq = FALSE, ...){
+            function(x, MARGIN, f, mergeTree = FALSE, ...){
+                # Check MARGIN
                 MARGIN <- .check_MARGIN(MARGIN)
-                if ( MARGIN == 1L ){
-                    .merge_rows_TSE(x, f, archetype = 1L, mergeTree = mergeTree,
-                                   mergeRefSeq = mergeRefSeq, ...)
-                }
-                else{
-                    .merge_cols_TSE(x, f, archetype = 1L, mergeTree = mergeTree,
-                                   ...)
-                }
+                # Get function based on MARGIN
+                FUN <- switch(MARGIN, .merge_rows_TSE, .merge_cols_TSE)
+                # Agglomerate
+                x <- FUN(x, f, mergeTree = mergeTree, ...)
+                return(x)
             }
 )
 
@@ -507,4 +510,4 @@ setMethod(
         tree <- collapse.singles(tree)
     }
     return(tree)
-}
+}
diff --git a/R/getPrevalence.R b/R/getPrevalence.R
@@ -42,11 +42,11 @@
 #'   \code{\link[=agglomerate-methods]{?agglomerateByRank}} for more details.
 #'   Note that you can specify whether to remove empty ranks with
 #'   \code{agg.na.rm} instead of \code{na.rm}. (default: \code{FALSE})
-#'     
+#'
 #'   \item for \code{getPrevalent}, \code{getRare}, \code{subsetByPrevalent}
 #'   and \code{subsetByRare} additional parameters passed to
 #'   \code{getPrevalence}
-#'   
+#'
 #'   \item for \code{getPrevalentAbundance} additional parameters passed to
 #'   \code{getPrevalent}
 #' }
@@ -64,27 +64,27 @@
 #' proportion of the core species (in between 0 and 1). The core taxa are
 #' defined as those that exceed the given population prevalence threshold at the
 #' given detection level as set for \code{getPrevalent}.
-#' 
+#'
 #' \code{subsetPrevalent} and \code{subsetRareFeatures} return a subset of
-#' \code{x}. 
-#' The subset includes the most prevalent or rare taxa that are calculated with 
+#' \code{x}.
+#' The subset includes the most prevalent or rare taxa that are calculated with
 #' \code{getPrevalent} or \code{getRare} respectively.
 #'
 #' @return
 #' \code{subsetPrevalent} and \code{subsetRareFeatures} return subset of
 #' \code{x}.
-#' 
+#'
 #' All other functions return a named vectors:
 #' \itemize{
-#'   \item \code{getPrevalence} returns a \code{numeric} vector with the 
-#'     names being set to either the row names of \code{x} or the names after 
+#'   \item \code{getPrevalence} returns a \code{numeric} vector with the
+#'     names being set to either the row names of \code{x} or the names after
 #'     agglomeration.
-#'   
+#'
 #'   \item \code{getPrevalentAbundance} returns a \code{numeric} vector with
-#'     the names corresponding to the column name of \code{x} and include the 
+#'     the names corresponding to the column name of \code{x} and include the
 #'     joint abundance of prevalent taxa.
-#'   
-#'   \item \code{getPrevalent} and \code{getRare} return a 
+#'
+#'   \item \code{getPrevalent} and \code{getRare} return a
 #'     \code{character} vector with only the names exceeding the threshold set
 #'     by \code{prevalence}, if the \code{rownames} of \code{x} is set.
 #'     Otherwise an \code{integer} vector is returned matching the rows in
@@ -153,7 +153,7 @@
 #'                                              detection = 0.001,
 #'                                              prevalence = 0.55,
 #'                                              as_relative = TRUE)
-#' altExp(tse, "prevalent")                                 
+#' altExp(tse, "prevalent")
 #'
 #' # getRare returns the inverse
 #' rare <- getRare(tse,
@@ -169,8 +169,8 @@
 #'                                     detection = 0.001,
 #'                                     prevalence = 0.001,
 #'                                     as_relative = TRUE)
-#' altExp(tse, "rare")      
-#' 
+#' altExp(tse, "rare")
+#'
 #' # Names of both experiments, prevalent and rare, can be found from slot
 #' # altExpNames
 #' tse
@@ -236,7 +236,7 @@ setMethod("getPrevalence", signature = c(x = "ANY"), function(
 
 .agg_for_prevalence <- function(
         x, rank, relabel = FALSE, make_unique = TRUE, na.rm = FALSE,
-        agg.na.rm = FALSE, ...){
+        agg.na.rm = TRUE, ...){
     # Check na.rm. It is not used in this function, it is only catched so that
     # it can be passed to getPrevalence(matrix) and not use it here in
     # agglomerateByRank function.
@@ -295,7 +295,7 @@ setMethod("getPrevalence", signature = c(x = "SummarizedExperiment"),
 #' given detection threshold for the selected taxonomic rank.
 #'
 #' @aliases getPrevalent
-#' 
+#'
 #' @export
 setGeneric("getPrevalent", signature = "x",
            function(x, ...)
@@ -376,7 +376,7 @@ setMethod("getPrevalent", signature = c(x = "ANY"),
 #' @rdname getPrevalence
 #' @export
 setMethod("getPrevalent", signature = c(x = "SummarizedExperiment"),
-    function(x, rank = NULL, prevalence = 50/100, 
+    function(x, rank = NULL, prevalence = 50/100,
              include_lowest = FALSE, ...){
         .get_prevalent_taxa(x, rank = rank, prevalence = prevalence,
                             include_lowest = include_lowest, ...)
@@ -389,7 +389,7 @@ setMethod("getPrevalent", signature = c(x = "SummarizedExperiment"),
 #'
 #' @details
 #' \code{getRare} returns complement of \code{getPrevalent}.
-#' 
+#'
 #' @export
 setGeneric("getRare", signature = "x",
            function(x, ...)
@@ -432,7 +432,7 @@ setMethod("getRare", signature = c(x = "ANY"),
 #' @rdname getPrevalence
 #' @export
 setMethod("getRare", signature = c(x = "SummarizedExperiment"),
-    function(x, rank = NULL, prevalence = 50/100, 
+    function(x, rank = NULL, prevalence = 50/100,
              include_lowest = FALSE, ...){
         .get_rare_taxa(x, rank = rank, prevalence = prevalence,
                        include_lowest = include_lowest, ...)
@@ -571,8 +571,8 @@ setMethod("agglomerateByPrevalence", signature = c(x = "SummarizedExperiment"),
         pr <- getPrevalent(x, rank = NULL, ...)
         f <- rownames(x) %in% pr
         if(any(!f)){
-            other_x <- agglomerateByVariable(x[!f,], MARGIN = "rows", 
-                                            factor(rep(1L,sum(!f))), 
+            other_x <- agglomerateByVariable(x[!f,], MARGIN = "rows",
+                                            factor(rep(1L,sum(!f))),
                                             check_assays = FALSE)
             rowData(other_x)[,colnames(rowData(other_x))] <- NA
             # set the other label

diff --git a/R/merge.R b/R/merge.R
@@ -1,8 +1,8 @@
 .norm_f <- function(i, f, dim.type = c("rows","columns"), na.rm = FALSE, ...){
-    dim.type <- match.arg(dim.type)
     if(!.is_a_bool(na.rm)){
-      stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
+        stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
     }
+    dim.type <- match.arg(dim.type)
     if(!is.character(f) && !is.factor(f)){
         stop("'f' must be a factor or character vector coercible to a ",
             "meaningful factor.",
@@ -13,7 +13,7 @@
             call. = FALSE)
     }
     # This is done otherwise we lose NA values
-    if (!na.rm && any(is.na(f))) {
+    if( !na.rm && any(is.na(f)) ){
         f <- as.character(f)
         f[ is.na(f) ] <- "NA"
     }
@@ -75,7 +75,7 @@
     if( .is_a_string(f) && f %in% colnames(rowData(x)) ){
         f <- rowData(x)[[ f ]]
     }
-    f <- .norm_f(nrow(x), f)
+    f <- .norm_f(nrow(x), f, ...)
     if(length(levels(f)) == nrow(x)){
         return(x)
     }
@@ -128,7 +128,7 @@
     if( .is_a_string(f) && f %in% colnames(colData(x)) ){
       f <- colData(x)[[ f ]]
     }
-    f <- .norm_f(ncol(x), f, "columns")
+    f <- .norm_f(ncol(x), f, "columns", ...)
 
     if(length(levels(f)) == ncol(x)){
         return(x)
@@ -163,14 +163,6 @@
     x
 }
 
-.merge_rows_SE <- function(x, f, archetype = 1L, ...){
-    .merge_rows(x, f, archetype = archetype, ...)
-}
-
-.merge_cols_SE <- function(x, f, archetype = 1L, ...){
-    .merge_cols(x, f, archetype = archetype, ...)
-}
-
 #' @importFrom Biostrings DNAStringSetList
 .merge_refseq_list <- function(sequences_list, f, names, ...){
     threshold <- list(...)[["threshold"]]
@@ -211,7 +203,7 @@
         refSeq <- referenceSeq(x)
     }
     #
-    x <- .merge_rows_SE(x, f, archetype = 1L, ...)
+    x <- .merge_rows(x, f, archetype = 1L, ...)
     # optionally merge rowTree
     if( mergeTree ){
         x <- .agglomerate_trees(x, 1)
@@ -229,7 +221,7 @@
         stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE)
     }
     #
-    x <- .merge_cols_SE(x, f, archetype = 1L, ...)
+    x <- .merge_cols(x, f, archetype = 1L, ...)
     # optionally merge colTree
     if( mergeTree ){
         x <- .agglomerate_trees(x, 2)