From 13f7dcefca9da4eadfc866327e5810bcaec96eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9otime=20Pralas?= <151254073+thpralas@users.noreply.github.com> Date: Wed, 8 May 2024 17:40:52 +0300 Subject: [PATCH] new generic function agglomerateByVariable (#507) Co-authored-by: Tuomas Borman <60338854+TuomasBorman@users.noreply.github.com> Co-authored-by: TuomasBorman --- DESCRIPTION | 2 +- NAMESPACE | 3 +- NEWS | 1 + R/agglomerate.R | 246 +++++++++++++++--------- R/deprecate.R | 192 ++++++++++++++++-- R/getPrevalence.R | 4 +- R/merge.R | 299 +++++++---------------------- R/splitByRanks.R | 2 +- R/splitOn.R | 7 +- R/utils.R | 6 +- man/agglomerate-methods.Rd | 134 +++++++++---- man/deprecate.Rd | 60 +++++- man/merge-methods.Rd | 121 ------------ man/splitByRanks.Rd | 2 +- man/splitOn.Rd | 7 +- tests/testthat/test-2merge.R | 31 ++- tests/testthat/test-3agglomerate.R | 6 +- 17 files changed, 592 insertions(+), 531 deletions(-) delete mode 100644 man/merge-methods.Rd diff --git a/DESCRIPTION b/DESCRIPTION index d4fdd9f96..a4ac31c39 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mia Type: Package -Version: 1.13.2 +Version: 1.13.3 Authors@R: c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"), email = "felix.gm.ernst@outlook.com", diff --git a/NAMESPACE b/NAMESPACE index 0c47fba0c..4424a74c5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(addPerSampleDominantTaxa) export(addTaxonomyTree) export(agglomerateByPrevalence) export(agglomerateByRank) +export(agglomerateByVariable) export(bestDMNFit) export(calculateDMN) export(calculateDMNgroup) @@ -124,6 +125,7 @@ exportMethods(addPerSampleDominantTaxa) exportMethods(addTaxonomyTree) exportMethods(agglomerateByPrevalence) exportMethods(agglomerateByRank) +exportMethods(agglomerateByVariable) exportMethods(bestDMNFit) exportMethods(calculateCCA) exportMethods(calculateDMN) @@ -288,7 +290,6 @@ importFrom(SummarizedExperiment,rowRanges) importFrom(ape,cophenetic.phylo) importFrom(ape,drop.tip) importFrom(ape,is.rooted) -importFrom(ape,keep.tip) importFrom(ape,node.depth) importFrom(ape,node.depth.edgelength) importFrom(ape,prop.part) diff --git a/NEWS b/NEWS index 0e675243e..b72e7755f 100644 --- a/NEWS +++ b/NEWS @@ -116,3 +116,4 @@ Changes in version 1.11.x Changes in version 1.13.x + Added new functions getMediation and addMediation + replace getExperiment* and testExperiment* functions with getCrossAssociation ++ Replace mergeRows and mergeCols with new function agglomerateByVariable diff --git a/R/agglomerate.R b/R/agglomerate.R index ab2581760..ac2026477 100644 --- a/R/agglomerate.R +++ b/R/agglomerate.R @@ -1,11 +1,24 @@ -#' Agglomerate data using taxonomic information -#' +#' Agglomerate or merge data using taxonomic information +#' #' Agglomeration functions can be used to sum-up data based on specific criteria #' such as taxonomic ranks, variables or prevalence. #' -#' @param x a -#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -#' object +#' \code{agglomerateByRank} can be used to sum up data based on associations +#' with certain taxonomic ranks, as defined in \code{rowData}. Only available +#' \code{\link{taxonomyRanks}} can be used. +#' +#' \code{agglomerateByVariable} merges data on rows or columns of a +#' \code{SummarizedExperiment} as defined by a \code{factor} alongside the +#' chosen dimension. This function allows agglomeration of data based on other +#' variables than taxonomy ranks. +#' Metadata from the \code{rowData} or \code{colData} are +#' retained as defined by \code{archetype}. +#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} are +#' agglomerated, i.e. summed up. If the assay contains values other than counts +#' or absolute values, this can lead to meaningless values being produced. +#' +#' @param x a \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} or +#' a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} #' #' @param rank a single character defining a taxonomic rank. Must be a value of #' \code{taxonomyRanks()} function. @@ -31,10 +44,10 @@ #' #' @param ... arguments passed to \code{agglomerateByRank} function for #' \code{SummarizedExperiment} objects, +#' to \code{\link[=agglomerate-methods]{agglomerateByVariable}} and +#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, #' to \code{getPrevalence} and \code{getPrevalentTaxa} and used in -#' \code{agglomeratebyPrevalence}, -#' to \code{\link[=merge-methods]{mergeRows}} and -#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. +#' \code{agglomeratebyPrevalence} #' \itemize{ #' \item{\code{remove_empty_ranks}}{A single boolean value for selecting #' whether to remove those columns of rowData that include only NAs after @@ -60,6 +73,30 @@ #' nested alternative experiments by default (default: #' \code{strip_altexp = TRUE}) #' +#' @param MARGIN A character value for selecting if data is merged +#' row-wise / for features ('rows') or column-wise / for samples ('cols'). +#' Must be \code{'rows'} or \code{'cols'}. +#' +#' @param f A factor for merging. Must be the same length as +#' \code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be +#' merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be +#' returned unchanged. +#' +#' @param archetype Of each level of \code{f}, which element should be regarded +#' as the archetype and metadata in the columns or rows kept, while merging? +#' This can be single integer value or an integer vector of the same length +#' as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first +#' element encountered per factor level will be kept) +#' +#' @param mergeTree \code{TRUE} or \code{FALSE}: Should +#' \code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE}) +#' +#' @param mergeRefSeq \code{TRUE} or \code{FALSE}: Should a consensus sequence +#' be calculated? If set to \code{FALSE}, the result from \code{archetype} is +#' returned; If set to \code{TRUE} the result from +#' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is +#' returned. (Default: \code{mergeRefSeq = FALSE}) +#' #' @details #' Depending on the available taxonomic data and its structure, setting #' \code{onRankOnly = TRUE} has certain implications on the interpretability of @@ -67,21 +104,33 @@ #' the same lower rank), the results should be comparable. You can check for #' loops using \code{\link[TreeSummarizedExperiment:detectLoop]{detectLoop}}. #' -#' Agglomeration sums up the values of assays at the specified taxonomic level. With -#' certain assays, e.g. those that include binary or negative values, this summing -#' can produce meaningless values. In those cases, consider performing agglomeration -#' first, and then applying the transformation afterwards. +#' Agglomeration sums up the values of assays at the specified taxonomic level. +#' With certain assays, e.g. those that include binary or negative values, this +#' summing can produce meaningless values. In those cases, consider performing +#' agglomeration first, and then applying the transformation afterwards. +#' +#' \code{agglomerateByVariable} works similarly to +#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. +#' However, additional support for \code{TreeSummarizedExperiment} was added and +#' science field agnostic names were used. In addition the \code{archetype} +#' argument lets the user select how to preserve row or column data. +#' +#' For merge data of assays the function from \code{scuttle} are used. #' -#' @return -#' \code{agglomerateByRank} returns a taxonomically-agglomerated, -#' optionally-pruned object of the same class as \code{x}. +#' @return \code{agglomerateByRank} returns a taxonomically-agglomerated, +#' optionally-pruned object of the same class as \code{x} while +#' \code{agglomerateByVariable} returns an object of the same class as \code{x} +#' with the specified entries merged into one entry in all relevant components. #' #' @name agglomerate-methods +#' #' @seealso -#' \code{\link[=merge-methods]{mergeRows}}, #' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}} #' #' @examples +#' +#' ### Agglomerate data based on taxonomic information +#' #' data(GlobalPatterns) #' # print the available taxonomic ranks #' colnames(rowData(GlobalPatterns)) @@ -100,12 +149,12 @@ #' rowTree(x1) # ... different #' rowTree(x2) # ... tree #' -#' # If assay contains binary or negative values, summing might lead to meaningless -#' # values, and you will get a warning. In these cases, you might want to do -#' # agglomeration again at chosen taxonomic level. -#' tse <- transformAssay(GlobalPatterns, method = "pa") -#' tse <- agglomerateByRank(tse, rank = "Genus") -#' tse <- transformAssay(tse, method = "pa") +#' # If assay contains binary or negative values, summing might lead to +#' # meaningless values, and you will get a warning. In these cases, you might +#' # want to do agglomeration again at chosen taxonomic level. +#' tse <- transformAssay(GlobalPatterns, method = "pa") +#' tse <- agglomerateByRank(tse, rank = "Genus") +#' tse <- transformAssay(tse, method = "pa") #' #' # removing empty labels by setting na.rm = TRUE #' sum(is.na(rowData(GlobalPatterns)$Family)) @@ -121,7 +170,8 @@ #' print(rownames(x3[1:3,])) #' #' # use 'remove_empty_ranks' to remove columns that include only NAs -#' x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", remove_empty_ranks = TRUE) +#' x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", +#' remove_empty_ranks = TRUE) #' head(rowData(x4)) #' #' # If the assay contains NAs, you might want to consider replacing them, @@ -141,10 +191,27 @@ #' ## Print the available taxonomic ranks. Shows only 1 available rank, #' ## not useful for agglomerateByRank #' taxonomyRanks(enterotype) +#' +#' ### Merge TreeSummarizedExperiments on rows and columns +#' +#' data(esophagus) +#' esophagus +#' plot(rowTree(esophagus)) +#' # get a factor for merging +#' f <- factor(regmatches(rownames(esophagus), +#' regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) +#' merged <- agglomerateByVariable(esophagus, MARGIN = "rows", f, +#' mergeTree = TRUE) +#' plot(rowTree(merged)) +#' # +#' data(GlobalPatterns) +#' GlobalPatterns +#' merged <- agglomerateByVariable(GlobalPatterns, MARGIN = "cols", +#' colData(GlobalPatterns)$SampleType) +#' merged NULL #' @rdname agglomerate-methods -#' @aliases mergeFeaturesByRank #' @export setGeneric("agglomerateByRank", signature = "x", @@ -152,15 +219,14 @@ setGeneric("agglomerateByRank", standardGeneric("agglomerateByRank")) #' @rdname agglomerate-methods -#' @aliases agglomerateByRank +#' @aliases agglomerateByVariable #' @export -setGeneric("mergeFeaturesByRank", - signature = "x", - function(x, ...) - standardGeneric("mergeFeaturesByRank")) +setGeneric("agglomerateByVariable", + signature = "x", + function(x, ...) + standardGeneric("agglomerateByVariable")) #' @rdname agglomerate-methods -#' @aliases mergeFeaturesByRank #' #' @importFrom SummarizedExperiment rowData rowData<- #' @@ -171,7 +237,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), # input check if(nrow(x) == 0L){ stop("No data available in `x` ('x' has nrow(x) == 0L.)", - call. = FALSE) + call. = FALSE) } if(!.is_non_empty_string(rank)){ stop("'rank' must be an non empty single character value.", @@ -198,7 +264,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), # tree will be pruned later, if agglomerate.tree = TRUE if( na.rm ){ x <- .remove_with_empty_taxonomic_info(x, tax_cols[col], - empty.fields) + empty.fields) } # If rank is the only rank that is available and this data is unique, # then the data is already 'aggregated' and no further operations @@ -212,7 +278,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), tax_factors <- .get_tax_groups(x, col = col, onRankOnly = onRankOnly) # merge taxa - x <- mergeRows(x, f = tax_factors, ...) + x <- agglomerateByVariable(x, MARGIN = "rows", f = tax_factors, ...) # "Empty" the values to the right of the rank, using NA_character_. if( col < length(taxonomyRanks(x)) ){ @@ -225,7 +291,8 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), } # adjust rownames rownames(x) <- getTaxonomyLabels(x, empty.fields, ..., - with_rank = FALSE, resolve_loops = FALSE) + with_rank = FALSE, + resolve_loops = FALSE) # Remove those columns from rowData that include only NAs x <- .remove_NA_cols_from_rowdata(x, ...) x <- .add_values_to_metadata(x, "agglomerated_by_rank", rank) @@ -234,19 +301,33 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), ) #' @rdname agglomerate-methods -#' @aliases agglomerateByRank -#' -#' @importFrom SummarizedExperiment rowData rowData<- -#' +#' @aliases agglomerateByVariable +#' @export +setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"), + function(x, MARGIN, f, archetype = 1L, ...){ + MARGIN <- .check_MARGIN(MARGIN) + FUN <- switch(MARGIN, .merge_rows_SE, .merge_cols_SE) + FUN(x, f, archetype = archetype, ...) + } +) + +#' @rdname agglomerate-methods +#' @aliases agglomerateByVariable #' @export -setMethod("mergeFeaturesByRank", signature = c(x = "SummarizedExperiment"), - function(x, rank = taxonomyRanks(x)[1], onRankOnly = FALSE, na.rm = FALSE, - empty.fields = c(NA, "", " ", "\t", "-", "_"), ...){ - .Deprecated(old="agglomerateByRank", new="mergeFeaturesByRank", "Now agglomerateByRank is deprecated. Use mergeFeaturesByRank instead.") - x <- agglomerateByRank(x, rank = rank, onRankOnly = onRankOnly, na.rm = na.rm, - empty.fields = empty.fields, ...) - x - } +setMethod("agglomerateByVariable", + signature = c(x = "TreeSummarizedExperiment"), + function(x, MARGIN, f, archetype = 1L, mergeTree = FALSE, + mergeRefSeq = FALSE, ...){ + MARGIN <- .check_MARGIN(MARGIN) + if ( MARGIN == 1L ){ + .merge_rows_TSE(x, f, archetype = 1L, mergeTree = mergeTree, + mergeRefSeq = mergeRefSeq, ...) + } + else{ + .merge_cols_TSE(x, f, archetype = 1L, mergeTree = mergeTree, + ...) + } + } ) #' @rdname agglomerate-methods @@ -269,63 +350,42 @@ setMethod("agglomerateByRank", signature = c(x = "SingleCellExperiment"), } ) -#' @rdname agglomerate-methods -#' @aliases agglomerateByRank -#' @importFrom SingleCellExperiment altExp altExp<- altExps<- -#' @export -setMethod("mergeFeaturesByRank", signature = c(x = "SingleCellExperiment"), - function(x, ..., altexp = NULL, strip_altexp = TRUE){ - .Deprecated(old="agglomerateByRank", new="mergeFeaturesByRank", "Now agglomerateByRank is deprecated. Use mergeFeaturesByRank instead.") - x <- agglomerateByRank(x, ..., altexp = altexp, strip_altexp = strip_altexp) - x - } -) - - #' @rdname agglomerate-methods #' @export setMethod( "agglomerateByRank", signature = c(x = "TreeSummarizedExperiment"), function( x, ..., agglomerate.tree = agglomerateTree, agglomerateTree = FALSE){ - # input check - if(!.is_a_bool(agglomerate.tree)){ - stop("'agglomerate.tree' must be TRUE or FALSE.", call. = FALSE) - } - # If there are multipe rowTrees, it might be that multiple - # trees are preserved after agglomeration even though the dataset - # could be presented with one tree. --> order the data so that - # the taxa are searched from one tree first. - if( length(x@rowTree) > 1 ){ - x <- .order_based_on_trees(x) - } - # Agglomerate data - x <- callNextMethod(x, ...) - # Agglomerate also tree, if the data includes only one - # rowTree --> otherwise it is not possible to agglomerate - # since all rownames are not found from individual tree. - if(agglomerate.tree){ - x <- .agglomerate_trees(x) - } - x - } + # input check + if(!.is_a_bool(agglomerate.tree)){ + stop("'agglomerate.tree' must be TRUE or FALSE.", + call. = FALSE) + } + # If there are multipe rowTrees, it might be that multiple + # trees are preserved after agglomeration even though the + # dataset could be presented with one tree. + # --> order the data so that the taxa are searched from one tree + # first. + if( length(x@rowTree) > 1 ){ + x <- .order_based_on_trees(x) + } + # Agglomerate data + x <- callNextMethod(x, ...) + # Agglomerate also tree, if the data includes only one + # rowTree --> otherwise it is not possible to agglomerate + # since all rownames are not found from individual tree. + if(agglomerate.tree){ + x <- .agglomerate_trees(x) + } + x + } ) -#' @rdname agglomerate-methods -#' @aliases agglomerateByRank -#' @export -setMethod("mergeFeaturesByRank", signature = c(x = "TreeSummarizedExperiment"), - function(x, ..., agglomerate.tree = FALSE){ - .Deprecated(old="agglomerateByRank", new="mergeFeaturesByRank", "Now agglomerateByRank is deprecated. Use mergeFeaturesByRank instead.") - x <- agglomerateByRank(x, ..., agglomerate.tree = agglomerate.tree) - x - } -) ################################ HELP FUNCTIONS ################################ .remove_with_empty_taxonomic_info <- function(x, column, empty.fields = c(NA,""," ","\t","-","_")) - { + { tax <- as.character(rowData(x)[,column]) f <- !(tax %in% empty.fields) if(any(!f)){ @@ -340,7 +400,7 @@ setMethod("mergeFeaturesByRank", signature = c(x = "TreeSummarizedExperiment"), # Check remove_empty_ranks if( !.is_a_bool(remove_empty_ranks) ){ stop("'remove_empty_ranks' must be a boolean value.", - call. = FALSE) + call. = FALSE) } # If user wants to remove those columns if( remove_empty_ranks ){ @@ -367,7 +427,7 @@ setMethod("mergeFeaturesByRank", signature = c(x = "TreeSummarizedExperiment"), # Calculate, how many rows each tree has, and add it to data freq <- as.data.frame(table(links$whichTree)) links <- merge(links, freq, all.x = TRUE, all.y = FALSE, - by.x = "whichTree", by.y = "Var1") + by.x = "whichTree", by.y = "Var1") # Factorize the names of trees links$whichTree <- factor(links$whichTree, levels = uniq_trees) # Order the data back to its original order based on row indices diff --git a/R/deprecate.R b/R/deprecate.R index cf42cd9e0..af3545ce4 100644 --- a/R/deprecate.R +++ b/R/deprecate.R @@ -60,6 +60,169 @@ setMethod("taxonomyTree", signature = c(x = "SummarizedExperiment"), } ) +#' @rdname deprecate +#' @export +setGeneric("mergeRows", + signature = "x", + function(x, ...) + standardGeneric("mergeRows")) + +#' @rdname deprecate +#' @export +setMethod("mergeRows", signature = c(x = "SummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeRows' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'rows' instead.")) + agglomerateByVariable(x, MARGIN = "rows", ...) + } +) + +#' @rdname deprecate +#' @export +setMethod("mergeRows", signature = c(x = "TreeSummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeRows' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'rows' instead.")) + agglomerateByVariable(x, MARGIN = "rows", ...) + } +) + +#' @rdname deprecate +#' @export +setGeneric("mergeCols", + signature = "x", + function(x, ...) + standardGeneric("mergeCols")) + +#' @rdname deprecate +#' @export +setMethod("mergeCols", signature = c(x = "SummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeCols' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'cols' instead.")) + agglomerateByVariable(x, MARGIN = "cols", ...) + } +) + +#' @rdname deprecate +#' @export +setMethod("mergeCols", signature = c(x = "TreeSummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeCols' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'cols' instead.")) + agglomerateByVariable(x, MARGIN = "cols", ...) + } +) + +#' @rdname deprecate +#' @export +setGeneric("mergeFeatures", + signature = "x", + function(x, ...) + standardGeneric("mergeFeatures")) + +#' @rdname deprecate +#' @export +setMethod("mergeFeatures", signature = c(x = "SummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeFeatures' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'rows' instead.")) + agglomerateByVariable(x, MARGIN = "rows", ...) + } +) + +#' @rdname deprecate +#' @export +setMethod("mergeFeatures", signature = c(x = "TreeSummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeFeatures' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'rows' instead.")) + agglomerateByVariable(x, MARGIN = "rows", ...) + } +) + +#' @rdname deprecate +#' @export +setGeneric("mergeSamples", + signature = "x", + function(x, ...) + standardGeneric("mergeSamples")) + +#' @rdname deprecate +#' @export +setMethod("mergeSamples", signature = c(x = "SummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeSamples' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'cols' instead.")) + agglomerateByVariable(x, MARGIN = "cols", ...) + } +) + +#' @rdname deprecate +#' @export +setMethod("mergeSamples", signature = c(x = "TreeSummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeSamples' is deprecated. ", + "Use 'agglomerateByVariable' with ", + "parameter MARGIN = 'cols' instead.")) + agglomerateByVariable(x, MARGIN = "cols", ...) + } +) + +#' @rdname deprecate +#' @export +setGeneric("mergeFeaturesByRank", + signature = "x", + function(x, ...) + standardGeneric("mergeFeaturesByRank")) + +#' @rdname deprecate +#' @export +setMethod("mergeFeaturesByRank", signature = c(x = "SummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeFeaturesByRank' is deprecated. ", + "Use 'agglomerateByRank' instead.")) + x <- agglomerateByRank(x, ...) + x + } +) + +#' @rdname deprecate +#' @export +setMethod("mergeFeaturesByRank", signature = c(x = "SingleCellExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeFeaturesByRank' is deprecated. ", + "Use 'agglomerateByRank' instead.")) + x <- agglomerateByRank(x, ...) + x + } +) + +#' @rdname deprecate +#' @export +setGeneric("mergeFeaturesByPrevalence", signature = "x", + function(x, ...) + standardGeneric("mergeFeaturesByPrevalence")) + +#' @rdname deprecate +#' @export +setMethod("mergeFeaturesByPrevalence", signature = c(x = "SummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0( + "'mergeFeaturesByPrevalence' is deprecated. ", + "Use agglomerateByPrevalence instead.")) + x <- agglomerateByPrevalence(x, ...) + x + } +) + #' @rdname deprecate #' @export setGeneric("getExperimentCrossAssociation", signature = c("x"), @@ -89,6 +252,17 @@ setMethod("getExperimentCrossAssociation", signature = "SummarizedExperiment", } ) +#' @rdname deprecate +#' @export +setMethod("mergeFeaturesByRank", signature = c(x = "TreeSummarizedExperiment"), + function(x, ...){ + .Deprecated(msg = paste0("'mergeFeaturesByRank' is deprecated. ", + "Use 'agglomerateByRank' instead.")) + x <- agglomerateByRank(x, ...) + x + } +) + #' @rdname deprecate #' @export setGeneric("testExperimentCrossAssociation", signature = c("x"), @@ -140,24 +314,6 @@ setMethod("getExperimentCrossCorrelation", signature = c(x = "ANY"), } ) -#' @rdname deprecate -#' @export -setGeneric("mergeFeaturesByPrevalence", signature = "x", - function(x, ...) - standardGeneric("mergeFeaturesByPrevalence")) - -#' @rdname deprecate -#' @export -setMethod("mergeFeaturesByPrevalence", signature = c(x = "SummarizedExperiment"), - function(x, ...){ - .Deprecated(msg = paste0( - "'mergeFeaturesByPrevalence' is deprecated. ", - "Use agglomerateByPrevalence instead.")) - x <- agglomerateByPrevalence(x, ...) - x - } -) - #' @rdname deprecate #' @export loadFromBiom <- function(...) { diff --git a/R/getPrevalence.R b/R/getPrevalence.R index 8ad3e16cb..239afdd73 100644 --- a/R/getPrevalence.R +++ b/R/getPrevalence.R @@ -643,7 +643,9 @@ setMethod("agglomerateByPrevalence", signature = c(x = "SummarizedExperiment"), pr <- getPrevalentTaxa(x, rank = NULL, ...) f <- rownames(x) %in% pr if(any(!f)){ - other_x <- mergeRows(x[!f,], factor(rep(1L,sum(!f))), check_assays = FALSE) + other_x <- agglomerateByVariable(x[!f,], MARGIN = "rows", + factor(rep(1L,sum(!f))), + check_assays = FALSE) rowData(other_x)[,colnames(rowData(other_x))] <- NA # set the other label rownames(other_x) <- other_label diff --git a/R/merge.R b/R/merge.R index c9713e722..b0857f3bc 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,119 +1,13 @@ -#' Merge a subset of the rows or columns of a \code{SummarizedExperiment} -#' -#' \code{mergeRows}/\code{mergeCols} merge data on rows or columns of a -#' \code{SummarizedExperiment} as defined by a \code{factor} alongside the -#' chosen dimension. Metadata from the \code{rowData} or \code{colData} are -#' retained as defined by \code{archetype}. -#' -#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} are -#' agglomerated, i.e. summed up. If the assay contains values other than counts -#' or absolute values, this can lead to meaningless values being produced. -#' -#' @param x a \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} or -#' a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -#' -#' @param f A factor for merging. Must be the same length as -#' \code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be -#' merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be -#' returned unchanged. -#' -#' @param archetype Of each level of \code{f}, which element should be regarded -#' as the archetype and metadata in the columns or rows kept, while merging? -#' This can be single integer value or an integer vector of the same length -#' as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first -#' element encountered per factor level will be kept) -#' -#' @param mergeTree \code{TRUE} or \code{FALSE}: Should -#' \code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE}) -#' -#' @param mergeRefSeq \code{TRUE} or \code{FALSE}: Should a consensus sequence -#' be calculated? If set to \code{FALSE}, the result from \code{archetype} is -#' returned; If set to \code{TRUE} the result from -#' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is -#' returned. (Default: \code{mergeRefSeq = FALSE}) -#' -#' @param ... Optional arguments: -#' \itemize{ -#' \item{Passed on to \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, -#' with the exception of \code{subset_row}, \code{subset_col}} -#' } -#' -#' @details -#' These functions are similar to -#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. -#' However, additional support for \code{TreeSummarizedExperiment} was added and -#' science field agnostic names were used. In addition the \code{archetype} -#' argument lets the user select how to preserve row or column data. -#' -#' For merge data of assays the function from \code{scuttle} are used. -#' -#' @name merge-methods -#' @aliases mergeRows mergeCols -#' -#' @return An object of the same class as \code{x} with the specified entries -#' merged into one entry in all relevant components. -#' -#' @seealso -#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}} -#' -#' @examples -#' data(esophagus) -#' esophagus -#' plot(rowTree(esophagus)) -#' # get a factor for merging -#' f <- factor(regmatches(rownames(esophagus), -#' regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) -#' merged <- mergeRows(esophagus,f, mergeTree = TRUE) -#' plot(rowTree(merged)) -#' # -#' data(GlobalPatterns) -#' GlobalPatterns -#' merged <- mergeCols(GlobalPatterns,colData(GlobalPatterns)$SampleType) -#' merged -NULL - -#' @rdname merge-methods -#' @aliases mergeFeatures -#' @export -setGeneric("mergeRows", - signature = "x", - function(x, f, archetype = 1L, ...) - standardGeneric("mergeRows")) - -#' @rdname merge-methods -#' @aliases mergeSamples -#' @export -setGeneric("mergeCols", - signature = "x", - function(x, f, archetype = 1L, ...) - standardGeneric("mergeCols")) - -#' @rdname merge-methods -#' @aliases mergeRows -#' @export -setGeneric("mergeFeatures", - signature = "x", - function(x, f, archetype = 1L, ...) - standardGeneric("mergeFeatures")) - -#' @rdname merge-methods -#' @aliases mergeCols -#' @export -setGeneric("mergeSamples", - signature = "x", - function(x, f, archetype = 1L, ...) - standardGeneric("mergeSamples")) - .norm_f <- function(i, f, dim.type = c("rows","columns")){ dim.type <- match.arg(dim.type) if(!is.character(f) && !is.factor(f)){ stop("'f' must be a factor or character vector coercible to a ", - "meaningful factor.", - call. = FALSE) + "meaningful factor.", + call. = FALSE) } if(i != length(f)){ stop("'f' must have the same number of ",dim.type," as 'x'", - call. = FALSE) + call. = FALSE) } if(is.character(f)){ f <- factor(f) @@ -125,21 +19,21 @@ setGeneric("mergeSamples", if(length(archetype) > 1L){ if(length(levels(f)) != length(archetype)){ stop("length of 'archetype' must have the same length as ", - "levels('f')", - call. = FALSE) + "levels('f')", + call. = FALSE) } } f_table <- table(f) if(!is.null(names(archetype))){ if(anyNA(names(archetype)) || anyDuplicated(names(archetype))){ stop("If 'archetype' is named, names must be non-NA and unqiue.", - call. = FALSE) + call. = FALSE) } archetype <- archetype[names(f_table)] } if(any(f_table < archetype)){ stop("'archetype' out of bounds for some levels of 'f'. The maximum of", - " 'archetype' is defined as table('f')", call. = FALSE) + " 'archetype' is defined as table('f')", call. = FALSE) } if(length(archetype) == 1L){ archetype <- rep(archetype,length(levels(f))) @@ -191,12 +85,12 @@ setGeneric("mergeSamples", mapply(.check_assays_for_merge, names(assays), assays) } assays <- S4Vectors::SimpleList(lapply(assays, - scuttle::sumCountsAcrossFeatures, - ids = f, - subset.row = NULL, - subset.col = NULL, - average = average, - BPPARAM = BPPARAM)) + scuttle::sumCountsAcrossFeatures, + ids = f, + subset.row = NULL, + subset.col = NULL, + average = average, + BPPARAM = BPPARAM)) names(assays) <- names(assays(x)) # merge to result x <- x[.get_element_pos(f, archetype = archetype),] @@ -212,15 +106,15 @@ setGeneric("mergeSamples", if( all(assay == 0 | assay == 1) ){ warning("'",assay.type,"'", " includes binary values.", "\nAgglomeration of it might lead to meaningless values.", - "\nCheck the assay, and consider doing transformation again manually", - " with agglomerated data.", + "\nCheck the assay, and consider doing transformation again", + "manually with agglomerated data.", call. = FALSE) } if( !all( assay >= 0 | is.na(assay) ) ){ warning("'",assay.type,"'", " includes negative values.", "\nAgglomeration of it might lead to meaningless values.", - "\nCheck the assay, and consider doing transformation again manually", - " with agglomerated data.", + "\nCheck the assay, and consider doing transformation again", + "manually with agglomerated data.", call. = FALSE) } } @@ -250,18 +144,18 @@ setGeneric("mergeSamples", mapply(.check_assays_for_merge, names(assays), assays) FUN <- function(mat, ...){ temp <- scuttle::summarizeAssayByGroup(mat, - statistics = "sum", - ...) + statistics = "sum", + ...) # "sum" includes agglomerated (summed up) data mat <- assay(temp, "sum") return(mat) } assays <- S4Vectors::SimpleList(lapply(assays, - FUN = FUN, - ids = f, - subset.row = NULL, - subset.col = NULL, - ...)) + FUN = FUN, + ids = f, + subset.row = NULL, + subset.col = NULL, + ...)) names(assays) <- names(assays(x)) # merge to result x <- x[,.get_element_pos(f, archetype = archetype)] @@ -271,43 +165,13 @@ setGeneric("mergeSamples", x } -#' @rdname merge-methods -#' @aliases mergeFeatures -#' @export -setMethod("mergeRows", signature = c(x = "SummarizedExperiment"), - function(x, f, archetype = 1L, ...){ - .merge_rows(x, f, archetype = archetype, ...) - } -) - -#' @rdname merge-methods -#' @aliases mergeSmaples -#' @export -setMethod("mergeCols", signature = c(x = "SummarizedExperiment"), - function(x, f, archetype = 1L, ...){ - .merge_cols(x, f, archetype = archetype, ...) - } -) - -#' @rdname merge-methods -#' @aliases mergeRows -#' @export -setMethod("mergeFeatures", signature = c(x = "SummarizedExperiment"), - function(x, f, archetype = 1L, ...){ - .Deprecated(old="mergeRows", new="mergeFeatures", "Now mergeRows is deprecated. Use mergeFeatures instead.") - .merge_rows(x, f, archetype = archetype, ...) - } -) +.merge_rows_SE <- function(x, f, archetype = 1L, ...){ + .merge_rows(x, f, archetype = archetype, ...) +} -#' @rdname merge-methods -#' @aliases mergeCols -#' @export -setMethod("mergeSamples", signature = c(x = "SummarizedExperiment"), - function(x, f, archetype = 1L, ...){ - .Deprecated(old="mergeCols", new="mergeSamples", "Now mergeCols is deprecated. Use mergeSamples instead.") - .merge_cols(x, f, archetype = archetype, ...) - } -) +.merge_cols_SE <- function(x, f, archetype = 1L, ...){ + .merge_cols(x, f, archetype = archetype, ...) +} .merge_tree <- function(tree, links){ tips <- sort(setdiff(tree$edge[, 2], tree$edge[, 1])) @@ -386,76 +250,45 @@ setMethod("mergeSamples", signature = c(x = "SummarizedExperiment"), .merge_refseq <- function(sequences, f, names, threshold){ sequences <- split(sequences,f) seq <- unlist(DNAStringSetList(lapply(sequences, ConsensusSequence, - threshold = threshold))) + threshold = threshold))) seq } -#' @rdname merge-methods -#' @importFrom ape keep.tip -#' @export -setMethod("mergeRows", signature = c(x = "TreeSummarizedExperiment"), - function(x, f, archetype = 1L, mergeTree = FALSE, mergeRefSeq = FALSE, ...){ - # input check - if(!.is_a_bool(mergeTree)){ - stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE) - } - if(!.is_a_bool(mergeRefSeq)){ - stop("'mergeRefSeq' must be TRUE or FALSE.", call. = FALSE) - } - # for optionally merging referenceSeq - refSeq <- NULL - if(mergeRefSeq){ - refSeq <- referenceSeq(x) - } - # - x <- callNextMethod(x, f, archetype = 1L, ...) - # optionally merge rowTree - x <- .merge_trees(x, mergeTree, 1) - # optionally merge referenceSeq - if(!is.null(refSeq)){ - referenceSeq(x) <- .merge_refseq_list(refSeq, f, rownames(x), ...) - } - x - } -) +.merge_rows_TSE <- function(x, f, archetype = 1L, mergeTree = FALSE, + mergeRefSeq = FALSE, ...){ + # input check + if(!.is_a_bool(mergeTree)){ + stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE) + } + if(!.is_a_bool(mergeRefSeq)){ + stop("'mergeRefSeq' must be TRUE or FALSE.", call. = FALSE) + } + # for optionally merging referenceSeq + refSeq <- NULL + if(mergeRefSeq){ + refSeq <- referenceSeq(x) + } + # + x <- .merge_rows_SE(x, f, archetype = 1L, ...) + # optionally merge rowTree + x <- .merge_trees(x, mergeTree, 1) + # optionally merge referenceSeq + if(!is.null(refSeq)){ + referenceSeq(x) <- .merge_refseq_list(refSeq, f, rownames(x), ...) + } + x +} -#' @rdname merge-methods -#' @importFrom ape keep.tip -#' @export -setMethod("mergeCols", signature = c(x = "TreeSummarizedExperiment"), - function(x, f, archetype = 1L, mergeTree = FALSE, ...){ - # input check - if(!.is_a_bool(mergeTree)){ - stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE) - } - # - x <- callNextMethod(x, f, archetype = 1L, ...) - # optionally merge colTree - x <- .merge_trees(x, mergeTree, 2) - return(x) - } -) -#' @rdname merge-methods -#' @importFrom ape keep.tip -#' @aliases mergeRows -#' @export -setMethod("mergeFeatures", signature = c(x = "TreeSummarizedExperiment"), - function(x, f, archetype = 1L, mergeTree = FALSE, mergeRefSeq = FALSE, ...){ - .Deprecated(old="mergeRows", new="mergeFeatures", "Now mergeRows is deprecated. Use mergeFeatures instead.") - x <- mergeRows(x = x, f = f, archetype = 1L, mergeTree = mergeTree, mergeRefSeq = mergeRefSeq, ...) - return(x) - } -) -#' @rdname merge-methods -#' @importFrom ape keep.tip -#' @aliases mergeCols -#' @export -setMethod("mergeSamples", signature = c(x = "TreeSummarizedExperiment"), - function(x, f, archetype = 1L, mergeTree = FALSE, ...){ - .Deprecated(old="mergeCols", new="mergeSamples", "Now mergeCols is deprecated. Use mergeSamples instead.") - x <- mergeCols(x, f, archetype = 1L, mergeTree =mergeTree, ...) - return(x) - } -) +.merge_cols_TSE <- function(x, f, archetype = 1L, mergeTree = FALSE, ...){ + # input check + if(!.is_a_bool(mergeTree)){ + stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE) + } + # + x <- .merge_cols_SE(x, f, archetype = 1L, ...) + # optionally merge colTree + x <- .merge_trees(x, mergeTree, 2) + return(x) +} diff --git a/R/splitByRanks.R b/R/splitByRanks.R index 908479674..483750e05 100644 --- a/R/splitByRanks.R +++ b/R/splitByRanks.R @@ -58,7 +58,7 @@ #' @seealso #' \code{\link[=splitOn]{splitOn}} #' \code{\link[=unsplitOn]{unsplitOn}} -#' \code{\link[=merge-methods]{mergeRows}}, +#' \code{\link[=agglomerate-methods]{agglomerateByVariable}}, #' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, #' \code{\link[=agglomerate-methods]{agglomerateByRank}}, #' \code{\link[SingleCellExperiment:altExps]{altExps}}, diff --git a/R/splitOn.R b/R/splitOn.R index a81d9d9f6..d4f704bef 100644 --- a/R/splitOn.R +++ b/R/splitOn.R @@ -26,9 +26,10 @@ #' @param altExpNames a \code{character} vector specifying the alternative experiments #' to be unsplit. (By default: \code{altExpNames = names(altExps(x))}) #' -#' @param ... Arguments passed to \code{mergeRows}/\code{mergeCols} function for +#' @param ... Arguments passed to \code{agglomerateByVariable} function for #' \code{SummarizedExperiment} objects and other functions. -#' See \code{\link[=agglomerate-methods]{mergeRows}} for more details. +#' See \code{\link[=agglomerate-methods]{agglomerateByVariable}} for more +#' details. #' \itemize{ #' \item{\code{use_names} A single boolean value to select whether to name elements of #' list by their group names.} @@ -53,7 +54,7 @@ #' @seealso #' \code{\link[=splitByRanks]{splitByRanks}} #' \code{\link[=unsplitByRanks]{unsplitByRanks}} -#' \code{\link[=merge-methods]{mergeRows}}, +#' \code{\link[=agglomerate-methods]{agglomerateByVariable}}, #' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, #' \code{\link[=agglomerate-methods]{agglomerateByRank}}, #' \code{\link[SingleCellExperiment:altExps]{altExps}}, diff --git a/R/utils.R b/R/utils.R index 61a13949c..df95cd31e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -427,15 +427,15 @@ } ################################################################################ -# internal wrappers for agglomerateByRank/mergeRows +# internal wrappers for agglomerateByRank/agglomerateByVariable .merge_features <- function(x, merge.by, ...) { # Check if merge.by parameter belongs to taxonomyRanks if (is.character(merge.by) && length(merge.by) == 1 && merge.by %in% taxonomyRanks(x)) { #Merge using agglomerateByRank x <- agglomerateByRank(x, rank = merge.by, ...) } else { - # Merge using mergeRows - x <- mergeRows(x, f = merge.by, ...) + # Merge using agglomerateByVariable + x <- agglomerateByVariable(x, MARGIN = "rows", f = merge.by, ...) } return(x) } diff --git a/man/agglomerate-methods.Rd b/man/agglomerate-methods.Rd index 42dfbe2c6..04909b067 100644 --- a/man/agglomerate-methods.Rd +++ b/man/agglomerate-methods.Rd @@ -3,20 +3,19 @@ \name{agglomerate-methods} \alias{agglomerate-methods} \alias{agglomerateByRank} -\alias{mergeFeaturesByRank} +\alias{agglomerateByVariable} \alias{agglomerateByRank,SummarizedExperiment-method} -\alias{mergeFeaturesByRank,SummarizedExperiment-method} +\alias{agglomerateByVariable,SummarizedExperiment-method} +\alias{agglomerateByVariable,TreeSummarizedExperiment-method} \alias{agglomerateByRank,SingleCellExperiment-method} -\alias{mergeFeaturesByRank,SingleCellExperiment-method} \alias{agglomerateByRank,TreeSummarizedExperiment-method} -\alias{mergeFeaturesByRank,TreeSummarizedExperiment-method} \alias{agglomerateByPrevalence} \alias{agglomerateByPrevalence,SummarizedExperiment-method} -\title{Agglomerate data using taxonomic information} +\title{Agglomerate or merge data using taxonomic information} \usage{ agglomerateByRank(x, ...) -mergeFeaturesByRank(x, ...) +agglomerateByVariable(x, ...) \S4method{agglomerateByRank}{SummarizedExperiment}( x, @@ -27,19 +26,20 @@ mergeFeaturesByRank(x, ...) ... ) -\S4method{mergeFeaturesByRank}{SummarizedExperiment}( +\S4method{agglomerateByVariable}{SummarizedExperiment}(x, MARGIN, f, archetype = 1L, ...) + +\S4method{agglomerateByVariable}{TreeSummarizedExperiment}( x, - rank = taxonomyRanks(x)[1], - onRankOnly = FALSE, - na.rm = FALSE, - empty.fields = c(NA, "", " ", "\\t", "-", "_"), + MARGIN, + f, + archetype = 1L, + mergeTree = FALSE, + mergeRefSeq = FALSE, ... ) \S4method{agglomerateByRank}{SingleCellExperiment}(x, ..., altexp = NULL, strip_altexp = TRUE) -\S4method{mergeFeaturesByRank}{SingleCellExperiment}(x, ..., altexp = NULL, strip_altexp = TRUE) - \S4method{agglomerateByRank}{TreeSummarizedExperiment}( x, ..., @@ -47,8 +47,6 @@ mergeFeaturesByRank(x, ...) agglomerateTree = FALSE ) -\S4method{mergeFeaturesByRank}{TreeSummarizedExperiment}(x, ..., agglomerate.tree = FALSE) - agglomerateByPrevalence(x, ...) \S4method{agglomerateByPrevalence}{SummarizedExperiment}( @@ -59,16 +57,15 @@ agglomerateByPrevalence(x, ...) ) } \arguments{ -\item{x}{a -\code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} -object} +\item{x}{a \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} or +a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}} \item{...}{arguments passed to \code{agglomerateByRank} function for \code{SummarizedExperiment} objects, +to \code{\link[=agglomerate-methods]{agglomerateByVariable}} and +\code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, to \code{getPrevalence} and \code{getPrevalentTaxa} and used in -\code{agglomeratebyPrevalence}, -to \code{\link[=merge-methods]{mergeRows}} and -\code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. +\code{agglomeratebyPrevalence} \itemize{ \item{\code{remove_empty_ranks}}{A single boolean value for selecting whether to remove those columns of rowData that include only NAs after @@ -102,6 +99,30 @@ will be dropped. This setting can be tweaked by defining regarded as empty. (Default: \code{c(NA, "", " ", "\t")}). They will be removed if \code{na.rm = TRUE} before agglomeration.} +\item{MARGIN}{A character value for selecting if data is merged +row-wise / for features ('rows') or column-wise / for samples ('cols'). +Must be \code{'rows'} or \code{'cols'}.} + +\item{f}{A factor for merging. Must be the same length as +\code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be +merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be +returned unchanged.} + +\item{archetype}{Of each level of \code{f}, which element should be regarded +as the archetype and metadata in the columns or rows kept, while merging? +This can be single integer value or an integer vector of the same length +as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first +element encountered per factor level will be kept)} + +\item{mergeTree}{\code{TRUE} or \code{FALSE}: Should +\code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE})} + +\item{mergeRefSeq}{\code{TRUE} or \code{FALSE}: Should a consensus sequence +be calculated? If set to \code{FALSE}, the result from \code{archetype} is +returned; If set to \code{TRUE} the result from +\code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is +returned. (Default: \code{mergeRefSeq = FALSE})} + \item{altexp}{String or integer scalar specifying an alternative experiment containing the input data.} @@ -121,7 +142,9 @@ summary of non-prevalent taxa. (default: \code{other_label = "Other"})} } \value{ \code{agglomerateByRank} returns a taxonomically-agglomerated, -optionally-pruned object of the same class as \code{x}. +optionally-pruned object of the same class as \code{x} while +\code{agglomerateByVariable} returns an object of the same class as \code{x} +with the specified entries merged into one entry in all relevant components. \code{agglomerateByPrevalence} returns a taxonomically-agglomerated object of the same class as x and based on prevalent taxonomic results. @@ -131,16 +154,38 @@ Agglomeration functions can be used to sum-up data based on specific criteria such as taxonomic ranks, variables or prevalence. } \details{ +\code{agglomerateByRank} can be used to sum up data based on associations +with certain taxonomic ranks, as defined in \code{rowData}. Only available +\code{\link{taxonomyRanks}} can be used. + +\code{agglomerateByVariable} merges data on rows or columns of a +\code{SummarizedExperiment} as defined by a \code{factor} alongside the +chosen dimension. This function allows agglomeration of data based on other +variables than taxonomy ranks. +Metadata from the \code{rowData} or \code{colData} are +retained as defined by \code{archetype}. +\code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} are +agglomerated, i.e. summed up. If the assay contains values other than counts +or absolute values, this can lead to meaningless values being produced. + Depending on the available taxonomic data and its structure, setting \code{onRankOnly = TRUE} has certain implications on the interpretability of your results. If no loops exist (loops meaning two higher ranks containing the same lower rank), the results should be comparable. You can check for loops using \code{\link[TreeSummarizedExperiment:detectLoop]{detectLoop}}. -Agglomeration sums up the values of assays at the specified taxonomic level. With -certain assays, e.g. those that include binary or negative values, this summing -can produce meaningless values. In those cases, consider performing agglomeration -first, and then applying the transformation afterwards. +Agglomeration sums up the values of assays at the specified taxonomic level. +With certain assays, e.g. those that include binary or negative values, this +summing can produce meaningless values. In those cases, consider performing +agglomeration first, and then applying the transformation afterwards. + +\code{agglomerateByVariable} works similarly to +\code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. +However, additional support for \code{TreeSummarizedExperiment} was added and +science field agnostic names were used. In addition the \code{archetype} +argument lets the user select how to preserve row or column data. + +For merge data of assays the function from \code{scuttle} are used. \code{agglomerateByPrevalence} sums up the values of assays at the taxonomic level specified by \code{rank} (by default the highest taxonomic level @@ -150,6 +195,9 @@ threshold) are agglomerated in an additional row taking the name indicated by \code{other_label} (by default "Other"). } \examples{ + +### Agglomerate data based on taxonomic information + data(GlobalPatterns) # print the available taxonomic ranks colnames(rowData(GlobalPatterns)) @@ -168,12 +216,12 @@ nrow(x2) # same number of rows, but rowTree(x1) # ... different rowTree(x2) # ... tree - # If assay contains binary or negative values, summing might lead to meaningless - # values, and you will get a warning. In these cases, you might want to do - # agglomeration again at chosen taxonomic level. - tse <- transformAssay(GlobalPatterns, method = "pa") - tse <- agglomerateByRank(tse, rank = "Genus") - tse <- transformAssay(tse, method = "pa") +# If assay contains binary or negative values, summing might lead to +# meaningless values, and you will get a warning. In these cases, you might +# want to do agglomeration again at chosen taxonomic level. +tse <- transformAssay(GlobalPatterns, method = "pa") +tse <- agglomerateByRank(tse, rank = "Genus") +tse <- transformAssay(tse, method = "pa") # removing empty labels by setting na.rm = TRUE sum(is.na(rowData(GlobalPatterns)$Family)) @@ -189,7 +237,8 @@ rownames(x3) <- getTaxonomyLabels(x3, with_rank = TRUE) print(rownames(x3[1:3,])) # use 'remove_empty_ranks' to remove columns that include only NAs -x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", remove_empty_ranks = TRUE) +x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", + remove_empty_ranks = TRUE) head(rowData(x4)) # If the assay contains NAs, you might want to consider replacing them, @@ -209,6 +258,24 @@ data(enterotype) ## Print the available taxonomic ranks. Shows only 1 available rank, ## not useful for agglomerateByRank taxonomyRanks(enterotype) + +### Merge TreeSummarizedExperiments on rows and columns + +data(esophagus) +esophagus +plot(rowTree(esophagus)) +# get a factor for merging +f <- factor(regmatches(rownames(esophagus), + regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) +merged <- agglomerateByVariable(esophagus, MARGIN = "rows", f, + mergeTree = TRUE) +plot(rowTree(merged)) +# +data(GlobalPatterns) +GlobalPatterns +merged <- agglomerateByVariable(GlobalPatterns, MARGIN = "cols", + colData(GlobalPatterns)$SampleType) +merged ## Data can be aggregated based on prevalent taxonomic results tse <- GlobalPatterns tse <- agglomerateByPrevalence(tse, @@ -229,6 +296,5 @@ assay(tse)[,1:5] } \seealso{ -\code{\link[=merge-methods]{mergeRows}}, \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}} } diff --git a/man/deprecate.Rd b/man/deprecate.Rd index 01744dda0..d606033e5 100644 --- a/man/deprecate.Rd +++ b/man/deprecate.Rd @@ -8,17 +8,33 @@ \alias{addTaxonomyTree,SummarizedExperiment-method} \alias{taxonomyTree} \alias{taxonomyTree,SummarizedExperiment-method} +\alias{mergeRows} +\alias{mergeRows,SummarizedExperiment-method} +\alias{mergeRows,TreeSummarizedExperiment-method} +\alias{mergeCols} +\alias{mergeCols,SummarizedExperiment-method} +\alias{mergeCols,TreeSummarizedExperiment-method} +\alias{mergeFeatures} +\alias{mergeFeatures,SummarizedExperiment-method} +\alias{mergeFeatures,TreeSummarizedExperiment-method} +\alias{mergeSamples} +\alias{mergeSamples,SummarizedExperiment-method} +\alias{mergeSamples,TreeSummarizedExperiment-method} +\alias{mergeFeaturesByRank} +\alias{mergeFeaturesByRank,SummarizedExperiment-method} +\alias{mergeFeaturesByRank,SingleCellExperiment-method} \alias{getExperimentCrossAssociation} \alias{getExperimentCrossAssociation,MultiAssayExperiment-method} \alias{getExperimentCrossAssociation,SummarizedExperiment-method} +\alias{mergeFeaturesByRank,TreeSummarizedExperiment-method} +\alias{mergeFeaturesByPrevalence} +\alias{mergeFeaturesByPrevalence,SummarizedExperiment-method} \alias{testExperimentCrossAssociation} \alias{testExperimentCrossAssociation,ANY-method} \alias{testExperimentCrossCorrelation} \alias{testExperimentCrossCorrelation,ANY-method} \alias{getExperimentCrossCorrelation} \alias{getExperimentCrossCorrelation,ANY-method} -\alias{mergeFeaturesByPrevalence} -\alias{mergeFeaturesByPrevalence,SummarizedExperiment-method} \alias{loadFromBiom} \alias{loadFromQIIME2} \alias{readQZA} @@ -50,12 +66,48 @@ taxonomyTree(x, ...) \S4method{taxonomyTree}{SummarizedExperiment}(x, ...) +mergeRows(x, ...) + +\S4method{mergeRows}{SummarizedExperiment}(x, ...) + +\S4method{mergeRows}{TreeSummarizedExperiment}(x, ...) + +mergeCols(x, ...) + +\S4method{mergeCols}{SummarizedExperiment}(x, ...) + +\S4method{mergeCols}{TreeSummarizedExperiment}(x, ...) + +mergeFeatures(x, ...) + +\S4method{mergeFeatures}{SummarizedExperiment}(x, ...) + +\S4method{mergeFeatures}{TreeSummarizedExperiment}(x, ...) + +mergeSamples(x, ...) + +\S4method{mergeSamples}{SummarizedExperiment}(x, ...) + +\S4method{mergeSamples}{TreeSummarizedExperiment}(x, ...) + +mergeFeaturesByRank(x, ...) + +\S4method{mergeFeaturesByRank}{SummarizedExperiment}(x, ...) + +\S4method{mergeFeaturesByRank}{SingleCellExperiment}(x, ...) + getExperimentCrossAssociation(x, ...) \S4method{getExperimentCrossAssociation}{MultiAssayExperiment}(x, ...) \S4method{getExperimentCrossAssociation}{SummarizedExperiment}(x, ...) +\S4method{mergeFeaturesByRank}{TreeSummarizedExperiment}(x, ...) + +mergeFeaturesByPrevalence(x, ...) + +\S4method{mergeFeaturesByPrevalence}{SummarizedExperiment}(x, ...) + testExperimentCrossAssociation(x, ...) \S4method{testExperimentCrossAssociation}{ANY}(x, ...) @@ -68,10 +120,6 @@ getExperimentCrossCorrelation(x, ...) \S4method{getExperimentCrossCorrelation}{ANY}(x, ...) -mergeFeaturesByPrevalence(x, ...) - -\S4method{mergeFeaturesByPrevalence}{SummarizedExperiment}(x, ...) - loadFromBiom(...) loadFromQIIME2(...) diff --git a/man/merge-methods.Rd b/man/merge-methods.Rd deleted file mode 100644 index 2fe46498b..000000000 --- a/man/merge-methods.Rd +++ /dev/null @@ -1,121 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/merge.R -\name{merge-methods} -\alias{merge-methods} -\alias{mergeRows} -\alias{mergeCols} -\alias{mergeFeatures} -\alias{mergeSamples} -\alias{mergeRows,SummarizedExperiment-method} -\alias{mergeCols,SummarizedExperiment-method} -\alias{mergeSmaples} -\alias{mergeFeatures,SummarizedExperiment-method} -\alias{mergeSamples,SummarizedExperiment-method} -\alias{mergeRows,TreeSummarizedExperiment-method} -\alias{mergeCols,TreeSummarizedExperiment-method} -\alias{mergeFeatures,TreeSummarizedExperiment-method} -\alias{mergeSamples,TreeSummarizedExperiment-method} -\title{Merge a subset of the rows or columns of a \code{SummarizedExperiment}} -\usage{ -mergeRows(x, f, archetype = 1L, ...) - -mergeCols(x, f, archetype = 1L, ...) - -mergeFeatures(x, f, archetype = 1L, ...) - -mergeSamples(x, f, archetype = 1L, ...) - -\S4method{mergeRows}{SummarizedExperiment}(x, f, archetype = 1L, ...) - -\S4method{mergeCols}{SummarizedExperiment}(x, f, archetype = 1L, ...) - -\S4method{mergeFeatures}{SummarizedExperiment}(x, f, archetype = 1L, ...) - -\S4method{mergeSamples}{SummarizedExperiment}(x, f, archetype = 1L, ...) - -\S4method{mergeRows}{TreeSummarizedExperiment}(x, f, archetype = 1L, mergeTree = FALSE, mergeRefSeq = FALSE, ...) - -\S4method{mergeCols}{TreeSummarizedExperiment}(x, f, archetype = 1L, mergeTree = FALSE, ...) - -\S4method{mergeFeatures}{TreeSummarizedExperiment}( - x, - f, - archetype = 1L, - mergeTree = FALSE, - mergeRefSeq = FALSE, - ... -) - -\S4method{mergeSamples}{TreeSummarizedExperiment}(x, f, archetype = 1L, mergeTree = FALSE, ...) -} -\arguments{ -\item{x}{a \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} or -a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}} - -\item{f}{A factor for merging. Must be the same length as -\code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be -merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be -returned unchanged.} - -\item{archetype}{Of each level of \code{f}, which element should be regarded -as the archetype and metadata in the columns or rows kept, while merging? -This can be single integer value or an integer vector of the same length -as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first -element encountered per factor level will be kept)} - -\item{...}{Optional arguments: -\itemize{ -\item{Passed on to \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, -with the exception of \code{subset_row}, \code{subset_col}} -}} - -\item{mergeTree}{\code{TRUE} or \code{FALSE}: Should -\code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE})} - -\item{mergeRefSeq}{\code{TRUE} or \code{FALSE}: Should a consensus sequence -be calculated? If set to \code{FALSE}, the result from \code{archetype} is -returned; If set to \code{TRUE} the result from -\code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is -returned. (Default: \code{mergeRefSeq = FALSE})} -} -\value{ -An object of the same class as \code{x} with the specified entries -merged into one entry in all relevant components. -} -\description{ -\code{mergeRows}/\code{mergeCols} merge data on rows or columns of a -\code{SummarizedExperiment} as defined by a \code{factor} alongside the -chosen dimension. Metadata from the \code{rowData} or \code{colData} are -retained as defined by \code{archetype}. -} -\details{ -\code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} are -agglomerated, i.e. summed up. If the assay contains values other than counts -or absolute values, this can lead to meaningless values being produced. - -These functions are similar to -\code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. -However, additional support for \code{TreeSummarizedExperiment} was added and -science field agnostic names were used. In addition the \code{archetype} -argument lets the user select how to preserve row or column data. - -For merge data of assays the function from \code{scuttle} are used. -} -\examples{ -data(esophagus) -esophagus -plot(rowTree(esophagus)) -# get a factor for merging -f <- factor(regmatches(rownames(esophagus), - regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) -merged <- mergeRows(esophagus,f, mergeTree = TRUE) -plot(rowTree(merged)) -# -data(GlobalPatterns) -GlobalPatterns -merged <- mergeCols(GlobalPatterns,colData(GlobalPatterns)$SampleType) -merged -} -\seealso{ -\code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}} -} diff --git a/man/splitByRanks.Rd b/man/splitByRanks.Rd index ba19d759b..b5b1bd718 100644 --- a/man/splitByRanks.Rd +++ b/man/splitByRanks.Rd @@ -99,7 +99,7 @@ x \seealso{ \code{\link[=splitOn]{splitOn}} \code{\link[=unsplitOn]{unsplitOn}} -\code{\link[=merge-methods]{mergeRows}}, +\code{\link[=agglomerate-methods]{agglomerateByVariable}}, \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, \code{\link[=agglomerate-methods]{agglomerateByRank}}, \code{\link[SingleCellExperiment:altExps]{altExps}}, diff --git a/man/splitOn.Rd b/man/splitOn.Rd index dceae5d4e..50e10298d 100644 --- a/man/splitOn.Rd +++ b/man/splitOn.Rd @@ -34,9 +34,10 @@ object or a list of \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} objects.} -\item{...}{Arguments passed to \code{mergeRows}/\code{mergeCols} function for +\item{...}{Arguments passed to \code{agglomerateByVariable} function for \code{SummarizedExperiment} objects and other functions. -See \code{\link[=agglomerate-methods]{mergeRows}} for more details. +See \code{\link[=agglomerate-methods]{agglomerateByVariable}} for more +details. \itemize{ \item{\code{use_names} A single boolean value to select whether to name elements of list by their group names.} @@ -113,7 +114,7 @@ unsplitOn(se_list) \seealso{ \code{\link[=splitByRanks]{splitByRanks}} \code{\link[=unsplitByRanks]{unsplitByRanks}} -\code{\link[=merge-methods]{mergeRows}}, +\code{\link[=agglomerate-methods]{agglomerateByVariable}}, \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, \code{\link[=agglomerate-methods]{agglomerateByRank}}, \code{\link[SingleCellExperiment:altExps]{altExps}}, diff --git a/tests/testthat/test-2merge.R b/tests/testthat/test-2merge.R index d70b2bd42..0c7de4c16 100644 --- a/tests/testthat/test-2merge.R +++ b/tests/testthat/test-2merge.R @@ -61,7 +61,7 @@ test_that("merge", { expect_error(mia:::.merge_rows(x), 'argument "f" is missing') FUN_check_x <- function(x,archetype=1){ - actual <- mergeFeatures(x, f, archetype) + actual <- agglomerateByVariable(x, MARGIN = "rows", f, archetype) expect_s4_class(actual,class(x)) expect_equal(dim(actual),c(2,10)) } @@ -77,7 +77,7 @@ test_that("merge", { xtse <- TreeSummarizedExperiment(assays = list(mat = mat), rowRanges = unname(grl)) FUN_check_x <- function(x,archetype=1){ - actual <- mergeFeatures(x, f, archetype) + actual <- agglomerateByVariable(x, MARGIN = "rows", f, archetype) expect_s4_class(actual,class(x)) expect_equal(dim(actual),c(2,10)) } @@ -99,22 +99,33 @@ test_that("merge", { # (trees are pruned differently --> first instance represent specific branch) tse <- tse[c(rownames(esophagus), rownames(GlobalPatterns)), ] # Only esophagus has these groups --> the merge should contain only esophagus - merged <- mergeFeatures(tse, f=rowData(tse)$group2, mergeTree=TRUE) - merged2 <- mergeFeatures(tse, f = rowData(tse)$group2, mergeTree = FALSE) - merged3 <- mergeFeatures(esophagus, f = rowData(esophagus)$group2, mergeTree = TRUE) - merged4 <- .merge_features(tse, merge.by = rowData(tse)$group2, mergeTree = TRUE) - merged5 <- mergeFeatures(tse, f = rowData(tse)$group2, mergeTree = TRUE) + merged <- agglomerateByVariable(tse, MARGIN = "rows", + f = rowData(tse)$group2, mergeTree=TRUE) + merged2 <- agglomerateByVariable(tse, MARGIN = "rows", + f = rowData(tse)$group2, mergeTree = FALSE) + merged3 <- agglomerateByVariable(esophagus, MARGIN = "rows", + f = rowData(esophagus)$group2, + mergeTree = TRUE) + merged4 <- .merge_features(tse, merge.by = rowData(tse)$group2, + mergeTree = TRUE) + merged5 <- agglomerateByVariable(tse, MARGIN = "rows", + f = rowData(tse)$group2, mergeTree = TRUE) expect_equal( rowLinks(merged)$whichTree, rowLinks(merged2)$whichTree ) expect_false( all(rowLinks(merged) == rowLinks(merged2)) ) expect_equal(rowTree(tse), rowTree(merged2)) expect_equal(rowTree(merged), rowTree(merged3)) expect_equal(merged4, merged5) - expect_equal(mergeFeatures(tse, f=rowData(tse)$group2), mergeFeatures(tse, f=rowData(tse)$group2)) + expect_equal(agglomerateByVariable(tse, MARGIN = "rows", + f=rowData(tse)$group2), + agglomerateByVariable(tse, MARGIN = "rows", + f=rowData(tse)$group2)) # Both datasets have group variable - merged <- mergeFeatures(tse, f = rowData(tse)$group, mergeTree = TRUE) - merged2 <- mergeFeatures(tse, f = rowData(tse)$group, mergeTree = FALSE) + merged <- agglomerateByVariable(tse, MARGIN = "rows", + f = rowData(tse)$group, mergeTree = TRUE) + merged2 <- agglomerateByVariable(tse, MARGIN = "rows", + f = rowData(tse)$group, mergeTree = FALSE) expect_equal( rowLinks(merged)$whichTree, rowLinks(merged2)$whichTree ) expect_false( all(rowLinks(merged) == rowLinks(merged2)) ) diff --git a/tests/testthat/test-3agglomerate.R b/tests/testthat/test-3agglomerate.R index 2836c773a..4d1a9d06e 100644 --- a/tests/testthat/test-3agglomerate.R +++ b/tests/testthat/test-3agglomerate.R @@ -14,13 +14,15 @@ test_that("agglomerate", { rowData(xtse) <- tax_data # mergeRows for mergeFeaturesByRank tax_factors <- mia:::.get_tax_groups(xtse, col = 2) - actual_family <- actual <- mergeRows(xtse, f = tax_factors) + actual_family <- actual <- agglomerateByVariable(xtse, MARGIN = "rows", + f = tax_factors) expect_s4_class(actual,class(xtse)) expect_equal(dim(actual),c(8,10)) expect_equal(assays(actual)$mat[8,1],c(c_NA = 8)) expect_equal(assays(actual)$mat[7,1],c(c_h = 16)) tax_factors <- mia:::.get_tax_groups(xtse, col = 1) - actual_phylum <- actual <- mergeRows(xtse, f = tax_factors) + actual_phylum <- actual <- agglomerateByVariable(xtse, MARGIN = "rows", + f = tax_factors) expect_s4_class(actual,class(xtse)) expect_equal(dim(actual),c(3,10)) expect_equal(assays(actual)$mat[1,1],c(a = 6))