diff --git a/R/agglomerate.R b/R/agglomerate.R index b50cf15bd..1f3a2987e 100644 --- a/R/agglomerate.R +++ b/R/agglomerate.R @@ -45,20 +45,32 @@ #' to \code{getPrevalence} and \code{getPrevalentTaxa} and used in #' \code{agglomeratebyPrevalence} #' \itemize{ -#' \item \code{remove_empty_ranks}: A single boolean value for selecting +#' \item \code{remove_empty_ranks}: A single boolean value for selecting #' whether to remove those columns of rowData that include only NAs after #' agglomeration. (By default: \code{remove_empty_ranks = FALSE}) -#' \item \code{make_unique}: A single boolean value for selecting +#' \item \code{make_unique}: A single boolean value for selecting #' whether to make rownames unique. (By default: \code{make_unique = TRUE}) -#' \item \code{detection}: Detection threshold for absence/presence. -#' Either an absolute value compared directly to the values of \code{x} +#' \item \code{detection}: Detection threshold for absence/presence. +#' Either an absolute value compared directly to the values of \code{x} #' or a relative value between 0 and 1, if \code{as_relative = FALSE}. -#' \item \code{prevalence}: Prevalence threshold (in 0 to 1). The -#' required prevalence is strictly greater by default. To include the +#' \item \code{prevalence}: Prevalence threshold (in 0 to 1). The +#' required prevalence is strictly greater by default. To include the #' limit, set \code{include_lowest} to \code{TRUE}. -#' \item \code{as.relative}: Logical scalar: Should the detection -#' threshold be applied on compositional (relative) abundances? +#' \item \code{as.relative}: Logical scalar: Should the detection +#' threshold be applied on compositional (relative) abundances? #' (default: \code{FALSE}) +#' \item \code{mergeRefSeq} \code{TRUE} or \code{FALSE}: Should a +#' consensus sequence be calculated? If set to \code{FALSE}, the result +#' from \code{archetype} is returned; If set to \code{TRUE} the result +#' from +#' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} +#' is returned. (Default: \code{mergeRefSeq = FALSE}) +#' \item \code{archetype} Of each level of \code{f}, which element should +#' be regarded as the archetype and metadata in the columns or rows kept, +#' while merging? This can be single integer value or an integer vector +#' of the same length as \code{levels(f)}. (Default: +#' \code{archetype = 1L}, which means the first element encountered per +#' factor level will be kept) #' } #' #' @param altexp String or integer scalar specifying an alternative experiment @@ -78,28 +90,16 @@ #' merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be #' returned unchanged. #' -#' @param archetype Of each level of \code{f}, which element should be regarded -#' as the archetype and metadata in the columns or rows kept, while merging? -#' This can be single integer value or an integer vector of the same length -#' as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first -#' element encountered per factor level will be kept) -#' #' @param mergeTree \code{TRUE} or \code{FALSE}: Should #' \code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE}) #' -#' @param mergeRefSeq \code{TRUE} or \code{FALSE}: Should a consensus sequence -#' be calculated? If set to \code{FALSE}, the result from \code{archetype} is -#' returned; If set to \code{TRUE} the result from -#' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is -#' returned. (Default: \code{mergeRefSeq = FALSE}) -#' #' @details #' #' Agglomeration sums up the values of assays at the specified taxonomic level. With #' certain assays, e.g. those that include binary or negative values, this summing #' can produce meaningless values. In those cases, consider performing agglomeration #' first, and then applying the transformation afterwards. -#' +#' #' \code{agglomerateByVariable} works similarly to #' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. #' However, additional support for \code{TreeSummarizedExperiment} was added and @@ -108,12 +108,12 @@ #' #' For merge data of assays the function from \code{scuttle} are used. #' -#' @return -#' \code{agglomerateByRank} returns a taxonomically-agglomerated, +#' @return +#' \code{agglomerateByRank} returns a taxonomically-agglomerated, #' optionally-pruned object of the same class as \code{x}. -#' \code{agglomerateByVariable} returns an object of the same class as \code{x} +#' \code{agglomerateByVariable} returns an object of the same class as \code{x} #' with the specified entries merged into one entry in all relevant components. -#' \code{agglomerateByRank} returns a taxonomically-agglomerated, +#' \code{agglomerateByRank} returns a taxonomically-agglomerated, #' optionally-pruned object of the same class as \code{x}. #' #' @name agglomerate-methods @@ -232,7 +232,7 @@ setGeneric("agglomerateByVariable", #' #' @export setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), - function(x, rank = taxonomyRanks(x)[1], na.rm = FALSE, + function(x, rank = taxonomyRanks(x)[1], na.rm = TRUE, empty.fields = c(NA, "", " ", "\t", "-", "_"), ...){ # input check if(nrow(x) == 0L){ @@ -273,9 +273,14 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), # get groups of taxonomy entries tax_factors <- .get_tax_groups(x, col = col, ...) + # Convert to factors. Use na.rm so that NA values are not preserved. + # i.e. they are not convrted into character values. + # NA values are handled earlier in this function. + tax_factors <- .norm_f(nrow(x), tax_factors, na.rm = TRUE) # merge taxa - x <- agglomerateByVariable(x, MARGIN = "rows", f = tax_factors, ...) + x <- agglomerateByVariable( + x, MARGIN = "rows", f = tax_factors, na.rm = TRUE, ...) # "Empty" the values to the right of the rank, using NA_character_. if( col < length(taxonomyRanks(x)) ){ @@ -303,10 +308,11 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), #' @aliases agglomerateByVariable #' @export setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"), - function(x, MARGIN, f, archetype = 1L, ...){ + function(x, MARGIN, f, ...){ MARGIN <- .check_MARGIN(MARGIN) - FUN <- switch(MARGIN, .merge_rows_SE, .merge_cols_SE) - FUN(x, f, archetype = archetype, ...) + FUN <- switch(MARGIN, .merge_rows, .merge_cols) + x <- FUN(x, f, ...) + return(x) } ) @@ -315,17 +321,14 @@ setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"), #' @export setMethod("agglomerateByVariable", signature = c(x = "TreeSummarizedExperiment"), - function(x, MARGIN, f, archetype = 1L, mergeTree = FALSE, - mergeRefSeq = FALSE, ...){ + function(x, MARGIN, f, mergeTree = FALSE, ...){ + # Check MARGIN MARGIN <- .check_MARGIN(MARGIN) - if ( MARGIN == 1L ){ - .merge_rows_TSE(x, f, archetype = 1L, mergeTree = mergeTree, - mergeRefSeq = mergeRefSeq, ...) - } - else{ - .merge_cols_TSE(x, f, archetype = 1L, mergeTree = mergeTree, - ...) - } + # Get function based on MARGIN + FUN <- switch(MARGIN, .merge_rows_TSE, .merge_cols_TSE) + # Agglomerate + x <- FUN(x, f, mergeTree = mergeTree, ...) + return(x) } ) @@ -507,4 +510,4 @@ setMethod( tree <- collapse.singles(tree) } return(tree) -} \ No newline at end of file +} diff --git a/R/getPrevalence.R b/R/getPrevalence.R index 6e0484edb..3a1e59e9d 100644 --- a/R/getPrevalence.R +++ b/R/getPrevalence.R @@ -42,11 +42,11 @@ #' \code{\link[=agglomerate-methods]{?agglomerateByRank}} for more details. #' Note that you can specify whether to remove empty ranks with #' \code{agg.na.rm} instead of \code{na.rm}. (default: \code{FALSE}) -#' +#' #' \item for \code{getPrevalent}, \code{getRare}, \code{subsetByPrevalent} #' and \code{subsetByRare} additional parameters passed to #' \code{getPrevalence} -#' +#' #' \item for \code{getPrevalentAbundance} additional parameters passed to #' \code{getPrevalent} #' } @@ -64,27 +64,27 @@ #' proportion of the core species (in between 0 and 1). The core taxa are #' defined as those that exceed the given population prevalence threshold at the #' given detection level as set for \code{getPrevalent}. -#' +#' #' \code{subsetPrevalent} and \code{subsetRareFeatures} return a subset of -#' \code{x}. -#' The subset includes the most prevalent or rare taxa that are calculated with +#' \code{x}. +#' The subset includes the most prevalent or rare taxa that are calculated with #' \code{getPrevalent} or \code{getRare} respectively. #' #' @return #' \code{subsetPrevalent} and \code{subsetRareFeatures} return subset of #' \code{x}. -#' +#' #' All other functions return a named vectors: #' \itemize{ -#' \item \code{getPrevalence} returns a \code{numeric} vector with the -#' names being set to either the row names of \code{x} or the names after +#' \item \code{getPrevalence} returns a \code{numeric} vector with the +#' names being set to either the row names of \code{x} or the names after #' agglomeration. -#' +#' #' \item \code{getPrevalentAbundance} returns a \code{numeric} vector with -#' the names corresponding to the column name of \code{x} and include the +#' the names corresponding to the column name of \code{x} and include the #' joint abundance of prevalent taxa. -#' -#' \item \code{getPrevalent} and \code{getRare} return a +#' +#' \item \code{getPrevalent} and \code{getRare} return a #' \code{character} vector with only the names exceeding the threshold set #' by \code{prevalence}, if the \code{rownames} of \code{x} is set. #' Otherwise an \code{integer} vector is returned matching the rows in @@ -153,7 +153,7 @@ #' detection = 0.001, #' prevalence = 0.55, #' as_relative = TRUE) -#' altExp(tse, "prevalent") +#' altExp(tse, "prevalent") #' #' # getRare returns the inverse #' rare <- getRare(tse, @@ -169,8 +169,8 @@ #' detection = 0.001, #' prevalence = 0.001, #' as_relative = TRUE) -#' altExp(tse, "rare") -#' +#' altExp(tse, "rare") +#' #' # Names of both experiments, prevalent and rare, can be found from slot #' # altExpNames #' tse @@ -236,7 +236,7 @@ setMethod("getPrevalence", signature = c(x = "ANY"), function( .agg_for_prevalence <- function( x, rank, relabel = FALSE, make_unique = TRUE, na.rm = FALSE, - agg.na.rm = FALSE, ...){ + agg.na.rm = TRUE, ...){ # Check na.rm. It is not used in this function, it is only catched so that # it can be passed to getPrevalence(matrix) and not use it here in # agglomerateByRank function. @@ -295,7 +295,7 @@ setMethod("getPrevalence", signature = c(x = "SummarizedExperiment"), #' given detection threshold for the selected taxonomic rank. #' #' @aliases getPrevalent -#' +#' #' @export setGeneric("getPrevalent", signature = "x", function(x, ...) @@ -376,7 +376,7 @@ setMethod("getPrevalent", signature = c(x = "ANY"), #' @rdname getPrevalence #' @export setMethod("getPrevalent", signature = c(x = "SummarizedExperiment"), - function(x, rank = NULL, prevalence = 50/100, + function(x, rank = NULL, prevalence = 50/100, include_lowest = FALSE, ...){ .get_prevalent_taxa(x, rank = rank, prevalence = prevalence, include_lowest = include_lowest, ...) @@ -389,7 +389,7 @@ setMethod("getPrevalent", signature = c(x = "SummarizedExperiment"), #' #' @details #' \code{getRare} returns complement of \code{getPrevalent}. -#' +#' #' @export setGeneric("getRare", signature = "x", function(x, ...) @@ -432,7 +432,7 @@ setMethod("getRare", signature = c(x = "ANY"), #' @rdname getPrevalence #' @export setMethod("getRare", signature = c(x = "SummarizedExperiment"), - function(x, rank = NULL, prevalence = 50/100, + function(x, rank = NULL, prevalence = 50/100, include_lowest = FALSE, ...){ .get_rare_taxa(x, rank = rank, prevalence = prevalence, include_lowest = include_lowest, ...) @@ -571,8 +571,8 @@ setMethod("agglomerateByPrevalence", signature = c(x = "SummarizedExperiment"), pr <- getPrevalent(x, rank = NULL, ...) f <- rownames(x) %in% pr if(any(!f)){ - other_x <- agglomerateByVariable(x[!f,], MARGIN = "rows", - factor(rep(1L,sum(!f))), + other_x <- agglomerateByVariable(x[!f,], MARGIN = "rows", + factor(rep(1L,sum(!f))), check_assays = FALSE) rowData(other_x)[,colnames(rowData(other_x))] <- NA # set the other label diff --git a/R/merge.R b/R/merge.R index 3df5d6df8..8a960a9d8 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,8 +1,8 @@ .norm_f <- function(i, f, dim.type = c("rows","columns"), na.rm = FALSE, ...){ - dim.type <- match.arg(dim.type) if(!.is_a_bool(na.rm)){ - stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) + stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) } + dim.type <- match.arg(dim.type) if(!is.character(f) && !is.factor(f)){ stop("'f' must be a factor or character vector coercible to a ", "meaningful factor.", @@ -13,7 +13,7 @@ call. = FALSE) } # This is done otherwise we lose NA values - if (!na.rm && any(is.na(f))) { + if( !na.rm && any(is.na(f)) ){ f <- as.character(f) f[ is.na(f) ] <- "NA" } @@ -75,7 +75,7 @@ if( .is_a_string(f) && f %in% colnames(rowData(x)) ){ f <- rowData(x)[[ f ]] } - f <- .norm_f(nrow(x), f) + f <- .norm_f(nrow(x), f, ...) if(length(levels(f)) == nrow(x)){ return(x) } @@ -128,7 +128,7 @@ if( .is_a_string(f) && f %in% colnames(colData(x)) ){ f <- colData(x)[[ f ]] } - f <- .norm_f(ncol(x), f, "columns") + f <- .norm_f(ncol(x), f, "columns", ...) if(length(levels(f)) == ncol(x)){ return(x) @@ -163,14 +163,6 @@ x } -.merge_rows_SE <- function(x, f, archetype = 1L, ...){ - .merge_rows(x, f, archetype = archetype, ...) -} - -.merge_cols_SE <- function(x, f, archetype = 1L, ...){ - .merge_cols(x, f, archetype = archetype, ...) -} - #' @importFrom Biostrings DNAStringSetList .merge_refseq_list <- function(sequences_list, f, names, ...){ threshold <- list(...)[["threshold"]] @@ -211,7 +203,7 @@ refSeq <- referenceSeq(x) } # - x <- .merge_rows_SE(x, f, archetype = 1L, ...) + x <- .merge_rows(x, f, archetype = 1L, ...) # optionally merge rowTree if( mergeTree ){ x <- .agglomerate_trees(x, 1) @@ -229,7 +221,7 @@ stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE) } # - x <- .merge_cols_SE(x, f, archetype = 1L, ...) + x <- .merge_cols(x, f, archetype = 1L, ...) # optionally merge colTree if( mergeTree ){ x <- .agglomerate_trees(x, 2) diff --git a/man/agglomerate-methods.Rd b/man/agglomerate-methods.Rd index 9a9878c96..5723ac926 100644 --- a/man/agglomerate-methods.Rd +++ b/man/agglomerate-methods.Rd @@ -29,22 +29,14 @@ agglomerateByVariable(x, ...) \S4method{agglomerateByRank}{SummarizedExperiment}( x, rank = taxonomyRanks(x)[1], - na.rm = FALSE, + na.rm = TRUE, empty.fields = c(NA, "", " ", "\\t", "-", "_"), ... ) -\S4method{agglomerateByVariable}{SummarizedExperiment}(x, MARGIN, f, archetype = 1L, ...) +\S4method{agglomerateByVariable}{SummarizedExperiment}(x, MARGIN, f, ...) -\S4method{agglomerateByVariable}{TreeSummarizedExperiment}( - x, - MARGIN, - f, - archetype = 1L, - mergeTree = FALSE, - mergeRefSeq = FALSE, - ... -) +\S4method{agglomerateByVariable}{TreeSummarizedExperiment}(x, MARGIN, f, mergeTree = FALSE, ...) \S4method{agglomerateByRank}{SingleCellExperiment}(x, ..., altexp = NULL, strip_altexp = TRUE) @@ -123,21 +115,9 @@ Must be \code{'rows'} or \code{'cols'}.} merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be returned unchanged.} -\item{archetype}{Of each level of \code{f}, which element should be regarded -as the archetype and metadata in the columns or rows kept, while merging? -This can be single integer value or an integer vector of the same length -as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first -element encountered per factor level will be kept)} - \item{mergeTree}{\code{TRUE} or \code{FALSE}: Should \code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE})} -\item{mergeRefSeq}{\code{TRUE} or \code{FALSE}: Should a consensus sequence -be calculated? If set to \code{FALSE}, the result from \code{archetype} is -returned; If set to \code{TRUE} the result from -\code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} is -returned. (Default: \code{mergeRefSeq = FALSE})} - \item{altexp}{String or integer scalar specifying an alternative experiment containing the input data.} diff --git a/man/getPrevalence.Rd b/man/getPrevalence.Rd index 6898c23e4..f60c800a5 100644 --- a/man/getPrevalence.Rd +++ b/man/getPrevalence.Rd @@ -218,7 +218,7 @@ altExp(tse, "prevalent") <- subsetByPrevalent(tse, detection = 0.001, prevalence = 0.55, as_relative = TRUE) -altExp(tse, "prevalent") +altExp(tse, "prevalent") # getRare returns the inverse rare <- getRare(tse, @@ -234,7 +234,7 @@ altExp(tse, "rare") <- subsetByRare(tse, detection = 0.001, prevalence = 0.001, as_relative = TRUE) -altExp(tse, "rare") +altExp(tse, "rare") # Names of both experiments, prevalent and rare, can be found from slot # altExpNames diff --git a/tests/testthat/test-2merge.R b/tests/testthat/test-2merge.R index 4c7fbd1c2..1f1ad9fc9 100644 --- a/tests/testthat/test-2merge.R +++ b/tests/testthat/test-2merge.R @@ -77,7 +77,8 @@ test_that("merge", { xtse <- TreeSummarizedExperiment(assays = list(mat = mat), rowRanges = unname(grl)) FUN_check_x <- function(x,archetype=1){ - actual <- agglomerateByVariable(x, MARGIN = "rows", f, archetype) + actual <- agglomerateByVariable(x, MARGIN = "rows", f, archetype, + mergeTree = FALSE) expect_s4_class(actual,class(x)) expect_equal(dim(actual),c(2,10)) } diff --git a/tests/testthat/test-3agglomerate.R b/tests/testthat/test-3agglomerate.R index 1756a167d..e1f1e8d82 100644 --- a/tests/testthat/test-3agglomerate.R +++ b/tests/testthat/test-3agglomerate.R @@ -12,7 +12,7 @@ test_that("agglomerate", { Family = c("c",NA,"d","e","f","g","h",NA,"h","e","f"), n = 7:17) rowData(xtse) <- tax_data - # mergeRows for mergeFeaturesByRank + # mergeRows for agglomerateByRank tax_factors <- mia:::.get_tax_groups(xtse, col = 2) actual_family <- actual <- agglomerateByVariable(xtse, MARGIN = "rows", f = tax_factors) @@ -29,137 +29,155 @@ test_that("agglomerate", { expect_equal(assays(actual)$mat[2,1],c(b = 36)) expect_equal(assays(actual)$mat[3,1],c(c = 24)) # - expect_error(mergeFeaturesByRank(xtse,"",na.rm=FALSE), + expect_error(agglomerateByRank(xtse,"",na.rm=FALSE), "'rank' must be a non-empty single character value") - expect_error(mergeFeaturesByRank(xtse,"Family",na.rm=""), + expect_error(agglomerateByRank(xtse,"Family",na.rm=""), "'na.rm' must be TRUE or FALSE") expect_error( - mergeFeaturesByRank(xtse,"Family",na.rm=FALSE,agglomerate.tree=""), + agglomerateByRank(xtse,"Family",na.rm=FALSE,agglomerate.tree=""), "'agglomerate.tree' must be TRUE or FALSE") xtse2 <- xtse rowData(xtse2) <- NULL - expect_error(mergeFeaturesByRank(xtse2,"Family",na.rm=FALSE), + expect_error(agglomerateByRank(xtse2,"Family",na.rm=FALSE), "taxonomyData needs to be populated") # - actual <- mergeFeaturesByRank(xtse,"Family",na.rm=FALSE) + actual <- agglomerateByRank(xtse,"Family",na.rm=FALSE) expect_equivalent(rowData(actual),rowData(actual_family)) - actual <- mergeFeaturesByRank(xtse,"Phylum",na.rm=FALSE) + actual <- agglomerateByRank(xtse,"Phylum",na.rm=FALSE) expect_equivalent(rowData(actual),rowData(actual_phylum)) # - actual <- mergeFeaturesByRank(xtse,"Family", onRankOnly = FALSE, na.rm = TRUE) + actual <- agglomerateByRank(xtse,"Family", onRankOnly = FALSE, na.rm = TRUE) expect_equal(dim(actual),c(6,10)) expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h")) - actual <- mergeFeaturesByRank(xtse,"Family", onRankOnly = FALSE, na.rm = FALSE) # the default + actual <- agglomerateByRank(xtse,"Family", onRankOnly = FALSE, na.rm = FALSE) # the default expect_equal(dim(actual),c(8,10)) expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h",NA,NA)) - actual <- mergeFeaturesByRank(xtse,"Phylum") + actual <- agglomerateByRank(xtse,"Phylum") expect_equivalent(rowData(actual),rowData(actual_phylum)) # - actual1 <- mergeFeaturesByRank(xtse,"Family") + actual1 <- agglomerateByRank(xtse,"Family") actual2 <- .merge_features(xtse, merge.by = "Family") expect_equal(actual1, actual2) - expect_equal(mergeFeaturesByRank(xtse,"Family"), mergeFeaturesByRank(xtse,"Family")) + expect_equal(agglomerateByRank(xtse,"Family"), agglomerateByRank(xtse,"Family")) # Only one rank available in the object - # the same dimensionality is retained data(enterotype, package="mia") expect_equal(length(unique(rowData(enterotype)[,"Genus"])), - nrow(mergeFeaturesByRank(enterotype,"Genus", onRankOnly = FALSE))) + nrow(agglomerateByRank(enterotype,"Genus", onRankOnly = FALSE, + na.rm = FALSE))) # agglomeration in all its forms data(GlobalPatterns, package="mia") se <- GlobalPatterns - actual <- mergeFeaturesByRank(se, rank = "Family", onRankOnly = FALSE) + actual <- agglomerateByRank(se, rank = "Family", + onRankOnly = FALSE, na.rm = FALSE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), length(rowTree(se)$tip.label)) - actual <- mergeFeaturesByRank(se, rank = "Family", onRankOnly = FALSE, agglomerate.tree = TRUE) + actual <- agglomerateByRank(se, rank = "Family", + onRankOnly = FALSE, na.rm = FALSE, agglomerate.tree = TRUE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), 603) - actual <- mergeFeaturesByRank(se, rank = "Family", onRankOnly = FALSE, agglomerate.tree = TRUE) + actual <- agglomerateByRank(se, rank = "Family", + onRankOnly = FALSE, na.rm = FALSE, agglomerate.tree = TRUE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), nrow(actual)) # Test that warning occurs when assay contian binary or negative values se1 <- transformAssay(se, method = "pa") se2 <- se1 assay(se2, "pa")[1, 1] <- -1 - expect_warning(mergeFeaturesByRank(se1, rank = "Phylum")) - expect_warning(mergeFeaturesByRank(se1, rank = "Order")) - + expect_warning(agglomerateByRank(se1, rank = "Phylum")) + expect_warning(agglomerateByRank(se1, rank = "Order")) + + # Load data data(GlobalPatterns, package="mia") tse <- GlobalPatterns - tse <- transformAssay(tse, assay.type="counts", method="relabundance") - altExp(tse, "Family") <- agglomerateByRank(tse, rank="Family", onRankOnly = TRUE) - altExp(tse, "Family1") <- agglomerateByRank(tse, rank="Family", onRankOnly = TRUE) - altExp(tse, "Family2") <- agglomerateByRank(tse, rank="Family", onRankOnly = FALSE) - altExp(tse, "Family3") <- agglomerateByRank(tse, rank="Family", onRankOnly = TRUE, na.rm = TRUE) - altExp(tse, "Family4") <- agglomerateByRank(tse, rank="Family", onRankOnly = TRUE, na.rm = FALSE) - altExp(tse, "Family5") <- agglomerateByVariable(tse, f="Family", MARGIN = 'row') + + # Check that na.rm works + # Get all phyla + all_phyla <- unique( rowData(tse)$Phylum ) + + # When na.rm = FALSE, then phyla should also include NA --> one extra row + test0 <- agglomerateByVariable(tse, MARGIN = 1, f = "Phylum", na.rm = FALSE) + test1 <- agglomerateByRank(tse, rank = "Phylum", na.rm = FALSE) + + # Test that dimentionality is the same for merging object by agglomerateByRank + # and agglomerateByVariable. + expect_equal(nrow(test0), length(all_phyla)) + expect_equal(nrow(test1), length(all_phyla)) + + # When na.rm = TRUE, there should be as many rows as there are non-NA phyla + test0 <- agglomerateByVariable(tse, MARGIN = 1, f = "Phylum", na.rm = TRUE) + test1 <- agglomerateByRank(tse, rank = "Phylum", na.rm = TRUE) - # Other group is added by agglomerateByPrevalence function to collect features under threshold - actual <- agglomerateByPrevalence(tse, rank="Family", assay.type="relabundance", - detection = 0.5/100, prevalence = 20/100, onRankOnly = TRUE) - actual0 <- agglomerateByPrevalence(altExp(tse, "Family"), assay.type="relabundance", - detection = 0.5/100, prevalence = 20/100) - actual1 <- agglomerateByPrevalence(altExp(tse, "Family1"), assay.type="relabundance", - detection = 0.5/100, prevalence = 20/100) - actual2 <- agglomerateByPrevalence(altExp(tse, "Family2"), assay.type="relabundance", - detection = 0.5/100, prevalence = 20/100) - actual3 <- agglomerateByPrevalence(altExp(tse, "Family3"), assay.type="relabundance", - detection = 0.5/100, prevalence = 20/100) - actual4 <- agglomerateByPrevalence(altExp(tse, "Family4"), assay.type="relabundance", - detection = 0.5/100, prevalence = 20/100) - actual5 <- agglomerateByPrevalence(altExp(tse, "Family5"), assay.type="relabundance", - detection = 0.5/100, prevalence = 20/100) + # Test that dimentionality is the same when NA values are removed. + expect_equal(nrow(test0), length( all_phyla[!is.na(all_phyla)] )) + expect_equal(nrow(test1), length( all_phyla[!is.na(all_phyla)] )) - # The values of actual( "", 0, 1, 4, 5) are equal since the factor group is created with groups - # at the family level i.e onRankOnly (tax_cols[tax_col_n == col]). - # However, actual2 creates groups based on the full taxonomic hierarchy up to family level - # i.e !onRankOnly (tax_cols[tax_col_n <= col]). While actual3 is less since empty or missing - # fields are removed from the tse object i.e na.rm. - expect_equal(nrow(actual), nrow(actual0)) - expect_equal(nrow(actual), nrow(actual1)) - expect_equal(nrow(actual2), 27) - expect_equal(nrow(actual3), 20) - expect_equal(nrow(actual), nrow(actual4)) - expect_equal(nrow(actual), nrow(actual5)) + # Check that there are more taxa when agglomeration is to "Species" level + test0 <- agglomerateByVariable(tse, MARGIN = 1, f = "Species", na.rm = FALSE) + test1 <- agglomerateByRank(tse, rank = "Species", na.rm = FALSE) + expect_equal(nrow(test0), 945) + expect_equal(nrow(test1), 2307) # Load data from miaTime package skip_if_not(require("miaTime", quietly = TRUE)) data(SilvermanAGutData) se <- SilvermanAGutData + + # checking reference consensus sequence generation + actual <- agglomerateByRank(se,"Genus", mergeRefSeq = FALSE) + # There should be only one exact match for each sequence + seqs_test <- as.character( referenceSeq(actual) ) + seqs_ref <- as.character( referenceSeq(se) ) + expect_true(all(vapply( + seqs_test, function(seq) sum(seqs_ref %in% seq) == 1, + FUN.VALUE = logical(1) )) ) + # Merging creates consensus sequences. + th <- runif(1, 0, 1) + actual <- agglomerateByRank( + se, "Genus", mergeRefSeq = TRUE, threshold = th) + seqs_test <- referenceSeq(actual) + # Get single taxon as reference. Merge those sequences and test that it + # equals to one that is output of agglomerateByRank + seqs_ref <- referenceSeq(se) + feature <- sample(na.omit(rowData(se)[["Genus"]]), 1) + seqs_ref <- seqs_ref[ rowData(se)[["Genus"]] %in% feature ] + seqs_ref <- .merge_refseq( + seqs_ref, factor(rep(feature, length(seqs_ref))), rownames(seqs_ref), + threshold = th) + seqs_test <- seqs_test[ names(seqs_test) %in% feature ] + expect_equal(seqs_test, seqs_ref) + # checking reference consensus sequence generation using 'Genus:Alistipes' - actual <- mergeFeaturesByRank(se,"Genus", mergeRefSeq = FALSE) - expect_equal(as.character(referenceSeq(actual)[["Genus:Alistipes"]]), + actual <- agglomerateByRank(se,"Genus", mergeRefSeq = FALSE) + expect_equal(as.character(referenceSeq(actual)[["Alistipes"]]), paste0("TCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTGATAA", "GTTAGAGGTGAAATCCCGGGGCTTAACTCCGGAACTGCCTCTAATACTGTTAG", "ACTAGAGAGTAGTTGCGGTAGGCGGAATGTATGGTGTAGCGGTGAAATGCTTA", "GAGATCATACAGAACACCGATTGCGAAGGCAGCTTACCAAACTATATCTGACG", "TTGAGGCACGAAAGCGTGGGG")) - actual <- mergeFeaturesByRank(se,"Genus", mergeRefSeq = TRUE) - expect_equal(as.character(referenceSeq(actual)[["Genus:Alistipes"]]), + actual <- agglomerateByRank(se,"Genus", mergeRefSeq = TRUE) + expect_equal(as.character(referenceSeq(actual)[["Alistipes"]]), paste0("BCNMKCKTTVWYCKKMHTTMYTKKKYKTMMMKNKHDYKYMKDYKKNHNNNYMM", "KHHNDNNKTKMMMDNBHNBKKCTYMMCHNBNDDDNKSSHBNNRWDMYKKBNND", "NYTDRRKDVHNKNDRVGRNDRSBRRAWTBYNHRKKKWRSSRKKRAAWKSSKWR", "RWDWTNDBRVRRAMHHCMRDKKSSRARGSSVSYYHNYBRRVHNDNNHYKRMVV", "YKVRDNNNSRAARSBDKGGKK")) # Test that remove_empty_ranks work - expect_error(mergeFeaturesByRank(se, rank = "Class", remove_empty_ranks = NULL)) - expect_error(mergeFeaturesByRank(se, rank = "Class", remove_empty_ranks = "NULL")) - expect_error(mergeFeaturesByRank(se, rank = "Class", remove_empty_ranks = 1)) - expect_error(mergeFeaturesByRank(se, rank = "Class", remove_empty_ranks = c(TRUE, TRUE))) - x <- mergeFeaturesByRank(se, rank = "Class") + expect_error(agglomerateByRank(se, rank = "Class", remove_empty_ranks = NULL)) + expect_error(agglomerateByRank(se, rank = "Class", remove_empty_ranks = "NULL")) + expect_error(agglomerateByRank(se, rank = "Class", remove_empty_ranks = 1)) + expect_error(agglomerateByRank(se, rank = "Class", remove_empty_ranks = c(TRUE, TRUE))) + x <- agglomerateByRank(se, rank = "Class") rd1 <- rowData(x)[, 1:3] - x <- mergeFeaturesByRank(se, rank = "Class", remove_empty_ranks = TRUE) + x <- agglomerateByRank(se, rank = "Class", remove_empty_ranks = TRUE) rd2 <- rowData(x) expect_equal(rd1, rd2) # Test that make_unique work - uniq <- mergeFeaturesByRank(se, rank = "Species") - not_uniq <- mergeFeaturesByRank(se, rank = "Species", make_unique = FALSE) + uniq <- agglomerateByRank(se, rank = "Species") + not_uniq <- agglomerateByRank(se, rank = "Species", make_unique = FALSE) expect_true( !any( duplicated(rownames(uniq)) ) ) expect_true( any( duplicated(rownames(not_uniq)) ) ) }) - - - - diff --git a/tests/testthat/test-5prevalence.R b/tests/testthat/test-5prevalence.R index 3b5c3bfd2..f780bf147 100644 --- a/tests/testthat/test-5prevalence.R +++ b/tests/testthat/test-5prevalence.R @@ -53,15 +53,15 @@ test_that("getPrevalence", { # Check that works also when rownames is NULL gp_null <- GlobalPatterns rownames(gp_null) <- NULL - + pr1 <- unname(getPrevalence(GlobalPatterns, detection=0.004, as_relative=TRUE)) - pr2 <- getPrevalence(gp_null, detection=0.004, as_relative=TRUE) + pr2 <- getPrevalence(gp_null, detection=0.004, as_relative=TRUE) expect_equal(pr1, pr2) - + pr1 <- getPrevalence(GlobalPatterns, detection=0.004, as_relative=TRUE, rank = "Family") - pr2 <- getPrevalence(gp_null, detection=0.004, as_relative=TRUE, rank = "Family") + pr2 <- getPrevalence(gp_null, detection=0.004, as_relative=TRUE, rank = "Family") expect_equal(pr1, pr2) - + # Check that na.rm works correctly tse <- GlobalPatterns # Get reference value @@ -82,7 +82,7 @@ test_that("getPrevalence", { res <- res[ !names(res) %in% remove ] ref <- ref[ !names(ref) %in% remove ] expect_equal( res[ names(ref) ], res[ names(ref) ] ) - + # Now test that the number of samples where feature was detected is correct tse <- GlobalPatterns ref <- getPrevalence(tse, assay.type = "counts") @@ -96,21 +96,24 @@ test_that("getPrevalence", { ref <- ref[ feature ] expect_true(res*ncol(tse) == 2) expect_true(ref*ncol(tse) == 3) - - # + + # tse <- GlobalPatterns rank <- "Genus" # Add NA values to matrix remove <- c(15, 200) assay(tse, "counts")[remove, ] <- NA # Check that agglomeration works - tse_agg <- agglomerateByRank(tse, onRankOnly = FALSE, rank = rank) + tse_agg <- agglomerateByRank(tse, onRankOnly = FALSE, na.rm = FALSE, rank = rank) expect_warning(ref <- getPrevalence(tse_agg, na.rm = FALSE)) - expect_warning(res <- getPrevalence(tse, onRankOnly = FALSE, na.rm = FALSE, rank = "Genus")) + expect_warning(res <- getPrevalence(tse, rank = "Genus", agg.na.rm = FALSE)) expect_true( all(res == ref, na.rm = TRUE) ) # - expect_warning(ref <- getPrevalence(tse_agg, na.rm = TRUE)) - expect_warning(res <- getPrevalence(tse, onRankOnly = FALSE, na.rm = TRUE, rank = "Genus")) + tse_agg <- agglomerateByRank( + tse, onRankOnly = FALSE, na.rm = TRUE, rank = rank) + ref <- getPrevalence(tse_agg, na.rm = TRUE) + res <- getPrevalence( + tse, na.rm = TRUE, rank = "Genus", agg.na.rm = TRUE) expect_true( all(res == ref, na.rm = TRUE) ) }) @@ -139,24 +142,24 @@ test_that("getPrevalent", { pr1 <- getPrevalent(GlobalPatterns, detection=0, prevalence=0, as_relative=TRUE) pr2 <- getPrevalent(GlobalPatterns, detection=0, prevalence=0, as_relative=FALSE) expect_true(all(pr1 == pr2)) - + # Check that works also when rownames is NULL gp_null <- GlobalPatterns rownames(gp_null) <- NULL - + pr1 <- getPrevalent(GlobalPatterns, detection=0.0045, prevalence = 0.25, as_relative=TRUE) pr1 <- which(rownames(GlobalPatterns) %in% pr1) - pr2 <- getPrevalent(gp_null, detection=0.0045, prevalence = 0.25, as_relative=TRUE) + pr2 <- getPrevalent(gp_null, detection=0.0045, prevalence = 0.25, as_relative=TRUE) expect_equal(pr1, pr2) - + # Test alias - alias <- getPrevalent(gp_null, detection=0.0045, prevalence = 0.25, as_relative=TRUE) + alias <- getPrevalent(gp_null, detection=0.0045, prevalence = 0.25, as_relative=TRUE) expect_equal(pr1, alias) - - pr1 <- getPrevalent(GlobalPatterns, detection=0.004, prevalence = 0.1, + + pr1 <- getPrevalent(GlobalPatterns, detection=0.004, prevalence = 0.1, + as_relative=TRUE, rank = "Family") + pr2 <- getPrevalent(gp_null, detection=0.004, prevalence = 0.1, as_relative=TRUE, rank = "Family") - pr2 <- getPrevalent(gp_null, detection=0.004, prevalence = 0.1, - as_relative=TRUE, rank = "Family") expect_equal(pr1, pr2) }) @@ -206,7 +209,7 @@ test_that("getRare", { for( rank in ranks ){ # Agglomerates data by rank - se <- mergeFeaturesByRank(GlobalPatterns, rank = rank) + se <- agglomerateByRank(GlobalPatterns, rank = rank) # Gets rownames for all the taxa all_taxa <- rownames(se) @@ -238,24 +241,24 @@ test_that("getRare", { expect_true(!anyDuplicated(rare_taxa)) } - + # Check that works also when rownames is NULL gp_null <- GlobalPatterns rownames(gp_null) <- NULL - + pr1 <- getRare(GlobalPatterns, detection=0.0045, prevalence = 0.25, as_relative=TRUE) pr1 <- which(rownames(GlobalPatterns) %in% pr1) - pr2 <- getRare(gp_null, detection=0.0045, prevalence = 0.25, as_relative=TRUE) + pr2 <- getRare(gp_null, detection=0.0045, prevalence = 0.25, as_relative=TRUE) expect_equal(pr1, pr2) - + # Test lias alias <- getRare(gp_null, detection=0.0045, prevalence = 0.25, as_relative=TRUE) expect_equal(pr1, alias) - - pr1 <- getRare(GlobalPatterns, detection=0.004, prevalence = 0.1, + + pr1 <- getRare(GlobalPatterns, detection=0.004, prevalence = 0.1, + as_relative=TRUE, rank = "Family") + pr2 <- getRare(gp_null, detection=0.004, prevalence = 0.1, as_relative=TRUE, rank = "Family") - pr2 <- getRare(gp_null, detection=0.004, prevalence = 0.1, - as_relative=TRUE, rank = "Family") expect_equal(pr1, pr2) }) @@ -266,46 +269,46 @@ test_that("subsetByPrevalent", { "'prevalence' must be a single numeric value or coercible to one") # Expect TSE object expect_equal(class(subsetByPrevalent(GlobalPatterns)), class(GlobalPatterns)) - + # Results compatible with getPrevalent pr1 <- rownames(subsetByPrevalent( GlobalPatterns, rank = "Class", detection=0.1/100, as_relative=TRUE, sort=TRUE)) - pr2 <- getPrevalent(GlobalPatterns, rank = "Class", detection=0.1/100, + pr2 <- getPrevalent(GlobalPatterns, rank = "Class", detection=0.1/100, as_relative=TRUE, sort=TRUE) expect_true(all(pr1 == pr2)) - + # Retrieved taxa are the same for counts and relative abundances pr1 <- assay(subsetByPrevalent(GlobalPatterns, prevalence=0.1/100, as_relative=TRUE), "counts") pr2 <- assay(subsetByPrevalent(GlobalPatterns, prevalence=0.1/100, as_relative=FALSE), "counts") expect_true(all(pr1 == pr2)) - + # Prevalence and detection threshold at 0 has the same impact on counts and relative abundances pr1 <- rownames(subsetByPrevalent(GlobalPatterns, detection=0, prevalence=0, as_relative=TRUE)) pr2 <- rownames(subsetByPrevalent(GlobalPatterns, detection=0, prevalence=0, as_relative=FALSE)) expect_true(all(pr1 == pr2)) - + # Check that works also when rownames is NULL gp_null <- GlobalPatterns rownames(gp_null) <- NULL - + pr1 <- subsetByPrevalent(GlobalPatterns, detection=12, prevalence = 0.33) pr1 <- unname(assay(pr1, "counts")) - pr2 <- subsetByPrevalent(gp_null, detection=12, prevalence = 0.33) + pr2 <- subsetByPrevalent(gp_null, detection=12, prevalence = 0.33) pr2 <- unname(assay(pr2, "counts")) expect_equal(pr1, pr2) - + pr1 <- subsetByPrevalent(GlobalPatterns, detection=5, prevalence = 0.33, rank = "Phylum") pr1 <- unname(assay(pr1, "counts")) - pr2 <- subsetByPrevalent(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") + pr2 <- subsetByPrevalent(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") pr2 <- unname(assay(pr2, "counts")) expect_equal(pr1, pr2) - + # Test alias - alias <- subsetByPrevalent(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") + alias <- subsetByPrevalent(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") alias <- unname(assay(alias, "counts")) expect_equal(alias, pr2) - + }) test_that("subsetByRare", { @@ -314,71 +317,71 @@ test_that("subsetByRare", { "'prevalence' must be a single numeric value or coercible to one") # Expect TSE object expect_equal(class(subsetByRare(GlobalPatterns)), class(GlobalPatterns)) - + # Results compatible with getRare pr1 <- rownames(subsetByRare( GlobalPatterns, rank = "Phylum", detection=0.1/100, as_relative=TRUE, sort=TRUE)) - pr2 <- getRare(GlobalPatterns, rank = "Phylum", detection=0.1/100, + pr2 <- getRare(GlobalPatterns, rank = "Phylum", detection=0.1/100, as_relative=TRUE, sort=TRUE) expect_true(all(pr1 == pr2)) - + # Retrieved taxa are the same for counts and relative abundances pr1 <- assay(subsetByRare(GlobalPatterns, prevalence=0.1/100, as_relative=TRUE), "counts") pr2 <- assay(subsetByRare(GlobalPatterns, prevalence=0.1/100, as_relative=FALSE), "counts") expect_true(all(pr1 == pr2)) - + # Prevalence and detection threshold at 0 has the same impact on counts and relative abundances pr1 <- rownames(subsetByRare(GlobalPatterns, detection=0, prevalence=0, as_relative=TRUE)) pr2 <- rownames(subsetByRare(GlobalPatterns, detection=0, prevalence=0, as_relative=FALSE)) expect_true(all(pr1 == pr2)) - + # subsetByRare + subsetByPrevalent should include all the taxa in OTU level d <- runif(1, 0.0001, 0.1) p <- runif(1, 0.0001, 0.5) - rare <- rownames(subsetByRare(GlobalPatterns, detection=d, prevalence=p, + rare <- rownames(subsetByRare(GlobalPatterns, detection=d, prevalence=p, as_relative=TRUE)) - - prevalent <- rownames(subsetByPrevalent(GlobalPatterns, detection=d, prevalence=p, + + prevalent <- rownames(subsetByPrevalent(GlobalPatterns, detection=d, prevalence=p, as_relative=TRUE)) - + all_taxa <- c(rare, prevalent) - + expect_true( all(all_taxa %in% rownames(GlobalPatterns)) ) - + # Check that works also when rownames is NULL gp_null <- GlobalPatterns rownames(gp_null) <- NULL - + pr1 <- subsetByRare(GlobalPatterns, detection=12, prevalence = 0.33) pr1 <- unname(assay(pr1, "counts")) - pr2 <- subsetByRare(gp_null, detection=12, prevalence = 0.33) + pr2 <- subsetByRare(gp_null, detection=12, prevalence = 0.33) pr2 <- unname(assay(pr2, "counts")) expect_equal(pr1, pr2) - + pr1 <- subsetByRare(GlobalPatterns, detection=5, prevalence = 0.33, rank = "Phylum") pr1 <- unname(assay(pr1, "counts")) - pr2 <- subsetByRare(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") + pr2 <- subsetByRare(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") pr2 <- unname(assay(pr2, "counts")) expect_equal(pr1, pr2) - + # Test alias - alias <- subsetByRare(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") + alias <- subsetByRare(gp_null, detection=5, prevalence = 0.33, rank = "Phylum") alias <- unname(assay(alias, "counts")) expect_equal(alias, pr2) - + }) -test_that("mergeFeaturesByPrevalence", { +test_that("agglomerateByPrevalence", { data(GlobalPatterns, package="mia") - expect_error(mergeFeaturesByPrevalence(GlobalPatterns, other_label=TRUE), + expect_error(agglomerateByPrevalence(GlobalPatterns, other_label=TRUE), "'other_label' must be a single character value") - actual <- mergeFeaturesByPrevalence(GlobalPatterns, rank = "Kingdom") + actual <- agglomerateByPrevalence(GlobalPatterns, rank = "Kingdom") expect_s4_class(actual,class(GlobalPatterns)) expect_equal(dim(actual),c(2,26)) - actual <- mergeFeaturesByPrevalence(GlobalPatterns, + actual <- agglomerateByPrevalence(GlobalPatterns, rank = "Phylum", detection = 1/100, prevalence = 50/100, @@ -388,19 +391,19 @@ test_that("mergeFeaturesByPrevalence", { expect_equal(dim(actual),c(6,26)) expect_equal(rowData(actual)[6,"Phylum"],"test") - actual <- mergeFeaturesByPrevalence(GlobalPatterns, + actual <- agglomerateByPrevalence(GlobalPatterns, rank = NULL, detection = 0.0001, prevalence = 50/100, as_relative = TRUE, other_label = "test") - expect_equal(mergeFeaturesByPrevalence(GlobalPatterns, + expect_equal(agglomerateByPrevalence(GlobalPatterns, rank = NULL, detection = 0.0001, prevalence = 50/100, as_relative = TRUE, other_label = "test"), - mergeFeaturesByPrevalence(GlobalPatterns, + agglomerateByPrevalence(GlobalPatterns, rank = NULL, detection = 0.0001, prevalence = 50/100,