diff --git a/DESCRIPTION b/DESCRIPTION index 4d2b9bf68..d1bdd0acf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mia Type: Package -Version: 1.13.17 +Version: 1.13.18 Authors@R: c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"), email = "felix.gm.ernst@outlook.com", diff --git a/NEWS b/NEWS index 1887d0ba5..398cdd13c 100644 --- a/NEWS +++ b/NEWS @@ -133,3 +133,4 @@ Changes in version 1.13.x + Rename splitByRanks to agglomerateByRanks and add option as.list + Explain that rarefyAssay returns a new SummarizedExperiment object that includes the newly added subsampled assay. ++ Fix bug in mergeFeaturesByPrevalence diff --git a/R/agglomerate.R b/R/agglomerate.R index 99873fe34..d3d9d5a4a 100644 --- a/R/agglomerate.R +++ b/R/agglomerate.R @@ -1,21 +1,21 @@ #' Agglomerate or merge data using taxonomic information -#' -#' Agglomeration functions can be used to sum-up data based on specific criteria +#' +#' Agglomeration functions can be used to sum-up data based on specific criteria #' such as taxonomic ranks, variables or prevalence. #' #' \code{agglomerateByRank} can be used to sum up data based on associations #' with certain taxonomic ranks, as defined in \code{rowData}. Only available -#' \code{\link{taxonomyRanks}} can be used. -#' -#' \code{agglomerateByVariable} merges data on rows or columns of a -#' \code{SummarizedExperiment} as defined by a \code{factor} alongside the -#' chosen dimension. This function allows agglomeration of data based on other +#' \code{\link{taxonomyRanks}} can be used. +#' +#' \code{agglomerateByVariable} merges data on rows or columns of a +#' \code{SummarizedExperiment} as defined by a \code{factor} alongside the +#' chosen dimension. This function allows agglomeration of data based on other #' variables than taxonomy ranks. -#' Metadata from the \code{rowData} or \code{colData} are +#' Metadata from the \code{rowData} or \code{colData} are #' retained as defined by \code{archetype}. -#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} are -#' agglomerated, i.e. summed up. If the assay contains values other than counts -#' or absolute values, this can lead to meaningless values being produced. +#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} are +#' agglomerated, i.e. summed up. If the assay contains values other than counts +#' or absolute values, this can lead to meaningless values being produced. #' #' @param x a \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} or #' a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} @@ -39,14 +39,14 @@ #' @param agglomerate.tree \code{TRUE} or \code{FALSE}: should #' \code{rowTree()} also be agglomerated? (Default: #' \code{agglomerate.tree = FALSE}) -#' +#' #' @param agglomerateTree alias for \code{agglomerate.tree}. #' #' @param ... arguments passed to \code{agglomerateByRank} function for #' \code{SummarizedExperiment} objects, #' to \code{\link[=agglomerate-methods]{agglomerateByVariable}} and #' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, -#' to \code{getPrevalence} and \code{getPrevalentTaxa} and used in +#' to \code{getPrevalence} and \code{getPrevalentTaxa} and used in #' \code{agglomeratebyPrevalence} #' \itemize{ #' \item \code{remove_empty_ranks}: A single boolean value for selecting @@ -74,7 +74,7 @@ #' \code{strip_altexp = TRUE}) #' #' @param MARGIN A character value for selecting if data is merged -#' row-wise / for features ('rows') or column-wise / for samples ('cols'). +#' row-wise / for features ('rows') or column-wise / for samples ('cols'). #' Must be \code{'rows'} or \code{'cols'}. #' #' @param f A factor for merging. Must be the same length as @@ -87,7 +87,7 @@ #' This can be single integer value or an integer vector of the same length #' as \code{levels(f)}. (Default: \code{archetype = 1L}, which means the first #' element encountered per factor level will be kept) -#' +#' #' @param mergeTree \code{TRUE} or \code{FALSE}: Should #' \code{rowTree()} also be merged? (Default: \code{mergeTree = FALSE}) #' @@ -127,7 +127,7 @@ #' optionally-pruned object of the same class as \code{x}. #' #' @name agglomerate-methods -#' +#' #' @seealso #' \code{\link[=splitOn]{splitOn}} #' \code{\link[=unsplitOn]{unsplitOn}} @@ -138,9 +138,9 @@ #' \code{\link[SingleCellExperiment:splitAltExps]{splitAltExps}} #' #' @examples -#' +#' #' ### Agglomerate data based on taxonomic information -#' +#' #' data(GlobalPatterns) #' # print the available taxonomic ranks #' colnames(rowData(GlobalPatterns)) @@ -151,16 +151,16 @@ #' ## How many taxa before/after agglomeration? #' nrow(GlobalPatterns) #' nrow(x1) -#' +#' #' # agglomerate the tree as well #' x2 <- agglomerateByRank(GlobalPatterns, rank="Family", #' agglomerate.tree = TRUE) #' nrow(x2) # same number of rows, but #' rowTree(x1) # ... different #' rowTree(x2) # ... tree -#' -#' # If assay contains binary or negative values, summing might lead to -#' # meaningless values, and you will get a warning. In these cases, you might +#' +#' # If assay contains binary or negative values, summing might lead to +#' # meaningless values, and you will get a warning. In these cases, you might #' # want to do agglomeration again at chosen taxonomic level. #' tse <- transformAssay(GlobalPatterns, method = "pa") #' tse <- agglomerateByRank(tse, rank = "Genus") @@ -170,20 +170,20 @@ #' sum(is.na(rowData(GlobalPatterns)$Family)) #' x3 <- agglomerateByRank(GlobalPatterns, rank="Family", na.rm = TRUE) #' nrow(x3) # different from x2 -#' -#' # Because all the rownames are from the same rank, rownames do not include -#' # prefixes, in this case "Family:". +#' +#' # Because all the rownames are from the same rank, rownames do not include +#' # prefixes, in this case "Family:". #' print(rownames(x3[1:3,])) -#' +#' #' # To add them, use getTaxonomyLabels function. #' rownames(x3) <- getTaxonomyLabels(x3, with_rank = TRUE) #' print(rownames(x3[1:3,])) -#' +#' #' # use 'remove_empty_ranks' to remove columns that include only NAs -#' x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", +#' x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", #' remove_empty_ranks = TRUE) #' head(rowData(x4)) -#' +#' #' # If the assay contains NAs, you might want to consider replacing them, #' # since summing-up NAs lead to NA #' x5 <- GlobalPatterns @@ -195,28 +195,28 @@ #' assay(x5)[ is.na(assay(x5)) ] <- 0 #' x6 <- agglomerateByRank(x5, "Kingdom") #' head( assay(x6) ) -#' +#' #' ## Look at enterotype dataset... #' data(enterotype) #' ## Print the available taxonomic ranks. Shows only 1 available rank, #' ## not useful for agglomerateByRank #' taxonomyRanks(enterotype) -#' +#' #' ### Merge TreeSummarizedExperiments on rows and columns -#' +#' #' data(esophagus) #' esophagus #' plot(rowTree(esophagus)) #' # get a factor for merging #' f <- factor(regmatches(rownames(esophagus), #' regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) -#' merged <- agglomerateByVariable(esophagus, MARGIN = "rows", f, +#' merged <- agglomerateByVariable(esophagus, MARGIN = "rows", f, #' mergeTree = TRUE) #' plot(rowTree(merged)) #' # #' data(GlobalPatterns) #' GlobalPatterns -#' merged <- agglomerateByVariable(GlobalPatterns, MARGIN = "cols", +#' merged <- agglomerateByVariable(GlobalPatterns, MARGIN = "cols", #' colData(GlobalPatterns)$SampleType) #' merged NULL @@ -242,7 +242,7 @@ setGeneric("agglomerateByVariable", #' #' @export setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), - function(x, rank = taxonomyRanks(x)[1], onRankOnly = FALSE, na.rm = FALSE, + function(x, rank = taxonomyRanks(x)[1], onRankOnly = TRUE, na.rm = FALSE, empty.fields = c(NA, "", " ", "\t", "-", "_"), ...){ # input check if(nrow(x) == 0L){ @@ -250,7 +250,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), call. = FALSE) } if(!.is_non_empty_string(rank)){ - stop("'rank' must be an non empty single character value.", + stop("'rank' must be a non-empty single character value", call. = FALSE) } if(!.is_a_bool(onRankOnly)){ @@ -265,7 +265,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), .check_taxonomic_rank(rank, x) .check_for_taxonomic_data_order(x) # - + # Make a vector from the taxonomic data. col <- which( taxonomyRanks(x) %in% rank ) tax_cols <- .get_tax_cols_from_se(x) @@ -301,12 +301,14 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), } # adjust rownames rownames(x) <- getTaxonomyLabels(x, empty.fields, ..., - with_rank = FALSE, + with_rank = FALSE, resolve_loops = FALSE) # Remove those columns from rowData that include only NAs x <- .remove_NA_cols_from_rowdata(x, ...) x <- .add_values_to_metadata(x, "agglomerated_by_rank", rank) - x + + # Order the data in alphabetical order + x <- x[ order(rownames(x)), ] } ) @@ -324,7 +326,7 @@ setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"), #' @rdname agglomerate-methods #' @aliases agglomerateByVariable #' @export -setMethod("agglomerateByVariable", +setMethod("agglomerateByVariable", signature = c(x = "TreeSummarizedExperiment"), function(x, MARGIN, f, archetype = 1L, mergeTree = FALSE, mergeRefSeq = FALSE, ...){ @@ -368,13 +370,13 @@ setMethod( x, ..., agglomerate.tree = agglomerateTree, agglomerateTree = FALSE){ # input check if(!.is_a_bool(agglomerate.tree)){ - stop("'agglomerate.tree' must be TRUE or FALSE.", + stop("'agglomerate.tree' must be TRUE or FALSE.", call. = FALSE) } # If there are multipe rowTrees, it might be that multiple - # trees are preserved after agglomeration even though the - # dataset could be presented with one tree. - # --> order the data so that the taxa are searched from one tree + # trees are preserved after agglomeration even though the + # dataset could be presented with one tree. + # --> order the data so that the taxa are searched from one tree # first. if( length(rowTreeNames(x)) > 1 ){ x <- .order_based_on_trees(x) @@ -403,7 +405,7 @@ setMethod( .remove_NA_cols_from_rowdata <- function(x, remove_empty_ranks = FALSE, ...){ # Check remove_empty_ranks if( !.is_a_bool(remove_empty_ranks) ){ - stop("'remove_empty_ranks' must be a boolean value.", + stop("'remove_empty_ranks' must be a boolean value.", call. = FALSE) } # If user wants to remove those columns @@ -518,4 +520,4 @@ setMethod( tree <- collapse.singles(tree) } return(tree) -} +} \ No newline at end of file diff --git a/R/estimateDominance.R b/R/estimateDominance.R index 269bd2064..7160a7ae0 100644 --- a/R/estimateDominance.R +++ b/R/estimateDominance.R @@ -251,8 +251,8 @@ setMethod("estimateDominance", signature = c(x = "SummarizedExperiment"), # Check indices index <- match.arg(index, several.ok = TRUE) if(!.is_non_empty_character(name) || length(name) != length(index)){ - stop("'name' must be a non-empty character value and have the ", - "same length than 'index'.", + stop("'name' must be a non-empty character value and have the + same length as 'index'", call. = FALSE) } diff --git a/R/getPrevalence.R b/R/getPrevalence.R index 9177e1721..6e0484edb 100644 --- a/R/getPrevalence.R +++ b/R/getPrevalence.R @@ -23,18 +23,18 @@ #' @param assay.type A single character value for selecting the #' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} #' to use for prevalence calculation. -#' +#' #' @param assay_name a single \code{character} value for specifying which #' assay to use for calculation. #' (Please use \code{assay.type} instead. At some point \code{assay_name} #' will be disabled.) -#' +#' #' @param rank a single character defining a taxonomic rank. Must be a value of #' \code{taxonomyRanks()} function. -#' +#' #' @param na.rm logical scalar: Should NA values be omitted when calculating #' prevalence? (default: \code{na.rm = TRUE}) -#' +#' #' @param ... additional arguments #' \itemize{ #' \item If \code{!is.null(rank)} arguments are passed on to @@ -86,7 +86,7 @@ #' #' \item \code{getPrevalent} and \code{getRare} return a #' \code{character} vector with only the names exceeding the threshold set -#' by \code{prevalence}, if the \code{rownames} of \code{x} is set. +#' by \code{prevalence}, if the \code{rownames} of \code{x} is set. #' Otherwise an \code{integer} vector is returned matching the rows in #' \code{x}. #' } @@ -146,7 +146,7 @@ #' prevalence = 50/100, #' as_relative = FALSE) #' head(prevalent) -#' +#' #' # Gets a subset of object that includes prevalent taxa #' altExp(tse, "prevalent") <- subsetByPrevalent(tse, #' rank = "Family", @@ -162,7 +162,7 @@ #' prevalence = 50/100, #' as_relative = TRUE) #' head(rare) -#' +#' #' # Gets a subset of object that includes rare taxa #' altExp(tse, "rare") <- subsetByRare(tse, #' rank = "Class", @@ -174,7 +174,7 @@ #' # Names of both experiments, prevalent and rare, can be found from slot #' # altExpNames #' tse -#' +#' #' data(esophagus) #' getPrevalentAbundance(esophagus, assay.type = "counts") #' @@ -266,7 +266,7 @@ setMethod("getPrevalence", signature = c(x = "ANY"), function( #' @rdname getPrevalence #' @export setMethod("getPrevalence", signature = c(x = "SummarizedExperiment"), - function(x, assay.type = assay_name, assay_name = "counts", + function(x, assay.type = assay_name, assay_name = "counts", as_relative = FALSE, rank = NULL, ...){ # input check if(!.is_a_bool(as_relative)){ @@ -318,16 +318,16 @@ setGeneric("getPrevalent", signature = "x", "one.", call. = FALSE) } - + prevalence <- as.numeric(prevalence) if(!.is_a_bool(include_lowest)){ stop("'include_lowest' must be TRUE or FALSE.", call. = FALSE) } - # rownames must bet set and unique, because if sort = TRUE, the order is + # rownames must bet set and unique, because if sort = TRUE, the order is # not preserved x <- .norm_rownames(x) pr <- getPrevalence(x, rank = NULL, ...) - + # get logical vector which row does exceed threshold if (include_lowest) { f <- pr >= prevalence @@ -343,7 +343,7 @@ setGeneric("getPrevalent", signature = "x", m <- match(names(f),names(indices)) m <- m[!is.na(m)] indices <- indices[m] - # + # indices } @@ -403,7 +403,7 @@ setGeneric("getRare", signature = "x", indices_new <- indices_x[f] indices_new } - + .get_rare_taxa <- function(x, rank = NULL, ...){ if(is(x,"SummarizedExperiment")){ x <- .agg_for_prevalence(x, rank = rank, ...) @@ -514,22 +514,22 @@ setMethod("getPrevalentAbundance", signature = c(x = "SummarizedExperiment"), ############################# agglomerateByPrevalence ########################## #' @rdname agglomerate-methods -#' +#' #' @param other_label A single \code{character} valued used as the label for the #' summary of non-prevalent taxa. (default: \code{other_label = "Other"}) -#' +#' #' @details -#' \code{agglomerateByPrevalence} sums up the values of assays at the taxonomic -#' level specified by \code{rank} (by default the highest taxonomic level -#' available) and selects the summed results that exceed the given population -#' prevalence at the given detection level. The other summed values (below the +#' \code{agglomerateByPrevalence} sums up the values of assays at the taxonomic +#' level specified by \code{rank} (by default the highest taxonomic level +#' available) and selects the summed results that exceed the given population +#' prevalence at the given detection level. The other summed values (below the #' threshold) are agglomerated in an additional row taking the name indicated by #' \code{other_label} (by default "Other"). -#' -#' @return +#' +#' @return #' \code{agglomerateByPrevalence} returns a taxonomically-agglomerated object #' of the same class as x and based on prevalent taxonomic results. -#' +#' #' @examples #' ## Data can be aggregated based on prevalent taxonomic results #' tse <- GlobalPatterns @@ -538,17 +538,17 @@ setMethod("getPrevalentAbundance", signature = c(x = "SummarizedExperiment"), #' detection = 1/100, #' prevalence = 50/100, #' as_relative = TRUE) -#' +#' #' tse -#' +#' #' # Here data is aggregated at the taxonomic level "Phylum". The five phyla -#' # that exceed the population prevalence threshold of 50/100 represent the +#' # that exceed the population prevalence threshold of 50/100 represent the #' # five first rows of the assay in the aggregated data. The sixth and last row -#' # named by default "Other" takes the summed up values of all the other phyla +#' # named by default "Other" takes the summed up values of all the other phyla #' # that are below the prevalence threshold. -#' +#' #' assay(tse)[,1:5] -#' +#' #' @export setGeneric("agglomerateByPrevalence", signature = "x", function(x, ...) @@ -557,7 +557,7 @@ setGeneric("agglomerateByPrevalence", signature = "x", #' @rdname agglomerate-methods #' @export setMethod("agglomerateByPrevalence", signature = c(x = "SummarizedExperiment"), - function(x, rank = taxonomyRanks(x)[1L], other_label = "Other", ...){ + function(x, rank = NULL, other_label = "Other", ...){ # input check if(!.is_a_string(other_label)){ stop("'other_label' must be a single character value.", diff --git a/R/merge.R b/R/merge.R index c36e5aa2b..3df5d6df8 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,5 +1,8 @@ -.norm_f <- function(i, f, dim.type = c("rows","columns")){ +.norm_f <- function(i, f, dim.type = c("rows","columns"), na.rm = FALSE, ...){ dim.type <- match.arg(dim.type) + if(!.is_a_bool(na.rm)){ + stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) + } if(!is.character(f) && !is.factor(f)){ stop("'f' must be a factor or character vector coercible to a ", "meaningful factor.", @@ -9,6 +12,11 @@ stop("'f' must have the same number of ",dim.type," as 'x'", call. = FALSE) } + # This is done otherwise we lose NA values + if (!na.rm && any(is.na(f))) { + f <- as.character(f) + f[ is.na(f) ] <- "NA" + } if(is.character(f)){ f <- factor(f) } @@ -52,7 +60,7 @@ #' @importFrom S4Vectors SimpleList #' @importFrom scuttle sumCountsAcrossFeatures -.merge_rows <- function(x, f, archetype = 1L, +.merge_rows <- function(x, f, archetype = 1L, average = FALSE, BPPARAM = SerialParam(), check.assays = TRUE, @@ -64,20 +72,14 @@ if( !.is_a_bool(check.assays) ){ stop("'check.assays' must be TRUE or FALSE.", call. = FALSE) } - # - if(is.character(f) && length(f)==1 && f %in% colnames(rowData(x))){ - f <- factor(as.character(rowData(x)[, f])) - } - else if(is.character(f) && length(f)==1 && f %in% colnames(colData(x))){ - f <- factor(as.character(colData(x)[, f])) - } else - { - f <- .norm_f(nrow(x), f) + if( .is_a_string(f) && f %in% colnames(rowData(x)) ){ + f <- rowData(x)[[ f ]] } + f <- .norm_f(nrow(x), f) if(length(levels(f)) == nrow(x)){ return(x) } - + archetype <- .norm_archetype(f, archetype) # merge assays assays <- assays(x) @@ -86,8 +88,8 @@ } assays <- S4Vectors::SimpleList(lapply(assays, scuttle::sumCountsAcrossFeatures, - ids = f, - subset.row = NULL, + ids = f, + subset.row = NULL, subset.col = NULL, average = average, BPPARAM = BPPARAM)) @@ -95,7 +97,7 @@ # merge to result x <- x[.get_element_pos(f, archetype = archetype),] assays(x, withDimnames = FALSE) <- assays - # Change rownames to group names + # Change rownames to group names rownames(x) <- rownames(assays[[1]]) x } @@ -105,7 +107,7 @@ # Check if assays include binary or negative values if( all(assay == 0 | assay == 1) ){ warning("'",assay.type,"'", " includes binary values.", - "\nAgglomeration of it might lead to meaningless values.", + "\nAgglomeration of it might lead to meaningless values.", "\nCheck the assay, and consider doing transformation again", "manually with agglomerated data.", call. = FALSE) @@ -123,15 +125,11 @@ #' @importFrom scuttle summarizeAssayByGroup .merge_cols <- function(x, f, archetype = 1L, ...){ # input check - if(is.character(f) && length(f)==1 && f %in% colnames(rowData(x))){ - f <- factor(as.character(rowData(x)[, f])) - } - else if(is.character(f) && length(f)==1 && f %in% colnames(colData(x))){ - f <- factor(as.character(colData(x)[, f])) - } else - { - f <- .norm_f(ncol(x), f, "columns") + if( .is_a_string(f) && f %in% colnames(colData(x)) ){ + f <- colData(x)[[ f ]] } + f <- .norm_f(ncol(x), f, "columns") + if(length(levels(f)) == ncol(x)){ return(x) } @@ -152,15 +150,15 @@ } assays <- S4Vectors::SimpleList(lapply(assays, FUN = FUN, - ids = f, - subset.row = NULL, + ids = f, + subset.row = NULL, subset.col = NULL, ...)) names(assays) <- names(assays(x)) # merge to result x <- x[,.get_element_pos(f, archetype = archetype)] assays(x, withDimnames = FALSE) <- assays - # Change colnames to group names + # Change colnames to group names colnames(x) <- colnames(assays[[1]]) x } @@ -198,7 +196,7 @@ seq } -.merge_rows_TSE <- function(x, f, archetype = 1L, mergeTree = FALSE, +.merge_rows_TSE <- function(x, f, archetype = 1L, mergeTree = FALSE, mergeRefSeq = FALSE, ...){ # input check if(!.is_a_bool(mergeTree)){ diff --git a/man/agglomerate-methods.Rd b/man/agglomerate-methods.Rd index 85108324e..acf30360d 100644 --- a/man/agglomerate-methods.Rd +++ b/man/agglomerate-methods.Rd @@ -29,7 +29,7 @@ agglomerateByVariable(x, ...) \S4method{agglomerateByRank}{SummarizedExperiment}( x, rank = taxonomyRanks(x)[1], - onRankOnly = FALSE, + onRankOnly = TRUE, na.rm = FALSE, empty.fields = c(NA, "", " ", "\\t", "-", "_"), ... @@ -58,12 +58,7 @@ agglomerateByVariable(x, ...) agglomerateByPrevalence(x, ...) -\S4method{agglomerateByPrevalence}{SummarizedExperiment}( - x, - rank = taxonomyRanks(x)[1L], - other_label = "Other", - ... -) +\S4method{agglomerateByPrevalence}{SummarizedExperiment}(x, rank = NULL, other_label = "Other", ...) agglomerateByRanks(x, ...) @@ -292,8 +287,8 @@ nrow(x2) # same number of rows, but rowTree(x1) # ... different rowTree(x2) # ... tree -# If assay contains binary or negative values, summing might lead to -# meaningless values, and you will get a warning. In these cases, you might +# If assay contains binary or negative values, summing might lead to +# meaningless values, and you will get a warning. In these cases, you might # want to do agglomeration again at chosen taxonomic level. tse <- transformAssay(GlobalPatterns, method = "pa") tse <- agglomerateByRank(tse, rank = "Genus") @@ -304,8 +299,8 @@ sum(is.na(rowData(GlobalPatterns)$Family)) x3 <- agglomerateByRank(GlobalPatterns, rank="Family", na.rm = TRUE) nrow(x3) # different from x2 -# Because all the rownames are from the same rank, rownames do not include -# prefixes, in this case "Family:". +# Because all the rownames are from the same rank, rownames do not include +# prefixes, in this case "Family:". print(rownames(x3[1:3,])) # To add them, use getTaxonomyLabels function. @@ -313,7 +308,7 @@ rownames(x3) <- getTaxonomyLabels(x3, with_rank = TRUE) print(rownames(x3[1:3,])) # use 'remove_empty_ranks' to remove columns that include only NAs -x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", +x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", remove_empty_ranks = TRUE) head(rowData(x4)) @@ -343,13 +338,13 @@ plot(rowTree(esophagus)) # get a factor for merging f <- factor(regmatches(rownames(esophagus), regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) -merged <- agglomerateByVariable(esophagus, MARGIN = "rows", f, +merged <- agglomerateByVariable(esophagus, MARGIN = "rows", f, mergeTree = TRUE) plot(rowTree(merged)) # data(GlobalPatterns) GlobalPatterns -merged <- agglomerateByVariable(GlobalPatterns, MARGIN = "cols", +merged <- agglomerateByVariable(GlobalPatterns, MARGIN = "cols", colData(GlobalPatterns)$SampleType) merged ## Data can be aggregated based on prevalent taxonomic results @@ -363,9 +358,9 @@ tse <- agglomerateByPrevalence(tse, tse # Here data is aggregated at the taxonomic level "Phylum". The five phyla -# that exceed the population prevalence threshold of 50/100 represent the +# that exceed the population prevalence threshold of 50/100 represent the # five first rows of the assay in the aggregated data. The sixth and last row -# named by default "Other" takes the summed up values of all the other phyla +# named by default "Other" takes the summed up values of all the other phyla # that are below the prevalence threshold. assay(tse)[,1:5] diff --git a/man/getPrevalence.Rd b/man/getPrevalence.Rd index 2d43b0cd5..6898c23e4 100644 --- a/man/getPrevalence.Rd +++ b/man/getPrevalence.Rd @@ -239,7 +239,7 @@ altExp(tse, "rare") # Names of both experiments, prevalent and rare, can be found from slot # altExpNames tse - + data(esophagus) getPrevalentAbundance(esophagus, assay.type = "counts") diff --git a/tests/testthat/test-2merge.R b/tests/testthat/test-2merge.R index 0c7de4c16..4c7fbd1c2 100644 --- a/tests/testthat/test-2merge.R +++ b/tests/testthat/test-2merge.R @@ -31,20 +31,20 @@ test_that("merge", { expect_equal(actual, c(1,2)) actual <- mia:::.norm_archetype(f, c(1)) expect_equal(actual, c(1,1)) - + # .get_element_pos expect_error(mia:::.get_element_pos(), 'argument "archetype" is missing') expect_error(mia:::.get_element_pos(f), 'argument "archetype" is missing') - + actual <- mia:::.get_element_pos(f, archetype = mia:::.norm_archetype(f, 1)) expect_equal(actual,c(a = 1, b = 4)) actual <- mia:::.get_element_pos(f, archetype = mia:::.norm_archetype(f, 2)) expect_equal(actual,c(a = 2, b = 5)) actual <- mia:::.get_element_pos(f, archetype = c(2,1)) expect_equal(actual,c(a = 2, b = 4)) - + # .merge_rows mat <- matrix(1:60, nrow = 6) gr <- GRanges("chr1",rep("1-6",6)) @@ -87,11 +87,11 @@ test_that("merge", { data(esophagus, package="mia") data(GlobalPatterns, package="mia") # Add arbitrary groups - rowData(esophagus)$group <- c(rep(c("A", "B", "C"), each = nrow(esophagus)/3), + rowData(esophagus)$group <- c(rep(c("A", "B", "C"), each = nrow(esophagus)/3), rep("A", nrow(esophagus)-round(nrow(esophagus)/3)*3) ) - rowData(esophagus)$group2 <- c(rep(c("A", "B", "C"), each = nrow(esophagus)/3), + rowData(esophagus)$group2 <- c(rep(c("A", "B", "C"), each = nrow(esophagus)/3), rep("A", nrow(esophagus)-round(nrow(esophagus)/3)*3) ) - rowData(GlobalPatterns)$group <- c(rep(c("C", "D", "E"), each = nrow(GlobalPatterns)/3), + rowData(GlobalPatterns)$group <- c(rep(c("C", "D", "E"), each = nrow(GlobalPatterns)/3), rep("C", nrow(GlobalPatterns)-round(nrow(GlobalPatterns)/3)*3) ) # Merge tse <- mergeSEs(esophagus, GlobalPatterns, assay.type="counts") @@ -99,34 +99,34 @@ test_that("merge", { # (trees are pruned differently --> first instance represent specific branch) tse <- tse[c(rownames(esophagus), rownames(GlobalPatterns)), ] # Only esophagus has these groups --> the merge should contain only esophagus - merged <- agglomerateByVariable(tse, MARGIN = "rows", + merged <- agglomerateByVariable(tse, MARGIN = "rows", f = rowData(tse)$group2, mergeTree=TRUE) - merged2 <- agglomerateByVariable(tse, MARGIN = "rows", + merged2 <- agglomerateByVariable(tse, MARGIN = "rows", f = rowData(tse)$group2, mergeTree = FALSE) merged3 <- agglomerateByVariable(esophagus, MARGIN = "rows", - f = rowData(esophagus)$group2, + f = rowData(esophagus)$group2, mergeTree = TRUE) - merged4 <- .merge_features(tse, merge.by = rowData(tse)$group2, + merged4 <- .merge_features(tse, merge.by = rowData(tse)$group2, mergeTree = TRUE) merged5 <- agglomerateByVariable(tse, MARGIN = "rows", f = rowData(tse)$group2, mergeTree = TRUE) - expect_equal( rowLinks(merged)$whichTree, + expect_equal( rowLinks(merged)$whichTree, rowLinks(merged2)$whichTree ) expect_false( all(rowLinks(merged) == rowLinks(merged2)) ) expect_equal(rowTree(tse), rowTree(merged2)) expect_equal(rowTree(merged), rowTree(merged3)) expect_equal(merged4, merged5) - expect_equal(agglomerateByVariable(tse, MARGIN = "rows", - f=rowData(tse)$group2), + expect_equal(agglomerateByVariable(tse, MARGIN = "rows", + f=rowData(tse)$group2), agglomerateByVariable(tse, MARGIN = "rows", f=rowData(tse)$group2)) - + # Both datasets have group variable - merged <- agglomerateByVariable(tse, MARGIN = "rows", + merged <- agglomerateByVariable(tse, MARGIN = "rows", f = rowData(tse)$group, mergeTree = TRUE) - merged2 <- agglomerateByVariable(tse, MARGIN = "rows", + merged2 <- agglomerateByVariable(tse, MARGIN = "rows", f = rowData(tse)$group, mergeTree = FALSE) - expect_equal( rowLinks(merged)$whichTree, + expect_equal( rowLinks(merged)$whichTree, rowLinks(merged2)$whichTree ) expect_false( all(rowLinks(merged) == rowLinks(merged2)) ) expect_true( rowTree(merged, "phylo")$Nnode < rowTree(merged2, "phylo")$Nnode ) diff --git a/tests/testthat/test-3agglomerate.R b/tests/testthat/test-3agglomerate.R index 4d1a9d06e..0b099279e 100644 --- a/tests/testthat/test-3agglomerate.R +++ b/tests/testthat/test-3agglomerate.R @@ -30,7 +30,7 @@ test_that("agglomerate", { expect_equal(assays(actual)$mat[3,1],c(c = 24)) # expect_error(mergeFeaturesByRank(xtse,"",na.rm=FALSE), - "'rank' must be an non empty single character value") + "'rank' must be a non-empty single character value") expect_error(mergeFeaturesByRank(xtse,"Family",na.rm=""), "'na.rm' must be TRUE or FALSE") expect_error( @@ -46,12 +46,12 @@ test_that("agglomerate", { actual <- mergeFeaturesByRank(xtse,"Phylum",na.rm=FALSE) expect_equivalent(rowData(actual),rowData(actual_phylum)) # - actual <- mergeFeaturesByRank(xtse,"Family", na.rm = TRUE) + actual <- mergeFeaturesByRank(xtse,"Family", onRankOnly = FALSE, na.rm = TRUE) expect_equal(dim(actual),c(6,10)) expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h")) - actual <- mergeFeaturesByRank(xtse,"Family", na.rm = FALSE) # the default + actual <- mergeFeaturesByRank(xtse,"Family", onRankOnly = FALSE, na.rm = FALSE) # the default expect_equal(dim(actual),c(8,10)) - expect_equal(rowData(actual)$Family,c("c",NA,"d","e","f","g","h",NA)) + expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h",NA,NA)) actual <- mergeFeaturesByRank(xtse,"Phylum") expect_equivalent(rowData(actual),rowData(actual_phylum)) # @@ -64,19 +64,19 @@ test_that("agglomerate", { # the same dimensionality is retained data(enterotype, package="mia") expect_equal(length(unique(rowData(enterotype)[,"Genus"])), - nrow(mergeFeaturesByRank(enterotype,"Genus"))) + nrow(mergeFeaturesByRank(enterotype,"Genus", onRankOnly = FALSE))) # agglomeration in all its forms data(GlobalPatterns, package="mia") se <- GlobalPatterns - actual <- mergeFeaturesByRank(se, rank = "Family") + actual <- mergeFeaturesByRank(se, rank = "Family", onRankOnly = FALSE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), length(rowTree(se)$tip.label)) - actual <- mergeFeaturesByRank(se, rank = "Family", agglomerate.tree = TRUE) + actual <- mergeFeaturesByRank(se, rank = "Family", onRankOnly = FALSE, agglomerate.tree = TRUE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), 603) - actual <- mergeFeaturesByRank(se, rank = "Family", agglomerate.tree = TRUE) + actual <- mergeFeaturesByRank(se, rank = "Family", onRankOnly = FALSE, agglomerate.tree = TRUE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), nrow(actual)) # Test that warning occurs when assay contian binary or negative values @@ -120,6 +120,44 @@ test_that("agglomerate", { not_uniq <- mergeFeaturesByRank(se, rank = "Species", make_unique = FALSE) expect_true( !any( duplicated(rownames(uniq)) ) ) expect_true( any( duplicated(rownames(not_uniq)) ) ) + + data(GlobalPatterns, package="mia") + tse <- GlobalPatterns + tse <- transformAssay(tse, assay.type="counts", method="relabundance") + altExp(tse, "Family") <- agglomerateByRank(tse, rank="Family") + altExp(tse, "Family1") <- agglomerateByRank(tse, rank="Family", onRankOnly = TRUE) + altExp(tse, "Family2") <- agglomerateByRank(tse, rank="Family", onRankOnly = FALSE) + altExp(tse, "Family3") <- agglomerateByRank(tse, rank="Family", onRankOnly = TRUE, na.rm = TRUE) + altExp(tse, "Family4") <- agglomerateByRank(tse, rank="Family", onRankOnly = TRUE, na.rm = FALSE) + altExp(tse, "Family5") <- agglomerateByVariable(tse, f="Family", MARGIN = 'row') + + # Other group is added by agglomerateByPrevalence function to collect features under threshold + actual <- agglomerateByPrevalence(tse, rank="Family", assay.type="relabundance", + detection = 0.5/100, prevalence = 20/100) + actual0 <- agglomerateByPrevalence(altExp(tse, "Family"), assay.type="relabundance", + detection = 0.5/100, prevalence = 20/100) + actual1 <- agglomerateByPrevalence(altExp(tse, "Family1"), assay.type="relabundance", + detection = 0.5/100, prevalence = 20/100) + actual2 <- agglomerateByPrevalence(altExp(tse, "Family2"), assay.type="relabundance", + detection = 0.5/100, prevalence = 20/100) + actual3 <- agglomerateByPrevalence(altExp(tse, "Family3"), assay.type="relabundance", + detection = 0.5/100, prevalence = 20/100) + actual4 <- agglomerateByPrevalence(altExp(tse, "Family4"), assay.type="relabundance", + detection = 0.5/100, prevalence = 20/100) + actual5 <- agglomerateByPrevalence(altExp(tse, "Family5"), assay.type="relabundance", + detection = 0.5/100, prevalence = 20/100) + + # The values of actual( "", 0, 1, 4, 5) are equal since the factor group is created with groups + # at the family level i.e onRankOnly (tax_cols[tax_col_n == col]). + # However, actual2 creates groups based on the full taxonomic hierarchy up to family level + # i.e !onRankOnly (tax_cols[tax_col_n <= col]). While actual3 is less since empty or missing + # fields are removed from the tse object i.e na.rm. + expect_equal(nrow(actual), nrow(actual0)) + expect_equal(nrow(actual), nrow(actual1)) + expect_equal(nrow(actual2), 27) + expect_equal(nrow(actual3), 20) + expect_equal(nrow(actual), nrow(actual4)) + expect_equal(nrow(actual), nrow(actual5)) }) diff --git a/tests/testthat/test-5dominantTaxa.R b/tests/testthat/test-5dominantTaxa.R index a39d236c5..1e1d87acd 100644 --- a/tests/testthat/test-5dominantTaxa.R +++ b/tests/testthat/test-5dominantTaxa.R @@ -33,6 +33,7 @@ test_that("getDominant", { names(exp.vals.two) <- exp.names.one expect_equal(getDominant(tse, rank = "Genus", + onRankOnly = FALSE, na.rm = FALSE)[1:15], exp.vals.two) diff --git a/tests/testthat/test-5prevalence.R b/tests/testthat/test-5prevalence.R index 02ef14673..3b5c3bfd2 100644 --- a/tests/testthat/test-5prevalence.R +++ b/tests/testthat/test-5prevalence.R @@ -104,13 +104,13 @@ test_that("getPrevalence", { remove <- c(15, 200) assay(tse, "counts")[remove, ] <- NA # Check that agglomeration works - tse_agg <- agglomerateByRank(tse, rank = rank) + tse_agg <- agglomerateByRank(tse, onRankOnly = FALSE, rank = rank) expect_warning(ref <- getPrevalence(tse_agg, na.rm = FALSE)) - expect_warning(res <- getPrevalence(tse, na.rm = FALSE, rank = "Genus")) + expect_warning(res <- getPrevalence(tse, onRankOnly = FALSE, na.rm = FALSE, rank = "Genus")) expect_true( all(res == ref, na.rm = TRUE) ) # expect_warning(ref <- getPrevalence(tse_agg, na.rm = TRUE)) - expect_warning(res <- getPrevalence(tse, na.rm = TRUE, rank = "Genus")) + expect_warning(res <- getPrevalence(tse, onRankOnly = FALSE, na.rm = TRUE, rank = "Genus")) expect_true( all(res == ref, na.rm = TRUE) ) }) @@ -374,7 +374,7 @@ test_that("mergeFeaturesByPrevalence", { data(GlobalPatterns, package="mia") expect_error(mergeFeaturesByPrevalence(GlobalPatterns, other_label=TRUE), "'other_label' must be a single character value") - actual <- mergeFeaturesByPrevalence(GlobalPatterns) + actual <- mergeFeaturesByPrevalence(GlobalPatterns, rank = "Kingdom") expect_s4_class(actual,class(GlobalPatterns)) expect_equal(dim(actual),c(2,26))