From ba42574a62539d99f092c93480b31576e6676a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9otime=20Pralas?= <151254073+thpralas@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:17:57 +0300 Subject: [PATCH] Use Rbiom unifrac implementation instead of runUnifrac (#523) Co-authored-by: Leo Lahti Co-authored-by: TuomasBorman Co-authored-by: Tuomas Borman <60338854+TuomasBorman@users.noreply.github.com> --- DESCRIPTION | 5 +- NAMESPACE | 5 +- NEWS | 3 +- R/calculateUnifrac.R | 294 +++++++-------------------------- man/calculateUnifrac.Rd | 49 +----- tests/testthat/test-5Unifrac.R | 117 ++++++------- 6 files changed, 119 insertions(+), 354 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e19ec6b91..35a4c6914 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mia Type: Package -Version: 1.13.20 +Version: 1.13.21 Authors@R: c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"), email = "felix.gm.ernst@outlook.com", @@ -68,7 +68,8 @@ Imports: tidyr, bluster, MatrixGenerics, - mediation + mediation, + rbiom Suggests: testthat, knitr, diff --git a/NAMESPACE b/NAMESPACE index 4f7de7b58..d392d1279 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -323,11 +323,7 @@ importFrom(ape,cophenetic.phylo) importFrom(ape,drop.tip) importFrom(ape,has.singles) importFrom(ape,is.rooted) -importFrom(ape,node.depth) -importFrom(ape,node.depth.edgelength) -importFrom(ape,prop.part) importFrom(ape,read.tree) -importFrom(ape,reorder.phylo) importFrom(ape,root) importFrom(bluster,clusterRows) importFrom(decontam,isContaminant) @@ -344,6 +340,7 @@ importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,tally) importFrom(mediation,mediate) +importFrom(rbiom,unifrac) importFrom(rlang,":=") importFrom(rlang,sym) importFrom(scuttle,sumCountsAcrossFeatures) diff --git a/NEWS b/NEWS index adc135ddc..23badbede 100644 --- a/NEWS +++ b/NEWS @@ -136,4 +136,5 @@ Changes in version 1.13.x + Fix bug in mergeFeaturesByPrevalence + new aliases calculateDPCoA to getDPCoA, calculateNMDS to getNMDS, calculateRDA to getRDA, calculateCCA to getCCA -+ add informative error message in rarefyAssay on assays with strictly-negative values \ No newline at end of file ++ add informative error message in rarefyAssay on assays with strictly-negative values ++ Use rbiom package in unifrac implementation diff --git a/R/calculateUnifrac.R b/R/calculateUnifrac.R index 0be2fcf3a..322c12fee 100644 --- a/R/calculateUnifrac.R +++ b/R/calculateUnifrac.R @@ -1,8 +1,8 @@ #' Calculate weighted or unweighted (Fast) Unifrac distance #' -#' This function calculates the (Fast) Unifrac distance for all sample-pairs +#' This function calculates the Unifrac distance for all sample-pairs #' in a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -#' object. +#' object. The function utilizes \code{\link[rbiom:unifrac]{rbiom:unifrac()}}. #' #' Please note that if \code{calculateUnifrac} is used as a \code{FUN} for #' \code{runMDS}, the argument \code{ntop} has to be set to \code{nrow(x)}. @@ -48,16 +48,6 @@ #' considers presence/absence. Default is \code{FALSE}, meaning the #' unweighted-Unifrac distance is calculated for all pairs of samples. #' -#' @param normalized \code{TRUE} or \code{FALSE}: Should the output be -#' normalized such that values range from 0 to 1 independent of branch length -#' values? Default is \code{TRUE}. Note that (unweighted) \code{Unifrac} is -#' always normalized by total branch-length, and so this value is ignored when -#' \code{weighted == FALSE}. -#' -#' @param BPPARAM A -#' \code{\link[BiocParallel:BiocParallelParam-class]{BiocParallelParam}} -#' object specifying whether the Unifrac calculation should be parallelized. -#' #' @param transposed Logical scalar, is x transposed with cells in rows, i.e., #' is Unifrac distance calculated based on rows (FALSE) or columns (TRUE). #' (By default: \code{transposed = FALSE}) @@ -69,15 +59,6 @@ #' @references #' \url{http://bmf.colorado.edu/unifrac/} #' -#' The main implementation (Fast Unifrac) is adapted from the algorithm's -#' description in: -#' -#' Hamady, Lozupone, and Knight, -#' ``\href{http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html}{Fast -#' UniFrac:} facilitating high-throughput phylogenetic analyses of microbial -#' communities including analysis of pyrosequencing and PhyloChip data.'' The -#' ISME Journal (2010) 4, 17--27. -#' #' See also additional descriptions of Unifrac in the following articles: #' #' Lozupone, Hamady and Knight, ``Unifrac - An Online Tool for Comparing @@ -95,22 +76,17 @@ #' #' @export #' -#' @author -#' Paul J. McMurdie. -#' Adapted for mia by Felix G.M. Ernst -#' #' @examples #' data(esophagus) #' library(scater) #' calculateUnifrac(esophagus, weighted = FALSE) #' calculateUnifrac(esophagus, weighted = TRUE) -#' calculateUnifrac(esophagus, weighted = TRUE, normalized = FALSE) #' # for using calculateUnifrac in conjunction with runMDS the tree argument #' # has to be given separately. In addition, subsetting using ntop must #' # be disabled #' esophagus <- runMDS(esophagus, FUN = calculateUnifrac, name = "Unifrac", #' tree = rowTree(esophagus), -#' exprs_values = "counts", +#' assay.type = "counts", #' ntop = nrow(esophagus)) #' reducedDim(esophagus) NULL @@ -118,23 +94,21 @@ NULL #' @rdname calculateUnifrac #' @export setGeneric("calculateUnifrac", signature = c("x", "tree"), - function(x, tree, ... ) - standardGeneric("calculateUnifrac")) + function(x, tree, ... ) + standardGeneric("calculateUnifrac")) #' @rdname calculateUnifrac #' @export setMethod("calculateUnifrac", signature = c(x = "ANY", tree = "phylo"), - function(x, tree, weighted = FALSE, normalized = TRUE, - BPPARAM = SerialParam(), ...){ + function(x, tree, weighted = FALSE, ...){ if(is(x,"SummarizedExperiment")){ - stop("When providing a 'tree', please provide a matrix-like as 'x'", + stop("When providing a 'tree', please provide a matrix-like as 'x'", " and not a 'SummarizedExperiment' object. Please consider ", "combining both into a 'TreeSummarizedExperiment' object.", call. = FALSE) } .calculate_distance(x, FUN = runUnifrac, tree = tree, - weighted = weighted, normalized = normalized, - BPPARAM = BPPARAM, ...) + weighted = weighted, ...) } ) @@ -144,10 +118,10 @@ setMethod("calculateUnifrac", signature = c(x = "ANY", tree = "phylo"), #' #' @export setMethod("calculateUnifrac", - signature = c(x = "TreeSummarizedExperiment", - tree = "missing"), + signature = c(x = "TreeSummarizedExperiment", + tree = "missing"), function(x, assay.type = assay_name, assay_name = exprs_values, exprs_values = "counts", - tree_name = "phylo", transposed = FALSE, ...){ + tree_name = "phylo", transposed = FALSE, ...){ # Check assay.type and get assay .check_assay_present(assay.type, x) mat <- assay(x, assay.type) @@ -191,28 +165,19 @@ setMethod("calculateUnifrac", links[ links$whichTree != tree_name, ] <- NA # Take only nodeLabs links <- links[ , "nodeLab" ] - calculateUnifrac(mat, tree = tree, nodeLab = links, ...) + res <- calculateUnifrac(mat, tree = tree, nodeLab = links, ...) + return(res) } ) -################################################################################ -# Fast Unifrac for R. -# Adapted from The ISME Journal (2010) 4, 17-27; doi:10.1038/ismej.2009.97 -# -# adopted from original implementation in phyloseq implemented by -# Paul J. McMurdie (https://github.com/joey711/phyloseq) ################################################################################ #' @rdname calculateUnifrac #' -#' @importFrom ape prop.part reorder.phylo node.depth node.depth.edgelength -#' @importFrom utils combn -#' @importFrom stats as.dist -#' @importFrom BiocParallel SerialParam register bplapply bpisup bpstart bpstop -#' @importFrom DelayedArray getAutoBPPARAM setAutoBPPARAM -#' +#' @importFrom ape drop.tip +#' @importFrom rbiom unifrac #' @export -runUnifrac <- function(x, tree, weighted = FALSE, normalized = TRUE, - nodeLab = NULL, BPPARAM = SerialParam(), ...){ +runUnifrac <- function( + x, tree, weighted = FALSE, nodeLab = NULL, ...){ # Check x if( !is.matrix(as.matrix(x)) ){ stop("'x' must be a matrix", call. = FALSE) @@ -228,25 +193,23 @@ runUnifrac <- function(x, tree, weighted = FALSE, normalized = TRUE, if(!.is_a_bool(weighted)){ stop("'weighted' must be TRUE or FALSE.", call. = FALSE) } - if(!.is_a_bool(normalized)){ - stop("'normalized' must be TRUE or FALSE.", call. = FALSE) - } if(is.null(colnames(x)) || is.null(rownames(x))){ stop("colnames and rownames must not be NULL", call. = FALSE) } # nodeLab should be NULL or character vector specifying links between # rows and tree labels if( !(is.null(nodeLab) || - (is.character(nodeLab) && length(nodeLab) == nrow(x) && - all(nodeLab[ !is.na(nodeLab) ] %in% c(tree$tip.label)))) ){ - stop("'nodeLab' must be NULL or character specifying links between ", - "abundance table and tree labels.", call. = FALSE) + (is.character(nodeLab) && length(nodeLab) == nrow(x) && + all(nodeLab[ !is.na(nodeLab) ] %in% c(tree$tip.label)))) ){ + stop( + "'nodeLab' must be NULL or character specifying links between ", + "abundance table and tree labels.", call. = FALSE) } # check that matrix and tree are compatible - if( is.null(nodeLab) && - !all(rownames(x) %in% c(tree$tip.label)) ) { - stop("Incompatible tree and abundance table! Please try to provide ", - "'nodeLab'.", call. = FALSE) + if( is.null(nodeLab) && !all(rownames(x) %in% c(tree$tip.label)) ) { + stop( + "Incompatible tree and abundance table! Please try to provide ", + "'nodeLab'.", call. = FALSE) } # Merge rows, so that rows that are assigned to same tree node are agglomerated # together. If nodeLabs were provided, merge based on those. Otherwise merge @@ -254,144 +217,52 @@ runUnifrac <- function(x, tree, weighted = FALSE, normalized = TRUE, if( is.null(nodeLab) ){ nodeLab <- rownames(x) } - # Merge assay + # Prune tree if there are nodes that cannot be found from tips or if there + # are tips that cannot be found from abundance matrix. It might be + # that certain row is linked to internal node or that the tree has extra + # tips that do not match with rows (e.g. after subsetting). + if( any( !nodeLab %in% tree$tip.label ) || + any( !tree$tip.label %in% nodeLab) ){ + tree <- .prune_tree(tree, nodeLab) + warning("Pruning tree...", call. = FALSE) + } + # If node labels cannot be found from tips even after pruning, give error. + # This kind of tree cannot be used in unifrac since it expects that every + # row is linked to tips. + if( any( !nodeLab %in% tree$tip.label ) ){ + stop( + "Unifrac cannot be calculated since tree is not compatible. ", + "Each row must be linked to tip of the tree.", call. = FALSE) + } + # Merge assay so that each row represent single tip. It might be that + # multiple rows are linked to single tip. x <- .merge_assay_by_rows(x, nodeLab, ...) - # Modify tree + # Modify tree so that it will become rooted. tree <- .norm_tree_to_be_rooted(tree, rownames(x)) # Remove those tips that are not present in the data if( any(!tree$tip.label %in% rownames(x)) ){ - tree <- ape::drop.tip( + tree <- drop.tip( tree, tree$tip.label[!tree$tip.label %in% rownames(x)]) - warning("The tree is pruned so that tips that cannot be found from ", - "the abundance matrix are removed.", call. = FALSE) - } - # - old <- getAutoBPPARAM() - setAutoBPPARAM(BPPARAM) - on.exit(setAutoBPPARAM(old)) - if (!(bpisup(BPPARAM) || is(BPPARAM, "MulticoreParam"))) { - bpstart(BPPARAM) - on.exit(bpstop(BPPARAM), add = TRUE) + warning( + "The tree is pruned so that tips that cannot be found from ", + "the abundance matrix are removed.", call. = FALSE) } - # - # create N x 2 matrix of all pairwise combinations of samples. - spn <- utils::combn(colnames(x), 2, simplify = FALSE) - - ######################################## - # Build the requisite matrices as defined - # in the Fast Unifrac article. - ######################################## - ## This only needs to happen once in a call to Unifrac. - ## Notice that A and B do not appear in this section. - # Begin by building the edge descendants matrix (edge-by-sample) - # `edge_array` - # - # Create a list of descendants, starting from the first internal node (root) - ntip <- length(tree$tip.label) - # Create a matrix that maps each internal node to its 2 descendants - # This matrix doesn't include the tips, so must use node#-ntip to index into - # it - ## to suppress the warning if the number of node edges is uneven, we add the - ## first node again - node.desc <- tree$edge[order(tree$edge[,1]),][,2] - if(length(node.desc) %% 2 == 1){ - node.desc <- c(node.desc,node.desc[1]) - } - node.desc <- matrix(node.desc, byrow = TRUE, ncol = 2) - # Define the edge_array object - # Right now this is a node_array object, each row is a node (including tips) - # It will be subset and ordered to match tree$edge later - edge_array <- matrix(0, nrow = ntip+tree$Nnode, ncol = ncol(x), - dimnames = list(NULL, sample_names = colnames(x))) - # Load the tip counts in directly - x_tip <- x[ rownames(x) %in% tree$tip.label, ] - edge_array[seq_len(ntip),] <- x_tip - # Get a list of internal nodes ordered by increasing depth - ord.node <- order(node.depth(tree))[(ntip+1):(ntip+tree$Nnode)] - # Loop over internal nodes, summing their descendants to get that nodes - # count - for(i in ord.node){ - edge_array[i,] <- colSums(edge_array[node.desc[i-ntip,], , drop=FALSE], - na.rm = TRUE) - } - # Keep only those with a parental edge (drops root) and order to match - # tree$edge - edge_array <- edge_array[tree$edge[,2],] - # calculate the sums per sample - samplesums <- colSums(x) - # Remove unneeded variables. - rm(node.desc) - ############################################################################ - # calculate the distances - ############################################################################ - if(weighted){ - if(!normalized){ - distlist <- BiocParallel::bplapply(spn, - unifrac_weighted_not_norm, - tree = tree, - samplesums = samplesums, - edge_array = edge_array, - BPPARAM = BPPARAM) - } else { - # This is only relevant to weighted-Unifrac. - # For denominator in the normalized distance, we need the age of each - # tip. - # 'z' is the tree in postorder order used in calls to .C - # Descending order of left-hand side of edge (the ancestor to the node) - z <- ape::reorder.phylo(tree, order="postorder") - # Call phyloseq-internal function that in-turn calls ape's internal - # horizontal position function, in C, using the re-ordered phylo object, - # `z` - tipAges = node.depth.edgelength(tree) - # Keep only the tips, and add the tip labels in case `z` order differs - # from `tree` - tipAges <- tipAges[seq.int(1L, length(tree$tip.label))] - names(tipAges) <- z$tip.label - # Explicitly re-order tipAges to match x - tipAges <- tipAges[rownames(x)] - distlist <- BiocParallel::bplapply(spn, - unifrac_weighted_norm, - mat = x, - tree = tree, - samplesums = samplesums, - edge_array = edge_array, - tipAges = tipAges, - BPPARAM = BPPARAM) - } - } else { - # For unweighted Unifrac, convert the edge_array to an occurrence - # (presence/absence binary) array - edge_occ <- (edge_array > 0) - 0 - distlist <- BiocParallel::bplapply(spn, - unifrac_unweighted, - tree = tree, - samplesums = samplesums, - edge_occ = edge_occ, - BPPARAM = BPPARAM) - } - # Initialize UnifracMat with NAs - UnifracMat <- matrix(NA_real_, ncol(x), ncol(x)) - rownames(UnifracMat) <- colnames(UnifracMat) <- colnames(x) - # Matrix-assign lower-triangle of UnifracMat. Then coerce to dist and - # return. - matIndices <- matIndices <- matrix(c(vapply(spn,"[",character(1),2L), - vapply(spn,"[",character(1),1L)), - ncol = 2) - UnifracMat[matIndices] <- unlist(distlist) - # - stats::as.dist(UnifracMat) + # Calculate unifrac. Use implementation from rbiom package + res <- unifrac(x, tree = tree, weighted = weighted) + return(res) } # Aggregate matrix based on nodeLabs. At the same time, rename rows based on nodeLab # --> each row represent specific node of tree +#' @importFrom scuttle sumCountsAcrossFeatures .merge_assay_by_rows <- function(x, nodeLab, average = FALSE, ...){ if( !.is_a_bool(average) ){ stop("'average' must be TRUE or FALSE.", call. = FALSE) } # Merge assay based on nodeLabs - x <- scuttle::sumCountsAcrossFeatures(x, ids = nodeLab, - subset.row = NULL, subset.col = NULL, - average = average) + x <- sumCountsAcrossFeatures( + x, ids = nodeLab, subset.row = NULL, subset.col = NULL, + average = average) # Remove NAs from nodeLab nodeLab <- nodeLab[ !is.na(nodeLab) ] # Get the original order back @@ -399,57 +270,6 @@ runUnifrac <- function(x, tree, weighted = FALSE, normalized = TRUE, return(x) } -unifrac_unweighted <- function(i, tree, samplesums, edge_occ){ - A <- i[1] - B <- i[2] - AT <- samplesums[A] - BT <- samplesums[B] - # Unweighted Unifrac - # Subset matrix to just columns A and B - edge_occ_AB <- edge_occ[, c(A, B)] - edge_occ_AB_rS <- rowSums(edge_occ_AB, na.rm = TRUE) - # Keep only the unique branches. Sum the lengths - edge_uni_AB_sum <- sum((tree$edge.length * edge_occ_AB)[edge_occ_AB_rS < 2,], - na.rm=TRUE) - # Normalize this sum to the total branches among these two samples, A and - # B - uwUFpairdist <- edge_uni_AB_sum / - sum(tree$edge.length[edge_occ_AB_rS > 0]) - uwUFpairdist -} -# if not-normalized weighted Unifrac, just return "numerator"; -# the u-value in the w-Unifrac description -unifrac_weighted_not_norm <- function(i, tree, samplesums, edge_array){ - A <- i[1] - B <- i[2] - AT <- samplesums[A] - BT <- samplesums[B] - # weighted Unifrac - wUF_branchweight <- abs(edge_array[, A]/AT - edge_array[, B]/BT) - # calculate the w-UF numerator - numerator <- sum((tree$edge.length * wUF_branchweight), na.rm = TRUE) - # if not-normalized weighted Unifrac, just return "numerator"; - # the u-value in the w-Unifrac description - numerator -} -unifrac_weighted_norm <- function(i, mat, tree, samplesums, edge_array, - tipAges){ - A <- i[1] - B <- i[2] - AT <- samplesums[A] - BT <- samplesums[B] - # weighted Unifrac - wUF_branchweight <- abs(edge_array[, A]/AT - edge_array[, B]/BT) - # calculate the w-UF numerator - numerator <- sum((tree$edge.length * wUF_branchweight), na.rm = TRUE) - # denominator (assumes tree-indices and matrix indices are same order) - denominator <- sum((tipAges * (mat[, A]/AT + mat[, B]/BT)), na.rm = TRUE) - # return the normalized weighted Unifrac values - numerator / denominator -} - -################################################################################ - #' @importFrom ape is.rooted root .norm_tree_to_be_rooted <- function(tree, names){ if( !is.rooted(tree) ){ diff --git a/man/calculateUnifrac.Rd b/man/calculateUnifrac.Rd index 836958a1c..0ff762973 100644 --- a/man/calculateUnifrac.Rd +++ b/man/calculateUnifrac.Rd @@ -9,14 +9,7 @@ \usage{ calculateUnifrac(x, tree, ...) -\S4method{calculateUnifrac}{ANY,phylo}( - x, - tree, - weighted = FALSE, - normalized = TRUE, - BPPARAM = SerialParam(), - ... -) +\S4method{calculateUnifrac}{ANY,phylo}(x, tree, weighted = FALSE, ...) \S4method{calculateUnifrac}{TreeSummarizedExperiment,missing}( x, @@ -28,15 +21,7 @@ calculateUnifrac(x, tree, ...) ... ) -runUnifrac( - x, - tree, - weighted = FALSE, - normalized = TRUE, - nodeLab = NULL, - BPPARAM = SerialParam(), - ... -) +runUnifrac(x, tree, weighted = FALSE, nodeLab = NULL, ...) } \arguments{ \item{x}{a numeric matrix or a @@ -61,16 +46,6 @@ species/taxa shared between samples, whereas unweighted-Unifrac only considers presence/absence. Default is \code{FALSE}, meaning the unweighted-Unifrac distance is calculated for all pairs of samples.} -\item{normalized}{\code{TRUE} or \code{FALSE}: Should the output be -normalized such that values range from 0 to 1 independent of branch length -values? Default is \code{TRUE}. Note that (unweighted) \code{Unifrac} is -always normalized by total branch-length, and so this value is ignored when -\code{weighted == FALSE}.} - -\item{BPPARAM}{A -\code{\link[BiocParallel:BiocParallelParam-class]{BiocParallelParam}} -object specifying whether the Unifrac calculation should be parallelized.} - \item{assay.type}{a single \code{character} value for specifying which assay to use for calculation.} @@ -100,9 +75,9 @@ node labs must be present in \code{tree}.} a sample-by-sample distance matrix, suitable for NMDS, etc. } \description{ -This function calculates the (Fast) Unifrac distance for all sample-pairs +This function calculates the Unifrac distance for all sample-pairs in a \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -object. +object. The function utilizes \code{\link[rbiom:unifrac]{rbiom:unifrac()}}. } \details{ Please note that if \code{calculateUnifrac} is used as a \code{FUN} for @@ -113,28 +88,18 @@ data(esophagus) library(scater) calculateUnifrac(esophagus, weighted = FALSE) calculateUnifrac(esophagus, weighted = TRUE) -calculateUnifrac(esophagus, weighted = TRUE, normalized = FALSE) # for using calculateUnifrac in conjunction with runMDS the tree argument # has to be given separately. In addition, subsetting using ntop must # be disabled esophagus <- runMDS(esophagus, FUN = calculateUnifrac, name = "Unifrac", tree = rowTree(esophagus), - exprs_values = "counts", + assay.type = "counts", ntop = nrow(esophagus)) reducedDim(esophagus) } \references{ \url{http://bmf.colorado.edu/unifrac/} -The main implementation (Fast Unifrac) is adapted from the algorithm's -description in: - -Hamady, Lozupone, and Knight, -``\href{http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html}{Fast -UniFrac:} facilitating high-throughput phylogenetic analyses of microbial -communities including analysis of pyrosequencing and PhyloChip data.'' The -ISME Journal (2010) 4, 17--27. - See also additional descriptions of Unifrac in the following articles: Lozupone, Hamady and Knight, ``Unifrac - An Online Tool for Comparing @@ -148,7 +113,3 @@ microbial communities.'' Appl Environ Microbiol. 2007 Lozupone C, Knight R. ``Unifrac: a new phylogenetic method for comparing microbial communities.'' Appl Environ Microbiol. 2005 71 (12):8228-35. } -\author{ -Paul J. McMurdie. -Adapted for mia by Felix G.M. Ernst -} diff --git a/tests/testthat/test-5Unifrac.R b/tests/testthat/test-5Unifrac.R index 6080e2f22..023842946 100644 --- a/tests/testthat/test-5Unifrac.R +++ b/tests/testthat/test-5Unifrac.R @@ -1,100 +1,85 @@ context("Unifrac beta diversity") test_that("Unifrac beta diversity", { - skip_if_not(require("phyloseq", quietly = TRUE)) - data(esophagus, package="mia") tse <- esophagus tse <- transformAssay(tse, assay.type="counts", method="relabundance") expect_error( calculateUnifrac(tse, assay.type = "test", tree_name = "phylo", - weighted = FALSE, normalized = TRUE, - BPPARAM = SerialParam()) + weighted = FALSE) ) expect_error( calculateUnifrac(tse, assay.type = 2, tree_name = "phylo", - weighted = FALSE, normalized = TRUE, - BPPARAM = SerialParam()) + weighted = FALSE) ) expect_error( calculateUnifrac(tse, assay.type = TRUE, tree_name = "phylo", - weighted = FALSE, normalized = TRUE, - BPPARAM = SerialParam()) + weighted = FALSE) ) expect_error( calculateUnifrac(tse, assay.type = "counts", tree_name = "test", - weighted = FALSE, normalized = TRUE, - BPPARAM = SerialParam()) + weighted = FALSE) ) expect_error( calculateUnifrac(tse, assay.type = "counts", tree_name = 1, - weighted = FALSE, normalized = TRUE, - BPPARAM = SerialParam()) + weighted = FALSE) ) expect_error( calculateUnifrac(tse, assay.type = "counts", tree_name = TRUE, - weighted = "FALSE", normalized = TRUE, - BPPARAM = SerialParam()) - ) - expect_error( - calculateUnifrac(tse, assay.type = "counts", tree_name = "phylo", - weighted = 1, normalized = TRUE, - BPPARAM = SerialParam()) - ) - expect_error( - calculateUnifrac(tse, assay.type = "counts", tree_name = "phylo", - weighted = FALSE, normalized = "TRUE", - BPPARAM = SerialParam()) + weighted = "FALSE") ) expect_error( calculateUnifrac(tse, assay.type = "counts", tree_name = "phylo", - weighted = FALSE, normalized = 1, - BPPARAM = SerialParam()) + weighted = 1) ) data(GlobalPatterns, package="mia") tse <- GlobalPatterns - # Compare to phyloseq function - .require_package("phyloseq") - # Convert data into phyloseq - pseq <- makePhyloseqFromTreeSE(tse) - # Calculate unifrac - unifrac_tse <- as.matrix(calculateUnifrac(tse)) - unifrac_pseq <- as.matrix(phyloseq::UniFrac(pseq)) - expect_equal(unifrac_tse, unifrac_pseq) - # Calculate unifrac - unifrac_tse <- as.matrix(calculateUnifrac(tse, weighted = TRUE, normalized = FALSE)) - unifrac_pseq <- as.matrix(phyloseq::UniFrac(pseq, weighted = TRUE, normalized = FALSE)) - expect_equal(unifrac_tse, unifrac_pseq) + # Calculate unweighted unifrac + unifrac_mia <- as.matrix(calculateUnifrac(tse, weighted = FALSE)) + unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse), weighted = FALSE, + rowTree(tse))) + expect_equal(unifrac_mia, unifrac_rbiom) + # Calculate weighted unifrac. Allow tolerance since weighted unifrac + # calculation in rbiom has some stochasticity. That is most likely due + # multithreading and complex structure of tree (loops). + unifrac_mia <- as.matrix(calculateUnifrac(tse, weighted = TRUE)) + unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse), weighted = TRUE, + rowTree(tse))) + expect_equal(unifrac_mia, unifrac_rbiom, tolerance = 1e-4) - # Test with merged object with multiple trees + # Test with merged object with multiple trees. runUnifrac takes subset of + # data based on provided tree. tse <- mergeSEs(GlobalPatterns, esophagus, assay.type="counts", missing_values = 0) + tse_ref <- tse + tse_ref <- tse_ref[ rowLinks(tse_ref)[["whichTree"]] == "phylo", ] + # Calculate unweighted unifrac + unifrac_mia <- as.matrix(calculateUnifrac(tse, weighted = FALSE)) + unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse_ref), weighted = FALSE, + rowTree(tse_ref))) + expect_equal(unifrac_mia, unifrac_rbiom) + # Calculate weighted unifrac + unifrac_mia <- as.matrix(calculateUnifrac(tse, weighted = TRUE)) + unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse_ref), weighted = TRUE, + rowTree(tse_ref))) + expect_equal(unifrac_mia, unifrac_rbiom, tolerance = 1e-4) - # Convert data into phyloseq - pseq <- makePhyloseqFromTreeSE(tse, assay.type="counts") - # Convert back to TreeSE (pseq has pruned tree) - tse <- makeTreeSEFromPhyloseq(pseq) - # Calculate unifrac - unifrac_tse <- as.matrix(calculateUnifrac(tse)) - unifrac_pseq <- as.matrix(phyloseq::UniFrac(pseq)) - expect_equal(unifrac_tse, unifrac_pseq) - # Calculate unifrac - unifrac_tse <- as.matrix(calculateUnifrac(tse, weighted = TRUE, normalized = FALSE, - tree_name = "phylo")) - unifrac_pseq <- as.matrix(phyloseq::UniFrac(pseq, weighted = TRUE, normalized = FALSE)) - expect_equal(unifrac_tse, unifrac_pseq) - - # Test the function with agglomerated data - tse <- agglomerateByRank(tse, rank = "Phylum") - # Convert data into phyloseq - suppressWarnings( pseq <- makePhyloseqFromTreeSE(tse) ) - # Convert back to TreeSE (pseq has pruned tree) - tse <- makeTreeSEFromPhyloseq(pseq) - # Calculate unifrac - unifrac_tse <- as.matrix(calculateUnifrac(tse, normalized = TRUE)) - unifrac_pseq <- as.matrix(phyloseq::UniFrac(pseq, normalized = TRUE)) - expect_equal(unifrac_tse, unifrac_pseq) - - # Detach phyloseq package - unloadNamespace("phyloseq") + # Test the function with agglomerated data. runUnifrac renames rownames + # based on tips and links to them. Then it also prunes the tree so that + # rows are in tips. + tse <- GlobalPatterns + tse <- agglomerateByRank(tse, rank = "Species") + tse_ref <- tse + rownames(tse_ref) <- rowLinks(tse_ref)[["nodeLab"]] + rowTree(tse_ref) <- .prune_tree(rowTree(tse_ref), rowLinks(tse_ref)[["nodeLab"]]) + # Calculate unweighted unifrac + unifrac_mia <- as.matrix(calculateUnifrac(tse, weighted = FALSE)) + unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse_ref), weighted = FALSE, + rowTree(tse_ref))) + # Calculate weighted unifrac. No tolerance needed since the tree has + # simpler structure after pruning. + unifrac_mia <- as.matrix(calculateUnifrac(tse, weighted = TRUE)) + unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse_ref), weighted = TRUE, + rowTree(tse_ref))) + expect_equal(unifrac_mia, unifrac_rbiom) })