diff --git a/DESCRIPTION b/DESCRIPTION index 211ca1a..0bf9226 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: SigGenes Title: Automated differential analysis and signature generation for bulk and single-cell data Description: Automated differential analysis between groups based on limma-voom and generation of per-group signatures via custom filtering. -Version: 1.0.4 +Version: 1.0.5 Author: Alexander Bender [aut,cre] Maintainer: Alexander Bender License: LGPL (>=2) diff --git a/NAMESPACE b/NAMESPACE index c8b1625..a23bc3b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,10 +13,14 @@ importFrom(SummarizedExperiment,assayNames) importFrom(SummarizedExperiment,colData) importFrom(edgeR,DGEList) importFrom(edgeR,calcNormFactors) +importFrom(edgeR,cpm) importFrom(edgeR,filterByExpr) importFrom(edgeR,voomLmFit) +importFrom(limma,arrayWeights) importFrom(limma,contrasts.fit) +importFrom(limma,duplicateCorrelation) importFrom(limma,is.fullrank) +importFrom(limma,lmFit) importFrom(limma,makeContrasts) importFrom(limma,topTreat) importFrom(limma,treat) diff --git a/NEWS b/NEWS index 5eb60d5..a8ba40b 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +CHANGES IN VERSION 1.0.5 +------------------------ + + o option to use limma-trend for testing rather than default limma-voom + + CHANGES IN VERSION 1.0.4 ------------------------ diff --git a/R/de_testing.R b/R/de_testing.R index 5f7fbb6..f4413ae 100644 --- a/R/de_testing.R +++ b/R/de_testing.R @@ -31,27 +31,36 @@ #' @param use_weights Logical, whether to use empirical sample weights in \code{edgeR::voomLmFit} #' @param mode Either "pairwise" to run all pairwise comparisons between `main_covariate` levels, #' or "average" to test each level against the average of all other levels. +#' @param use_trend Logical, if TRUE will convert data to logCPM with edgeR first and then use limma-trend for testing. #' @param use_filterByExpr Logical, whether to run \code{filterByExpr} from `edgeR` for prefiltering of counts. #' This makes sense when testing for example bulk RNA-seq data. #' @param return_object Logical, whether to return the DGEList in the output #' @param delim A delimiter string used in the names of the output list. #' #' @examples +#' # Testing on pseudobulk level with limma-voom #' set.seed(1) -#' sce <- scuttle::mockSCE(ncells=1000, ngenes=1000, nspikes=0) -#' sce$group <- rep(LETTERS[1:4], each=250) +#' sce <- scuttle::mockSCE(ncells = 1000, ngenes = 1000, nspikes = 0) +#' sce$group <- rep(LETTERS[1:4], each = 250) #' sce$donor <- rep(LETTERS[1:4], 250) -#' res_pairwise <- de_limma(x = sce, aggregate_by = c("group", "donor"), -#' main_covariate = "group", other_covariates = c("donor")) +#' res_pairwise <- de_limma( +#' x = sce, aggregate_by = c("group", "donor"), +#' main_covariate = "group", other_covariates = c("donor") +#' ) #' -#' res_pairwise_block <- de_limma(x = sce, aggregate_by = c("group", "donor"), -#' main_covariate = "group", block = "donor") +#' res_pairwise_block <- de_limma( +#' x = sce, aggregate_by = c("group", "donor"), +#' main_covariate = "group", block = "donor" +#' ) #' -#' res_average <- de_limma(x = sce, aggregate_by = c("group", "donor"), -#' main_covariate = "group", other_covariates = c("donor"), -#' mode="average") +#' res_average <- de_limma( +#' x = sce, aggregate_by = c("group", "donor"), +#' main_covariate = "group", other_covariates = c("donor"), +#' mode = "average" +#' ) #' -#' res_singlecell <- de_limma(x = sce, main_covariate = "group") +#' # Testing on single-cell level with limma-trend +#' res_singlecell <- de_limma(x = sce, main_covariate = "group", use_trend = TRUE) #' #' @author Alexander Bender #' @@ -65,99 +74,98 @@ #' @importFrom SingleCellExperiment colData sizeFactors sizeFactors<- #' @importFrom SummarizedExperiment assay assayNames colData #' @importFrom scuttle aggregateAcrossCells librarySizeFactors -#' @importFrom edgeR calcNormFactors DGEList filterByExpr voomLmFit +#' @importFrom edgeR calcNormFactors cpm DGEList filterByExpr voomLmFit #' @importFrom stats model.matrix as.formula median p.adjust -#' @importFrom limma contrasts.fit is.fullrank makeContrasts treat topTreat +#' @importFrom limma arrayWeights contrasts.fit duplicateCorrelation is.fullrank makeContrasts lmFit treat topTreat #' @importFrom utils combn head #' @importFrom S4Vectors SimpleList #' @importFrom methods as is #' #' @export -de_limma <- function(x, use_assay = "counts", aggregate_by=NULL, use_existing_sf = TRUE, - main_covariate, other_covariates=NULL, block=NULL, - min_pct=0, min_fc = 1.0, use_weights = FALSE, - mode=c("pairwise", "average"), - use_filterByExpr = FALSE, return_object = FALSE, delim="_vs_"){ - +de_limma <- function(x, use_assay = "counts", aggregate_by = NULL, use_existing_sf = TRUE, + main_covariate, other_covariates = NULL, + test_method = c("trend", "voom"), block = NULL, + min_pct = 0, min_fc = 1.0, use_weights = FALSE, + mode = c("pairwise", "average"), use_trend = FALSE, + use_filterByExpr = FALSE, return_object = FALSE, delim = "_vs_") { # Checks is_sce <- is(x, "SingleCellExperiment") - if(!is_sce) stop("x must be a SingleCellExperiment") + if (!is_sce) stop("x must be a SingleCellExperiment") - if(!use_assay %in% assayNames(x)) + if (!use_assay %in% assayNames(x)) { stop(paste("x contains no assay named", use_assay)) + } - if(!is.null(aggregate_by)){ + if (!is.null(aggregate_by)) { is_agg <- aggregate_by %in% colnames(colData(x)) - if(!sum(is_agg) == length(aggregate_by)) stop("Not all entries in are in colData(x)") + if (!sum(is_agg) == length(aggregate_by)) stop("Not all entries in are in colData(x)") } - if(!is.null(block)){ - if(!block %in% colnames(colData(x))) + if (!is.null(block)) { + if (!block %in% colnames(colData(x))) { stop("Name of block not in colData(x)") + } } - if(!is(use_existing_sf, "logical")) + if (!is(use_existing_sf, "logical")) { stop("use_existing_sf must be logical") + } is_too_long_main <- length(main_covariate) - if(is_too_long_main > 1) stop("main_covariate must be length 1") + if (is_too_long_main > 1) stop("main_covariate must be length 1") is_main <- all(main_covariate %in% colnames(colData(x))) - if(!is_main) stop("Not all entries in are in colData(x)") + if (!is_main) stop("Not all entries in are in colData(x)") - if(!is.null(other_covariates)){ + if (!is.null(other_covariates)) { is_other <- all(other_covariates %in% colnames(colData(x))) - if(!is_other) stop("Not all entries in are in colData(x)") + if (!is_other) stop("Not all entries in are in colData(x)") } - if(!is.null(aggregate_by)){ + if (!is.null(aggregate_by)) { is_agg <- all(aggregate_by %in% colnames(colData(x))) - if(!is_agg) stop("Not all entries in are in colData(x)") + if (!is_agg) stop("Not all entries in are in colData(x)") } is_pct <- min_pct >= 0 & is.numeric(min_pct) - if(!is_pct) stop("min_pct must be numeric and not negative") + if (!is_pct) stop("min_pct must be numeric and not negative") is_fc <- min_fc >= 1.0 & is.numeric(min_fc) - if(!is_fc) stop("is_fc must be numeric and >= 1.0") + if (!is_fc) stop("is_fc must be numeric and >= 1.0") - if(!is.logical(use_weights)) stop("use_weights must be logical") + if (!is.logical(use_weights)) stop("use_weights must be logical") invisible(match.arg(mode, c("pairwise", "average"))) mode <- match.arg(mode) - if(!is(delim, "character")) stop("delim must be a charcter") + if (!is(delim, "character")) stop("delim must be a charcter") # Aggregate to pseudobulk - if(!is.null(aggregate_by)){ - - pb <- aggregateAcrossCells(x, id=colData(x)[,aggregate_by], use.assay.type=use_assay) + if (!is.null(aggregate_by)) { + pb <- aggregateAcrossCells(x, id = colData(x)[, aggregate_by], use.assay.type = use_assay) pb <- DGEList(counts = assay(pb), samples = data.frame(colData(pb), check.names = FALSE)) run_norm <- TRUE - } else { - run_norm <- FALSE # means no calcNormFactors w <- which(assayNames(x) == use_assay) - pb <- DGEList(counts = assay(x, use_assay), samples = data.frame(colData(x), check.names = FALSE)) + pb <- DGEList(counts = as.matrix(assay(x, use_assay)), samples = data.frame(colData(x), check.names = FALSE)) # Normalization sf <- suppressWarnings(sizeFactors(x)) sf_null <- is.null(sf) - if(sf_null){ + if (sf_null) { use_existing_sf <- FALSE } - if(!use_existing_sf){ + if (!use_existing_sf) { sf <- librarySizeFactors(x) } # Convert Size factors to normalization factors based on scran::convertTo() - nf <- log(sf/pb$samples$lib.size) + nf <- log(sf / pb$samples$lib.size) nf <- exp(nf - mean(nf)) pb$samples$norm.factors <- nf - } pb$samples <- droplevels.data.frame(pb$samples) @@ -165,121 +173,135 @@ de_limma <- function(x, use_assay = "counts", aggregate_by=NULL, use_existing_sf # Require replication in at least one group max_n <- max(table(pb$samples[[main_covariate]])) - if(max_n == 1) stop("No replication in any group -- DE analysis not possible!") + if (max_n == 1) stop("No replication in any group -- DE analysis not possible!") # Design, allowing any additional covariates - others <- if(!is.null(other_covariates)) paste(other_covariates, collapse = " + ") else "" - f <- as.formula(paste("~ 0", main_covariate, others, sep = "+ ")) + others <- if (!is.null(other_covariates)) paste(other_covariates, collapse = " + ") else "" + strg <- paste("~ 0", main_covariate, others, sep = " + ") + f <- as.formula(gsub(" \\+ $", "", strg)) design <- model.matrix(f, pb$samples) colnames(design) <- gsub(paste0("^", main_covariate), "", colnames(design)) is_fr <- is.fullrank(design) - if(!is_fr) stop("Design is not full-rank, meaning you cannot adjust for these covariates!") + if (!is_fr) stop("Design is not full-rank, meaning you cannot adjust for these covariates!") # Optionally prefilter with edgeR - if(use_filterByExpr){ - + if (use_filterByExpr) { keep_fbe <- filterByExpr(pb, group = pb$samples[[main_covariate]]) - pb <- pb[keep_fbe,] - - + pb <- pb[keep_fbe, ] } # Prefilter to genes that in at least one group is expressed by, then normalize pexp <- get_pexpr(assay(x, use_assay), group = x[[main_covariate]]) - if(min_pct > 0){ - + if (min_pct > 0) { keep <- apply(pexp >= min_pct, 1, sum) > 0 - pb <- pb[keep,] - + pb <- pb[keep, ] } - rm(x) + # rm(x) - if(nrow(pb) == 0) return(NULL) - if(nrow(pb) < 10){ + if (nrow(pb) == 0) { + return(NULL) + } + if (nrow(pb) < 10) { warning("Fewer than 10 genes (arbitrary threshold to trigger this warning) left after expression filter.") } # Only run normalization if doing pseudobulk analysis - if(run_norm){ + if (run_norm) { pb <- calcNormFactors(pb, method = "TMMwsp") } - # For now we do voomLmFit - if(is.null(block)){ + if (is.null(block)) { blocker <- NULL } else { blocker <- pb$samples[[block]] } - v <- voomLmFit(counts = pb, design = design, sample.weights = use_weights, block = blocker) + # USe voom by default, or use trend + if (!use_trend) { + v <- voomLmFit(counts = pb, design = design, sample.weights = use_weights, block = blocker) + trend <- FALSE + } else { + lcpm <- cpm(pb, log = TRUE) + + aw <- NULL + if (use_weights) { + aw <- arrayWeights(object = lcpm, design = design) + } + + if (!is.null(block)) { + if (nrow(lcpm) > 2000) { + spl <- sample(x = 1:nrow(lcpm), size = 2000, replace = FALSE) + } + + dcor <- duplicateCorrelation(object = lcpm[spl, ], design = design, block = blocker, weights = aw) + + v <- lmFit(object = lcpm, design = design, weights = aw, block = blocker, correlation = dcor$consensus) + } else { + v <- lmFit(object = lcpm, design = design, weights = aw) + } + + trend <- TRUE + } # Make all pairwise comparisons and make the contrasts.fit ux <- pb$samples[[main_covariate]] - if(is(ux, "factor")){ + if (is(ux, "factor")) { u <- levels(droplevels(ux)) } else { u <- unique(as.character(ux)) } # -- Pairwise -- # ----------------------------------------------------------- - if(mode=="pairwise"){ - + if (mode == "pairwise") { contrasts <- .make_contrasts_pairwise(u, design, delim) v <- contrasts.fit(fit = v, contrasts = contrasts) - v <- treat(v, fc=min_fc) + v <- treat(v, fc = min_fc, trend = trend) # Extract results per contrast iter <- colnames(contrasts) - de_results <- lapply(iter, function(i){ - + de_results <- lapply(iter, function(i) { s <- strsplit(i, delim)[[1]] first <- s[1] second <- s[2] - tt <- topTreat(fit = v, coef = i, number=Inf) - tt <- tt[order(tt$t, decreasing = TRUE),] + tt <- topTreat(fit = v, coef = i, number = Inf) + tt <- tt[order(tt$t, decreasing = TRUE), ] pe <- pexp[rownames(tt), c(first, second)] colnames(pe) <- c("pct.1", "pct.2") tt <- cbind(tt, pe) - tt <- tt[tt$pct.1 >= min_pct | tt$pct.2 >= min_pct,] + tt <- tt[tt$pct.1 >= min_pct | tt$pct.2 >= min_pct, ] tt$adj.P.Val <- p.adjust(tt$P.Value, "BH") tt - }) names(de_results) <- iter - } - if(mode=="average"){ - + if (mode == "average") { contrasts <- .make_contrasts_average(u, design) v <- contrasts.fit(fit = v, contrasts = contrasts) - v <- treat(v, fc=min_fc) + v <- treat(v, fc = min_fc) # Extract significant genes per contrast iter <- colnames(contrasts) - de_results <- lapply(iter, function(i){ - - tt <- topTreat(fit = v, coef = i, number=Inf) - tt <- tt[order(tt$t, decreasing = TRUE),] - pct.1 <- round(as.numeric(pexp[rownames(tt), i, drop=TRUE]), 2) - pct.2 <- round(as.numeric(apply(pexp[rownames(tt), setdiff(colnames(pexp), i), drop=FALSE], 1, mean)), 2) + de_results <- lapply(iter, function(i) { + tt <- topTreat(fit = v, coef = i, number = Inf) + tt <- tt[order(tt$t, decreasing = TRUE), ] + pct.1 <- round(as.numeric(pexp[rownames(tt), i, drop = TRUE]), 2) + pct.2 <- round(as.numeric(apply(pexp[rownames(tt), setdiff(colnames(pexp), i), drop = FALSE], 1, mean)), 2) tt$pct.1 <- pct.1 tt$pct.2 <- pct.2 - tt <- tt[tt$pct.1 >= min_pct,] + tt <- tt[tt$pct.1 >= min_pct, ] tt$adj.P.Val <- p.adjust(tt$P.Value, "BH") tt - }) names(de_results) <- iter - } de_results <- sapply(de_results, function(x) data.frame(Gene = rownames(x), x), simplify = FALSE) @@ -287,10 +309,8 @@ de_limma <- function(x, use_assay = "counts", aggregate_by=NULL, use_existing_sf to_return <- SimpleList(results = de_results) - if(return_object){ - + if (return_object) { to_return[["DGEList"]] <- pb - } to_return[["params"]] <- list() @@ -301,5 +321,4 @@ de_limma <- function(x, use_assay = "counts", aggregate_by=NULL, use_existing_sf return(to_return) - } diff --git a/man/de_limma.Rd b/man/de_limma.Rd index 4ef17aa..bee2e2d 100644 --- a/man/de_limma.Rd +++ b/man/de_limma.Rd @@ -11,11 +11,13 @@ de_limma( use_existing_sf = TRUE, main_covariate, other_covariates = NULL, + test_method = c("trend", "voom"), block = NULL, min_pct = 0, min_fc = 1, use_weights = FALSE, mode = c("pairwise", "average"), + use_trend = FALSE, use_filterByExpr = FALSE, return_object = FALSE, delim = "_vs_" @@ -51,6 +53,8 @@ usually has already been performed. If TRUE and no size factors can be found wil \item{mode}{Either "pairwise" to run all pairwise comparisons between `main_covariate` levels, or "average" to test each level against the average of all other levels.} +\item{use_trend}{Logical, if TRUE will convert data to logCPM with edgeR first and then use limma-trend for testing.} + \item{use_filterByExpr}{Logical, whether to run \code{filterByExpr} from `edgeR` for prefiltering of counts. This makes sense when testing for example bulk RNA-seq data.} @@ -73,21 +77,29 @@ individual results for identical contrasts e.g. A-B / B-A can differ if \code{mi the gene with more than \code{min_pct} of cells. If {min_pct} is 0 then these two lists would be the same. } \examples{ +# Testing on pseudobulk level with limma-voom set.seed(1) -sce <- scuttle::mockSCE(ncells=1000, ngenes=1000, nspikes=0) -sce$group <- rep(LETTERS[1:4], each=250) +sce <- scuttle::mockSCE(ncells = 1000, ngenes = 1000, nspikes = 0) +sce$group <- rep(LETTERS[1:4], each = 250) sce$donor <- rep(LETTERS[1:4], 250) -res_pairwise <- de_limma(x = sce, aggregate_by = c("group", "donor"), - main_covariate = "group", other_covariates = c("donor")) +res_pairwise <- de_limma( + x = sce, aggregate_by = c("group", "donor"), + main_covariate = "group", other_covariates = c("donor") +) -res_pairwise_block <- de_limma(x = sce, aggregate_by = c("group", "donor"), - main_covariate = "group", block = "donor") +res_pairwise_block <- de_limma( + x = sce, aggregate_by = c("group", "donor"), + main_covariate = "group", block = "donor" +) -res_average <- de_limma(x = sce, aggregate_by = c("group", "donor"), - main_covariate = "group", other_covariates = c("donor"), - mode="average") +res_average <- de_limma( + x = sce, aggregate_by = c("group", "donor"), + main_covariate = "group", other_covariates = c("donor"), + mode = "average" +) -res_singlecell <- de_limma(x = sce, main_covariate = "group") +# Testing on single-cell level with limma-trend +res_singlecell <- de_limma(x = sce, main_covariate = "group", use_trend = TRUE) } \references{