From 5fb22f7c21e0ec974c7c8abb69df1f124ee2e460 Mon Sep 17 00:00:00 2001 From: const-ae Date: Wed, 12 Jun 2024 13:16:38 +0200 Subject: [PATCH] In pseudobulk detect if smart subset of matrix can be used --- R/pseudobulk.R | 17 +++++++++++++---- tests/testthat/test-pseudobulk.R | 11 +++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/R/pseudobulk.R b/R/pseudobulk.R index 2b4659c..9cc6dba 100644 --- a/R/pseudobulk.R +++ b/R/pseudobulk.R @@ -87,7 +87,12 @@ pseudobulk <- function(data, group_by, ..., if(verbose) message("Aggregating assay '", assay_name, "' using '", aggr_fnc_res$label, "'.") data_mat <- SummarizedExperiment::assay(data, assay_name) new_data_mat <- do.call(cbind, lapply(group_split, function(idx){ - aggr_fnc(data_mat[,idx,drop=FALSE]) + if(aggr_fnc_res$smart_subset){ + # This optimization needs sparseMatrixStats >= 1.17.1 to be effective + aggr_fnc(data_mat, cols = idx) + }else{ + aggr_fnc(data_mat[,idx,drop=FALSE]) + } })) rownames(new_data_mat) <- rownames(data) new_data_mat @@ -110,7 +115,11 @@ pseudobulk <- function(data, group_by, ..., if(is(tdata_mat, "LinearEmbeddingMatrix")){ data_mat <- t(SingleCellExperiment::sampleFactors(tdata_mat)) new_data_mat <- do.call(cbind, lapply(group_split, function(idx){ - aggr_fnc(data_mat[,idx,drop=FALSE]) + if(aggr_fnc_res$smart_subset){ + aggr_fnc(data_mat, cols = idx) + }else{ + aggr_fnc(data_mat[,idx,drop=FALSE]) + } })) SingleCellExperiment::LinearEmbeddingMatrix(t(new_data_mat), SingleCellExperiment::featureLoadings(tdata_mat), factorData = SingleCellExperiment::factorData(tdata_mat)) @@ -191,8 +200,8 @@ get_aggregation_function <- function(assay_name, aggregation_functions){ }else{ label <- "custom function" } - - list(fnc = aggr_fnc, label = label) + smart_subset <- "cols" %in% names(formals(aggr_fnc)) + list(fnc = aggr_fnc, label = label, smart_subset = smart_subset) } #' Quote grouping variables diff --git a/tests/testthat/test-pseudobulk.R b/tests/testthat/test-pseudobulk.R index e010a7e..6e5288d 100644 --- a/tests/testthat/test-pseudobulk.R +++ b/tests/testthat/test-pseudobulk.R @@ -98,3 +98,14 @@ test_that("NA's don't mess up the results", { psce <- pseudobulk(sce, group_by = vars(fav_food), make_colnames = FALSE) expect_equal(SummarizedExperiment::colData(psce)$fav_food, unique(SummarizedExperiment::colData(sce)$fav_food)) }) + + +# Compare speed of complex aggregation with smart_subsetting +# mat <- as(matrix(rpois(n = 1000 * 1e5, lambda = 0.1), nrow = 1000, ncol = 1e5), "dgCMatrix") +# sce <- SingleCellExperiment::SingleCellExperiment(list(counts = mat)) +# grouping <- sample(1:3000, size = 1e5, replace = TRUE) +# bench::mark( +# smart = pseudobulk(sce, group_by = vars(grouping), aggregation_functions = list("counts" = sparseMatrixStats::rowMeans2)), +# not_smart = pseudobulk(sce, group_by = vars(grouping), aggregation_functions = list("counts" = Matrix::rowMeans)) +# ) +