Merge pull request #66 from LTLA/tatamize

Switch from the old beachmat API to the new tatami library.
const-ae · Nov 26, 2024 · 590440f · 590440f
2 parents 7566610 + 53254a4
commit 590440f
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 157 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,7 +13,7 @@ Description: Fit linear models to overdispersed count data.
      typically occur in single cell RNA-seq experiments.
 License: GPL-3
 Encoding: UTF-8
-SystemRequirements: C++11
+SystemRequirements: C++17
 Suggests: 
     testthat (>= 2.1.0),
     zoo,
@@ -37,7 +37,8 @@ Suggests:
 LinkingTo: 
     Rcpp,
     RcppArmadillo,
-    beachmat
+    beachmat,
+    assorthead
 Imports: 
     Rcpp,
     DelayedMatrixStats,
@@ -56,7 +57,7 @@ Imports:
     rlang,
     vctrs
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 URL: https://github.com/const-ae/glmGamPoi
 BugReports: https://github.com/const-ae/glmGamPoi/issues
 biocViews: Regression, RNASeq, Software, SingleCell

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,7 @@ import(stats)
 importFrom(BiocGenerics,end)
 importFrom(BiocGenerics,width)
 importFrom(SummarizedExperiment,assay)
+importFrom(beachmat,initializeCpp)
 importFrom(methods,as)
 importFrom(methods,canCoerce)
 importFrom(methods,is)

diff --git a/R/estimate_betas.R b/R/estimate_betas.R
@@ -39,6 +39,7 @@ estimate_betas_roughly <- function(Y, model_matrix, offset_matrix, pseudo_count
 #'   * `iterations` the number of iterations
 #'
 #' @keywords internal
+#' @importFrom beachmat initializeCpp
 estimate_betas_fisher_scoring <- function(Y, model_matrix, offset_matrix,
                                           dispersions, beta_mat_init, ridge_penalty,
                                           try_recovering_convergence_problems = TRUE){
@@ -58,7 +59,8 @@ estimate_betas_fisher_scoring <- function(Y, model_matrix, offset_matrix,
     attr(ridge_penalty, "target") <- ridge_target
   }
 
-  betaRes <- fitBeta_fisher_scoring(Y, model_matrix, exp(offset_matrix), dispersions, beta_mat_init,
+  exp_offset_matrix <- exp(offset_matrix)
+  betaRes <- fitBeta_fisher_scoring(initializeCpp(Y), model_matrix, initializeCpp(exp_offset_matrix), dispersions, beta_mat_init,
                                     ridge_penalty_nl = ridge_penalty, tolerance = 1e-8,
                                     max_rel_mu_change = 1e5, max_iter =  max_iter)
   not_converged <- betaRes$iter == max_iter
@@ -179,6 +181,7 @@ estimate_betas_roughly_group_wise <- function(Y, offset_matrix, groups){
 #'   * `deviances` the deviance for each gene (sum of the deviance per group)
 #'
 #' @keywords internal
+#' @importFrom beachmat initializeCpp
 estimate_betas_group_wise <- function(Y, offset_matrix,  dispersions, beta_group_init = NULL, beta_mat_init = NULL, groups, model_matrix){
   stopifnot(nrow(beta_group_init) == nrow(Y))
   stopifnot(ncol(beta_group_init) == length(unique(groups)))
@@ -192,8 +195,11 @@ estimate_betas_group_wise <- function(Y, offset_matrix,  dispersions, beta_group
   }
 
   Beta_res_list <- lapply(unique(groups), function(gr){
-    betaRes <- fitBeta_one_group(Y[, gr == groups, drop = FALSE],
-                                 offset_matrix[, gr == groups, drop = FALSE], thetas = dispersions,
+    chosen <- gr == groups
+    Y_gr <- Y[, chosen, drop = FALSE]
+    offset_gr <- offset_matrix[, chosen, drop = FALSE]
+    betaRes <- fitBeta_one_group(initializeCpp(Y_gr),
+                                 initializeCpp(offset_gr), thetas = dispersions,
                                  beta_start_values = beta_group_init[, gr == unique(groups),drop=TRUE],
                                  tolerance = 1e-8, maxIter = 100)
   })

diff --git a/R/overdispersion.R b/R/overdispersion.R
@@ -77,6 +77,7 @@
 #'
 #' @seealso [glm_gp()]
 #' @export
+#' @importFrom beachmat initializeCpp
 overdispersion_mle <- function(y, mean,
                            model_matrix = NULL,
                            do_cox_reid_adjustment = ! is.null(model_matrix),
@@ -110,7 +111,7 @@ overdispersion_mle <- function(y, mean,
       estimate_global_overdispersion(y, mean, model_matrix, do_cox_reid_adjustment)
     }else{
       # This function calls overdispersion_mle() for each row, but is faster than a vapply()
-      estimate_overdispersions_fast(y, mean, model_matrix, do_cox_reid_adjustment, n_subsamples, max_iter)
+      estimate_overdispersions_fast(initializeCpp(y), initializeCpp(mean), model_matrix, do_cox_reid_adjustment, n_subsamples, max_iter)
     }
   }else{
     overdispersion_mle_impl(as.numeric(y), mean, model_matrix, do_cox_reid_adjustment,
@@ -254,13 +255,13 @@ conventional_overdispersion_mle <- function(y, mean_vector,
 }
 
 
-
+#' @importFrom beachmat initializeCpp
 estimate_global_overdispersion <- function(Y, Mu, model_matrix, do_cox_reid_adjustment){
   # The idea to calculate the log-likelihood and than maximize the interpolation
   # is from edgeR.
   # The runtime is linear with the number of `log_thetas`
   log_thetas <- seq(-6, 1, length.out = 10)
-  log_likes <- estimate_global_overdispersions_fast(Y, Mu, model_matrix, do_cox_reid_adjustment, log_thetas)
+  log_likes <- estimate_global_overdispersions_fast(initializeCpp(Y), initializeCpp(Mu), model_matrix, do_cox_reid_adjustment, log_thetas)
   spl <- spline(log_thetas, log_likes, n = 1000)
   est <- exp(spl$x[which.max(spl$y)])
   list(estimate = est, iterations = 10, message = "global estimate using spline interpolation")

diff --git a/src/Makevars b/src/Makevars
@@ -1,3 +1,3 @@
-CXX_STD = CXX11
+CXX_STD = CXX17
 PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
 PKG_CPPFLAGS =  -I../inst/include/
diff --git a/src/Makevars.win b/src/Makevars.win
@@ -1,4 +1,4 @@
-CXX_STD = CXX11
+CXX_STD = CXX17
 PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
 PKG_LIBS = $(shell $(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe -e "Rcpp:::LdFlags()") $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
 PKG_CPPFLAGS =  -I../inst/include/
diff --git a/src/beta_estimation.cpp b/src/beta_estimation.cpp
@@ -1,7 +1,6 @@
 // #include <Rcpp.h>
 #include <RcppArmadillo.h>
-#include "beachmat/numeric_matrix.h"
-#include "beachmat/integer_matrix.h"
+#include "Rtatami.h"
 
 #include <deviance.h>
 #include <fisher_scoring_steps.h>
@@ -188,14 +187,17 @@ double decrease_deviance_plus_ridge(/*In-Out Parameter*/ arma::vec& beta_hat,
 // fit the Negative Binomial GLM with Fisher scoring
 // note: the betas are on the natural log scale
 //
-template<class NumericType, class BMNumericType>
 List fitBeta_fisher_scoring_impl(RObject Y, const arma::mat& model_matrix, RObject exp_offset_matrix,
                                  NumericVector thetas, SEXP beta_matSEXP, Nullable<NumericMatrix> ridge_penalty_nl,
                                  double tolerance, double max_rel_mu_change, int max_iter, bool use_diagonal_approx) {
-  auto Y_bm = beachmat::create_matrix<BMNumericType>(Y);
-  auto exp_offsets_bm = beachmat::create_numeric_matrix(exp_offset_matrix);
-  int n_samples = Y_bm->get_ncol();
-  int n_genes = Y_bm->get_nrow();
+  Rtatami::BoundNumericPointer Y_bm_ptr(Y);
+  const auto& Y_bm = *(Y_bm_ptr->ptr);
+
+  Rtatami::BoundNumericPointer exp_offsets_bm_ptr(exp_offset_matrix);
+  const auto& exp_offsets_bm = *(exp_offsets_bm_ptr->ptr);
+
+  int n_samples = Y_bm.ncol();
+  int n_genes = Y_bm.nrow();
 
   // the ridge penalty
   bool apply_ridge_penalty = ridge_penalty_nl.isNotNull();
@@ -219,16 +221,24 @@ List fitBeta_fisher_scoring_impl(RObject Y, const arma::mat& model_matrix, RObje
   // The result
   arma::mat beta_mat = as<arma::mat>(beta_matSEXP);
 
+  auto Y_ext = tatami::consecutive_extractor<false>(&Y_bm, true, 0, n_genes);
+  auto exp_offsets_ext = tatami::consecutive_extractor<false>(&exp_offsets_bm, true, 0, n_genes);
+  arma::Col<double> counts(n_samples), exp_off(n_samples);
+
   // deviance, convergence and tolerance
   NumericVector iterations(n_genes);
   NumericVector deviance(n_genes);
+
   for (int gene_idx = 0; gene_idx < n_genes; gene_idx++) {
     if (gene_idx % 100 == 0) checkUserInterrupt();
-    // Fill count and offset vector from beachmat matrix
-    arma::Col<NumericType> counts(n_samples);
-    Y_bm->get_row(gene_idx, counts.begin());
-    arma::Col<double> exp_off(n_samples);
-    exp_offsets_bm->get_row(gene_idx, exp_off.begin());
+
+    // Fill count and offset vector from beachmat matrix. This requires copy_n
+    // to ensure that the arma vectors are actually filled.
+    auto cptr = Y_ext->fetch(counts.begin());
+    tatami::copy_n(cptr, n_samples, counts.begin());
+    auto eptr = exp_offsets_ext->fetch(exp_off.begin());
+    tatami::copy_n(eptr, n_samples, exp_off.begin());
+
     // Init beta and mu
     arma::vec beta_hat = beta_mat.row(gene_idx).t();
     arma::vec mu_hat = calculate_mu(model_matrix, beta_hat, exp_off);
@@ -296,22 +306,11 @@ List fitBeta_fisher_scoring_impl(RObject Y, const arma::mat& model_matrix, RObje
 List fitBeta_fisher_scoring(RObject Y, const arma::mat& model_matrix, RObject exp_offset_matrix,
                                   NumericVector thetas, SEXP beta_matSEXP, Nullable<NumericMatrix> ridge_penalty_nl,
                                   double tolerance, double max_rel_mu_change, int max_iter) {
-  auto mattype=beachmat::find_sexp_type(Y);
-  if (mattype==INTSXP) {
-    return fitBeta_fisher_scoring_impl<int, beachmat::integer_matrix>(Y, model_matrix, exp_offset_matrix,
-                                                                      thetas,  beta_matSEXP,
-                                                                      /*ridge_penalty=*/ ridge_penalty_nl,
-                                                                      tolerance, max_rel_mu_change, max_iter,
-                                                                      /*use_diagonal_approx=*/ false);
-  } else if (mattype==REALSXP) {
-    return fitBeta_fisher_scoring_impl<double, beachmat::numeric_matrix>(Y, model_matrix, exp_offset_matrix,
-                                                                         thetas,  beta_matSEXP,
-                                                                         /*ridge_penalty=*/ ridge_penalty_nl,
-                                                                         tolerance, max_rel_mu_change, max_iter,
-                                                                         /*use_diagonal_approx=*/ false);
-  } else {
-    throw std::runtime_error("unacceptable matrix type");
-  }
+  return fitBeta_fisher_scoring_impl(Y, model_matrix, exp_offset_matrix,
+                                     thetas,  beta_matSEXP,
+                                     /*ridge_penalty=*/ ridge_penalty_nl,
+                                     tolerance, max_rel_mu_change, max_iter,
+                                     /*use_diagonal_approx=*/ false);
 }
 
 
@@ -320,47 +319,51 @@ List fitBeta_fisher_scoring(RObject Y, const arma::mat& model_matrix, RObject ex
 List fitBeta_diagonal_fisher_scoring(RObject Y, const arma::mat& model_matrix, RObject exp_offset_matrix,
                                      NumericVector thetas, SEXP beta_matSEXP,
                                      double tolerance, double max_rel_mu_change, int max_iter) {
-  auto mattype=beachmat::find_sexp_type(Y);
-  if (mattype==INTSXP) {
-    return fitBeta_fisher_scoring_impl<int, beachmat::integer_matrix>(Y, model_matrix, exp_offset_matrix,
-                                                                      thetas,  beta_matSEXP,
-                                                                      /*ridge_penalty=*/ R_NilValue,
-                                                                      tolerance, max_rel_mu_change, max_iter,
-                                                                      /*use_diagonal_approx=*/ true);
-  } else if (mattype==REALSXP) {
-    return fitBeta_fisher_scoring_impl<double, beachmat::numeric_matrix>(Y, model_matrix, exp_offset_matrix,
-                                                                         thetas,  beta_matSEXP,
-                                                                         /*ridge_penalty=*/ R_NilValue,
-                                                                         tolerance, max_rel_mu_change, max_iter,
-                                                                         /*use_diagonal_approx=*/ true);
-  } else {
-    throw std::runtime_error("unacceptable matrix type");
-  }
+  return fitBeta_fisher_scoring_impl(Y, model_matrix, exp_offset_matrix,
+                                     thetas,  beta_matSEXP,
+                                     /*ridge_penalty=*/ R_NilValue,
+                                     tolerance, max_rel_mu_change, max_iter,
+                                     /*use_diagonal_approx=*/ true);
 }
 
 
 
 // If there is only one group, there is no need to do the full Fisher-scoring
 // Instead a simple Newton-Raphson algorithm will do
-template<class NumericType>
-List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
-                       NumericVector thetas, NumericVector beta_start_values,
-                       double tolerance, int maxIter) {
-  auto Y_bm = beachmat::create_matrix<NumericType>(Y_SEXP);
-
-  auto offsets_bm = beachmat::create_numeric_matrix(offsets_SEXP);
-  int n_samples = Y_bm->get_ncol();
-  int n_genes = Y_bm->get_nrow();
+//
+//[[Rcpp::export(rng = false)]]
+List fitBeta_one_group(RObject Y, RObject offset_matrix, NumericVector thetas, NumericVector beta_start_values, double tolerance, int maxIter) {
+  Rtatami::BoundNumericPointer Y_bm_ptr(Y);
+  const auto& Y_bm = *(Y_bm_ptr->ptr);
+
+  Rtatami::BoundNumericPointer offsets_bm_ptr(offset_matrix);
+  const auto& offsets_bm = *(offsets_bm_ptr->ptr);
+
+  int n_samples = Y_bm.ncol();
+  int n_genes = Y_bm.nrow();
   NumericVector result(n_genes);
   IntegerVector iterations(n_genes);
   NumericVector deviance(n_genes);
 
   Environment glmGamPoiEnv = Environment::namespace_env("glmGamPoi");
   Function estimate_betas_group_wise_optimize_helper = glmGamPoiEnv["estimate_betas_group_wise_optimize_helper"];
 
+  auto Y_ext = tatami::consecutive_extractor<false>(&Y_bm, true, 0, n_genes);
+  auto offset_ext = tatami::consecutive_extractor<false>(&offsets_bm, true, 0, n_genes);
+  Rcpp::NumericVector counts_vec(n_samples), off_vec(n_samples);
+
   for(int gene_idx = 0; gene_idx < n_genes; gene_idx++){
     if (gene_idx % 100 == 0) checkUserInterrupt();
 
+    // This must be run before any continue statement, otherwise we're not respecting 
+    // our promise to extract consecutive genes in the consecutive_extractor() call.
+    //
+    // Also note that this function returns pointers that may not actually fill the
+    // *_vec vectors, e.g., row-major matrices where a pointer to the underlying
+    // array can be directly returned. If *_vec must be filled, use tatami::copy_n.
+    auto counts = Y_ext->fetch(gene_idx, counts_vec.begin());
+    auto off = offset_ext->fetch(gene_idx, off_vec.begin());
+
     double beta = beta_start_values(gene_idx);
     const double& theta = thetas(gene_idx);
     if(Rcpp::traits::is_na<REALSXP>(beta) || Rcpp::traits::is_na<REALSXP>(theta)){
@@ -371,10 +374,6 @@ List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
       continue;
     }
 
-    typename NumericType::vector counts(n_samples);
-    Y_bm->get_row(gene_idx, counts.begin());
-    NumericVector off(n_samples);
-    offsets_bm->get_row(gene_idx, off.begin());
     // Newton-Raphson
     int iter = 0;
     for(; iter < maxIter; iter++){
@@ -403,8 +402,11 @@ List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
       }
     }
     if(iter == maxIter || Rcpp::traits::is_nan<REALSXP>(beta)){
+      // Make sure we actually populate the vectors before sending them over to R.
+      tatami::copy_n(counts, n_samples, counts_vec.begin());
+      tatami::copy_n(off, n_samples, off_vec.begin());
       // Not converged -> try again with optimize()
-      beta =  Rcpp::as<double>(estimate_betas_group_wise_optimize_helper(counts, off, theta));
+      beta =  Rcpp::as<double>(estimate_betas_group_wise_optimize_helper(counts_vec, off_vec, theta));
     }
     result(gene_idx) = beta;
     iterations(gene_idx) = iter;
@@ -420,26 +422,3 @@ List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
     Named("deviance", deviance)
   );
 }
-
-// [[Rcpp::export(rng = false)]]
-List fitBeta_one_group(RObject Y, RObject offset_matrix,
-                        NumericVector thetas, NumericVector beta_start_values,
-                        double tolerance, int maxIter) {
-  auto mattype=beachmat::find_sexp_type(Y);
-  if (mattype==INTSXP) {
-    return fitBeta_one_group_internal<beachmat::integer_matrix>(Y, offset_matrix, thetas, beta_start_values, tolerance, maxIter);
-  } else if (mattype==REALSXP) {
-    return fitBeta_one_group_internal<beachmat::numeric_matrix>(Y, offset_matrix, thetas, beta_start_values, tolerance, maxIter);
-  } else {
-    throw std::runtime_error("unacceptable matrix type");
-  }
-}
-
-
-
-
-
-
-
-
-