Skip to content

Commit

Permalink
Merge pull request #66 from LTLA/tatamize
Browse files Browse the repository at this point in the history
Switch from the old beachmat API to the new tatami library.
  • Loading branch information
const-ae authored Nov 26, 2024
2 parents 7566610 + 53254a4 commit 590440f
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 157 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Description: Fit linear models to overdispersed count data.
typically occur in single cell RNA-seq experiments.
License: GPL-3
Encoding: UTF-8
SystemRequirements: C++11
SystemRequirements: C++17
Suggests:
testthat (>= 2.1.0),
zoo,
Expand All @@ -37,7 +37,8 @@ Suggests:
LinkingTo:
Rcpp,
RcppArmadillo,
beachmat
beachmat,
assorthead
Imports:
Rcpp,
DelayedMatrixStats,
Expand All @@ -56,7 +57,7 @@ Imports:
rlang,
vctrs
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
URL: https://github.com/const-ae/glmGamPoi
BugReports: https://github.com/const-ae/glmGamPoi/issues
biocViews: Regression, RNASeq, Software, SingleCell
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import(stats)
importFrom(BiocGenerics,end)
importFrom(BiocGenerics,width)
importFrom(SummarizedExperiment,assay)
importFrom(beachmat,initializeCpp)
importFrom(methods,as)
importFrom(methods,canCoerce)
importFrom(methods,is)
Expand Down
12 changes: 9 additions & 3 deletions R/estimate_betas.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ estimate_betas_roughly <- function(Y, model_matrix, offset_matrix, pseudo_count
#' * `iterations` the number of iterations
#'
#' @keywords internal
#' @importFrom beachmat initializeCpp
estimate_betas_fisher_scoring <- function(Y, model_matrix, offset_matrix,
dispersions, beta_mat_init, ridge_penalty,
try_recovering_convergence_problems = TRUE){
Expand All @@ -58,7 +59,8 @@ estimate_betas_fisher_scoring <- function(Y, model_matrix, offset_matrix,
attr(ridge_penalty, "target") <- ridge_target
}

betaRes <- fitBeta_fisher_scoring(Y, model_matrix, exp(offset_matrix), dispersions, beta_mat_init,
exp_offset_matrix <- exp(offset_matrix)
betaRes <- fitBeta_fisher_scoring(initializeCpp(Y), model_matrix, initializeCpp(exp_offset_matrix), dispersions, beta_mat_init,
ridge_penalty_nl = ridge_penalty, tolerance = 1e-8,
max_rel_mu_change = 1e5, max_iter = max_iter)
not_converged <- betaRes$iter == max_iter
Expand Down Expand Up @@ -179,6 +181,7 @@ estimate_betas_roughly_group_wise <- function(Y, offset_matrix, groups){
#' * `deviances` the deviance for each gene (sum of the deviance per group)
#'
#' @keywords internal
#' @importFrom beachmat initializeCpp
estimate_betas_group_wise <- function(Y, offset_matrix, dispersions, beta_group_init = NULL, beta_mat_init = NULL, groups, model_matrix){
stopifnot(nrow(beta_group_init) == nrow(Y))
stopifnot(ncol(beta_group_init) == length(unique(groups)))
Expand All @@ -192,8 +195,11 @@ estimate_betas_group_wise <- function(Y, offset_matrix, dispersions, beta_group
}

Beta_res_list <- lapply(unique(groups), function(gr){
betaRes <- fitBeta_one_group(Y[, gr == groups, drop = FALSE],
offset_matrix[, gr == groups, drop = FALSE], thetas = dispersions,
chosen <- gr == groups
Y_gr <- Y[, chosen, drop = FALSE]
offset_gr <- offset_matrix[, chosen, drop = FALSE]
betaRes <- fitBeta_one_group(initializeCpp(Y_gr),
initializeCpp(offset_gr), thetas = dispersions,
beta_start_values = beta_group_init[, gr == unique(groups),drop=TRUE],
tolerance = 1e-8, maxIter = 100)
})
Expand Down
7 changes: 4 additions & 3 deletions R/overdispersion.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
#'
#' @seealso [glm_gp()]
#' @export
#' @importFrom beachmat initializeCpp
overdispersion_mle <- function(y, mean,
model_matrix = NULL,
do_cox_reid_adjustment = ! is.null(model_matrix),
Expand Down Expand Up @@ -110,7 +111,7 @@ overdispersion_mle <- function(y, mean,
estimate_global_overdispersion(y, mean, model_matrix, do_cox_reid_adjustment)
}else{
# This function calls overdispersion_mle() for each row, but is faster than a vapply()
estimate_overdispersions_fast(y, mean, model_matrix, do_cox_reid_adjustment, n_subsamples, max_iter)
estimate_overdispersions_fast(initializeCpp(y), initializeCpp(mean), model_matrix, do_cox_reid_adjustment, n_subsamples, max_iter)
}
}else{
overdispersion_mle_impl(as.numeric(y), mean, model_matrix, do_cox_reid_adjustment,
Expand Down Expand Up @@ -254,13 +255,13 @@ conventional_overdispersion_mle <- function(y, mean_vector,
}



#' @importFrom beachmat initializeCpp
estimate_global_overdispersion <- function(Y, Mu, model_matrix, do_cox_reid_adjustment){
# The idea to calculate the log-likelihood and than maximize the interpolation
# is from edgeR.
# The runtime is linear with the number of `log_thetas`
log_thetas <- seq(-6, 1, length.out = 10)
log_likes <- estimate_global_overdispersions_fast(Y, Mu, model_matrix, do_cox_reid_adjustment, log_thetas)
log_likes <- estimate_global_overdispersions_fast(initializeCpp(Y), initializeCpp(Mu), model_matrix, do_cox_reid_adjustment, log_thetas)
spl <- spline(log_thetas, log_likes, n = 1000)
est <- exp(spl$x[which.max(spl$y)])
list(estimate = est, iterations = 10, message = "global estimate using spline interpolation")
Expand Down
2 changes: 1 addition & 1 deletion src/Makevars
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
CXX_STD = CXX11
CXX_STD = CXX17
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
PKG_CPPFLAGS = -I../inst/include/
2 changes: 1 addition & 1 deletion src/Makevars.win
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CXX_STD = CXX11
CXX_STD = CXX17
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
PKG_LIBS = $(shell $(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe -e "Rcpp:::LdFlags()") $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
PKG_CPPFLAGS = -I../inst/include/
141 changes: 60 additions & 81 deletions src/beta_estimation.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// #include <Rcpp.h>
#include <RcppArmadillo.h>
#include "beachmat/numeric_matrix.h"
#include "beachmat/integer_matrix.h"
#include "Rtatami.h"

#include <deviance.h>
#include <fisher_scoring_steps.h>
Expand Down Expand Up @@ -188,14 +187,17 @@ double decrease_deviance_plus_ridge(/*In-Out Parameter*/ arma::vec& beta_hat,
// fit the Negative Binomial GLM with Fisher scoring
// note: the betas are on the natural log scale
//
template<class NumericType, class BMNumericType>
List fitBeta_fisher_scoring_impl(RObject Y, const arma::mat& model_matrix, RObject exp_offset_matrix,
NumericVector thetas, SEXP beta_matSEXP, Nullable<NumericMatrix> ridge_penalty_nl,
double tolerance, double max_rel_mu_change, int max_iter, bool use_diagonal_approx) {
auto Y_bm = beachmat::create_matrix<BMNumericType>(Y);
auto exp_offsets_bm = beachmat::create_numeric_matrix(exp_offset_matrix);
int n_samples = Y_bm->get_ncol();
int n_genes = Y_bm->get_nrow();
Rtatami::BoundNumericPointer Y_bm_ptr(Y);
const auto& Y_bm = *(Y_bm_ptr->ptr);

Rtatami::BoundNumericPointer exp_offsets_bm_ptr(exp_offset_matrix);
const auto& exp_offsets_bm = *(exp_offsets_bm_ptr->ptr);

int n_samples = Y_bm.ncol();
int n_genes = Y_bm.nrow();

// the ridge penalty
bool apply_ridge_penalty = ridge_penalty_nl.isNotNull();
Expand All @@ -219,16 +221,24 @@ List fitBeta_fisher_scoring_impl(RObject Y, const arma::mat& model_matrix, RObje
// The result
arma::mat beta_mat = as<arma::mat>(beta_matSEXP);

auto Y_ext = tatami::consecutive_extractor<false>(&Y_bm, true, 0, n_genes);
auto exp_offsets_ext = tatami::consecutive_extractor<false>(&exp_offsets_bm, true, 0, n_genes);
arma::Col<double> counts(n_samples), exp_off(n_samples);

// deviance, convergence and tolerance
NumericVector iterations(n_genes);
NumericVector deviance(n_genes);

for (int gene_idx = 0; gene_idx < n_genes; gene_idx++) {
if (gene_idx % 100 == 0) checkUserInterrupt();
// Fill count and offset vector from beachmat matrix
arma::Col<NumericType> counts(n_samples);
Y_bm->get_row(gene_idx, counts.begin());
arma::Col<double> exp_off(n_samples);
exp_offsets_bm->get_row(gene_idx, exp_off.begin());

// Fill count and offset vector from beachmat matrix. This requires copy_n
// to ensure that the arma vectors are actually filled.
auto cptr = Y_ext->fetch(counts.begin());
tatami::copy_n(cptr, n_samples, counts.begin());
auto eptr = exp_offsets_ext->fetch(exp_off.begin());
tatami::copy_n(eptr, n_samples, exp_off.begin());

// Init beta and mu
arma::vec beta_hat = beta_mat.row(gene_idx).t();
arma::vec mu_hat = calculate_mu(model_matrix, beta_hat, exp_off);
Expand Down Expand Up @@ -296,22 +306,11 @@ List fitBeta_fisher_scoring_impl(RObject Y, const arma::mat& model_matrix, RObje
List fitBeta_fisher_scoring(RObject Y, const arma::mat& model_matrix, RObject exp_offset_matrix,
NumericVector thetas, SEXP beta_matSEXP, Nullable<NumericMatrix> ridge_penalty_nl,
double tolerance, double max_rel_mu_change, int max_iter) {
auto mattype=beachmat::find_sexp_type(Y);
if (mattype==INTSXP) {
return fitBeta_fisher_scoring_impl<int, beachmat::integer_matrix>(Y, model_matrix, exp_offset_matrix,
thetas, beta_matSEXP,
/*ridge_penalty=*/ ridge_penalty_nl,
tolerance, max_rel_mu_change, max_iter,
/*use_diagonal_approx=*/ false);
} else if (mattype==REALSXP) {
return fitBeta_fisher_scoring_impl<double, beachmat::numeric_matrix>(Y, model_matrix, exp_offset_matrix,
thetas, beta_matSEXP,
/*ridge_penalty=*/ ridge_penalty_nl,
tolerance, max_rel_mu_change, max_iter,
/*use_diagonal_approx=*/ false);
} else {
throw std::runtime_error("unacceptable matrix type");
}
return fitBeta_fisher_scoring_impl(Y, model_matrix, exp_offset_matrix,
thetas, beta_matSEXP,
/*ridge_penalty=*/ ridge_penalty_nl,
tolerance, max_rel_mu_change, max_iter,
/*use_diagonal_approx=*/ false);
}


Expand All @@ -320,47 +319,51 @@ List fitBeta_fisher_scoring(RObject Y, const arma::mat& model_matrix, RObject ex
List fitBeta_diagonal_fisher_scoring(RObject Y, const arma::mat& model_matrix, RObject exp_offset_matrix,
NumericVector thetas, SEXP beta_matSEXP,
double tolerance, double max_rel_mu_change, int max_iter) {
auto mattype=beachmat::find_sexp_type(Y);
if (mattype==INTSXP) {
return fitBeta_fisher_scoring_impl<int, beachmat::integer_matrix>(Y, model_matrix, exp_offset_matrix,
thetas, beta_matSEXP,
/*ridge_penalty=*/ R_NilValue,
tolerance, max_rel_mu_change, max_iter,
/*use_diagonal_approx=*/ true);
} else if (mattype==REALSXP) {
return fitBeta_fisher_scoring_impl<double, beachmat::numeric_matrix>(Y, model_matrix, exp_offset_matrix,
thetas, beta_matSEXP,
/*ridge_penalty=*/ R_NilValue,
tolerance, max_rel_mu_change, max_iter,
/*use_diagonal_approx=*/ true);
} else {
throw std::runtime_error("unacceptable matrix type");
}
return fitBeta_fisher_scoring_impl(Y, model_matrix, exp_offset_matrix,
thetas, beta_matSEXP,
/*ridge_penalty=*/ R_NilValue,
tolerance, max_rel_mu_change, max_iter,
/*use_diagonal_approx=*/ true);
}



// If there is only one group, there is no need to do the full Fisher-scoring
// Instead a simple Newton-Raphson algorithm will do
template<class NumericType>
List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
NumericVector thetas, NumericVector beta_start_values,
double tolerance, int maxIter) {
auto Y_bm = beachmat::create_matrix<NumericType>(Y_SEXP);

auto offsets_bm = beachmat::create_numeric_matrix(offsets_SEXP);
int n_samples = Y_bm->get_ncol();
int n_genes = Y_bm->get_nrow();
//
//[[Rcpp::export(rng = false)]]
List fitBeta_one_group(RObject Y, RObject offset_matrix, NumericVector thetas, NumericVector beta_start_values, double tolerance, int maxIter) {
Rtatami::BoundNumericPointer Y_bm_ptr(Y);
const auto& Y_bm = *(Y_bm_ptr->ptr);

Rtatami::BoundNumericPointer offsets_bm_ptr(offset_matrix);
const auto& offsets_bm = *(offsets_bm_ptr->ptr);

int n_samples = Y_bm.ncol();
int n_genes = Y_bm.nrow();
NumericVector result(n_genes);
IntegerVector iterations(n_genes);
NumericVector deviance(n_genes);

Environment glmGamPoiEnv = Environment::namespace_env("glmGamPoi");
Function estimate_betas_group_wise_optimize_helper = glmGamPoiEnv["estimate_betas_group_wise_optimize_helper"];

auto Y_ext = tatami::consecutive_extractor<false>(&Y_bm, true, 0, n_genes);
auto offset_ext = tatami::consecutive_extractor<false>(&offsets_bm, true, 0, n_genes);
Rcpp::NumericVector counts_vec(n_samples), off_vec(n_samples);

for(int gene_idx = 0; gene_idx < n_genes; gene_idx++){
if (gene_idx % 100 == 0) checkUserInterrupt();

// This must be run before any continue statement, otherwise we're not respecting
// our promise to extract consecutive genes in the consecutive_extractor() call.
//
// Also note that this function returns pointers that may not actually fill the
// *_vec vectors, e.g., row-major matrices where a pointer to the underlying
// array can be directly returned. If *_vec must be filled, use tatami::copy_n.
auto counts = Y_ext->fetch(gene_idx, counts_vec.begin());
auto off = offset_ext->fetch(gene_idx, off_vec.begin());

double beta = beta_start_values(gene_idx);
const double& theta = thetas(gene_idx);
if(Rcpp::traits::is_na<REALSXP>(beta) || Rcpp::traits::is_na<REALSXP>(theta)){
Expand All @@ -371,10 +374,6 @@ List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
continue;
}

typename NumericType::vector counts(n_samples);
Y_bm->get_row(gene_idx, counts.begin());
NumericVector off(n_samples);
offsets_bm->get_row(gene_idx, off.begin());
// Newton-Raphson
int iter = 0;
for(; iter < maxIter; iter++){
Expand Down Expand Up @@ -403,8 +402,11 @@ List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
}
}
if(iter == maxIter || Rcpp::traits::is_nan<REALSXP>(beta)){
// Make sure we actually populate the vectors before sending them over to R.
tatami::copy_n(counts, n_samples, counts_vec.begin());
tatami::copy_n(off, n_samples, off_vec.begin());
// Not converged -> try again with optimize()
beta = Rcpp::as<double>(estimate_betas_group_wise_optimize_helper(counts, off, theta));
beta = Rcpp::as<double>(estimate_betas_group_wise_optimize_helper(counts_vec, off_vec, theta));
}
result(gene_idx) = beta;
iterations(gene_idx) = iter;
Expand All @@ -420,26 +422,3 @@ List fitBeta_one_group_internal(SEXP Y_SEXP, SEXP offsets_SEXP,
Named("deviance", deviance)
);
}

// [[Rcpp::export(rng = false)]]
List fitBeta_one_group(RObject Y, RObject offset_matrix,
NumericVector thetas, NumericVector beta_start_values,
double tolerance, int maxIter) {
auto mattype=beachmat::find_sexp_type(Y);
if (mattype==INTSXP) {
return fitBeta_one_group_internal<beachmat::integer_matrix>(Y, offset_matrix, thetas, beta_start_values, tolerance, maxIter);
} else if (mattype==REALSXP) {
return fitBeta_one_group_internal<beachmat::numeric_matrix>(Y, offset_matrix, thetas, beta_start_values, tolerance, maxIter);
} else {
throw std::runtime_error("unacceptable matrix type");
}
}









Loading

0 comments on commit 590440f

Please sign in to comment.