From 58581eb89f2599c8e3567f6c00f90a5c2439c1ac Mon Sep 17 00:00:00 2001 From: Frank Weber <55132727+fweber144@users.noreply.github.com> Date: Mon, 11 Oct 2021 18:05:35 +0200 Subject: [PATCH] Miscellaneous modifications and fixes (again) (#234) * Remove `as.matrix.list()` (neither used nor needed). * Define the default `baseline` in the `vsel` methods directly (instead of using `NULL`) and remove the `is.null()` case in `.validate_baseline()`. * Docs: Mention that for argument `baseline`, the best submodel is decided in terms of `stats[1]`. * Docs of `summary.vsel()`: Minor wording improvements for argument `type`. * Docs of argument `baseline`: Mention the relevance. * Simplify `cvfun()` in `get_refmodel.stanreg()`. * Set a visible default for argument `K` of `cv_varsel()`. * Improve y-axis label in `plot.vsel()`. * There are no Gaussian-only `stats` (at least it isn't implemented this way). Therefore, adapt the tests. * Fix issue #210. * Adapt the tests to the fix for issue #210. * Harmonize definitions and usages of `%%` binary infix operators. * Unify the equivalent special binary (infix) operators `%||%` and `%ORifNULL%`. * Avoid function definitions without curly braces because they're not shown in RStudio's document outline. * Remove `.get_proj_handle()` (not necessary and had an incorrect default for `regul`). * `search_forward()`: Remove unused argument `increasing_order`. * `projfun.R`: Enhance comments. * `predict.subfit()`: Don't access `subfit$x` if not necessary. * Minor cleaning. * Remove more unused methods. * `get_refmodel.stanreg()`: Throw an error if **rstanarm** is not installed. * Replace "search path" by "solution path" (at least where relevant and easily possible) to avoid a (purely optical) confusion with "search part". --- NAMESPACE | 6 -- R/cv_varsel.R | 49 ++++++------- R/divergence_minimizers.R | 3 +- R/gamm4.R | 4 +- R/methods.R | 106 ++++++++++++++--------------- R/misc.R | 45 ++++++------ R/project.R | 6 +- R/projfun.R | 43 ++++-------- R/refmodel.R | 10 +-- R/search.R | 15 ++-- R/varsel.R | 4 +- man/cv_varsel.Rd | 6 +- man/plot.vsel.Rd | 6 +- man/suggest_size.Rd | 6 +- man/summary.vsel.Rd | 13 ++-- tests/testthat/helpers/testers.R | 80 +++++++++++++++++----- tests/testthat/setup.R | 4 +- tests/testthat/test_datafit.R | 2 +- tests/testthat/test_methods_vsel.R | 3 +- tests/testthat/test_varsel.R | 10 +-- 20 files changed, 218 insertions(+), 203 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 69f9e6470..9f1e39b4b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,11 +3,9 @@ S3method(as.matrix,gamm4) S3method(as.matrix,glm) S3method(as.matrix,glmerMod) -S3method(as.matrix,list) S3method(as.matrix,lm) S3method(as.matrix,lmerMod) S3method(as.matrix,projection) -S3method(as.matrix,ridgelm) S3method(as.matrix,subfit) S3method(coef,subfit) S3method(cv_varsel,default) @@ -24,10 +22,6 @@ S3method(solution_terms,projection) S3method(solution_terms,vsel) S3method(suggest_size,vsel) S3method(summary,vsel) -S3method(t,glm) -S3method(t,list) -S3method(t,lm) -S3method(t,ridgelm) S3method(varsel,default) S3method(varsel,refmodel) export("%>%") diff --git a/R/cv_varsel.R b/R/cv_varsel.R index 1c641cd08..5c8a9d8fb 100644 --- a/R/cv_varsel.R +++ b/R/cv_varsel.R @@ -24,9 +24,7 @@ #' `NULL`, all observations are used, but for faster experimentation, one can #' set this to a smaller value. #' @param K Only relevant if `cv_method == "kfold"`. Number of folds in the -#' \eqn{K}-fold CV. If `NULL`, then `5` is used for genuine reference models -#' (i.e., of class `refmodel`) and `10` for `datafit`s (that is, for penalized -#' maximum likelihood estimation). +#' \eqn{K}-fold CV. #' @param validate_search Only relevant if `cv_method == "LOO"`. A single #' logical value indicating whether to cross-validate also the search part, #' i.e., whether to run the search separately for each CV fold (`TRUE`) or not @@ -103,17 +101,27 @@ cv_varsel.default <- function(object, ...) { #' @rdname cv_varsel #' @export cv_varsel.refmodel <- function( - object, method = NULL, + object, + method = NULL, cv_method = if (!inherits(object, "datafit")) "LOO" else "kfold", - ndraws = 20, nclusters = NULL, - ndraws_pred = 400, nclusters_pred = NULL, + ndraws = 20, + nclusters = NULL, + ndraws_pred = 400, + nclusters_pred = NULL, cv_search = !inherits(object, "datafit"), - nterms_max = NULL, penalty = NULL, + nterms_max = NULL, + penalty = NULL, verbose = TRUE, - nloo = NULL, K = NULL, lambda_min_ratio = 1e-5, - nlambda = 150, thresh = 1e-6, regul = 1e-4, - validate_search = TRUE, seed = NULL, - search_terms = NULL, ... + nloo = NULL, + K = if (!inherits(object, "datafit")) 5 else 10, + lambda_min_ratio = 1e-5, + nlambda = 150, + thresh = 1e-6, + regul = 1e-4, + validate_search = TRUE, + seed = NULL, + search_terms = NULL, + ... ) { refmodel <- object ## resolve the arguments similar to varsel @@ -279,23 +287,16 @@ parse_args_cv_varsel <- function(refmodel, cv_method, K, cv_method <- "kfold" } - if (!is.null(K)) { - if (length(K) > 1 || !is.numeric(K) || K != round(K)) { - stop("K must be a single integer value.") + if (cv_method == "kfold") { + stopifnot(!is.null(K)) + if (length(K) > 1 || !is.numeric(K) || !.is.wholenumber(K)) { + stop("`K` must be a single integer value.") } if (K < 2) { - stop("K must be at least 2.") + stop("`K` must be at least 2.") } if (K > NROW(refmodel$y)) { - stop("K cannot exceed the number of observations.") - } - } - - if (cv_method == "kfold" && is.null(K)) { - if (inherits(refmodel, "datafit")) { - K <- 10 - } else { - K <- 5 + stop("`K` cannot exceed the number of observations.") } } diff --git a/R/divergence_minimizers.R b/R/divergence_minimizers.R index 9a5798f80..9c9fe27fd 100644 --- a/R/divergence_minimizers.R +++ b/R/divergence_minimizers.R @@ -272,12 +272,11 @@ subprd <- function(fit, newdata = NULL) { predict.subfit <- function(subfit, newdata = NULL) { beta <- subfit$beta alpha <- subfit$alpha - x <- subfit$x if (is.null(newdata)) { if (is.null(beta)) { return(as.matrix(rep(alpha, NROW(subfit$x)))) } else { - return(x %*% rbind(alpha, beta)) + return(subfit$x %*% rbind(alpha, beta)) } } else { contrasts_arg <- get_contrasts_arg_list(subfit$formula, newdata) diff --git a/R/gamm4.R b/R/gamm4.R index eee88029c..ee84b257d 100644 --- a/R/gamm4.R +++ b/R/gamm4.R @@ -7,7 +7,7 @@ gamm4.setup <- function(formula, pterms, data = NULL, knots = NULL) { formula$response <- 1 pterms$response <- 1 } - gam.setup <- `%:::%`("mgcv", "gam.setup") + gam.setup <- "mgcv" %:::% "gam.setup" G <- gam.setup(formula, pterms, data = data, knots = knots, sp = NULL, min.sp = NULL, @@ -160,7 +160,7 @@ model.matrix.gamm4 <- function(formula, random = NULL, data = NULL, names(dl) <- vars ## list of all variables needed ## summarize the input data - variable.summary <- `%:::%`("mgcv", "variable.summary") + variable.summary <- "mgcv" %:::% "variable.summary" var.summary <- variable.summary(gp$pf, dl, nrow(mf)) ## lmer offset handling work around... diff --git a/R/methods.R b/R/methods.R index 29e22d816..2347c1361 100644 --- a/R/methods.R +++ b/R/methods.R @@ -294,6 +294,9 @@ proj_predict_aux <- function(proj, mu, weights, ...) { #' #' @inheritParams summary.vsel #' @param x An object of class `vsel` (returned by [varsel()] or [cv_varsel()]). +#' @param baseline Either `"ref"` or `"best"` indicating whether the baseline is +#' the reference model or the best submodel (in terms of `stats[1]`), +#' respectively. #' #' @examples #' if (requireNamespace("rstanarm", quietly = TRUE)) { @@ -317,9 +320,15 @@ proj_predict_aux <- function(proj, mu, weights, ...) { #' } #' #' @export -plot.vsel <- function(x, nterms_max = NULL, stats = "elpd", - deltas = FALSE, alpha = 0.32, baseline = NULL, - ...) { +plot.vsel <- function( + x, + nterms_max = NULL, + stats = "elpd", + deltas = FALSE, + alpha = 0.32, + baseline = if (!inherits(x$refmodel, "datafit")) "ref" else "best", + ... +) { object <- x .validate_vsel_object_stats(object, stats) baseline <- .validate_baseline(object$refmodel, baseline, deltas) @@ -360,7 +369,16 @@ plot.vsel <- function(x, nterms_max = NULL, stats = "elpd", if (nterms_max < 1) { stop("nterms_max must be at least 1") } - ylab <- if (deltas) "Difference to the baseline" else "Value" + if (baseline == "ref") { + baseline_pretty <- "reference model" + } else { + baseline_pretty <- "best submodel" + } + if (deltas) { + ylab <- paste0("Difference vs.", baseline_pretty) + } else { + ylab <- "Value" + } # make sure that breaks on the x-axis are integers n_opts <- c(4, 5, 6) @@ -434,18 +452,18 @@ plot.vsel <- function(x, nterms_max = NULL, stats = "elpd", #' `"diff"`, and `"diff.se"` indicating which of these to compute (mean, #' standard error, lower and upper credible bounds, difference to the #' corresponding statistic of the reference model, and standard error of this -#' difference). The credible bounds are determined so that `1 - alpha` percent -#' of the probability mass falls between them. Items `"diff"` and `"diff.se"` -#' are only supported if `!deltas`. +#' difference, respectively). The credible bounds are determined so that `1 - +#' alpha` percent of the probability mass falls between them. Items `"diff"` +#' and `"diff.se"` are only supported if `deltas` is `FALSE`. #' @param deltas If `TRUE`, the submodel statistics are estimated relative to #' the baseline model (see argument `baseline`) instead of estimating the #' actual values of the statistics. #' @param alpha A number giving the desired coverage of the credible intervals. #' For example, `alpha = 0.32` corresponds to 68% probability mass within the #' intervals, that is, one-standard-error intervals. -#' @param baseline Either `"ref"` or `"best"` indicating whether the baseline is -#' the reference model or the best submodel found, respectively. If `NULL`, -#' then `"ref"` is used, except for `datafit`s for which `"best"` is used. +#' @param baseline Only relevant if `deltas` is `TRUE`. Either `"ref"` or +#' `"best"` indicating whether the baseline is the reference model or the best +#' submodel (in terms of `stats[1]`), respectively. #' @param ... Currently ignored. #' #' @examples @@ -470,9 +488,16 @@ plot.vsel <- function(x, nterms_max = NULL, stats = "elpd", #' } #' #' @export -summary.vsel <- function(object, nterms_max = NULL, stats = "elpd", - type = c("mean", "se", "diff", "diff.se"), - deltas = FALSE, alpha = 0.32, baseline = NULL, ...) { +summary.vsel <- function( + object, + nterms_max = NULL, + stats = "elpd", + type = c("mean", "se", "diff", "diff.se"), + deltas = FALSE, + alpha = 0.32, + baseline = if (!inherits(object$refmodel, "datafit")) "ref" else "best", + ... +) { .validate_vsel_object_stats(object, stats) baseline <- .validate_baseline(object$refmodel, baseline, deltas) @@ -658,7 +683,6 @@ print.vsel <- function(x, ...) { #' results via [plot.vsel()] and/or [summary.vsel()] and make the final decision #' based on what is most appropriate for the given problem. #' -#' @inheritParams summary.vsel #' @param object An object of class `vsel` (returned by [varsel()] or #' [cv_varsel()]). #' @param stat Statistic used for the decision. See [summary.vsel()] for @@ -673,6 +697,9 @@ print.vsel <- function(x, ...) { #' @param type Either `"upper"` or `"lower"` determining whether the decisions #' are based on the upper or lower credible bounds, respectively. See section #' "Details" for more information. +#' @param baseline Either `"ref"` or `"best"` indicating whether the baseline is +#' the reference model or the best submodel (in terms of `stats[1]`), +#' respectively. #' @param warnings Whether to give warnings if automatic suggestion fails, #' mainly for internal use. Usually there is no reason to set this to `FALSE`. #' @param ... Currently ignored. @@ -735,9 +762,16 @@ suggest_size <- function(object, ...) { #' @rdname suggest_size #' @export -suggest_size.vsel <- function(object, stat = "elpd", alpha = 0.32, pct = 0, - type = "upper", baseline = NULL, warnings = TRUE, - ...) { +suggest_size.vsel <- function( + object, + stat = "elpd", + alpha = 0.32, + pct = 0, + type = "upper", + baseline = if (!inherits(object$refmodel, "datafit")) "ref" else "best", + warnings = TRUE, + ... +) { .validate_vsel_object_stats(object, stat) if (length(stat) > 1) { stop("Only one statistic can be specified to suggest_size") @@ -824,13 +858,6 @@ as.matrix.lm <- function(x, ...) { replace_population_names()) } -#' @method as.matrix ridgelm -#' @keywords internal -#' @export -as.matrix.ridgelm <- function(x, ...) { - return(as.matrix.lm(x)) -} - #' @method as.matrix subfit #' @keywords internal #' @export @@ -979,37 +1006,6 @@ as.matrix.gamm4 <- function(x, ...) { return(as.matrix.lm(x, ...)) } -#' @method as.matrix list -#' @keywords internal -#' @export -as.matrix.list <- function(x, ...) { - return(do.call(cbind, lapply(x, as.matrix.glm, ...))) -} - -#' @keywords internal -#' @export -t.glm <- function(x, ...) { - return(t(as.matrix(x), ...)) -} - -#' @keywords internal -#' @export -t.lm <- function(x, ...) { - return(t(as.matrix(x), ...)) -} - -#' @keywords internal -#' @export -t.ridgelm <- function(x, ...) { - return(t(as.matrix(x), ...)) -} - -#' @keywords internal -#' @export -t.list <- function(x, ...) { - return(t(as.matrix(x), ...)) -} - #' Extract projected parameter draws #' #' This is the [as.matrix()] method for `projection` objects (returned by diff --git a/R/misc.R b/R/misc.R index c154377a3..e83ed05bb 100644 --- a/R/misc.R +++ b/R/misc.R @@ -87,10 +87,9 @@ bootstrap <- function(x, fun = mean, b = 1000, oobfun = NULL, seed = NULL, } } -# from rstanarm -`%ORifNULL%` <- function(a, b) if (is.null(a)) b else a - -.is.wholenumber <- function(x) abs(x - round(x)) < .Machine$double.eps^0.5 +.is.wholenumber <- function(x) { + abs(x - round(x)) < .Machine$double.eps^0.5 +} .validate_num_folds <- function(k, n) { if (!is.numeric(k) || length(k) != 1 || !.is.wholenumber(k)) { @@ -128,22 +127,15 @@ bootstrap <- function(x, fun = mean, b = 1000, oobfun = NULL, seed = NULL, } .validate_baseline <- function(refmodel, baseline, deltas) { - if (is.null(baseline)) { - if (inherits(refmodel, "datafit")) { - baseline <- "best" - } else { - baseline <- "ref" - } - } else { - if (!(baseline %in% c("ref", "best"))) { - stop("Argument 'baseline' must be either 'ref' or 'best'.") - } - if (baseline == "ref" && deltas == TRUE && inherits(refmodel, "datafit")) { - # no reference model (or the results missing for some other reason), - # so cannot compute differences between the reference model and submodels - stop("Cannot use deltas = TRUE and baseline = 'ref' when there is no ", - "reference model.") - } + stopifnot(!is.null(baseline)) + if (!(baseline %in% c("ref", "best"))) { + stop("Argument 'baseline' must be either 'ref' or 'best'.") + } + if (baseline == "ref" && deltas == TRUE && inherits(refmodel, "datafit")) { + # no reference model (or the results missing for some other reason), + # so cannot compute differences between the reference model and submodels + stop("Cannot use deltas = TRUE and baseline = 'ref' when there is no ", + "reference model.") } return(baseline) } @@ -344,8 +336,9 @@ nlist <- function(...) { dots } -## ifelse operator -"%||%" <- function(x, y) { +# The `%||%` special binary (infix) operator from brms (equivalent to the +# `%ORifNULL%` operator from rstanarm): +`%||%` <- function(x, y) { if (is.null(x)) x <- y x } @@ -405,7 +398,7 @@ eval2 <- function(expr, envir = parent.frame(), ...) { eval(expr, envir, ...) } -# coerce 'x' to a single character string +# coerce `x` to a single character string as_one_character <- function(x, allow_na = FALSE) { s <- substitute(x) x <- as.character(x) @@ -443,7 +436,7 @@ magrittr::`%>%` # Function where() is not exported by package tidyselect, so redefine it here to # avoid a note in R CMD check which would occur for usage of # tidyselect:::where(): -where <- `%:::%`("tidyselect", "where") +where <- "tidyselect" %:::% "where" get_as.matrix_cls_projpred <- function() { ### Only works when projpred is loaded via devtools::load_all(): @@ -463,4 +456,6 @@ get_as.matrix_cls_projpred <- function() { ## Helper function extract and combine mu and lppd from K lists with each ## n/K of the elements to one list with n elements -hf <- function(x) as.list(do.call(rbind, x)) +hf <- function(x) { + as.list(do.call(rbind, x)) +} diff --git a/R/project.R b/R/project.R index 5f11289fd..8731faa57 100644 --- a/R/project.R +++ b/R/project.R @@ -135,9 +135,9 @@ project <- function(object, nterms = NULL, solution_terms = NULL, any( solution_terms(object)[seq_along(solution_terms)] != solution_terms )) { - warning("The given `solution_terms` are not part of the search path (from ", - "`solution_terms(object)`), so `cv_search` is automatically set ", - "to `TRUE`.") + warning("The given `solution_terms` are not part of the solution path ", + "(from `solution_terms(object)`), so `cv_search` is automatically ", + "set to `TRUE`.") cv_search <- TRUE } diff --git a/R/projfun.R b/R/projfun.R index d0e1eba8f..7374b879b 100644 --- a/R/projfun.R +++ b/R/projfun.R @@ -1,6 +1,7 @@ -## Function handles for the projection -## - +# Function to project the reference model onto a single submodel with predictor +# terms given in `solution_terms`. Note that "single submodel" does not refer to +# a single fit (there are as many fits for this single submodel as there are +# projected draws). project_submodel <- function(solution_terms, p_ref, refmodel, family, intercept, regul = 1e-4) { validparams <- .validate_wobs_wsample(refmodel$wobs, p_ref$weights, p_ref$mu) @@ -28,27 +29,11 @@ project_submodel <- function(solution_terms, p_ref, refmodel, family, intercept, )) } -## function handle for the projection over samples -.get_proj_handle <- function(refmodel, p_ref, family, regul = 1e-9, - intercept = TRUE) { - return(function(solution_terms) { - project_submodel( - solution_terms = solution_terms, p_ref = p_ref, refmodel = refmodel, - family = family, intercept = intercept, regul = regul - ) - }) -} - +# Function to project the reference model onto the submodels of given model +# sizes `nterms`. Returns a list of submodels (each processed by +# .init_submodel()). .get_submodels <- function(search_path, nterms, family, p_ref, refmodel, intercept, regul, cv_search = FALSE) { - ## - ## - ## Project onto given model sizes nterms. Returns a list of submodels. If - ## cv_search=FALSE, submodels parameters will be as they were computed during - ## the search, so there is no need to project anything anymore, and this - ## function simply fetches the information from the search_path list, which - ## contains the parameter values. - varorder <- search_path$solution_terms if (!cv_search) { @@ -73,14 +58,15 @@ project_submodel <- function(solution_terms, p_ref, refmodel, family, intercept, } } else { ## need to project again for each submodel size - projfun <- .get_proj_handle(refmodel, p_ref, family, regul, intercept) fetch_submodel <- function(nterms) { - solution_terms <- utils::head(varorder, nterms) - return(projfun(solution_terms)) + project_submodel( + solution_terms = utils::head(varorder, nterms), p_ref = p_ref, + refmodel = refmodel, family = family, intercept = intercept, + regul = regul + ) } } - submodels <- lapply(nterms, fetch_submodel) - return(submodels) + return(lapply(nterms, fetch_submodel)) } .validate_wobs_wsample <- function(ref_wobs, ref_wsample, ref_mu) { @@ -120,6 +106,5 @@ project_submodel <- function(solution_terms, p_ref, refmodel, family, intercept, nlist(mu, dis) ), wsample) weights <- wsample - submodel <- nlist(dis, kl, weights, solution_terms, sub_fit) - return(submodel) + return(nlist(dis, kl, weights, solution_terms, sub_fit)) } diff --git a/R/refmodel.R b/R/refmodel.R index f32e8baaf..b5ba668d5 100644 --- a/R/refmodel.R +++ b/R/refmodel.R @@ -363,6 +363,10 @@ get_refmodel.default <- function(object, formula, family = NULL, ...) { #' @rdname refmodel-init-get #' @export get_refmodel.stanreg <- function(object, ...) { + if (!requireNamespace("rstanarm", quietly = TRUE)) { + stop("Please install the 'rstanarm' package.") + } + # Family ------------------------------------------------------------------ family <- family(object) @@ -495,10 +499,8 @@ get_refmodel.stanreg <- function(object, ...) { } cvfun <- function(folds, ...) { - cvres <- rstanarm::kfold(object, K = max(folds), save_fits = TRUE, - folds = folds, ...) - fits <- cvres$fits[, "fit"] - return(fits) + rstanarm::kfold(object, K = max(folds), save_fits = TRUE, + folds = folds, ...)$fits[, "fit"] } # Miscellaneous ----------------------------------------------------------- diff --git a/R/search.R b/R/search.R index 6cf4cacb1..866b478b3 100644 --- a/R/search.R +++ b/R/search.R @@ -1,9 +1,5 @@ search_forward <- function(p_ref, refmodel, family, intercept, nterms_max, - verbose = TRUE, opt, search_terms = NULL, - increasing_order = TRUE) { - # projfun() performs the projection: - projfun <- .get_proj_handle(refmodel, p_ref, family, opt$regul, intercept) - + verbose = TRUE, opt, search_terms = NULL) { formula <- refmodel$formula iq <- ceiling(quantile(seq_len(nterms_max), 1:10 / 10)) if (is.null(search_terms)) { @@ -22,7 +18,9 @@ search_forward <- function(p_ref, refmodel, family, intercept, nterms_max, if (is.null(cands)) next full_cands <- lapply(cands, function(cand) c(chosen, cand)) - subL <- lapply(full_cands, projfun) + subL <- lapply(full_cands, project_submodel, + p_ref = p_ref, refmodel = refmodel, family = family, + intercept = intercept, regul = opt$regul) ## select best candidate imin <- which.min(sapply(subL, "[[", "kl")) @@ -162,6 +160,7 @@ search_L1 <- function(p_ref, refmodel, family, intercept, nterms_max, penalty, if (nterms == 0) { formula <- make_formula(c("1")) beta <- NULL + x <- x[, numeric(), drop = FALSE] } else { formula <- make_formula(solution_terms[seq_len(nterms)]) variables <- unlist(lapply( @@ -180,6 +179,10 @@ search_L1 <- function(p_ref, refmodel, family, intercept, nterms_max, penalty, )) indices <- match(variables, colnames(x)[search_path$solution_terms]) beta <- search_path$beta[indices, max(indices) + 1, drop = FALSE] + # Also reduce `x` (important for coef.subfit(), for example); note that + # `x <- x[, variables, drop = FALSE]` should also be possible, but the + # re-use of `colnames(x)` should provide another sanity check: + x <- x[, colnames(x)[search_path$solution_terms[indices]], drop = FALSE] } sub <- nlist( alpha = search_path$alpha[nterms + 1], diff --git a/R/varsel.R b/R/varsel.R index c15791467..4aa4b8412 100644 --- a/R/varsel.R +++ b/R/varsel.R @@ -266,8 +266,8 @@ select <- function(method, p_sel, refmodel, family, intercept, nterms_max, ## last three ## are returned only if one cluster projection is used for selection): ## solution_terms: the variable ordering - ## beta: coefficients along the search path - ## alpha: intercepts along the search path + ## beta: coefficients along the solution path + ## alpha: intercepts along the solution path ## p_sel: the reference distribution used in the selection (the input ## argument p_sel) ## diff --git a/man/cv_varsel.Rd b/man/cv_varsel.Rd index a406f1ede..34f674f05 100644 --- a/man/cv_varsel.Rd +++ b/man/cv_varsel.Rd @@ -23,7 +23,7 @@ cv_varsel(object, ...) penalty = NULL, verbose = TRUE, nloo = NULL, - K = NULL, + K = if (!inherits(object, "datafit")) 5 else 10, lambda_min_ratio = 1e-05, nlambda = 150, thresh = 1e-06, @@ -100,9 +100,7 @@ faster computation but higher uncertainty in the evaluation part. If set this to a smaller value.} \item{K}{Only relevant if \code{cv_method == "kfold"}. Number of folds in the -\eqn{K}-fold CV. If \code{NULL}, then \code{5} is used for genuine reference models -(i.e., of class \code{refmodel}) and \code{10} for \code{datafit}s (that is, for penalized -maximum likelihood estimation).} +\eqn{K}-fold CV.} \item{lambda_min_ratio}{Only relevant if \code{method} turns out as \code{"L1"}. Ratio between the smallest and largest lambda in the L1-penalized search. This diff --git a/man/plot.vsel.Rd b/man/plot.vsel.Rd index 9891a3fa1..555dcc85b 100644 --- a/man/plot.vsel.Rd +++ b/man/plot.vsel.Rd @@ -10,7 +10,7 @@ stats = "elpd", deltas = FALSE, alpha = 0.32, - baseline = NULL, + baseline = if (!inherits(x$refmodel, "datafit")) "ref" else "best", ... ) } @@ -44,8 +44,8 @@ For example, \code{alpha = 0.32} corresponds to 68\% probability mass within the intervals, that is, one-standard-error intervals.} \item{baseline}{Either \code{"ref"} or \code{"best"} indicating whether the baseline is -the reference model or the best submodel found, respectively. If \code{NULL}, -then \code{"ref"} is used, except for \code{datafit}s for which \code{"best"} is used.} +the reference model or the best submodel (in terms of \code{stats[1]}), +respectively.} \item{...}{Currently ignored.} } diff --git a/man/suggest_size.Rd b/man/suggest_size.Rd index 869fef362..926eda589 100644 --- a/man/suggest_size.Rd +++ b/man/suggest_size.Rd @@ -13,7 +13,7 @@ suggest_size(object, ...) alpha = 0.32, pct = 0, type = "upper", - baseline = NULL, + baseline = if (!inherits(object$refmodel, "datafit")) "ref" else "best", warnings = TRUE, ... ) @@ -41,8 +41,8 @@ are based on the upper or lower credible bounds, respectively. See section "Details" for more information.} \item{baseline}{Either \code{"ref"} or \code{"best"} indicating whether the baseline is -the reference model or the best submodel found, respectively. If \code{NULL}, -then \code{"ref"} is used, except for \code{datafit}s for which \code{"best"} is used.} +the reference model or the best submodel (in terms of \code{stats[1]}), +respectively.} \item{warnings}{Whether to give warnings if automatic suggestion fails, mainly for internal use. Usually there is no reason to set this to \code{FALSE}.} diff --git a/man/summary.vsel.Rd b/man/summary.vsel.Rd index fd39aeaea..8ade10b4f 100644 --- a/man/summary.vsel.Rd +++ b/man/summary.vsel.Rd @@ -11,7 +11,7 @@ type = c("mean", "se", "diff", "diff.se"), deltas = FALSE, alpha = 0.32, - baseline = NULL, + baseline = if (!inherits(object$refmodel, "datafit")) "ref" else "best", ... ) } @@ -41,9 +41,8 @@ family only); \code{"diff"}, and \code{"diff.se"} indicating which of these to compute (mean, standard error, lower and upper credible bounds, difference to the corresponding statistic of the reference model, and standard error of this -difference). The credible bounds are determined so that \code{1 - alpha} percent -of the probability mass falls between them. Items \code{"diff"} and \code{"diff.se"} -are only supported if \code{!deltas}.} +difference, respectively). The credible bounds are determined so that \code{1 - alpha} percent of the probability mass falls between them. Items \code{"diff"} +and \code{"diff.se"} are only supported if \code{deltas} is \code{FALSE}.} \item{deltas}{If \code{TRUE}, the submodel statistics are estimated relative to the baseline model (see argument \code{baseline}) instead of estimating the @@ -53,9 +52,9 @@ actual values of the statistics.} For example, \code{alpha = 0.32} corresponds to 68\% probability mass within the intervals, that is, one-standard-error intervals.} -\item{baseline}{Either \code{"ref"} or \code{"best"} indicating whether the baseline is -the reference model or the best submodel found, respectively. If \code{NULL}, -then \code{"ref"} is used, except for \code{datafit}s for which \code{"best"} is used.} +\item{baseline}{Only relevant if \code{deltas} is \code{TRUE}. Either \code{"ref"} or +\code{"best"} indicating whether the baseline is the reference model or the best +submodel (in terms of \code{stats[1]}), respectively.} \item{...}{Currently ignored.} } diff --git a/tests/testthat/helpers/testers.R b/tests/testthat/helpers/testers.R index fcc6f2d97..7a7fd3b70 100644 --- a/tests/testthat/helpers/testers.R +++ b/tests/testthat/helpers/testers.R @@ -241,7 +241,7 @@ refmodel_tester <- function( drop = FALSE ] ### TODO (GAMs and GAMMs): Do this manually: - mm_s <- `%:::%`("rstanarm", "pp_data")(refmod$fit)$x + mm_s <- ("rstanarm" %:::% "pp_data")(refmod$fit)$x mm_s <- mm_s[, grep("^s\\(", colnames(mm_s), value = TRUE), drop = FALSE] ### @@ -423,9 +423,9 @@ refmodel_tester <- function( # expected to be of class `"gam"` or `"gamm4"` (depending on whether the # submodel is non-multilevel or multilevel, respectively). # @param wobs_expected The expected numeric vector of observation weights. -# @param ref_formul The formula of the reference model . Should only be needed -# if `sub_fit_totest` comes from the L1 `search_path` of an object of class -# `"vsel"`. Otherwise, use `NULL`. +# @param solterms_vsel_L1_search If `sub_fit_totest` comes from the L1 +# `search_path` of an object of class `"vsel"`, provide here the solution +# terms. Otherwise, use `NULL`. # @param with_offs A single logical value indicating whether `sub_fit_totest` is # expected to include offsets (`TRUE`) or not (`FALSE`). # @param info_str A single character string giving information to be printed in @@ -441,14 +441,14 @@ sub_fit_tester <- function( has_grp = formula_contains_group_terms(sub_formul[[1]]), has_add = formula_contains_additive_terms(sub_formul[[1]]), wobs_expected = wobs_tst, - ref_formul = NULL, + solterms_vsel_L1_search = NULL, with_offs = FALSE, info_str ) { expect_type(sub_fit_totest, "list") expect_length(sub_fit_totest, nprjdraws_expected) - from_vsel_L1_search <- !is.null(ref_formul) + from_vsel_L1_search <- !is.null(solterms_vsel_L1_search) seq_extensive_tests <- unique(round( seq(1, length(sub_fit_totest), @@ -473,11 +473,7 @@ sub_fit_tester <- function( } else { ncoefs <- 0L } - if (!from_vsel_L1_search) { - formul_for_mm <- sub_formul[[1]] - } else { - formul_for_mm <- ref_formul - } + formul_for_mm <- sub_formul[[1]] sub_trms_for_mm <- labels(terms(formul_for_mm)) if (length(sub_trms_for_mm) > 0) { if (any(grepl("xca\\.", sub_trms_for_mm))) { @@ -497,6 +493,56 @@ sub_fit_tester <- function( sub_x_expected <- model.matrix(update(formul_for_mm, NULL ~ . + 0), data = sub_data, contrasts.arg = sub_contr) + if (from_vsel_L1_search) { + attr(sub_x_expected, "assign") <- NULL + attr(sub_x_expected, "contrasts") <- NULL + # Unfortunately, model.matrix() uses terms() and there seems to be no way + # to set `keep.order = TRUE` in that internal terms() call. Thus, we have + # to reorder the columns manually: + if (length(solterms_vsel_L1_search) > 0) { + contr_list <- get_contrasts_arg_list( + as.formula(paste("~", + paste(solterms_vsel_L1_search, collapse = " + "))), + data = sub_data + ) + terms_contr <- names(contr_list) + terms_contr_expd <- lapply(solterms_vsel_L1_search, function(term_crr) { + if (!term_crr %in% terms_contr) { + return(term_crr) + } else { + if (sum(terms_contr == term_crr) != 1) { + stop("The following code is not general enough. It needs to be ", + "adapted.") + } + return(paste0( + terms_contr[terms_contr == term_crr], + rownames(contr_list[[which(terms_contr == term_crr)]]) + )) + } + }) + terms_contr_expd <- unlist(terms_contr_expd) + colnms_x <- colnames(sub_x_expected) + colnms_x <- unlist(lapply(colnms_x, function(colnm_x) { + if (!colnm_x %in% terms_contr_expd) { + colon_found <- gregexpr(":", colnm_x) + if (length(colon_found) != 1 || colon_found == -1) { + stop("The following code is not general enough. It needs to be ", + "adapted.") + } + return(paste(rev(strsplit(colnm_x, ":")[[1]]), collapse = ":")) + } else { + return(colnm_x) + } + })) + colnames(sub_x_expected) <- colnms_x + sub_x_expected <- sub_x_expected[ + , + colnames(sub_x_expected)[order(match(colnames(sub_x_expected), + terms_contr_expd))], + drop = FALSE + ] + } + } if (from_vsel_L1_search) { subfit_nms <- setdiff(subfit_nms, "y") } @@ -778,10 +824,10 @@ projection_tester <- function(p, } if (!from_vsel_L1_search) { y_nm <- as.character(p$refmodel$formula)[2] - ref_formul_crr <- NULL + solterms_vsel_L1_search_crr <- NULL } else { y_nm <- "" - ref_formul_crr <- p$refmodel$formula + solterms_vsel_L1_search_crr <- p$solution_terms } y_nms <- paste0(".", y_nm) # A preliminary check for `nprjdraws_expected` (doesn't work for "datafit"s @@ -826,7 +872,7 @@ projection_tester <- function(p, sub_data = sub_data_crr, sub_fam = p$family$family, wobs_expected = p$refmodel$wobs, - ref_formul = ref_formul_crr, + solterms_vsel_L1_search = solterms_vsel_L1_search_crr, info_str = info_str) # dis @@ -1088,10 +1134,10 @@ vsel_tester <- function( nprjdraws_expected <- ncol(clust_ref$mu) if (!from_vsel_L1_search) { y_nm <- as.character(vs$refmodel$formula)[2] - ref_formul_crr <- NULL + solterms_vsel_L1_search_crr <- NULL } else { y_nm <- "" - ref_formul_crr <- vs$refmodel$formula + solterms_vsel_L1_search_crr <- vs$solution_terms } y_nms <- paste0(".", y_nm) if (nprjdraws_expected > 1) { @@ -1120,7 +1166,7 @@ vsel_tester <- function( sub_data = sub_data_crr, sub_fam = vs$family$family, wobs_expected = vs$refmodel$wobs, - ref_formul = ref_formul_crr, + solterms_vsel_L1_search = solterms_vsel_L1_search_crr, info_str = paste(info_str, i, sep = "__") ) } diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 6645e95ac..55d942da9 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -537,11 +537,10 @@ cvmeth_tst <- list( ) vsel_funs <- nlist("summary.vsel", "plot.vsel", "suggest_size.vsel") -stats_common <- c("elpd", "mlpd") +stats_common <- c("elpd", "mlpd", "mse", "rmse") stats_tst <- list( default_stats = list(), common_stats = list(stats = stats_common), - gauss_stats = list(stats = c(stats_common, c("mse", "rmse"))), binom_stats = list(stats = c(stats_common, c("acc", "auc"))) ) type_tst <- c("mean", "lower", "upper", "se") @@ -851,7 +850,6 @@ cre_args_smmry_vsel <- function(tstsetups_smmry_vsel) { fam_crr <- args_obj[[tstsetup_vsel]]$fam_nm add_stats <- switch(mod_crr, "glm" = switch(fam_crr, - "gauss" = "gauss_stats", "brnll" = "binom_stats", "binom" = "binom_stats", "common_stats"), diff --git a/tests/testthat/test_datafit.R b/tests/testthat/test_datafit.R index 8ffa2ccff..11fd6ca30 100644 --- a/tests/testthat/test_datafit.R +++ b/tests/testthat/test_datafit.R @@ -557,7 +557,7 @@ test_that(paste( stats_expected = stats_common, type_expected = type_tst, cv_method_expected = - args_cvvs_datafit[[tstsetup]]$cv_method %ORifNULL% "LOO", + args_cvvs_datafit[[tstsetup]]$cv_method %||% "LOO", solterms_expected = cvvss_datafit[[tstsetup]]$solution_terms ) if (run_snaps) { diff --git a/tests/testthat/test_methods_vsel.R b/tests/testthat/test_methods_vsel.R index 09845bc49..5fb9f10a5 100644 --- a/tests/testthat/test_methods_vsel.R +++ b/tests/testthat/test_methods_vsel.R @@ -93,7 +93,7 @@ test_that(paste( type_expected = args_smmry_cvvs[[tstsetup]]$type, nterms_max_expected = args_smmry_cvvs[[tstsetup]]$nterms_max, cv_method_expected = - args_cvvs[[tstsetup_cvvs]]$cv_method %ORifNULL% "LOO", + args_cvvs[[tstsetup_cvvs]]$cv_method %||% "LOO", solterms_expected = cvvss[[tstsetup_cvvs]]$solution_terms ) } @@ -268,7 +268,6 @@ test_that("`stat` works", { tstsetup_vs <- args_smmry_vs[[tstsetup]]$tstsetup_vsel fam_crr <- args_vs[[tstsetup_vs]]$fam_nm stat_crr_nm <- switch(fam_crr, - "gauss" = "gauss_stats", "brnll" = "binom_stats", "binom" = "binom_stats", "common_stats") diff --git a/tests/testthat/test_varsel.R b/tests/testthat/test_varsel.R index 4350fcee2..522edec92 100644 --- a/tests/testthat/test_varsel.R +++ b/tests/testthat/test_varsel.R @@ -701,15 +701,15 @@ test_that("`validate_search` works", { test_that("invalid `K` fails", { expect_error(cv_varsel(refmods[[1]], cv_method = "kfold", K = 1), - "^K must be at least 2\\.$") + "^`K` must be at least 2\\.$") expect_error(cv_varsel(refmods[[1]], cv_method = "kfold", K = 1000), - "^K cannot exceed the number of observations\\.$") + "^`K` cannot exceed the number of observations\\.$") expect_error(cv_varsel(refmods[[1]], cv_method = "kfold", K = c(4, 9)), - "^K must be a single integer value\\.$") + "^`K` must be a single integer value\\.$") expect_error(cv_varsel(refmods[[1]], cv_method = "kfold", K = "a"), - "^K must be a single integer value\\.$") + "^`K` must be a single integer value\\.$") expect_error(cv_varsel(refmods[[1]], cv_method = "kfold", K = dat), - "^K must be a single integer value\\.$") + "^`K` must be a single integer value\\.$") }) test_that("`cvfits` (actually passed to init_refmodel()) works", {