Skip to content

Commit

Permalink
Merge pull request #341 from fweber144/d_test_sub_fix
Browse files Browse the repository at this point in the history
Fixes and other improvements for argument `d_test` of `varsel()`
  • Loading branch information
fweber144 authored Jul 14, 2022
2 parents fc54766 + 3aba937 commit 3142d8d
Show file tree
Hide file tree
Showing 9 changed files with 323 additions and 90 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* Several improvements in the documentation (especially in the explanation of the `suggest_size()` heuristic).
* At multiple places throughout the package: Improvement of the numerical stability for some link functions, achieved by avoiding unnecessary back-and-forth transformations between latent space and response space. (GitHub: #337, #338)
* All arguments `seed` and `.seed` are now allowed to be `NA` for not calling `set.seed()` internally at all.
* Argument `d_test` of `varsel()` is not considered as an internal feature anymore. This was possible after fixing a bug for `d_test` (see below).
* The order of the observations in the subelements of `<vsel_object>$summaries` and `<vsel_object>$d_test` now corresponds to the order of the observations in the original dataset if `<vsel_object>` was created by a call to `cv_varsel([...], cv_method = "kfold")` (formerly, in that case, the observations in those subelements were ordered by fold). Thereby, the order of the observations in those subelements now always corresponds to the order of the observations in the original dataset, except if `<vsel_object>` was created by a call to `varsel([...], d_test = <non-NULL_d_test_object>)`, in which case the order of the observations in those subelements corresponds to the order of the observations in `<non-NULL_d_test_object>`.

## Bug fixes

Expand All @@ -16,6 +18,7 @@
* Fix GitHub issue #331.
* `plot.vsel()` now draws the dashed red horizontal line for the reference model (and---if present---the dotted black horizontal line for the baseline model) first (i.e., before the submodel-specific graphical elements), to avoid overplotting.
* Fix GitHub issue #339. (GitHub: #340)
* Fix argument `d_test` of `varsel()`: Not only the predictive performance of the *reference model* needs to be evaluated on the test data, but also the predictive performance of the *submodels*.

# projpred 2.1.2

Expand Down
40 changes: 23 additions & 17 deletions R/cv_varsel.R
Original file line number Diff line number Diff line change
Expand Up @@ -481,9 +481,9 @@ loo_varsel <- function(refmodel, method, nterms_max, ndraws,
p_ref = p_pred, refmodel = refmodel, regul = opt$regul,
refit_prj = refit_prj, ...
)
summaries_sub <- .get_sub_summaries(
submodels = submodels, test_points = c(i), refmodel = refmodel
)
summaries_sub <- .get_sub_summaries(submodels = submodels,
refmodel = refmodel,
test_points = i)
for (k in seq_along(summaries_sub)) {
loo_sub[[k]][i] <- summaries_sub[[k]]$lppd
mu_sub[[k]][i] <- summaries_sub[[k]]$mu
Expand Down Expand Up @@ -516,12 +516,8 @@ loo_varsel <- function(refmodel, method, nterms_max, ndraws,
summ_ref <- list(lppd = loo_ref, mu = mu_ref)
summaries <- list(sub = summ_sub, ref = summ_ref)

d_test <- list(
y = refmodel$y, type = "LOO",
test_points = seq_along(refmodel$y),
weights = refmodel$wobs,
data = NULL, offset = refmodel$offset
)
d_test <- list(type = "LOO", data = NULL, offset = refmodel$offset,
weights = refmodel$wobs, y = refmodel$y)

out_list <- nlist(solution_terms_cv = solution_terms_mat, summaries, d_test)
if (!validate_search) {
Expand Down Expand Up @@ -606,10 +602,9 @@ kfold_varsel <- function(refmodel, method, nterms_max, ndraws,
# Perform the evaluation of the submodels for each fold (and make sure to
# combine the results from the K folds into a single results list):
get_summaries_submodel_cv <- function(submodels, fold) {
.get_sub_summaries(
submodels = submodels, test_points = fold$d_test$omitted,
refmodel = refmodel
)
.get_sub_summaries(submodels = submodels,
refmodel = refmodel,
test_points = fold$d_test$omitted)
}
sub_cv_summaries <- mapply(get_summaries_submodel_cv, submodels_cv, list_cv)
if (is.null(dim(sub_cv_summaries))) {
Expand All @@ -618,7 +613,14 @@ kfold_varsel <- function(refmodel, method, nterms_max, ndraws,
dim(sub_cv_summaries) <- rev(summ_dim)
}
sub <- apply(sub_cv_summaries, 1, rbind2list)
idxs_sorted_by_fold <- unlist(lapply(list_cv, function(fold) {
fold$d_test$omitted
}))
sub <- lapply(sub, function(summ) {
summ$mu <- summ$mu[order(idxs_sorted_by_fold)]
summ$lppd <- summ$lppd[order(idxs_sorted_by_fold)]

# Add weights (see GitHub issue #330 for why this needs to be clarified):
summ$w <- rep(1, length(summ$mu))
summ$w <- summ$w / sum(summ$w)
return(summ)
Expand All @@ -637,19 +639,23 @@ kfold_varsel <- function(refmodel, method, nterms_max, ndraws,
dis = fold$refmodel$dis
)
}))
ref$mu <- ref$mu[order(idxs_sorted_by_fold)]
ref$lppd <- ref$lppd[order(idxs_sorted_by_fold)]

# Combine the K separate test "datasets" (rather "information objects") into a
# single list:
d_cv <- rbind2list(lapply(list_cv, function(fold) {
list(y = fold$d_test$y,
list(offset = fold$d_test$offset,
weights = fold$d_test$weights,
test_points = fold$d_test$omitted,
offset = fold$d_test$offset)
y = fold$d_test$y)
}))
d_cv <- as.list(
as.data.frame(d_cv)[order(idxs_sorted_by_fold), , drop = FALSE]
)

return(nlist(solution_terms_cv,
summaries = nlist(sub, ref),
d_test = c(d_cv, type = "kfold")))
d_test = c(list(type = "kfold", data = NULL), d_cv)))
}

# Re-fit the reference model K times (once for each fold; `cvfun` case) or fetch
Expand Down
4 changes: 4 additions & 0 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
packageStartupMessage(msg)
}

nms_d_test <- function() {
c("type", "data", "offset", "weights", "y")
}

weighted.sd <- function(x, w, na.rm = FALSE) {
if (na.rm) {
ind <- !is.na(w) & !is.na(x)
Expand Down
13 changes: 7 additions & 6 deletions R/summary_funs.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
.get_sub_summaries <- function(submodels, test_points, refmodel) {
.get_sub_summaries <- function(submodels, refmodel, test_points, newdata = NULL,
offset = refmodel$offset[test_points],
wobs = refmodel$wobs[test_points],
y = refmodel$y[test_points]) {
lapply(submodels, function(model) {
.weighted_summary_means(
y_test = list(y = refmodel$y[test_points],
weights = refmodel$wobs[test_points]),
y_test = list(y = y, weights = wobs),
family = refmodel$family,
wsample = model$weights,
mu = refmodel$family$mu_fun(model$submodl,
obs = test_points,
offset = refmodel$offset[test_points]),
mu = refmodel$family$mu_fun(model$submodl, obs = test_points,
newdata = newdata, offset = offset),
dis = model$dis
)
})
Expand Down
44 changes: 29 additions & 15 deletions R/varsel.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
#' @param object An object of class `refmodel` (returned by [get_refmodel()] or
#' [init_refmodel()]) or an object that can be passed to argument `object` of
#' [get_refmodel()].
#' @param d_test For internal use only. A `list` providing information about the
#' test set which is used for evaluating the predictive performance of the
#' reference model. If not provided, the training set is used.
#' @param d_test A `list` of the structure outlined in section "Argument
#' `d_test`" below, providing test data for evaluating the predictive
#' performance of the submodels as well as of the reference model. If `NULL`,
#' the training data is used.
#' @param method The method for the search part. Possible options are `"L1"` for
#' L1 search and `"forward"` for forward search. If `NULL`, then internally,
#' `"L1"` is used, except if the reference model has multilevel or additive
Expand Down Expand Up @@ -82,6 +83,19 @@
#' minimizer (during a forward search and also during the evaluation part, but
#' the latter only if `refit_prj` is `TRUE`).
#'
#' @details
#'
#' # Argument `d_test`
#'
#' If not `NULL`, then `d_test` needs to be a `list` with the following
#' elements:
#' * `data`: a `data.frame` containing the predictor variables for the test set.
#' * `offset`: a numeric vector containing the offset values for the test set
#' (if there is no offset, use a vector of zeros).
#' * `weights`: a numeric vector containing the observation weights for the test
#' set (if there are no observation weights, use a vector of ones).
#' * `y`: a numeric vector containing the response values for the test set.
#'
#' @details Arguments `ndraws`, `nclusters`, `nclusters_pred`, and `ndraws_pred`
#' are automatically truncated at the number of posterior draws in the
#' reference model (which is `1` for `datafit`s). Using less draws or clusters
Expand Down Expand Up @@ -201,14 +215,11 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
search_terms <- args$search_terms

if (is.null(d_test)) {
d_type <- "train"
test_points <- seq_len(NROW(refmodel$y))
d_test <- nlist(
y = refmodel$y, test_points, data = NULL, weights = refmodel$wobs,
type = d_type, offset = refmodel$offset
)
d_test <- list(type = "train", data = NULL, offset = refmodel$offset,
weights = refmodel$wobs, y = refmodel$y)
} else {
d_type <- d_test$type
d_test$type <- "test"
d_test <- d_test[nms_d_test()]
}

## reference distributions for selection and prediction after selection
Expand All @@ -230,10 +241,13 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
p_ref = p_pred, refmodel = refmodel, regul = regul, refit_prj = refit_prj,
...
)
sub <- .get_sub_summaries(
submodels = submodels, test_points = seq_along(refmodel$y),
refmodel = refmodel
)
sub <- .get_sub_summaries(submodels = submodels,
refmodel = refmodel,
test_points = NULL,
newdata = d_test$data,
offset = d_test$offset,
wobs = d_test$weights,
y = d_test$y)

## predictive statistics of the reference model on test data. if no test data
## are provided,
Expand All @@ -244,7 +258,7 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
ntest <- NROW(refmodel$y)
ref <- list(mu = rep(NA, ntest), lppd = rep(NA, ntest))
} else {
if (d_type == "train") {
if (d_test$type == "train") {
mu_test <- refmodel$mu
if (!all(refmodel$offset == 0)) {
mu_test <- refmodel$family$linkinv(
Expand Down
20 changes: 17 additions & 3 deletions man/varsel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 20 additions & 27 deletions tests/testthat/helpers/testers.R
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,9 @@ pp_tester <- function(pp,
# @param with_cv A single logical value indicating whether `vs` was created by
# cv_varsel() (`TRUE`) or not (`FALSE`).
# @param refmod_expected The expected `vs$refmodel` object.
# @param dtest_expected The expected `vs$d_test` object.
# @param dtest_expected If `vs` was created with a non-`NULL` argument `d_test`
# (which is only possible for varsel()), then this needs to be the expected
# `vs$d_test` object. Otherwise, this needs to be `NULL`.
# @param solterms_len_expected A single numeric value giving the expected number
# of solution terms (not counting the intercept, even for the intercept-only
# model).
Expand Down Expand Up @@ -1122,8 +1124,6 @@ vsel_tester <- function(
info_str = ""
) {
# Preparations:
dtest_type <- "train"
dtest_nms <- c("y", "test_points", "data", "weights", "type", "offset")
if (with_cv) {
vsel_nms <- vsel_nms_cv
vsel_smmrs_sub_nms <- c("lppd", "mu", "w")
Expand All @@ -1136,13 +1136,7 @@ vsel_tester <- function(
valsearch_expected <- TRUE
}

dtest_type <- cv_method_expected
if (cv_method_expected == "LOO") {
# Re-order:
dtest_nms <- dtest_nms[c(1, 5, 2, 4, 3, 6)]
} else if (cv_method_expected == "kfold") {
# Re-order and remove `"data"`:
dtest_nms <- dtest_nms[c(1, 4, 2, 6, 5)]
if (cv_method_expected == "kfold") {
# Re-order:
vsel_smmrs_sub_nms[1:2] <- vsel_smmrs_sub_nms[2:1]
vsel_smmrs_ref_nms[1:2] <- vsel_smmrs_ref_nms[2:1]
Expand Down Expand Up @@ -1263,21 +1257,16 @@ vsel_tester <- function(
# d_test
if (is.null(dtest_expected)) {
expect_type(vs$d_test, "list")
expect_named(vs$d_test, dtest_nms, info = info_str)
if (identical(cv_method_expected, "kfold")) {
expect_identical(vs$d_test$y[order(vs$d_test$test_points)],
vs$refmodel$y, info = info_str)
expect_identical(vs$d_test$test_points[order(vs$d_test$test_points)],
seq_len(nobsv), info = info_str)
expect_identical(vs$d_test$weights[order(vs$d_test$test_points)],
vs$refmodel$wobs, info = info_str)
} else {
expect_identical(vs$d_test$y, vs$refmodel$y, info = info_str)
expect_identical(vs$d_test$test_points, seq_len(nobsv), info = info_str)
expect_identical(vs$d_test$weights, vs$refmodel$wobs, info = info_str)
expect_named(vs$d_test, nms_d_test(), info = info_str)
dtest_type <- cv_method_expected
if (length(dtest_type) == 0) {
dtest_type <- "train"
}
expect_null(vs$d_test$data, info = info_str)
expect_identical(vs$d_test$type, dtest_type, info = info_str)
expect_null(vs$d_test$data, info = info_str)
expect_identical(vs$d_test$offset, vs$refmodel$offset, info = info_str)
expect_identical(vs$d_test$weights, vs$refmodel$wobs, info = info_str)
expect_identical(vs$d_test$y, vs$refmodel$y, info = info_str)
} else {
expect_identical(vs$d_test, dtest_expected, info = info_str)
}
Expand All @@ -1292,18 +1281,22 @@ vsel_tester <- function(
nloo_expected <- nobsv
}
}
nobsv_summ <- nobsv
if (!is.null(dtest_expected)) {
nobsv_summ <- nrow(dtest_expected$data)
}
for (j in seq_along(vs$summaries$sub)) {
expect_named(vs$summaries$sub[[!!j]], vsel_smmrs_sub_nms, info = info_str)
expect_type(vs$summaries$sub[[!!j]]$mu, "double")
expect_length(vs$summaries$sub[[!!j]]$mu, nobsv)
expect_length(vs$summaries$sub[[!!j]]$mu, nobsv_summ)
if (with_cv) {
expect_identical(sum(!is.na(vs$summaries$sub[[!!j]]$mu)),
nloo_expected, info = info_str)
} else {
expect_true(all(!is.na(vs$summaries$sub[[!!j]]$mu)), info = info_str)
}
expect_type(vs$summaries$sub[[!!j]]$lppd, "double")
expect_length(vs$summaries$sub[[!!j]]$lppd, nobsv)
expect_length(vs$summaries$sub[[!!j]]$lppd, nobsv_summ)
if (with_cv) {
expect_identical(sum(!is.na(vs$summaries$sub[[!!j]]$lppd)),
nloo_expected, info = info_str)
Expand All @@ -1325,13 +1318,13 @@ vsel_tester <- function(
}
expect_type(vs$summaries$ref, "list")
expect_named(vs$summaries$ref, vsel_smmrs_ref_nms, info = info_str)
expect_length(vs$summaries$ref$mu, nobsv)
expect_length(vs$summaries$ref$mu, nobsv_summ)
if (!from_datafit) {
expect_true(all(!is.na(vs$summaries$ref$mu)), info = info_str)
} else {
expect_true(all(is.na(vs$summaries$ref$mu)), info = info_str)
}
expect_length(vs$summaries$ref$lppd, nobsv)
expect_length(vs$summaries$ref$lppd, nobsv_summ)
if (!from_datafit) {
expect_true(all(!is.na(vs$summaries$ref$lppd)), info = info_str)
} else {
Expand Down
Loading

0 comments on commit 3142d8d

Please sign in to comment.