Merge pull request #341 from fweber144/d_test_sub_fix

Fixes and other improvements for argument `d_test` of `varsel()`
stan-dev · Jul 14, 2022 · 3142d8d · 3142d8d
2 parents fc54766 + 3aba937
commit 3142d8d
Show file tree

Hide file tree

Showing 9 changed files with 323 additions and 90 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,8 @@
 * Several improvements in the documentation (especially in the explanation of the `suggest_size()` heuristic).
 * At multiple places throughout the package: Improvement of the numerical stability for some link functions, achieved by avoiding unnecessary back-and-forth transformations between latent space and response space. (GitHub: #337, #338)
 * All arguments `seed` and `.seed` are now allowed to be `NA` for not calling `set.seed()` internally at all.
+* Argument `d_test` of `varsel()` is not considered as an internal feature anymore. This was possible after fixing a bug for `d_test` (see below).
+* The order of the observations in the subelements of `<vsel_object>$summaries` and `<vsel_object>$d_test` now corresponds to the order of the observations in the original dataset if `<vsel_object>` was created by a call to `cv_varsel([...], cv_method = "kfold")` (formerly, in that case, the observations in those subelements were ordered by fold). Thereby, the order of the observations in those subelements now always corresponds to the order of the observations in the original dataset, except if `<vsel_object>` was created by a call to `varsel([...], d_test = <non-NULL_d_test_object>)`, in which case the order of the observations in those subelements corresponds to the order of the observations in `<non-NULL_d_test_object>`.
 
 ## Bug fixes
 
@@ -16,6 +18,7 @@
 * Fix GitHub issue #331.
 * `plot.vsel()` now draws the dashed red horizontal line for the reference model (and---if present---the dotted black horizontal line for the baseline model) first (i.e., before the submodel-specific graphical elements), to avoid overplotting.
 * Fix GitHub issue #339. (GitHub: #340)
+* Fix argument `d_test` of `varsel()`: Not only the predictive performance of the *reference model* needs to be evaluated on the test data, but also the predictive performance of the *submodels*.
 
 # projpred 2.1.2
 

diff --git a/R/cv_varsel.R b/R/cv_varsel.R
@@ -481,9 +481,9 @@ loo_varsel <- function(refmodel, method, nterms_max, ndraws,
         p_ref = p_pred, refmodel = refmodel, regul = opt$regul,
         refit_prj = refit_prj, ...
       )
-      summaries_sub <- .get_sub_summaries(
-        submodels = submodels, test_points = c(i), refmodel = refmodel
-      )
+      summaries_sub <- .get_sub_summaries(submodels = submodels,
+                                          refmodel = refmodel,
+                                          test_points = i)
       for (k in seq_along(summaries_sub)) {
         loo_sub[[k]][i] <- summaries_sub[[k]]$lppd
         mu_sub[[k]][i] <- summaries_sub[[k]]$mu
@@ -516,12 +516,8 @@ loo_varsel <- function(refmodel, method, nterms_max, ndraws,
   summ_ref <- list(lppd = loo_ref, mu = mu_ref)
   summaries <- list(sub = summ_sub, ref = summ_ref)
 
-  d_test <- list(
-    y = refmodel$y, type = "LOO",
-    test_points = seq_along(refmodel$y),
-    weights = refmodel$wobs,
-    data = NULL, offset = refmodel$offset
-  )
+  d_test <- list(type = "LOO", data = NULL, offset = refmodel$offset,
+                 weights = refmodel$wobs, y = refmodel$y)
 
   out_list <- nlist(solution_terms_cv = solution_terms_mat, summaries, d_test)
   if (!validate_search) {
@@ -606,10 +602,9 @@ kfold_varsel <- function(refmodel, method, nterms_max, ndraws,
   # Perform the evaluation of the submodels for each fold (and make sure to
   # combine the results from the K folds into a single results list):
   get_summaries_submodel_cv <- function(submodels, fold) {
-    .get_sub_summaries(
-      submodels = submodels, test_points = fold$d_test$omitted,
-      refmodel = refmodel
-    )
+    .get_sub_summaries(submodels = submodels,
+                       refmodel = refmodel,
+                       test_points = fold$d_test$omitted)
   }
   sub_cv_summaries <- mapply(get_summaries_submodel_cv, submodels_cv, list_cv)
   if (is.null(dim(sub_cv_summaries))) {
@@ -618,7 +613,14 @@ kfold_varsel <- function(refmodel, method, nterms_max, ndraws,
     dim(sub_cv_summaries) <- rev(summ_dim)
   }
   sub <- apply(sub_cv_summaries, 1, rbind2list)
+  idxs_sorted_by_fold <- unlist(lapply(list_cv, function(fold) {
+    fold$d_test$omitted
+  }))
   sub <- lapply(sub, function(summ) {
+    summ$mu <- summ$mu[order(idxs_sorted_by_fold)]
+    summ$lppd <- summ$lppd[order(idxs_sorted_by_fold)]
+
+    # Add weights (see GitHub issue #330 for why this needs to be clarified):
     summ$w <- rep(1, length(summ$mu))
     summ$w <- summ$w / sum(summ$w)
     return(summ)
@@ -637,19 +639,23 @@ kfold_varsel <- function(refmodel, method, nterms_max, ndraws,
       dis = fold$refmodel$dis
     )
   }))
+  ref$mu <- ref$mu[order(idxs_sorted_by_fold)]
+  ref$lppd <- ref$lppd[order(idxs_sorted_by_fold)]
 
   # Combine the K separate test "datasets" (rather "information objects") into a
   # single list:
   d_cv <- rbind2list(lapply(list_cv, function(fold) {
-    list(y = fold$d_test$y,
+    list(offset = fold$d_test$offset,
          weights = fold$d_test$weights,
-         test_points = fold$d_test$omitted,
-         offset = fold$d_test$offset)
+         y = fold$d_test$y)
   }))
+  d_cv <- as.list(
+    as.data.frame(d_cv)[order(idxs_sorted_by_fold), , drop = FALSE]
+  )
 
   return(nlist(solution_terms_cv,
                summaries = nlist(sub, ref),
-               d_test = c(d_cv, type = "kfold")))
+               d_test = c(list(type = "kfold", data = NULL), d_cv)))
 }
 
 # Re-fit the reference model K times (once for each fold; `cvfun` case) or fetch

diff --git a/R/misc.R b/R/misc.R
@@ -4,6 +4,10 @@
   packageStartupMessage(msg)
 }
 
+nms_d_test <- function() {
+  c("type", "data", "offset", "weights", "y")
+}
+
 weighted.sd <- function(x, w, na.rm = FALSE) {
   if (na.rm) {
     ind <- !is.na(w) & !is.na(x)

diff --git a/R/summary_funs.R b/R/summary_funs.R
@@ -1,13 +1,14 @@
-.get_sub_summaries <- function(submodels, test_points, refmodel) {
+.get_sub_summaries <- function(submodels, refmodel, test_points, newdata = NULL,
+                               offset = refmodel$offset[test_points],
+                               wobs = refmodel$wobs[test_points],
+                               y = refmodel$y[test_points]) {
   lapply(submodels, function(model) {
     .weighted_summary_means(
-      y_test = list(y = refmodel$y[test_points],
-                    weights = refmodel$wobs[test_points]),
+      y_test = list(y = y, weights = wobs),
       family = refmodel$family,
       wsample = model$weights,
-      mu = refmodel$family$mu_fun(model$submodl,
-                                  obs = test_points,
-                                  offset = refmodel$offset[test_points]),
+      mu = refmodel$family$mu_fun(model$submodl, obs = test_points,
+                                  newdata = newdata, offset = offset),
       dis = model$dis
     )
   })

diff --git a/R/varsel.R b/R/varsel.R
@@ -10,9 +10,10 @@
 #' @param object An object of class `refmodel` (returned by [get_refmodel()] or
 #'   [init_refmodel()]) or an object that can be passed to argument `object` of
 #'   [get_refmodel()].
-#' @param d_test For internal use only. A `list` providing information about the
-#'   test set which is used for evaluating the predictive performance of the
-#'   reference model. If not provided, the training set is used.
+#' @param d_test A `list` of the structure outlined in section "Argument
+#'   `d_test`" below, providing test data for evaluating the predictive
+#'   performance of the submodels as well as of the reference model. If `NULL`,
+#'   the training data is used.
 #' @param method The method for the search part. Possible options are `"L1"` for
 #'   L1 search and `"forward"` for forward search. If `NULL`, then internally,
 #'   `"L1"` is used, except if the reference model has multilevel or additive
@@ -82,6 +83,19 @@
 #'   minimizer (during a forward search and also during the evaluation part, but
 #'   the latter only if `refit_prj` is `TRUE`).
 #'
+#' @details
+#'
+#' # Argument `d_test`
+#'
+#' If not `NULL`, then `d_test` needs to be a `list` with the following
+#' elements:
+#' * `data`: a `data.frame` containing the predictor variables for the test set.
+#' * `offset`: a numeric vector containing the offset values for the test set
+#' (if there is no offset, use a vector of zeros).
+#' * `weights`: a numeric vector containing the observation weights for the test
+#' set (if there are no observation weights, use a vector of ones).
+#' * `y`: a numeric vector containing the response values for the test set.
+#'
 #' @details Arguments `ndraws`, `nclusters`, `nclusters_pred`, and `ndraws_pred`
 #'   are automatically truncated at the number of posterior draws in the
 #'   reference model (which is `1` for `datafit`s). Using less draws or clusters
@@ -201,14 +215,11 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
   search_terms <- args$search_terms
 
   if (is.null(d_test)) {
-    d_type <- "train"
-    test_points <- seq_len(NROW(refmodel$y))
-    d_test <- nlist(
-      y = refmodel$y, test_points, data = NULL, weights = refmodel$wobs,
-      type = d_type, offset = refmodel$offset
-    )
+    d_test <- list(type = "train", data = NULL, offset = refmodel$offset,
+                   weights = refmodel$wobs, y = refmodel$y)
   } else {
-    d_type <- d_test$type
+    d_test$type <- "test"
+    d_test <- d_test[nms_d_test()]
   }
 
   ## reference distributions for selection and prediction after selection
@@ -230,10 +241,13 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
     p_ref = p_pred, refmodel = refmodel, regul = regul, refit_prj = refit_prj,
     ...
   )
-  sub <- .get_sub_summaries(
-    submodels = submodels, test_points = seq_along(refmodel$y),
-    refmodel = refmodel
-  )
+  sub <- .get_sub_summaries(submodels = submodels,
+                            refmodel = refmodel,
+                            test_points = NULL,
+                            newdata = d_test$data,
+                            offset = d_test$offset,
+                            wobs = d_test$weights,
+                            y = d_test$y)
 
   ## predictive statistics of the reference model on test data. if no test data
   ## are provided,
@@ -244,7 +258,7 @@ varsel.refmodel <- function(object, d_test = NULL, method = NULL,
     ntest <- NROW(refmodel$y)
     ref <- list(mu = rep(NA, ntest), lppd = rep(NA, ntest))
   } else {
-    if (d_type == "train") {
+    if (d_test$type == "train") {
       mu_test <- refmodel$mu
       if (!all(refmodel$offset == 0)) {
         mu_test <- refmodel$family$linkinv(

diff --git a/man/varsel.Rd b/man/varsel.Rd
diff --git a/tests/testthat/helpers/testers.R b/tests/testthat/helpers/testers.R
@@ -1076,7 +1076,9 @@ pp_tester <- function(pp,
 # @param with_cv A single logical value indicating whether `vs` was created by
 #   cv_varsel() (`TRUE`) or not (`FALSE`).
 # @param refmod_expected The expected `vs$refmodel` object.
-# @param dtest_expected The expected `vs$d_test` object.
+# @param dtest_expected If `vs` was created with a non-`NULL` argument `d_test`
+#   (which is only possible for varsel()), then this needs to be the expected
+#   `vs$d_test` object. Otherwise, this needs to be `NULL`.
 # @param solterms_len_expected A single numeric value giving the expected number
 #   of solution terms (not counting the intercept, even for the intercept-only
 #   model).
@@ -1122,8 +1124,6 @@ vsel_tester <- function(
     info_str = ""
 ) {
   # Preparations:
-  dtest_type <- "train"
-  dtest_nms <- c("y", "test_points", "data", "weights", "type", "offset")
   if (with_cv) {
     vsel_nms <- vsel_nms_cv
     vsel_smmrs_sub_nms <- c("lppd", "mu", "w")
@@ -1136,13 +1136,7 @@ vsel_tester <- function(
       valsearch_expected <- TRUE
     }
 
-    dtest_type <- cv_method_expected
-    if (cv_method_expected == "LOO") {
-      # Re-order:
-      dtest_nms <- dtest_nms[c(1, 5, 2, 4, 3, 6)]
-    } else if (cv_method_expected == "kfold") {
-      # Re-order and remove `"data"`:
-      dtest_nms <- dtest_nms[c(1, 4, 2, 6, 5)]
+    if (cv_method_expected == "kfold") {
       # Re-order:
       vsel_smmrs_sub_nms[1:2] <- vsel_smmrs_sub_nms[2:1]
       vsel_smmrs_ref_nms[1:2] <- vsel_smmrs_ref_nms[2:1]
@@ -1263,21 +1257,16 @@ vsel_tester <- function(
   # d_test
   if (is.null(dtest_expected)) {
     expect_type(vs$d_test, "list")
-    expect_named(vs$d_test, dtest_nms, info = info_str)
-    if (identical(cv_method_expected, "kfold")) {
-      expect_identical(vs$d_test$y[order(vs$d_test$test_points)],
-                       vs$refmodel$y, info = info_str)
-      expect_identical(vs$d_test$test_points[order(vs$d_test$test_points)],
-                       seq_len(nobsv), info = info_str)
-      expect_identical(vs$d_test$weights[order(vs$d_test$test_points)],
-                       vs$refmodel$wobs, info = info_str)
-    } else {
-      expect_identical(vs$d_test$y, vs$refmodel$y, info = info_str)
-      expect_identical(vs$d_test$test_points, seq_len(nobsv), info = info_str)
-      expect_identical(vs$d_test$weights, vs$refmodel$wobs, info = info_str)
+    expect_named(vs$d_test, nms_d_test(), info = info_str)
+    dtest_type <- cv_method_expected
+    if (length(dtest_type) == 0) {
+      dtest_type <- "train"
     }
-    expect_null(vs$d_test$data, info = info_str)
     expect_identical(vs$d_test$type, dtest_type, info = info_str)
+    expect_null(vs$d_test$data, info = info_str)
+    expect_identical(vs$d_test$offset, vs$refmodel$offset, info = info_str)
+    expect_identical(vs$d_test$weights, vs$refmodel$wobs, info = info_str)
+    expect_identical(vs$d_test$y, vs$refmodel$y, info = info_str)
   } else {
     expect_identical(vs$d_test, dtest_expected, info = info_str)
   }
@@ -1292,18 +1281,22 @@ vsel_tester <- function(
       nloo_expected <- nobsv
     }
   }
+  nobsv_summ <- nobsv
+  if (!is.null(dtest_expected)) {
+    nobsv_summ <- nrow(dtest_expected$data)
+  }
   for (j in seq_along(vs$summaries$sub)) {
     expect_named(vs$summaries$sub[[!!j]], vsel_smmrs_sub_nms, info = info_str)
     expect_type(vs$summaries$sub[[!!j]]$mu, "double")
-    expect_length(vs$summaries$sub[[!!j]]$mu, nobsv)
+    expect_length(vs$summaries$sub[[!!j]]$mu, nobsv_summ)
     if (with_cv) {
       expect_identical(sum(!is.na(vs$summaries$sub[[!!j]]$mu)),
                        nloo_expected, info = info_str)
     } else {
       expect_true(all(!is.na(vs$summaries$sub[[!!j]]$mu)), info = info_str)
     }
     expect_type(vs$summaries$sub[[!!j]]$lppd, "double")
-    expect_length(vs$summaries$sub[[!!j]]$lppd, nobsv)
+    expect_length(vs$summaries$sub[[!!j]]$lppd, nobsv_summ)
     if (with_cv) {
       expect_identical(sum(!is.na(vs$summaries$sub[[!!j]]$lppd)),
                        nloo_expected, info = info_str)
@@ -1325,13 +1318,13 @@ vsel_tester <- function(
   }
   expect_type(vs$summaries$ref, "list")
   expect_named(vs$summaries$ref, vsel_smmrs_ref_nms, info = info_str)
-  expect_length(vs$summaries$ref$mu, nobsv)
+  expect_length(vs$summaries$ref$mu, nobsv_summ)
   if (!from_datafit) {
     expect_true(all(!is.na(vs$summaries$ref$mu)), info = info_str)
   } else {
     expect_true(all(is.na(vs$summaries$ref$mu)), info = info_str)
   }
-  expect_length(vs$summaries$ref$lppd, nobsv)
+  expect_length(vs$summaries$ref$lppd, nobsv_summ)
   if (!from_datafit) {
     expect_true(all(!is.na(vs$summaries$ref$lppd)), info = info_str)
   } else {