From 4449b086f61d8cdb472938d5e47380460082ddd1 Mon Sep 17 00:00:00 2001
From: Yann McLatchie <yann@mclatchie.com>
Date: Fri, 22 Sep 2023 10:47:56 +0300
Subject: [PATCH 1/6] add warning if the difference in predictive performance
 of the models compared is potentially due to random noise

---
 R/loo_compare.R                          | 66 +++++++++++++++++++++++-
 man-roxygen/loo-and-compare-references.R | 14 +++++
 man/loo_compare.Rd                       | 13 +++++
 tests/testthat/test_compare.R            |  7 +++
 4 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 man-roxygen/loo-and-compare-references.R

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 38a4b7ee..c7fbbdc2 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -42,10 +42,19 @@
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
 #'
+#'   If more than \eqn{11} models are compared, then the worst model by elpd is
+#'   taken as the baseline model, and the risk of difference in predictive
+#'   performance being due to random noise is estimated as described by
+#'   McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
+#'   there is a risk of over-fitting due to the selection process, and users
+#'   are recommended to either avoid model selection based on LOO-CV, and
+#'   instead to favour of model averaging/stacking or projection predictive
+#'   inference.
+#'
 #' @seealso
 #' * The [FAQ page](https://mc-stan.org/loo/articles/online-only/faq.html) on
 #'   the __loo__ website for answers to frequently asked questions.
-#' @template loo-and-psis-references
+#' @template loo-and-compare-references
 #'
 #' @examples
 #' # very artificial example, just for demonstration!
@@ -108,6 +117,9 @@ loo_compare.default <- function(x, ...) {
   comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp)
   rownames(comp) <- rnms
 
+  # run order statistics-based checks on models
+  loo_order_stat_check(loos, ord)
+
   class(comp) <- c("compare.loo", class(comp))
   return(comp)
 }
@@ -270,3 +282,55 @@ loo_compare_order <- function(loos){
   ord <- order(tmp[grep("^elpd", rnms), ], decreasing = TRUE)
   ord
 }
+
+#' Perform checks on `"loo"` objects __after__ comparison
+#' @noRd
+#' @keywords internal
+#' @param loos List of `"loo"` objects.
+#' @param ord List of `"loo"` object orderings.
+#' @return Nothing, just possibly throws errors/warnings.
+loo_order_stat_check <- function(loos, ord) {
+
+  ## breaks
+
+  if (length(loos) <= 11L) {
+    # procedure cannot be diagnosed for fewer than ten candidate models
+    # (total models = worst model + ten candidates)
+    # break from function
+    return(NULL)
+  }
+
+  ## warnings
+
+  # compute the elpd differences
+  worst_idx <- tail(ord, n=1)
+  diffs <- mapply(FUN = elpd_diffs, loos[ord[worst_idx]], loos[ord])
+  elpd_diff <- apply(diffs, 2, sum)
+
+  # estimate the standard deviation of the upper-half-normal
+  diff_median <- median(elpd_diff)
+  elpd_diff_trunc <- elpd_diff[elpd_diff >= diff_median]
+  n_models <- sum(!is.na(elpd_diff_trunc))
+  candidate_sd <- sqrt(1 / n_models * sum(elpd_diff_trunc^2, na.rm = TRUE))
+
+  # estimate expected best diff under null hypothesis
+  K <- length(loos) - 1
+  order_stat <- order_stat_heuristic(K, candidate_sd)
+
+  if (max(elpd_diff) <= order_stat) {
+    # flag warning if we suspect no model is theoretically better than the baseline
+    warning("Difference in performance potentially due to random noise.",
+            call. = FALSE)
+  }
+}
+
+#' Computes maximum order statistic from K Gaussians
+#' @noRd
+#' @keywords internal
+#' @param K Number of Gaussians.
+#' @param c Scaling of the order statistic.
+#' @return Numeric expected maximum from K samples from a Gaussian with mean
+#' zero and scale `"c"`
+order_stat_heuristic <- function(K, c) {
+  qnorm(p = 1 - 1 / (K * 2), mean = 0, sd = c)
+}
diff --git a/man-roxygen/loo-and-compare-references.R b/man-roxygen/loo-and-compare-references.R
new file mode 100644
index 00000000..f8afce31
--- /dev/null
+++ b/man-roxygen/loo-and-compare-references.R
@@ -0,0 +1,14 @@
+#' @references
+#' Vehtari, A., Gelman, A., and Gabry, J. (2017). Practical Bayesian model
+#' evaluation using leave-one-out cross-validation and WAIC.
+#' *Statistics and Computing*. 27(5), 1413--1432. doi:10.1007/s11222-016-9696-4
+#' ([journal version](https://link.springer.com/article/10.1007/s11222-016-9696-4),
+#'  [preprint arXiv:1507.04544](https://arxiv.org/abs/1507.04544)).
+#'
+#' Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2019).
+#' Pareto smoothed importance sampling.
+#' [preprint arXiv:1507.02646](https://arxiv.org/abs/1507.02646)
+#'
+#' McLatchie, Y., and Vehtari, A. (2023).
+#' Efficient estimation and correction of selection-induced bias with order statistics.
+#' [preprint arXiv:2309.03742](https://arxiv.org/abs/2309.03742)
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index c35c2056..e9e905e7 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -66,6 +66,15 @@ better sense of uncertainty than what is obtained using the current
 standard approach of comparing differences of deviances to a Chi-squared
 distribution, a practice derived for Gaussian linear models or
 asymptotically, and which only applies to nested models in any case.
+
+If more than \eqn{11} models are compared, then the worst model by elpd is
+taken as the baseline model, and the risk of difference in predictive
+performance being due to random noise is estimated as described by
+McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
+there is a risk of over-fitting due to the selection process, and users
+are recommended to either avoid model selection based on LOO-CV, and
+instead to favour of model averaging/stacking or projection predictive
+inference.
 }
 \examples{
 # very artificial example, just for demonstration!
@@ -101,6 +110,10 @@ evaluation using leave-one-out cross-validation and WAIC.
 Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2019).
 Pareto smoothed importance sampling.
 \href{https://arxiv.org/abs/1507.02646}{preprint arXiv:1507.02646}
+
+McLatchie, Y., and Vehtari, A. (2023).
+Efficient estimation and correction of selection-induced bias with order statistics.
+\href{https://arxiv.org/abs/2309.03742}{preprint arXiv:2309.03742}
 }
 \seealso{
 \itemize{
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index ffb99fa4..59c3f828 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -39,6 +39,13 @@ test_that("loo_compare throws appropriate warnings", {
   attr(w3, "yhash") <- "a"
   attr(w4, "yhash") <- "b"
   expect_warning(loo_compare(w3, w4), "Not all models have the same y variable")
+
+  w_list <- lapply(1:25, function(x) SW(waic(LLarr + rnorm(1, 0, 0.1))))
+  expect_warning(loo_compare(w_list),
+                 "Difference in performance potentially due to random noise.")
+
+  w_list_short <- lapply(1:4, function(x) SW(waic(LLarr + rnorm(1, 0, 0.1))))
+  expect_no_warning(loo_compare(w_list_short))
 })
 
 

From 5a65d667e39862f8a3dfd0e6a88f67cdee75afbf Mon Sep 17 00:00:00 2001
From: Yann McLatchie <yann@mclatchie.com>
Date: Fri, 22 Sep 2023 13:58:30 +0300
Subject: [PATCH 2/6] fix test for order statistic warning

---
 tests/testthat/test_compare.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 59c3f828..8ce3305b 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -40,9 +40,10 @@ test_that("loo_compare throws appropriate warnings", {
   attr(w4, "yhash") <- "b"
   expect_warning(loo_compare(w3, w4), "Not all models have the same y variable")
 
+  set.seed(123)
   w_list <- lapply(1:25, function(x) SW(waic(LLarr + rnorm(1, 0, 0.1))))
   expect_warning(loo_compare(w_list),
-                 "Difference in performance potentially due to random noise.")
+                 "Difference in performance potentially due to random noise")
 
   w_list_short <- lapply(1:4, function(x) SW(waic(LLarr + rnorm(1, 0, 0.1))))
   expect_no_warning(loo_compare(w_list_short))

From e79d6758c9d39fadd60c488858cb659a65bbfd57 Mon Sep 17 00:00:00 2001
From: Yann McLatchie <yann@mclatchie.com>
Date: Fri, 22 Sep 2023 14:11:47 +0300
Subject: [PATCH 3/6] update warning docs

---
 R/loo_compare.R    | 4 ++--
 man/loo_compare.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index c7fbbdc2..2c50f5d0 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -43,11 +43,11 @@
 #'   asymptotically, and which only applies to nested models in any case.
 #'
 #'   If more than \eqn{11} models are compared, then the worst model by elpd is
-#'   taken as the baseline model, and the risk of difference in predictive
+#'   taken as the baseline model, and the risk of the difference in predictive
 #'   performance being due to random noise is estimated as described by
 #'   McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
 #'   there is a risk of over-fitting due to the selection process, and users
-#'   are recommended to either avoid model selection based on LOO-CV, and
+#'   are recommended to avoid model selection based on LOO-CV, and
 #'   instead to favour of model averaging/stacking or projection predictive
 #'   inference.
 #'
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index e9e905e7..f4caa378 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -68,11 +68,11 @@ distribution, a practice derived for Gaussian linear models or
 asymptotically, and which only applies to nested models in any case.
 
 If more than \eqn{11} models are compared, then the worst model by elpd is
-taken as the baseline model, and the risk of difference in predictive
+taken as the baseline model, and the risk of the difference in predictive
 performance being due to random noise is estimated as described by
 McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
 there is a risk of over-fitting due to the selection process, and users
-are recommended to either avoid model selection based on LOO-CV, and
+are recommended to avoid model selection based on LOO-CV, and
 instead to favour of model averaging/stacking or projection predictive
 inference.
 }

From 9f31d312a81b6a3dc87377532ff95803de8595df Mon Sep 17 00:00:00 2001
From: Yann McLatchie <yann@mclatchie.com>
Date: Fri, 29 Sep 2023 10:53:03 +0100
Subject: [PATCH 4/6] update warning message

---
 R/loo_compare.R               | 4 ++--
 man/loo_compare.Rd            | 2 +-
 tests/testthat/test_compare.R | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 2c50f5d0..65be92fa 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -44,7 +44,7 @@
 #'
 #'   If more than \eqn{11} models are compared, then the worst model by elpd is
 #'   taken as the baseline model, and the risk of the difference in predictive
-#'   performance being due to random noise is estimated as described by
+#'   performance being due to chance is estimated as described by
 #'   McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
 #'   there is a risk of over-fitting due to the selection process, and users
 #'   are recommended to avoid model selection based on LOO-CV, and
@@ -319,7 +319,7 @@ loo_order_stat_check <- function(loos, ord) {
 
   if (max(elpd_diff) <= order_stat) {
     # flag warning if we suspect no model is theoretically better than the baseline
-    warning("Difference in performance potentially due to random noise.",
+    warning("Difference in performance potentially due to chance.",
             call. = FALSE)
   }
 }
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index f4caa378..e599a7ab 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -69,7 +69,7 @@ asymptotically, and which only applies to nested models in any case.
 
 If more than \eqn{11} models are compared, then the worst model by elpd is
 taken as the baseline model, and the risk of the difference in predictive
-performance being due to random noise is estimated as described by
+performance being due to chance is estimated as described by
 McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
 there is a risk of over-fitting due to the selection process, and users
 are recommended to avoid model selection based on LOO-CV, and
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 8ce3305b..8835a530 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -43,7 +43,7 @@ test_that("loo_compare throws appropriate warnings", {
   set.seed(123)
   w_list <- lapply(1:25, function(x) SW(waic(LLarr + rnorm(1, 0, 0.1))))
   expect_warning(loo_compare(w_list),
-                 "Difference in performance potentially due to random noise")
+                 "Difference in performance potentially due to chance")
 
   w_list_short <- lapply(1:4, function(x) SW(waic(LLarr + rnorm(1, 0, 0.1))))
   expect_no_warning(loo_compare(w_list_short))

From 06294197b402dcdcb02c9373e6df66514d2953a9 Mon Sep 17 00:00:00 2001
From: Yann McLatchie <yann@mclatchie.com>
Date: Mon, 2 Oct 2023 10:39:10 +0100
Subject: [PATCH 5/6] update documentation and fix the R cmd check NOTE

---
 R/loo_compare.R    | 23 ++++++++++++++++-------
 man/loo_compare.Rd |  7 ++++---
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 65be92fa..585d7a90 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -42,9 +42,10 @@
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
 #'
-#'   If more than \eqn{11} models are compared, then the worst model by elpd is
-#'   taken as the baseline model, and the risk of the difference in predictive
-#'   performance being due to chance is estimated as described by
+#'   If more than \eqn{11} models are compared, then the median model by elpd is
+#'   taken as the baseline model, and we recompute (internally) the model
+#'   differences to this baseline. We then estimate whether the difference in
+#'   predictive performances is potentially due to chance as described by
 #'   McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
 #'   there is a risk of over-fitting due to the selection process, and users
 #'   are recommended to avoid model selection based on LOO-CV, and
@@ -302,13 +303,13 @@ loo_order_stat_check <- function(loos, ord) {
 
   ## warnings
 
-  # compute the elpd differences
-  worst_idx <- tail(ord, n=1)
-  diffs <- mapply(FUN = elpd_diffs, loos[ord[worst_idx]], loos[ord])
+  # compute the elpd differences from the median model
+  baseline_idx <- middle_idx(ord)
+  diffs <- mapply(FUN = elpd_diffs, loos[ord[baseline_idx]], loos[ord])
   elpd_diff <- apply(diffs, 2, sum)
 
   # estimate the standard deviation of the upper-half-normal
-  diff_median <- median(elpd_diff)
+  diff_median <- stats::median(elpd_diff)
   elpd_diff_trunc <- elpd_diff[elpd_diff >= diff_median]
   n_models <- sum(!is.na(elpd_diff_trunc))
   candidate_sd <- sqrt(1 / n_models * sum(elpd_diff_trunc^2, na.rm = TRUE))
@@ -320,10 +321,18 @@ loo_order_stat_check <- function(loos, ord) {
   if (max(elpd_diff) <= order_stat) {
     # flag warning if we suspect no model is theoretically better than the baseline
     warning("Difference in performance potentially due to chance.",
+            "See McLatchie and Vehtari (2023) for details.",
             call. = FALSE)
   }
 }
 
+#' Returns the middle index of a vector
+#' @noRd
+#' @keywords internal
+#' @param vec A vector.
+#' @return Integer index value.
+middle_idx <- function(vec) floor(length(vec) / 2)
+
 #' Computes maximum order statistic from K Gaussians
 #' @noRd
 #' @keywords internal
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index e599a7ab..85f876b3 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -67,9 +67,10 @@ standard approach of comparing differences of deviances to a Chi-squared
 distribution, a practice derived for Gaussian linear models or
 asymptotically, and which only applies to nested models in any case.
 
-If more than \eqn{11} models are compared, then the worst model by elpd is
-taken as the baseline model, and the risk of the difference in predictive
-performance being due to chance is estimated as described by
+If more than \eqn{11} models are compared, then the median model by elpd is
+taken as the baseline model, and we recompute (internally) the model
+differences to this baseline. We then estimate whether the difference in
+predictive performances is potentially due to chance as described by
 McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
 there is a risk of over-fitting due to the selection process, and users
 are recommended to avoid model selection based on LOO-CV, and

From 3526fbf82f4bc7c632119132b30e25b40a21a2ce Mon Sep 17 00:00:00 2001
From: Yann McLatchie <67298926+yannmclatchie@users.noreply.github.com>
Date: Fri, 10 Nov 2023 18:32:44 +0200
Subject: [PATCH 6/6] edit documentation

Co-authored-by: Jonah Gabry <jgabry@gmail.com>
---
 R/loo_compare.R | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 585d7a90..b41801fa 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -42,16 +42,14 @@
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
 #'
-#'   If more than \eqn{11} models are compared, then the median model by elpd is
-#'   taken as the baseline model, and we recompute (internally) the model
-#'   differences to this baseline. We then estimate whether the difference in
-#'   predictive performances is potentially due to chance as described by
-#'   McLatchie and Vehtari (2023). This will flag a warning if it is deemed that
-#'   there is a risk of over-fitting due to the selection process, and users
-#'   are recommended to avoid model selection based on LOO-CV, and
-#'   instead to favour of model averaging/stacking or projection predictive
-#'   inference.
-#'
+#'   If more than \eqn{11} models are compared, we internally recompute the model
+#'   differences using the median model by ELPD as the baseline model. We then
+#'   estimate whether the differences in predictive performance are potentially
+#'   due to chance as described by McLatchie and Vehtari (2023). This will flag
+#'   a warning if it is deemed that there is a risk of over-fitting due to the
+#'   selection process. In that case users are recommended to avoid model
+#'   selection based on LOO-CV, and instead to favor model averaging/stacking or
+#'   projection predictive inference.
 #' @seealso
 #' * The [FAQ page](https://mc-stan.org/loo/articles/online-only/faq.html) on
 #'   the __loo__ website for answers to frequently asked questions.