diff --git a/NEWS.md b/NEWS.md index a55ae14..b4c9536 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ * When multiple quantiles are the same, santoku warns and returns the leftmost quantile interval. Before it would merge the intervals, creating labels that might be different to what the user asked for. +* `chop_quantiles()` gains a `use_ecdf` argument. `use_ecdf = TRUE` recalculates + probabilities using `ecdf(x)`, which may give more accurate interval labels. * `single = NULL` has been documented explicitly in `lbl_*` functions. diff --git a/R/breaks-by-group-size.R b/R/breaks-by-group-size.R index 5dd0a68..db43017 100644 --- a/R/breaks-by-group-size.R +++ b/R/breaks-by-group-size.R @@ -3,13 +3,14 @@ #' #' @export #' @order 2 -brk_quantiles <- function (probs, ..., weights = NULL) { +brk_quantiles <- function (probs, ..., weights = NULL, use_ecdf = FALSE) { assert_that( is.numeric(probs), noNA(probs), all(probs >= 0), all(probs <= 1), - is.null(weights) || is.numeric(weights) + is.null(weights) || is.numeric(weights), + is.flag(use_ecdf) ) probs <- sort(probs) @@ -32,7 +33,9 @@ brk_quantiles <- function (probs, ..., weights = NULL) { if (anyNA(qs)) return(empty_breaks()) # data was all NA if (any(duplicated(qs))) { - warning("`x` has non-unique quantiles: break labels may be misleading") + if (! use_ecdf) { + warning("`x` has duplicate quantiles: break labels may be misleading") + } # We use the left-most probabilities, so e.g. if 0%, 20% and 40% quantiles # are all the same number, we'll use the category [0%, 20%). # This means we always return intervals that the user asked for, though @@ -50,6 +53,11 @@ brk_quantiles <- function (probs, ..., weights = NULL) { breaks <- extend_and_close(breaks, x, extend, left, close_end) class(breaks) <- c("quantileBreaks", class(breaks)) + + if (use_ecdf) { + probs <- calculate_ecdf_probs(x, breaks, weights) + } + attr(breaks, "scaled_endpoints") <- probs names(breaks) <- names(probs) @@ -58,6 +66,40 @@ brk_quantiles <- function (probs, ..., weights = NULL) { } +#' Calculate the proportions of `x` that is strictly/weakly less than +#' each break +#' +#' @param x A numeric vector +#' @param breaks A breaks object +#' @param weights A vector of weights. Non-NULL weights are unimplemented +#' +#' @return A vector of proportions of `x` that are strictly less than +#' left-closed breaks, and weakly less than right-closed breaks. +#' +#' @noRd +calculate_ecdf_probs <- function (x, breaks, weights) { + if (! is.numeric(x)) { + stop("`use_ecdf = TRUE` can only be used with numeric `x`") + } + if (! is.null(weights)) { + stop("`use_ecdf = TRUE` cannot be used with non-null `weights`") + } + + brk_vec <- unclass_breaks(breaks) + left_vec <- attr(breaks, "left") + + # proportion of x that is weakly less than x + prop_lte_brk <- stats::ecdf(x)(brk_vec) + # proportion of x that is strictly less than x + prop_lt_brk <- 1 - stats::ecdf(-x)(-brk_vec) + probs <- ifelse(left_vec, prop_lt_brk, prop_lte_brk) + + # Suppose your breaks are [a, b]. + # You want to expand this? + probs +} + + #' @rdname chop_equally #' #' @export diff --git a/R/chop-by-group-size.R b/R/chop-by-group-size.R index f405b77..6b0fa85 100644 --- a/R/chop-by-group-size.R +++ b/R/chop-by-group-size.R @@ -10,6 +10,8 @@ #' passed to [stats::quantile()] or [Hmisc::wtd.quantile()]. #' @param weights `NULL` or numeric vector of same length as `x`. If not #' `NULL`, [Hmisc::wtd.quantile()] is used to calculate weighted quantiles. +#' @param use_ecdf Logical. Recalculate probabilities of quantiles using +#' [`ecdf(x)`][stats::ecdf()]? See below. #' #' @inheritParams chop #' @inherit chop-doc params return @@ -19,8 +21,15 @@ #' for calculating "type 1" quantiles, since they round down. See #' [stats::quantile()]. #' -#' If `x` contains duplicates, consecutive quantiles may be the same number. If -#' so, quantile labels may be misleading and a warning is emitted. +#' By default, `chop_quantiles()` shows the requested probabilities in the +#' labels. To show the numeric quantiles themselves, set `raw = TRUE`. +#' +#' When `x` contains duplicates, consecutive quantiles may be the same number. If +#' so, interval labels may be misleading, and if `use_ecdf = FALSE` a warning is +#' emitted. Set `use_ecdf = TRUE` to recalculate the probabilities of the quantiles +#' using the [empirical cumulative distribution function][stats::ecdf()] of `x`. +#' Doing so may give you different labels from what you expect, and will +#' remove any names from `probs`. See the example below. #' #' @family chopping functions #' @@ -40,8 +49,10 @@ #' chop_quantiles(1:10, 1:3/4, raw = TRUE) #' #' # duplicate quantiles: -#' tab_quantiles(c(1, 1, 1, 2, 3), 1:5/5) -#' +#' x <- c(1, 1, 1, 2, 3) +#' quantile(x, 1:5/5) +#' tab_quantiles(x, 1:5/5) +#' tab_quantiles(x, 1:5/5, use_ecdf = TRUE) chop_quantiles <- function( x, probs, @@ -50,10 +61,11 @@ chop_quantiles <- function( lbl_intervals(single = NULL), left = is.numeric(x), raw = FALSE, - weights = NULL + weights = NULL, + use_ecdf = FALSE ) { - chop(x, brk_quantiles(probs, weights = weights), labels = labels, ..., - left = left, raw = raw) + chop(x, brk_quantiles(probs, weights = weights, use_ecdf = use_ecdf), + labels = labels, ..., left = left, raw = raw) } diff --git a/man/chop_quantiles.Rd b/man/chop_quantiles.Rd index fb41a4d..de32dfb 100644 --- a/man/chop_quantiles.Rd +++ b/man/chop_quantiles.Rd @@ -16,12 +16,13 @@ chop_quantiles( labels = if (raw) lbl_intervals() else lbl_intervals(single = NULL), left = is.numeric(x), raw = FALSE, - weights = NULL + weights = NULL, + use_ecdf = FALSE ) chop_deciles(x, ...) -brk_quantiles(probs, ..., weights = NULL) +brk_quantiles(probs, ..., weights = NULL, use_ecdf = FALSE) tab_quantiles(x, probs, ..., left = is.numeric(x), raw = FALSE) @@ -44,6 +45,9 @@ passed to \code{\link[stats:quantile]{stats::quantile()}} or \code{\link[Hmisc:w \item{weights}{\code{NULL} or numeric vector of same length as \code{x}. If not \code{NULL}, \code{\link[Hmisc:wtd.stats]{Hmisc::wtd.quantile()}} is used to calculate weighted quantiles.} + +\item{use_ecdf}{Logical. Recalculate probabilities of quantiles using +\code{\link[stats:ecdf]{ecdf(x)}}? See below.} } \value{ \verb{chop_*} functions return a \code{\link{factor}} of the same length as \code{x}. @@ -61,8 +65,15 @@ For non-numeric \code{x}, \code{left} is set to \code{FALSE} by default. This wo for calculating "type 1" quantiles, since they round down. See \code{\link[stats:quantile]{stats::quantile()}}. -If \code{x} contains duplicates, consecutive quantiles may be the same number. If -so, quantile labels may be misleading and a warning is emitted. +By default, \code{chop_quantiles()} shows the requested probabilities in the +labels. To show the numeric quantiles themselves, set \code{raw = TRUE}. + +When \code{x} contains duplicates, consecutive quantiles may be the same number. If +so, interval labels may be misleading, and if \code{use_ecdf = FALSE} a warning is +emitted. Set \code{use_ecdf = TRUE} to recalculate the probabilities of the quantiles +using the \link[stats:ecdf]{empirical cumulative distribution function} of \code{x}. +Doing so may give you different labels from what you expect, and will +remove any names from \code{probs}. See the example below. } \examples{ chop_quantiles(1:10, 1:3/4) @@ -77,8 +88,10 @@ chop_deciles(1:10) chop_quantiles(1:10, 1:3/4, raw = TRUE) # duplicate quantiles: -tab_quantiles(c(1, 1, 1, 2, 3), 1:5/5) - +x <- c(1, 1, 1, 2, 3) +quantile(x, 1:5/5) +tab_quantiles(x, 1:5/5) +tab_quantiles(x, 1:5/5, use_ecdf = TRUE) set.seed(42) tab_quantiles(rnorm(100), probs = 1:3/4, raw = TRUE) diff --git a/tests/testthat/test-breaks.R b/tests/testthat/test-breaks.R index dfce7cc..23116d1 100644 --- a/tests/testthat/test-breaks.R +++ b/tests/testthat/test-breaks.R @@ -189,7 +189,7 @@ test_that("brk_quantiles", { }) -test_that("brk_quantiles should warn on duplicate quantiles", { +test_that("brk_quantiles with duplicate quantiles", { x <- rep(1, 5) expect_warning(brks <- brk_quantiles(1:3/4)(x, FALSE, TRUE, FALSE)) expect_equivalent(c(brks), c(1, 1)) @@ -201,6 +201,11 @@ test_that("brk_quantiles should warn on duplicate quantiles", { x <- c(1, 1, 1, 2, 3) expect_warning(brks <- brk_quantiles(0:5/5)(x, FALSE, TRUE, FALSE)) expect_equivalent(c(brks), c(1.0, 1.0, 1.4, 2.2, 3.0)) + + expect_silent( + brks <- brk_quantiles(0:5/5, use_ecdf = TRUE)(x, FALSE, TRUE, FALSE) + ) + expect_equivalent(c(brks), c(1.0, 1.0, 1.4, 2.2, 3.0)) }) diff --git a/tests/testthat/test-chop.R b/tests/testthat/test-chop.R index 8465daa..46a7ca8 100644 --- a/tests/testthat/test-chop.R +++ b/tests/testthat/test-chop.R @@ -247,6 +247,12 @@ test_that("chop_quantiles", { factor(c("Q1", "Q1", "Q2", "Q3", "Q4", "Q4")) ) + x <- c(1, 1, 1, 2, 3) + expect_equivalent( + chop_quantiles(x, 1:4/5, use_ecdf = TRUE), + factor(c("[0%, 60%]", "[0%, 60%]", "[0%, 60%]", "[60%, 80%)", "[80%, 100%]")) + ) + withr::local_options(lifecycle_verbosity = "quiet") expect_equivalent( chop_quantiles(1:6, c(.25, .5, .75), raw = TRUE),