Skip to content

Commit

Permalink
Add use_ecdf argument to chop_quantiles() and friends.
Browse files Browse the repository at this point in the history
  • Loading branch information
hughjonesd committed Jun 9, 2024
1 parent 867b731 commit ff3ea47
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 17 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
* When multiple quantiles are the same, santoku warns and returns the leftmost
quantile interval. Before it would merge the intervals, creating labels that
might be different to what the user asked for.
* `chop_quantiles()` gains a `use_ecdf` argument. `use_ecdf = TRUE` recalculates
probabilities using `ecdf(x)`, which may give more accurate interval labels.
* `single = NULL` has been documented explicitly in `lbl_*` functions.


Expand Down
48 changes: 45 additions & 3 deletions R/breaks-by-group-size.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
#'
#' @export
#' @order 2
brk_quantiles <- function (probs, ..., weights = NULL) {
brk_quantiles <- function (probs, ..., weights = NULL, use_ecdf = FALSE) {
assert_that(
is.numeric(probs),
noNA(probs),
all(probs >= 0),
all(probs <= 1),
is.null(weights) || is.numeric(weights)
is.null(weights) || is.numeric(weights),
is.flag(use_ecdf)
)
probs <- sort(probs)

Expand All @@ -32,7 +33,9 @@ brk_quantiles <- function (probs, ..., weights = NULL) {
if (anyNA(qs)) return(empty_breaks()) # data was all NA

if (any(duplicated(qs))) {
warning("`x` has non-unique quantiles: break labels may be misleading")
if (! use_ecdf) {
warning("`x` has duplicate quantiles: break labels may be misleading")
}
# We use the left-most probabilities, so e.g. if 0%, 20% and 40% quantiles
# are all the same number, we'll use the category [0%, 20%).
# This means we always return intervals that the user asked for, though
Expand All @@ -50,6 +53,11 @@ brk_quantiles <- function (probs, ..., weights = NULL) {
breaks <- extend_and_close(breaks, x, extend, left, close_end)

class(breaks) <- c("quantileBreaks", class(breaks))

if (use_ecdf) {
probs <- calculate_ecdf_probs(x, breaks, weights)
}

attr(breaks, "scaled_endpoints") <- probs
names(breaks) <- names(probs)

Expand All @@ -58,6 +66,40 @@ brk_quantiles <- function (probs, ..., weights = NULL) {
}


#' Calculate the proportions of `x` that is strictly/weakly less than
#' each break
#'
#' @param x A numeric vector
#' @param breaks A breaks object
#' @param weights A vector of weights. Non-NULL weights are unimplemented
#'
#' @return A vector of proportions of `x` that are strictly less than
#' left-closed breaks, and weakly less than right-closed breaks.
#'
#' @noRd
calculate_ecdf_probs <- function (x, breaks, weights) {
if (! is.numeric(x)) {
stop("`use_ecdf = TRUE` can only be used with numeric `x`")
}
if (! is.null(weights)) {
stop("`use_ecdf = TRUE` cannot be used with non-null `weights`")
}

brk_vec <- unclass_breaks(breaks)
left_vec <- attr(breaks, "left")

# proportion of x that is weakly less than x
prop_lte_brk <- stats::ecdf(x)(brk_vec)
# proportion of x that is strictly less than x
prop_lt_brk <- 1 - stats::ecdf(-x)(-brk_vec)
probs <- ifelse(left_vec, prop_lt_brk, prop_lte_brk)

# Suppose your breaks are [a, b].
# You want to expand this?
probs
}


#' @rdname chop_equally
#'
#' @export
Expand Down
26 changes: 19 additions & 7 deletions R/chop-by-group-size.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#' passed to [stats::quantile()] or [Hmisc::wtd.quantile()].
#' @param weights `NULL` or numeric vector of same length as `x`. If not
#' `NULL`, [Hmisc::wtd.quantile()] is used to calculate weighted quantiles.
#' @param use_ecdf Logical. Recalculate probabilities of quantiles using
#' [`ecdf(x)`][stats::ecdf()]? See below.
#'
#' @inheritParams chop
#' @inherit chop-doc params return
Expand All @@ -19,8 +21,15 @@
#' for calculating "type 1" quantiles, since they round down. See
#' [stats::quantile()].
#'
#' If `x` contains duplicates, consecutive quantiles may be the same number. If
#' so, quantile labels may be misleading and a warning is emitted.
#' By default, `chop_quantiles()` shows the requested probabilities in the
#' labels. To show the numeric quantiles themselves, set `raw = TRUE`.
#'
#' When `x` contains duplicates, consecutive quantiles may be the same number. If
#' so, interval labels may be misleading, and if `use_ecdf = FALSE` a warning is
#' emitted. Set `use_ecdf = TRUE` to recalculate the probabilities of the quantiles
#' using the [empirical cumulative distribution function][stats::ecdf()] of `x`.
#' Doing so may give you different labels from what you expect, and will
#' remove any names from `probs`. See the example below.
#'
#' @family chopping functions
#'
Expand All @@ -40,8 +49,10 @@
#' chop_quantiles(1:10, 1:3/4, raw = TRUE)
#'
#' # duplicate quantiles:
#' tab_quantiles(c(1, 1, 1, 2, 3), 1:5/5)
#'
#' x <- c(1, 1, 1, 2, 3)
#' quantile(x, 1:5/5)
#' tab_quantiles(x, 1:5/5)
#' tab_quantiles(x, 1:5/5, use_ecdf = TRUE)
chop_quantiles <- function(
x,
probs,
Expand All @@ -50,10 +61,11 @@ chop_quantiles <- function(
lbl_intervals(single = NULL),
left = is.numeric(x),
raw = FALSE,
weights = NULL
weights = NULL,
use_ecdf = FALSE
) {
chop(x, brk_quantiles(probs, weights = weights), labels = labels, ...,
left = left, raw = raw)
chop(x, brk_quantiles(probs, weights = weights, use_ecdf = use_ecdf),
labels = labels, ..., left = left, raw = raw)
}


Expand Down
25 changes: 19 additions & 6 deletions man/chop_quantiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion tests/testthat/test-breaks.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ test_that("brk_quantiles", {
})


test_that("brk_quantiles should warn on duplicate quantiles", {
test_that("brk_quantiles with duplicate quantiles", {
x <- rep(1, 5)
expect_warning(brks <- brk_quantiles(1:3/4)(x, FALSE, TRUE, FALSE))
expect_equivalent(c(brks), c(1, 1))
Expand All @@ -201,6 +201,11 @@ test_that("brk_quantiles should warn on duplicate quantiles", {
x <- c(1, 1, 1, 2, 3)
expect_warning(brks <- brk_quantiles(0:5/5)(x, FALSE, TRUE, FALSE))
expect_equivalent(c(brks), c(1.0, 1.0, 1.4, 2.2, 3.0))

expect_silent(
brks <- brk_quantiles(0:5/5, use_ecdf = TRUE)(x, FALSE, TRUE, FALSE)
)
expect_equivalent(c(brks), c(1.0, 1.0, 1.4, 2.2, 3.0))
})


Expand Down
6 changes: 6 additions & 0 deletions tests/testthat/test-chop.R
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,12 @@ test_that("chop_quantiles", {
factor(c("Q1", "Q1", "Q2", "Q3", "Q4", "Q4"))
)

x <- c(1, 1, 1, 2, 3)
expect_equivalent(
chop_quantiles(x, 1:4/5, use_ecdf = TRUE),
factor(c("[0%, 60%]", "[0%, 60%]", "[0%, 60%]", "[60%, 80%)", "[80%, 100%]"))
)

withr::local_options(lifecycle_verbosity = "quiet")
expect_equivalent(
chop_quantiles(1:6, c(.25, .5, .75), raw = TRUE),
Expand Down

0 comments on commit ff3ea47

Please sign in to comment.