From 326a4082329c1893069325dd3524a70b86653995 Mon Sep 17 00:00:00 2001 From: David Hugh-Jones Date: Thu, 20 Jun 2024 12:05:06 +0100 Subject: [PATCH] Add `exclude_spikes` parameter to `dissect()`. --- R/chop-isolates.R | 53 +++++++++++++++++++++++++------------- man/dissect.Rd | 33 +++++++++++++++++------- tests/testthat/test-chop.R | 9 +++++++ 3 files changed, 68 insertions(+), 27 deletions(-) diff --git a/R/chop-isolates.R b/R/chop-isolates.R index eb28f41..394257f 100644 --- a/R/chop-isolates.R +++ b/R/chop-isolates.R @@ -40,18 +40,20 @@ chop_spikes <- function ( } -#' Cut data into intervals, then separate out common values +#' Cut data into intervals, separating out common values #' #' Sometimes it's useful to separate out common elements of `x`. -#' `dissect()` first chops `x`, then puts common elements of `x` ("spikes") +#' `dissect()` chops `x`, but puts common elements of `x` ("spikes") #' into separate categories. #' #' Unlike [chop_spikes()], `dissect()` doesn't break up -#' intervals which contain a spike. As a result, unlike other `chop_*` functions, +#' intervals which contain a spike. As a result, unlike `chop_*` functions, #' `dissect()` does not chop `x` into disjoint intervals. See the examples. #' #' If breaks are data-dependent, their labels may be misleading after common -#' elements have been removed. See the example below. +#' elements have been removed. See the example below. To get round this, +#' set `exclude_spikes` to `TRUE`. Then breaks will be calculated after +#' removing spikes from the data. #' #' Levels of the result are ordered by the minimum element in each level. As #' a result, if `drop = FALSE`, empty levels will be placed last. @@ -60,8 +62,10 @@ chop_spikes <- function ( #' #' @param x,breaks,... Passed to [chop()]. #' @inheritParams chop_spikes -#' @param spike_labels Glue string for spike labels. Use `"{l}"` for the spike -#' value. +#' @param spike_labels [Glue][glue::glue()] string for spike labels. Use `"{l}"` +#' for the spike value. +#' @param exclude_spikes Logical. Exclude spikes before chopping `x`? This +#' can affect the location of data-dependent breaks. #' #' @return #' `dissect()` returns the result of [chop()], but with common values put into @@ -89,18 +93,23 @@ chop_spikes <- function ( #' # Misleading data-dependent breaks: #' set.seed(42) #' x <- rnorm(99) -#' x[1:10] <- x[1] +#' x[1:9] <- x[1] #' tab_quantiles(x, 1:2/3) -#' tab_dissect(x, brk_quantiles(1:2/3), prop = 0.1) +#' tab_dissect(x, brk_quantiles(1:2/3), n = 9) +#' # Calculate quantiles excluding spikes: +#' tab_dissect(x, brk_quantiles(1:2/3), n = 9, exclude_spikes = TRUE) dissect <- function (x, - breaks, - ..., - n = NULL, - prop = NULL, - spike_labels = "{{{l}}}") { + breaks, + ..., + n = NULL, + prop = NULL, + spike_labels = "{{{l}}}", + exclude_spikes = FALSE) { assert_that( is.number(n) || is.number(prop), is.null(n) || is.null(prop), + is.string(spike_labels), + is.flag(exclude_spikes), msg = "exactly one of `n` and `prop` must be a scalar numeric" ) assert_that( @@ -108,9 +117,20 @@ dissect <- function (x, n >= 0 || prop >= 0 ) - chopped <- chop(x, breaks, ...) - spikes <- find_spikes(x, n, prop) + x_spikes <- match(x, spikes) + is_spike <- ! is.na(x_spikes) + x_spikes <- x_spikes[is_spike] + + if (exclude_spikes) { + x_not_spikes <- x[! is_spike] + chopped_not_spikes <- chop(x_not_spikes, breaks, ...) + chopped <- factor(rep(NA_integer_, length(x)), + levels = levels(chopped_not_spikes)) + chopped[! is_spike] <- chopped_not_spikes + } else { + chopped <- chop(x, breaks, ...) + } elabels <- endpoint_labels(spikes, raw = TRUE) glue_env <- new.env() @@ -120,9 +140,6 @@ dissect <- function (x, new_levels <- c(levels(chopped), spike_labels) levels(chopped) <- new_levels - x_spikes <- match(x, spikes) - is_spike <- ! is.na(x_spikes) - x_spikes <- x_spikes[is_spike] chopped[is_spike] <- spike_labels[x_spikes] # We reorder the levels of chopped in order of their smallest elements. diff --git a/man/dissect.Rd b/man/dissect.Rd index 411476a..eaca60d 100644 --- a/man/dissect.Rd +++ b/man/dissect.Rd @@ -3,9 +3,17 @@ \name{dissect} \alias{dissect} \alias{tab_dissect} -\title{Cut data into intervals, then separate out common values} +\title{Cut data into intervals, separating out common values} \usage{ -dissect(x, breaks, ..., n = NULL, prop = NULL, spike_labels = "{{{l}}}") +dissect( + x, + breaks, + ..., + n = NULL, + prop = NULL, + spike_labels = "{{{l}}}", + exclude_spikes = FALSE +) tab_dissect(x, breaks, ..., n = NULL, prop = NULL) } @@ -16,8 +24,11 @@ tab_dissect(x, breaks, ..., n = NULL, prop = NULL) a proportion of \code{length(x)}. Values of \code{x} which occur at least this often will get their own singleton break.} -\item{spike_labels}{Glue string for spike labels. Use \code{"{l}"} for the spike -value.} +\item{spike_labels}{\link[glue:glue]{Glue} string for spike labels. Use \code{"{l}"} +for the spike value.} + +\item{exclude_spikes}{Logical. Exclude spikes before chopping \code{x}? This +can affect the location of data-dependent breaks.} } \value{ \code{dissect()} returns the result of \code{\link[=chop]{chop()}}, but with common values put into @@ -27,16 +38,18 @@ separate factor levels. } \description{ Sometimes it's useful to separate out common elements of \code{x}. -\code{dissect()} first chops \code{x}, then puts common elements of \code{x} ("spikes") +\code{dissect()} chops \code{x}, but puts common elements of \code{x} ("spikes") into separate categories. } \details{ Unlike \code{\link[=chop_spikes]{chop_spikes()}}, \code{dissect()} doesn't break up -intervals which contain a spike. As a result, unlike other \verb{chop_*} functions, +intervals which contain a spike. As a result, unlike \verb{chop_*} functions, \code{dissect()} does not chop \code{x} into disjoint intervals. See the examples. If breaks are data-dependent, their labels may be misleading after common -elements have been removed. See the example below. +elements have been removed. See the example below. To get round this, +set \code{exclude_spikes} to \code{TRUE}. Then breaks will be calculated after +removing spikes from the data. Levels of the result are ordered by the minimum element in each level. As a result, if \code{drop = FALSE}, empty levels will be placed last. @@ -59,9 +72,11 @@ tab_spikes(x, brk_width(2, 0), prop = 0.05) # Misleading data-dependent breaks: set.seed(42) x <- rnorm(99) -x[1:10] <- x[1] +x[1:9] <- x[1] tab_quantiles(x, 1:2/3) -tab_dissect(x, brk_quantiles(1:2/3), prop = 0.1) +tab_dissect(x, brk_quantiles(1:2/3), n = 9) +# Calculate quantiles excluding spikes: +tab_dissect(x, brk_quantiles(1:2/3), n = 9, exclude_spikes = TRUE) } \seealso{ \code{\link[=chop_spikes]{chop_spikes()}} for a different approach. diff --git a/tests/testthat/test-chop.R b/tests/testthat/test-chop.R index 20188b8..ce73f6b 100644 --- a/tests/testthat/test-chop.R +++ b/tests/testthat/test-chop.R @@ -383,6 +383,15 @@ test_that("dissect", { expect_silent(res2 <- dissect(x, breaks = c(2, 5), prop = 0.25)) expect_equivalent(res, res2) + + x <- c(1, 2, 3, 4, 5, 5, 5, 5) + expect_silent(res3 <- dissect(x, breaks = brk_equally(2), n = 2, + exclude_spikes = TRUE)) + expect_equivalent( + res3, + factor(c("[0%, 50%)", "[0%, 50%)", "[50%, 100%]", "[50%, 100%]", + rep("{5}", 4))) + ) })