CDCgov · dylanhmorris · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -20,6 +20,7 @@ Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
 Imports:
+    checkmate,
     cli,
     copula,
     dplyr,
@@ -51,9 +52,10 @@ Suggests:
     tidyr,
     roxygen2,
     usethis,
-    testthat,
+    testthat (>= 3.0.0),
     rcmdcheck,
-    httptest
+    httptest,
+    withr
 VignetteBuilder:
     knitr
 URL: https://cdcgov.github.io/forecasttools, https://github.com/CDCgov/forecasttools
@@ -63,3 +65,4 @@ LazyData: true
 Remotes:
     hubverse-org/hubData
 BugReports: https://github.com/CDCgov/forecasttools/issues
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(bottom_up_aggregation)
+export(categorize_vector)
 export(copula2tbl)
 export(count_trajectories)
 export(create_table_for_scoring)
@@ -13,6 +14,7 @@ export(gather_hub_forecast_data)
 export(gather_location_data)
 export(gather_target_data)
 export(get_hubverse_table)
+export(get_prism_cutpoints)
 export(inferencedata_to_tidy_draws)
 export(location_lookup)
 export(nhsn_soda_query)

diff --git a/R/categorize_prism.R b/R/categorize_prism.R
@@ -0,0 +1,75 @@
+#' Get PRISM activity level cutpoints for given
+#' diseases and locations
+#'
+#' @param diseases disease(s) for which to return the cutpoints.
+#' One of `"ARI"`, `"COVID-19"`, `"Influenza"`, or `"RSV"`, or
+#' an array of those values.
+#' @param locations location(s) for which to return the cutpoints.
+#' A location two-letter abbreviation as in the `short_name`
+#' column of [forecasttools::us_location_table], or an array
+#' of those abbreviations.
+#' @return The cutpoints, as an ordered list of vectors.
+#'
+#' @examples
+#' get_prism_cutpoints("WA", "Influenza")
+#'
+#' get_prism_cutpoints(c("US", "WA"), "COVID-19")
+#'
+#' get_prism_cutpoints(c("US", "WA"), c("ARI", "RSV"))
+#' @export
+get_prism_cutpoints <- function(locations, diseases) {
+  checkmate::assert_names(
+    diseases,
+    subset.of =
+      dimnames(forecasttools::prism_thresholds)$disease,
+    what = "disease"
+  )
+  checkmate::assert_names(
+    locations,
+    subset.of =
+      dimnames(forecasttools::prism_thresholds)$location,
+    what = "location"
+  )
+
+  return(purrr::map2(locations, diseases, \(x, y) {
+    forecasttools::prism_thresholds[x, y, ]
+  }))
+}
+
+
+
+#' Categorize a vector of values into PRISM
+#' activity level bins.
+#'
+#' @param values values to categorize
+#' @param locations vector of locations of length equal to
+#' `values` or a single location for all `values`.
+#' @param diseases vector of diseases of length equal to
+#' `values` or a single disease for all `values`.
+#' @param prism_bin_names Bin names for the PRISM bins.
+#' in order from lowest to highest. Must be a vector of
+#' length 5. `list(prism_bin_names)` will be passed as the
+#' `label_sets` argument to [categorize_vector()].
+#' Defaults to the standard PRISM bin names in title case:
+#' `c("Very Low", "Low", "Moderate", "High", "Very High")`.
+#' @return A factor vector equal in length to `values` of
+#' the categories, as the output of [categorize_vector()].
+#' @export
+categorize_prism <- function(values,
+                             locations,
+                             diseases,
+                             prism_bin_names = c(
+                               "Very Low",
+                               "Low",
+                               "Moderate",
+                               "High",
+                               "Very High"
+                             )) {
+  cutpoints <- get_prism_cutpoints(locations, diseases)
+
+  return(categorize_vector(
+    values,
+    break_sets = cutpoints,
+    label_sets = list(prism_bin_names)
+  ))
+}
diff --git a/R/categorize_vector.R b/R/categorize_vector.R
@@ -0,0 +1,58 @@
+#' Catgeorize entries in a numeric vector according
+#' to a list of breakpoints and labels.
+#'
+#' Vectorized version of [base::cut()] that
+#' allows each entry in the vector to use different
+#' break points and labels.
+#'
+#' @param values Vector of values to categorize.
+#' @param break_sets sets of category breakpoints,
+#' with one set for each entry of `values`, as a list of
+#' numeric vectors. Alternatively, a single set of category
+#' breakpoints to use for all values, as a one-entry list
+#' containing a single numeric vector.
+#' @param label_sets sets of labels to associate,
+#' with the corresponding breakpoints in `break_sets`,
+#' with one set for each entry of `values`, as a list of
+#' character vectors. Alternatively, a single set of labels
+#' to use for all values, as a one-entry list containing a
+#' single character vector.
+#' @param include.lowest Passed to [base::cut()]. Default `TRUE`.
+#' @param order Passed to [base::cut()]. Default `TRUE`.
+#' @param right Passed to [base::cut()]. Default `TRUE`.
+#' @param ... Additional keyword arguments passed to
+#' [base::cut()].
+#' @return A categorized vector, as vector of ordered
+#' factors.
+#' @export
+categorize_vector <- function(values,
+                              break_sets,
+                              label_sets,
+                              include.lowest = TRUE, # nolint
+                              order = TRUE,
+                              right = TRUE,
+                              ...) {
+  n_vals <- length(values)
+
+  checkmate::assert_list(break_sets,
+    types = "numeric"
+  )
+  checkmate::assert_list(label_sets,
+    types = "character"
+  )
+  checkmate::qassert(break_sets, c("l1", glue::glue("l{n_vals}")))
+  checkmate::qassert(label_sets, c("l1", glue::glue("l{n_vals}")))
+
+  return(purrr::pmap_vec(
+    list(
+      x = values,
+      breaks = break_sets,
+      labels = label_sets,
+      include.lowest = include.lowest,
+      order = order,
+      right = right,
+      ...
+    ),
+    cut
+  ))
+}
diff --git a/R/data.R b/R/data.R
@@ -41,3 +41,25 @@
 #' A data frame with 40 rows and 20 columns:
 #' @source <data-raw/ex_inferencedata_dataframe.R>
 "ex_inferencedata_dataframe"
+
+#' PRISM respiratory virus activity level thresholds
+#'
+#' A multi-dimensional array with PRISM
+#' respiratory virus activity level thresholds.
+#' Dimensions, in order, are `location`,
+#' `disease`, and `breaks`.
+#'
+#' Values of `disease` are `Influenza`, `COVID-19`,
+#' `RSV`, and `ARI` (acute respiratory infections).
+#'
+#' Values of `breaks` are `prop_lower_bound`,
+#' `prop_low`, `prop_moderate`, `prop_high`,
+#' `prop_very_high`, and `prop_upper_bound`.
+#'
+#' Values of `location` are US jurisdictions
+#' and the United States as a whole, using
+#' USPS two-letter codes (the values of `short_name`)
+#' in [us_location_table].
+#'
+#' @source <data-raw/prism_thresholds.R>
+"prism_thresholds"
diff --git a/data-raw/prism_thresholds.R b/data-raw/prism_thresholds.R
@@ -0,0 +1,53 @@
+#' When there is an appropriate online endpoint for these
+#' thresholds, this this script will be updated to point there.
+#' For now, it reads from a bundled .tsv in inst/extdata
+
+thresholds <- readr::read_tsv(
+  fs::path(
+    "inst",
+    "extdata",
+    "prism_thresholds",
+    ext = "tsv"
+  ),
+  show_col_types = FALSE
+)
+
+## Transform thresholds from percentage to proprotion
+prop_thresholds <- thresholds |>
+  dplyr::transmute(
+    disease,
+    location = state_abb,
+    prop_lower_bound = 0,
+    prop_low = perc_level_low / 100,
+    prop_moderate = perc_level_moderate / 100,
+    prop_high = perc_level_high / 100,
+    prop_very_high = perc_level_very_high / 100,
+    prop_upper_bound = 1
+  )
+
+#' Transform thresholds from flat table to
+#' multi-dimensional array, via a nested list.
+#'
+#' Method for conversion to multi-dim array:
+#' 1. Transform the long-form tabular data to a
+#' nested named list (via nest() and deframe())
+#' 2. Transform the nested named list to a multi-dimensional
+#' array with dimension names (via simplify2array() and unlist())
+#' 3. Order and name the dimensions of that array.
+
+thresholds_nested_list <- prop_thresholds |>
+  tidyr::nest(breaks = dplyr::starts_with("prop_")) |>
+  tidyr::nest(loc_breaks = c(location, breaks)) |>
+  tibble::deframe() |>
+  purrr::map(deframe) # yields a nested list of tibbles
+
+prism_thresholds <- thresholds_nested_list |>
+  simplify2array() |> # yields a 2D array of length-1 tibbles
+  apply(1:2, unlist) |> # yields a 3D array
+  aperm(c(2, 3, 1)) # orders 3D array dimensions as desired
+
+names(dimnames(prism_thresholds)) <- c("location", "disease", "breaks")
+
+usethis::use_data(prism_thresholds,
+  overwrite = TRUE
+)
diff --git a/data/prism_thresholds.rda b/data/prism_thresholds.rda