Merge pull request #1 from ucsf-bhhi/sampling-tools

initial sampling tools
ucsf-bhhi · Feb 3, 2022 · 03a3549 · 03a3549
2 parents a4d13fe + c3fcfb5
commit 03a3549
Show file tree

Hide file tree

Showing 41 changed files with 1,125 additions and 60 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: statewide.survey.tools
 Title: Statewide Survey Tools
-Version: 0.1.1
+Version: 0.2.0
 Authors@R: 
     person("Eve", "Perry", , "[email protected]", role = c("aut", "cre"))
 Description: Helper tools for the Statewide Survey.
@@ -11,14 +11,24 @@ Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.1.2
 Imports: 
     airtabler (>= 0.1.6),
+    assertthat,
     dplyr,
+    glue,
+    httr,
+    jsonlite,
     lubridate,
     magrittr,
+    purrr,
     REDCapR,
+    rlang,
     RStata (>= 1.2.0),
-    tidyr
+    tidyr,
+    tidyselect
 Remotes: 
     eveyp/RStata,
     bergant/airtabler
 URL: https://github.com/ucsf-bhhi/statewide.survey.tools
 BugReports: https://github.com/ucsf-bhhi/statewide.survey.tools/issues
+Suggests: 
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,10 +1,19 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(adjust_new_sites)
+export(adjust_sheltered_unsheltered)
 export(connect_to_airtable)
+export(draw_sample)
 export(fetch_redcap_data)
 export(format_stata_date)
+export(insert_airtable_records)
+export(prepare_sample_for_airtable)
+export(pull_from_airtable)
 export(redcap_api_url)
 export(redcap_token)
 export(run_stata_cleaning)
+importFrom(assertthat,has_name)
 importFrom(magrittr,"%>%")
+importFrom(rlang,"!!")
+importFrom(rlang,abort)
diff --git a/R/adjust_new_sites.R b/R/adjust_new_sites.R
@@ -0,0 +1,33 @@
+#' Adjust New Sites' Sampling Weights
+#'
+#' Increases the sampling weights of new venues. New venues are ones have not
+#' been eligible to be selected in a previous sample.
+#'
+#' @param sampling_weight The name of the variable with the sampling weights.
+#' @param samples The name of the variable containing the samples the venue has been eligible for.
+#' @param new_venue_factor The scaling factor for new venues.
+#'
+#' @return A vector of adjusted sampling weights.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' venues %>%
+#'   mutate(sampling_weight = adjust_new_sites(sampling_weight, samples, 2))
+#' }
+adjust_new_sites = function(sampling_weight, samples, new_venue_factor) {
+  assertthat::assert_that(is.numeric(sampling_weight))
+
+  if (!is.list(samples))
+    abort("'samples' is not a list-column. Did you use the correct variable?")
+
+  if (length(sampling_weight) != length(samples))
+    abort("'sampling_weight' and 'samples' must be the same length.")
+
+  assertthat::assert_that(assertthat::is.number(new_venue_factor))
+
+  if (new_venue_factor <= 0)
+    abort("'new_venue_factor' must be positive.")
+
+  purrr::map2_dbl(sampling_weight, samples, ~ ifelse(is.null(.y), .x * new_venue_factor, .x))
+}
diff --git a/R/adjust_sheltered_unsheltered.R b/R/adjust_sheltered_unsheltered.R
@@ -0,0 +1,87 @@
+#' Adjust PEH Counts for Sheltered/Unsheltered Share
+#'
+#' Makes the adjustment to sampling weights so that the weighted average share
+#' of residential venue (ie. emergency shelters or encampments) PEH who are
+#' sheltered matches a target (usually from PIT counts). It's designed for use
+#' within a [dplyr::mutate()] pipeline when calculating the sampling weights.
+#'
+#' @param peh_count A numeric vector of venue PEH counts.
+#' @param site_category A character vector of venue site categories.
+#' @param sheltered_share The target sheltered share as a single numeric.
+#'
+#' @return A vector of adjusted PEH counts.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#'   venue_data %>%
+#'     mutate(
+#'       sampling_weight = adjust_sheltered_unsheltered(
+#'         final_peh_estimate,
+#'         site_category,
+#'         0.45
+#'       )
+#'     )
+#' }
+adjust_sheltered_unsheltered = function(
+  peh_count, site_category, sheltered_share
+) {
+  # check inputs
+  assertthat::assert_that(is.numeric(peh_count))
+  assertthat::assert_that(is.character(site_category))
+  assertthat::assert_that(
+    any(c("Emergency Shelter", "Encampment") %in% unique(site_category)),
+    msg = "'site_category' doesn't have any sheltered or unsheltered venues.
+    Is this the correct variable?"
+  )
+  assertthat::assert_that(assertthat::is.number(sheltered_share))
+  assertthat::assert_that(
+    sheltered_share >= 0 & sheltered_share <= 1,
+    msg = "'sheltered_share' must be between 0 and 1."
+  )
+
+  # turn site_category into a sheltered/unsheltered indicator
+  sheltered_indicator = sheltered_or_unsheltered(site_category)
+
+  # calculate observed sheltered share in the venue data
+  observed_sheltered_share = observed_sheltered_share(
+    sheltered_indicator,
+    peh_count
+  )
+
+  # calculate the adjuster
+  adjuster = calculate_adjuster(sheltered_share, observed_sheltered_share)
+
+  # apply the adjuster
+  purrr::map2_dbl(sheltered_indicator, peh_count, adjust_count, adjuster)
+}
+
+sheltered_or_unsheltered = function(site_category) {
+  dplyr::case_when(
+    site_category == "Emergency Shelter" ~ "Sheltered",
+    site_category == "Encampment" ~ "Unsheltered",
+    TRUE ~ NA_character_
+  )
+}
+
+observed_sheltered_share = function(sheltered_indicator, peh_count) {
+  sheltered = sum(
+    peh_count[sheltered_indicator == "Sheltered"],
+    na.rm = TRUE
+    )
+  sheltered_or_unsheltered = sum(peh_count[!is.na(sheltered_indicator)])
+
+  return(sheltered / sheltered_or_unsheltered)
+}
+
+calculate_adjuster = function(sheltered_share, observed_sheltered_share) {
+  sheltered_share / observed_sheltered_share
+}
+
+adjust_count = function(sheltered_indicator, peh_count, adjuster) {
+  dplyr::case_when(
+    sheltered_indicator == "Sheltered" ~ peh_count * adjuster,
+    sheltered_indicator == "Unsheltered" ~ peh_count * 1 / adjuster,
+    TRUE ~ as.numeric(peh_count)
+  )
+}
diff --git a/R/airtable.R → R/connect_to_airtable.R b/R/airtable.R → R/connect_to_airtable.R
diff --git a/R/draw_sample.R b/R/draw_sample.R
@@ -0,0 +1,116 @@
+#' Draw Venue Sample
+#'
+#' Draws a weighted, without replacement sample of venues. Supports both
+#' stratified and unstratified samples.
+#'
+#' @param data Data frame with venues.
+#' @param weights Name of the weighting variable.
+#' @param n Either a single number of venues to select, or a data frame with
+#'   stratification plan (see below).
+#'
+#' @section Stratification Plan:
+#' To draw a stratified sample, include a data frame with the stratification
+#' plan.
+#'
+#' The data frame should include a row for each stratum with the values of
+#' the stratification variables and the number of venues to sample. The
+#' stratification variables must have the same name and coding as in the
+#' venue data. The variable with the the number of venues to sample must be
+#' named `n`.
+#'
+#' For example:
+#'
+#' | **dv_shelter** | **zone**       | **n** |
+#' | -------------- | -------------- | ----- |
+#' | TRUE           | Eastern Placer | 1     |
+#' | TRUE           | Central Placer | 2     |
+#' | TRUE           | South Placer   | 3     |
+#' | FALSE          | Eastern Placer | 4     |
+#' | FALSE          | Central Placer | 5     |
+#' | FALSE          | South Placer   | 6     |
+#'
+#' This can be generated with the following R code:
+#' ```
+#' tibble::tribble(
+#'   ~dv_shelter, ~zone           , ~n,
+#'    TRUE      , "Eastern Placer",  1,
+#'    TRUE      , "Central Placer",  2,
+#'    TRUE      , "South Placer"  ,  3,
+#'    FALSE     , "Eastern Placer",  4,
+#'    FALSE     , "Central Placer",  5,
+#'    FALSE     , "South Placer"  ,  6
+#' )
+#' ```
+#'
+#' @return The venue data frame with the variable `sampled` added, which
+#'   indicates whether the venue was selected.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#'   draw_sample(venues, sampling_weight, 5)
+#'
+#'   strata = tibble::tribble(
+#'     ~zone          , ~n,
+#'     "Eastern Placer",  1,
+#'     "Central Placer",  3,
+#'     "South Placer"  ,  5
+#'   )
+#'   draw_sample(venues, sampling_weight, strata)
+#' }
+draw_sample = function(data, weights, n) {
+  # make sure the main df doesn't already have a variable called sampled
+  if (assertthat::has_name(data, "sampled"))
+    abort("Venue data already has a variable named 'sampled'. Please rename or remove this variable.")
+
+  # make sure that the main df has the weights variable
+  weighting_var_name = rlang::as_string(rlang::ensym(weights))
+  if (!assertthat::has_name(data, weighting_var_name)) {
+    abort(glue::glue("Venue data is missing the weighting variable: '{weighting_var_name}'."))
+  }
+
+  n_int = assertthat::is.count(n)
+  n_df = is.data.frame(n)
+
+  if (!(n_int | n_df))
+    abort("n must be a single positive integer or a data frame with strata.")
+  data = dplyr::mutate(data, .id = dplyr::row_number())
+
+  if (n_df) {
+    # make sure the strata df has a column named n
+    if (!assertthat::has_name(n, "n"))
+      abort("Strata data frame is missing 'n' column.")
+
+    # make sure the main df has all of the stratification variables
+    strata_vars = names(n)[names(n) != "n"]
+    purrr::walk(
+      strata_vars,
+      ~ if ((!assertthat::has_name(data, .x)))
+        abort(
+          glue::glue(
+            "Venue data is missing the stratification variable: '{.x}'."
+          )
+        )
+    )
+
+    results = vector("list", nrow(n))
+    for (i in seq_len(nrow(n))) {
+      results[[i]] = dplyr::semi_join(
+        data,
+        n[i,],
+        by = strata_vars
+      ) %>%
+        dplyr::slice_sample(n = n$n[i], weight_by = {{ weights }}) %>%
+        dplyr::pull(.id)
+    }
+    ids = purrr::flatten_chr(results)
+  }
+
+  if (n_int) {
+    ids = dplyr::slice_sample(data, n = n, weight_by = {{ weights }}) %>%
+      dplyr::pull(.id)
+  }
+
+  dplyr::mutate(data, sampled = .id %in% ids) %>%
+    dplyr::select(-.id)
+}
diff --git a/R/fetch_redcap_data.R b/R/fetch_redcap_data.R
@@ -0,0 +1,32 @@
+#' Downloads REDCap Data
+#'
+#' Downloads data from REDCap by wrapping [REDCapR::redcap_read()]. The API URL
+#' is filled automatically, but can be overridden. The token for the main
+#' project is also filled automatically, but can be overridden or replaced with
+#' the RDS project token.
+#'
+#' @param ... Options passed to [REDCapR::redcap_read()].
+#' @param redcap_uri REDCap API URL. Defaults to [redcap_api_url()].
+#' @param token REDCap API Token. Defaults to main project token. Use RDS
+#'   project token with `redcap_token("rds")`.
+#' @param verbose Should messages be printed to the R console during the
+#'   operation. The verbose output might contain sensitive information (e.g.
+#'   PHI), so turn this off if the output might be visible somewhere public.
+#'
+#' @return A tibble with the requested data.
+#' @export
+#' @seealso [redcap_token()]
+fetch_redcap_data = function(
+  ...,
+  redcap_uri = redcap_api_url(),
+  token = redcap_token(),
+  verbose = FALSE
+) {
+  REDCapR::redcap_read_oneshot(
+    ...,
+    redcap_uri = redcap_uri,
+    token = token,
+    verbose = verbose
+  )$data %>%
+    dplyr::as_tibble()
+}
diff --git a/R/format_stata_date.R b/R/format_stata_date.R
@@ -0,0 +1,15 @@
+#' Convert a Stata date to text
+#'
+#' Converts a Stata date to a character string.
+#'
+#' @param date Stata date.
+#' @param format Format string for character representation. See
+#'   [base::strptime()] for options. Defaults to formatting like January 1,
+#'   2022.
+#'
+#' @return A character string with the formatted date.
+#' @export
+format_stata_date = function(date, format = "%B %e, %Y") {
+  lubridate::dmy(date) %>%
+    as.character(format)
+}