Skip to content

Commit

Permalink
Merge pull request #1 from ucsf-bhhi/sampling-tools
Browse files Browse the repository at this point in the history
initial sampling tools
  • Loading branch information
eveyp authored Feb 3, 2022
2 parents a4d13fe + c3fcfb5 commit 03a3549
Show file tree
Hide file tree
Showing 41 changed files with 1,125 additions and 60 deletions.
14 changes: 12 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: statewide.survey.tools
Title: Statewide Survey Tools
Version: 0.1.1
Version: 0.2.0
Authors@R:
person("Eve", "Perry", , "[email protected]", role = c("aut", "cre"))
Description: Helper tools for the Statewide Survey.
Expand All @@ -11,14 +11,24 @@ Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.2
Imports:
airtabler (>= 0.1.6),
assertthat,
dplyr,
glue,
httr,
jsonlite,
lubridate,
magrittr,
purrr,
REDCapR,
rlang,
RStata (>= 1.2.0),
tidyr
tidyr,
tidyselect
Remotes:
eveyp/RStata,
bergant/airtabler
URL: https://github.com/ucsf-bhhi/statewide.survey.tools
BugReports: https://github.com/ucsf-bhhi/statewide.survey.tools/issues
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(adjust_new_sites)
export(adjust_sheltered_unsheltered)
export(connect_to_airtable)
export(draw_sample)
export(fetch_redcap_data)
export(format_stata_date)
export(insert_airtable_records)
export(prepare_sample_for_airtable)
export(pull_from_airtable)
export(redcap_api_url)
export(redcap_token)
export(run_stata_cleaning)
importFrom(assertthat,has_name)
importFrom(magrittr,"%>%")
importFrom(rlang,"!!")
importFrom(rlang,abort)
33 changes: 33 additions & 0 deletions R/adjust_new_sites.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#' Adjust New Sites' Sampling Weights
#'
#' Increases the sampling weights of new venues. New venues are ones have not
#' been eligible to be selected in a previous sample.
#'
#' @param sampling_weight The name of the variable with the sampling weights.
#' @param samples The name of the variable containing the samples the venue has been eligible for.
#' @param new_venue_factor The scaling factor for new venues.
#'
#' @return A vector of adjusted sampling weights.
#' @export
#'
#' @examples
#' \dontrun{
#' venues %>%
#' mutate(sampling_weight = adjust_new_sites(sampling_weight, samples, 2))
#' }
adjust_new_sites = function(sampling_weight, samples, new_venue_factor) {
assertthat::assert_that(is.numeric(sampling_weight))

if (!is.list(samples))
abort("'samples' is not a list-column. Did you use the correct variable?")

if (length(sampling_weight) != length(samples))
abort("'sampling_weight' and 'samples' must be the same length.")

assertthat::assert_that(assertthat::is.number(new_venue_factor))

if (new_venue_factor <= 0)
abort("'new_venue_factor' must be positive.")

purrr::map2_dbl(sampling_weight, samples, ~ ifelse(is.null(.y), .x * new_venue_factor, .x))
}
87 changes: 87 additions & 0 deletions R/adjust_sheltered_unsheltered.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#' Adjust PEH Counts for Sheltered/Unsheltered Share
#'
#' Makes the adjustment to sampling weights so that the weighted average share
#' of residential venue (ie. emergency shelters or encampments) PEH who are
#' sheltered matches a target (usually from PIT counts). It's designed for use
#' within a [dplyr::mutate()] pipeline when calculating the sampling weights.
#'
#' @param peh_count A numeric vector of venue PEH counts.
#' @param site_category A character vector of venue site categories.
#' @param sheltered_share The target sheltered share as a single numeric.
#'
#' @return A vector of adjusted PEH counts.
#' @export
#'
#' @examples
#' \dontrun{
#' venue_data %>%
#' mutate(
#' sampling_weight = adjust_sheltered_unsheltered(
#' final_peh_estimate,
#' site_category,
#' 0.45
#' )
#' )
#' }
adjust_sheltered_unsheltered = function(
peh_count, site_category, sheltered_share
) {
# check inputs
assertthat::assert_that(is.numeric(peh_count))
assertthat::assert_that(is.character(site_category))
assertthat::assert_that(
any(c("Emergency Shelter", "Encampment") %in% unique(site_category)),
msg = "'site_category' doesn't have any sheltered or unsheltered venues.
Is this the correct variable?"
)
assertthat::assert_that(assertthat::is.number(sheltered_share))
assertthat::assert_that(
sheltered_share >= 0 & sheltered_share <= 1,
msg = "'sheltered_share' must be between 0 and 1."
)

# turn site_category into a sheltered/unsheltered indicator
sheltered_indicator = sheltered_or_unsheltered(site_category)

# calculate observed sheltered share in the venue data
observed_sheltered_share = observed_sheltered_share(
sheltered_indicator,
peh_count
)

# calculate the adjuster
adjuster = calculate_adjuster(sheltered_share, observed_sheltered_share)

# apply the adjuster
purrr::map2_dbl(sheltered_indicator, peh_count, adjust_count, adjuster)
}

sheltered_or_unsheltered = function(site_category) {
dplyr::case_when(
site_category == "Emergency Shelter" ~ "Sheltered",
site_category == "Encampment" ~ "Unsheltered",
TRUE ~ NA_character_
)
}

observed_sheltered_share = function(sheltered_indicator, peh_count) {
sheltered = sum(
peh_count[sheltered_indicator == "Sheltered"],
na.rm = TRUE
)
sheltered_or_unsheltered = sum(peh_count[!is.na(sheltered_indicator)])

return(sheltered / sheltered_or_unsheltered)
}

calculate_adjuster = function(sheltered_share, observed_sheltered_share) {
sheltered_share / observed_sheltered_share
}

adjust_count = function(sheltered_indicator, peh_count, adjuster) {
dplyr::case_when(
sheltered_indicator == "Sheltered" ~ peh_count * adjuster,
sheltered_indicator == "Unsheltered" ~ peh_count * 1 / adjuster,
TRUE ~ as.numeric(peh_count)
)
}
File renamed without changes.
116 changes: 116 additions & 0 deletions R/draw_sample.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#' Draw Venue Sample
#'
#' Draws a weighted, without replacement sample of venues. Supports both
#' stratified and unstratified samples.
#'
#' @param data Data frame with venues.
#' @param weights Name of the weighting variable.
#' @param n Either a single number of venues to select, or a data frame with
#' stratification plan (see below).
#'
#' @section Stratification Plan:
#' To draw a stratified sample, include a data frame with the stratification
#' plan.
#'
#' The data frame should include a row for each stratum with the values of
#' the stratification variables and the number of venues to sample. The
#' stratification variables must have the same name and coding as in the
#' venue data. The variable with the the number of venues to sample must be
#' named `n`.
#'
#' For example:
#'
#' | **dv_shelter** | **zone** | **n** |
#' | -------------- | -------------- | ----- |
#' | TRUE | Eastern Placer | 1 |
#' | TRUE | Central Placer | 2 |
#' | TRUE | South Placer | 3 |
#' | FALSE | Eastern Placer | 4 |
#' | FALSE | Central Placer | 5 |
#' | FALSE | South Placer | 6 |
#'
#' This can be generated with the following R code:
#' ```
#' tibble::tribble(
#' ~dv_shelter, ~zone , ~n,
#' TRUE , "Eastern Placer", 1,
#' TRUE , "Central Placer", 2,
#' TRUE , "South Placer" , 3,
#' FALSE , "Eastern Placer", 4,
#' FALSE , "Central Placer", 5,
#' FALSE , "South Placer" , 6
#' )
#' ```
#'
#' @return The venue data frame with the variable `sampled` added, which
#' indicates whether the venue was selected.
#' @export
#'
#' @examples
#' \dontrun{
#' draw_sample(venues, sampling_weight, 5)
#'
#' strata = tibble::tribble(
#' ~zone , ~n,
#' "Eastern Placer", 1,
#' "Central Placer", 3,
#' "South Placer" , 5
#' )
#' draw_sample(venues, sampling_weight, strata)
#' }
draw_sample = function(data, weights, n) {
# make sure the main df doesn't already have a variable called sampled
if (assertthat::has_name(data, "sampled"))
abort("Venue data already has a variable named 'sampled'. Please rename or remove this variable.")

# make sure that the main df has the weights variable
weighting_var_name = rlang::as_string(rlang::ensym(weights))
if (!assertthat::has_name(data, weighting_var_name)) {
abort(glue::glue("Venue data is missing the weighting variable: '{weighting_var_name}'."))
}

n_int = assertthat::is.count(n)
n_df = is.data.frame(n)

if (!(n_int | n_df))
abort("n must be a single positive integer or a data frame with strata.")
data = dplyr::mutate(data, .id = dplyr::row_number())

if (n_df) {
# make sure the strata df has a column named n
if (!assertthat::has_name(n, "n"))
abort("Strata data frame is missing 'n' column.")

# make sure the main df has all of the stratification variables
strata_vars = names(n)[names(n) != "n"]
purrr::walk(
strata_vars,
~ if ((!assertthat::has_name(data, .x)))
abort(
glue::glue(
"Venue data is missing the stratification variable: '{.x}'."
)
)
)

results = vector("list", nrow(n))
for (i in seq_len(nrow(n))) {
results[[i]] = dplyr::semi_join(
data,
n[i,],
by = strata_vars
) %>%
dplyr::slice_sample(n = n$n[i], weight_by = {{ weights }}) %>%
dplyr::pull(.id)
}
ids = purrr::flatten_chr(results)
}

if (n_int) {
ids = dplyr::slice_sample(data, n = n, weight_by = {{ weights }}) %>%
dplyr::pull(.id)
}

dplyr::mutate(data, sampled = .id %in% ids) %>%
dplyr::select(-.id)
}
32 changes: 32 additions & 0 deletions R/fetch_redcap_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#' Downloads REDCap Data
#'
#' Downloads data from REDCap by wrapping [REDCapR::redcap_read()]. The API URL
#' is filled automatically, but can be overridden. The token for the main
#' project is also filled automatically, but can be overridden or replaced with
#' the RDS project token.
#'
#' @param ... Options passed to [REDCapR::redcap_read()].
#' @param redcap_uri REDCap API URL. Defaults to [redcap_api_url()].
#' @param token REDCap API Token. Defaults to main project token. Use RDS
#' project token with `redcap_token("rds")`.
#' @param verbose Should messages be printed to the R console during the
#' operation. The verbose output might contain sensitive information (e.g.
#' PHI), so turn this off if the output might be visible somewhere public.
#'
#' @return A tibble with the requested data.
#' @export
#' @seealso [redcap_token()]
fetch_redcap_data = function(
...,
redcap_uri = redcap_api_url(),
token = redcap_token(),
verbose = FALSE
) {
REDCapR::redcap_read_oneshot(
...,
redcap_uri = redcap_uri,
token = token,
verbose = verbose
)$data %>%
dplyr::as_tibble()
}
15 changes: 15 additions & 0 deletions R/format_stata_date.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#' Convert a Stata date to text
#'
#' Converts a Stata date to a character string.
#'
#' @param date Stata date.
#' @param format Format string for character representation. See
#' [base::strptime()] for options. Defaults to formatting like January 1,
#' 2022.
#'
#' @return A character string with the formatted date.
#' @export
format_stata_date = function(date, format = "%B %e, %Y") {
lubridate::dmy(date) %>%
as.character(format)
}
Loading

0 comments on commit 03a3549

Please sign in to comment.