From 3c3df6f26518bc6649caa110759a1472ea299b2f Mon Sep 17 00:00:00 2001 From: szimmer Date: Sun, 17 Mar 2024 22:12:49 +0000 Subject: [PATCH] szimmer published a site update --- 404.html | 59 +- anes-cb.html | 59 +- ...hunk-275-1.png => unnamed-chunk-267-1.png} | Bin c01-intro.html | 59 +- c02-overview-surveys.html | 63 +- ...derstanding-survey-data-documentation.html | 59 +- c04-getting-started.html | 271 ++-- c05-descriptive-analysis.html | 1442 ++++++++--------- c06-statistical-testing.html | 917 ++++++----- c07-modeling.html | 1435 ++++++++-------- c08-communicating-results.html | 1193 +++++++------- c09-reprex-data.html | 69 +- c10-specifying-sample-designs.html | 539 +++--- c11-missing-data.html | 317 ++-- c12-pitfalls.html | 59 +- c13-ncvs-vignette.html | 889 +++++----- c14-ambarom-vignette.html | 647 ++++---- importing-survey-data-into-r.html | 297 ++-- index.html | 59 +- recs-cb.html | 59 +- reference-keys.txt | 3 +- references.html | 59 +- search_index.json | 2 +- 23 files changed, 4259 insertions(+), 4297 deletions(-) rename bookdown_files/figure-html/{unnamed-chunk-275-1.png => unnamed-chunk-267-1.png} (100%) diff --git a/404.html b/404.html index 14556d2c..6f7251a3 100644 --- a/404.html +++ b/404.html @@ -230,55 +230,54 @@
  • 4.3 Survey analysis process
  • 4.4 Similarities between {dplyr} and {srvyr} functions
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
      @@ -497,37 +496,37 @@

      Prerequisites

      For this chapter, load the following packages:

      -
      library(tidyverse)
      -library(survey) 
      -library(srvyr) 
      -library(srvyrexploR)
      -library(broom)
      -library(gt)
      +
      library(tidyverse)
      +library(survey) 
      +library(srvyr) 
      +library(srvyrexploR)
      +library(broom)
      +library(gt)

      We will be using data from ANES and RECS described in Chapter 4. As a reminder, here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information).

      -
      targetpop <- 231592693
      -data(anes_2020)
      -
      -anes_adjwgt <- anes_2020 %>%
      -  mutate(Weight = Weight / sum(Weight) * targetpop)
      -
      -anes_des <- anes_adjwgt %>%
      -  as_survey_design(
      -    weights = Weight,
      -    strata = Stratum,
      -    ids = VarUnit,
      -    nest = TRUE
      -  )
      +
      targetpop <- 231592693
      +data(anes_2020)
      +
      +anes_adjwgt <- anes_2020 %>%
      +  mutate(Weight = Weight / sum(Weight) * targetpop)
      +
      +anes_des <- anes_adjwgt %>%
      +  as_survey_design(
      +    weights = Weight,
      +    strata = Stratum,
      +    ids = VarUnit,
      +    nest = TRUE
      +  )

      For RECS, details are included in the RECS documentation and Chapters 4 and 10.

      -
      data(recs_2020)
      -
      -recs_des <- recs_2020 %>%
      -  as_survey_rep(
      -    weights = NWEIGHT,
      -    repweights = NWEIGHT1:NWEIGHT60,
      -    type = "JK1",
      -    scale = 59/60,
      -    mse = TRUE
      -  )
      +
      data(recs_2020)
      +
      +recs_des <- recs_2020 %>%
      +  as_survey_rep(
      +    weights = NWEIGHT,
      +    repweights = NWEIGHT1:NWEIGHT60,
      +    type = "JK1",
      +    scale = 59/60,
      +    mse = TRUE
      +  )

      6.1 Introduction

      @@ -561,15 +560,15 @@

      6.2 Dot Notation
      svydata_des %>%
      - svyttest(x ~ y, .)

      +
      svydata_des %>%
      + svyttest(x ~ y, .)

      By default, the pipe places the left-hand object in the first argument spot. Placing the dot (.) in the second argument spot indicates that the survey design object svydata_des should be used in the second argument and not the first.

      Alternatively, named arguments could be used to place the dot first as named arguments can appear at any location, as in the following:

      -
      svydata_des %>%
      - svyttest(design = ., x ~ y)
      +
      svydata_des %>%
      + svyttest(design = ., x ~ y)

      However, the following code will not work as the svyttest() function expects the formula as the first argument when arguments are not named:

      -
      svydata_des %>%
      - svyttest(., x ~ y)
      +
      svydata_des %>%
      + svyttest(., x ~ y)

      6.3 Comparison of Proportions and Means

      @@ -594,9 +593,9 @@

      6.3.1 Syntax
      svyttest(formula,
      -         design,
      -         ...)

      +
      svyttest(formula,
      +         design,
      +         ...)

      The arguments are:

      • formula: Formula, outcome~group for two-sample, outcome~0 or outcome~1 for one-sample. The group variable must be a factor or character with two levels, or be coded 0/1 or 1/2. We give more details on formula set-up below for different types of tests.
      • @@ -634,14 +633,14 @@

        Example 1: One-sample t-test for Mean\(H_A: \mu \neq 68\)

      To conduct this in R, we use svyttest() and subtract the temperature on the left-hand side of the formula:

      -
      ttest_ex1 <- recs_des %>%
      -  svyttest(
      -    formula = SummerTempNight - 68 ~ 0,
      -    design = .,
      -    na.rm = TRUE
      -  )
      -
      -ttest_ex1
      +
      ttest_ex1 <- recs_des %>%
      +  svyttest(
      +    formula = SummerTempNight - 68 ~ 0,
      +    design = .,
      +    na.rm = TRUE
      +  )
      +
      +ttest_ex1
      ## 
       ##  Design-based one-sample t-test
       ## 
      @@ -655,24 +654,24 @@ 

      Example 1: One-sample t-test for Mean\(\mu - 68\), we run ttest_ex1$estimate.

      If we want the average, we take our t-test estimate and add it to 68:

      -
      ttest_ex1$estimate + 68
      +
      ttest_ex1$estimate + 68
      ##  mean 
       ## 71.37

      Or, we can use the survey_mean() function described in Chapter 5:

      -
      recs_des %>%
      -  summarize(mu = survey_mean(SummerTempNight, na.rm = TRUE))
      +
      recs_des %>%
      +  summarize(mu = survey_mean(SummerTempNight, na.rm = TRUE))
      ## # A tibble: 1 × 2
       ##      mu  mu_se
       ##   <dbl>  <dbl>
       ## 1  71.4 0.0397

      The result is the same in both methods, so we see that the average temperature U.S. households set their thermostat to in the summer at night is 71.4\(^\circ\)F. Looking at the output from svyttest(), the t-statistic is 84.8, and the p-value is \(<0.0001\), indicating that the average is statistically different from 68\(^\circ\)F at an \(\alpha\) level of \(0.05\).

      If we want an 80% confidence interval for the test statistic, we can use the function confint() to change the confidence level. Below, we print both the original 95% confidence interval and the 80% confidence interval:

      -
      confint(ttest_ex1, level = 0.95)
      +
      confint(ttest_ex1, level = 0.95)
      ##                                  2.5 % 97.5 %
       ## as.numeric(SummerTempNight - 68) 3.288  3.447
       ## attr(,"conf.level")
       ## [1] 0.95
      -
      confint(ttest_ex1, level = 0.8)
      +
      confint(ttest_ex1, level = 0.8)
      ## [1] 3.316 3.419
       ## attr(,"conf.level")
       ## [1] 0.8
      @@ -681,11 +680,11 @@

      Example 1: One-sample t-test for Mean

      Example 2: One-sample t-test for Proportion

      RECS asked respondents if they use any air conditioning (AC) in their home.19 In our data, we call this variable ACUsed. Let’s look at the proportion of U.S. households that use AC in their homes using the survey_prop() function we learned in Chapter 5.

      -
      acprop <- recs_des %>%
      -  group_by(ACUsed) %>%
      -  summarize(p = survey_prop())
      -
      -acprop
      +
      acprop <- recs_des %>%
      +  group_by(ACUsed) %>%
      +  summarize(p = survey_prop())
      +
      +acprop
      ## # A tibble: 2 × 3
       ##   ACUsed     p    p_se
       ##   <lgl>  <dbl>   <dbl>
      @@ -697,14 +696,14 @@ 

      Example 2: One-sample t-test for Proportion\(H_A: p \neq 0.90\)

    To conduct this in R, we use the svyttest() function as follows:

    -
    ttest_ex2 <- recs_des %>%
    -  svyttest(
    -    formula = (ACUsed == TRUE) - 0.90 ~ 0,
    -    design = .,
    -    na.rm = TRUE
    -  )
    -
    -ttest_ex2
    +
    ttest_ex2 <- recs_des %>%
    +  svyttest(
    +    formula = (ACUsed == TRUE) - 0.90 ~ 0,
    +    design = .,
    +    na.rm = TRUE
    +  )
    +
    +ttest_ex2
    ## 
     ##  Design-based one-sample t-test
     ## 
    @@ -717,7 +716,7 @@ 

    Example 2: One-sample t-test for Proportion
    broom::tidy(ttest_ex2)
    +
    broom::tidy(ttest_ex2)
    ## # A tibble: 1 × 8
     ##   estimate statistic   p.value parameter conf.low conf.high method      
     ##      <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl> <chr>       
    @@ -734,21 +733,21 @@ 

    Example 3: Unpaired two-sample t-test\(H_A: \mu_{AC} \neq \mu_{noAC}\)

  • Let’s take a quick look at the data to see the format the data are in:

    -
    recs_des %>%
    -  group_by(ACUsed) %>%
    -  summarize(mean = survey_mean(DOLLAREL, na.rm = TRUE))
    +
    recs_des %>%
    +  group_by(ACUsed) %>%
    +  summarize(mean = survey_mean(DOLLAREL, na.rm = TRUE))
    ## # A tibble: 2 × 3
     ##   ACUsed  mean mean_se
     ##   <lgl>  <dbl>   <dbl>
     ## 1 FALSE  1056.   16.0 
     ## 2 TRUE   1422.    5.69

    To conduct this in R, we use svyttest():

    -
    ttest_ex3 <- recs_des %>%
    -  svyttest(formula = DOLLAREL ~ ACUsed,
    -           design = .,
    -           na.rm = TRUE)
    -
    -broom::tidy(ttest_ex3)
    +
    ttest_ex3 <- recs_des %>%
    +  svyttest(formula = DOLLAREL ~ ACUsed,
    +           design = .,
    +           na.rm = TRUE)
    +
    +broom::tidy(ttest_ex3)
    ## # A tibble: 1 × 8
     ##   estimate statistic  p.value parameter conf.low conf.high method       
     ##      <dbl>     <dbl>    <dbl>     <dbl>    <dbl>     <dbl> <chr>        
    @@ -764,14 +763,14 @@ 

    Example 4: Paired two-sample t-test\(H_A: \mu_{summer} \neq \mu_{winter}\)

    To conduct this in R, we use svyttest() by calculating the temperature difference on the left-hand side as follows:

    -
    ttest_ex4 <- recs_des %>%
    -  svyttest(
    -    design = .,
    -    formula = SummerTempNight - WinterTempNight ~ 0,
    -    na.rm = TRUE
    -  )
    -
    -broom::tidy(ttest_ex4)
    +
    ttest_ex4 <- recs_des %>%
    +  svyttest(
    +    design = .,
    +    formula = SummerTempNight - WinterTempNight ~ 0,
    +    na.rm = TRUE
    +  )
    +
    +broom::tidy(ttest_ex4)
    ## # A tibble: 1 × 8
     ##   estimate statistic  p.value parameter conf.low conf.high method       
     ##      <dbl>     <dbl>    <dbl>     <dbl>    <dbl>     <dbl> <chr>        
    @@ -809,11 +808,11 @@ 

    6.4.1 Syntax
    svygofchisq(formula,
    -            p,
    -            design,
    -            na.rm = TRUE,
    -            ...)
    +
    svygofchisq(formula,
    +            p,
    +            design,
    +            na.rm = TRUE,
    +            ...)

    The arguments are:

    • formula: Formula specifying a single factor variable
    • @@ -823,13 +822,13 @@

      6.4.1 Syntax6.2. For the goodness of fit tests, the formula will be a single variable formula = ~var as we compare the observed data from this variable to the expected data. The expected probabilities are then entered in the p argument and need to be a vector of the same length as the number of categories in the variable. For example, if we want to know if the proportion of males and females matches a distribution of 30/70, then the sex variable (with two categories) would be used formula = ~SEX, and the proportions would be included as p = c(.3, .7). It is important to note that the variable entered into the formula should be formatted as either a factor or a character. The examples below provide more detail and tips on how to make sure the levels match up correctly.

      For tests of homogeneity and independence, the svychisq() function should be used. The syntax is as follows:

      -
      svychisq(
      -  formula,
      -  design,
      -  statistic = c("F", "Chisq", "Wald", "adjWald",
      -                "lincom", "saddlepoint"),
      -  na.rm = TRUE
      -)
      +
      svychisq(
      +  formula,
      +  design,
      +  statistic = c("F", "Chisq", "Wald", "adjWald",
      +                "lincom", "saddlepoint"),
      +  na.rm = TRUE
      +)

      The arguments are:

      To conduct this in R, let’s first look at the education variable (Education) we have on the ANES data. Using the survey_mean() function discussed in Chapter 5, we can see the education levels and estimated proportions.

      -
      anes_des %>%
      -  drop_na(Education) %>%
      -  group_by(Education) %>%
      -  summarize(p = survey_mean())
      +
      anes_des %>%
      +  drop_na(Education) %>%
      +  group_by(Education) %>%
      +  summarize(p = survey_mean())
      ## # A tibble: 5 × 3
       ##   Education         p    p_se
       ##   <fct>         <dbl>   <dbl>
      @@ -873,16 +872,16 @@ 

      Example 1: Goodness of Fit Test
      anes_des_educ <- anes_des %>%
      -  mutate(Education2 =
      -           fct_collapse(Education,
      -                        "Bachelor or Higher" = c("Bachelor's",
      -                                                 "Graduate")))
      -
      -anes_des_educ %>%
      -  drop_na(Education2) %>%
      -  group_by(Education2) %>%
      -  summarize(p = survey_mean())
      +
      anes_des_educ <- anes_des %>%
      +  mutate(Education2 =
      +           fct_collapse(Education,
      +                        "Bachelor or Higher" = c("Bachelor's",
      +                                                 "Graduate")))
      +
      +anes_des_educ %>%
      +  drop_na(Education2) %>%
      +  group_by(Education2) %>%
      +  summarize(p = survey_mean())
      ## # A tibble: 4 × 3
       ##   Education2              p    p_se
       ##   <fct>               <dbl>   <dbl>
      @@ -890,15 +889,15 @@ 

      Example 1: Goodness of Fit Test
      chi_ex1 <- anes_des_educ %>%
      -  svygofchisq(
      -    formula =  ~ Education2,
      -    p = c(0.11, 0.27, 0.29, 0.33),
      -    design = .,
      -    na.rm = TRUE
      -  )
      -
      -chi_ex1
      +
      chi_ex1 <- anes_des_educ %>%
      +  svygofchisq(
      +    formula =  ~ Education2,
      +    p = c(0.11, 0.27, 0.29, 0.33),
      +    design = .,
      +    na.rm = TRUE
      +  )
      +
      +chi_ex1
      ## 
       ##  Design-based chi-squared test for given probabilities
       ## 
      @@ -906,15 +905,15 @@ 

      Example 1: Goodness of Fit Test\(\chi^2 =\) 2,172,220; p-value <0.0001). To get a better idea of the differences, we can use the expected output along with survey_mean() to create a comparison table:

      -
      ex1_table <- anes_des_educ %>%
      -  drop_na(Education2) %>%
      -  group_by(Education2) %>%
      -  summarize(Observed = survey_mean(vartype = "ci")) %>%
      -  rename(Education = Education2) %>%
      -  mutate(Expected=c(0.11, 0.27, 0.29, 0.33)) %>%
      -  select(Education, Expected, everything())
      -
      -ex1_table
      +
      ex1_table <- anes_des_educ %>%
      +  drop_na(Education2) %>%
      +  group_by(Education2) %>%
      +  summarize(Observed = survey_mean(vartype = "ci")) %>%
      +  rename(Education = Education2) %>%
      +  mutate(Expected=c(0.11, 0.27, 0.29, 0.33)) %>%
      +  select(Education, Expected, everything())
      +
      +ex1_table
      ## # A tibble: 4 × 5
       ##   Education          Expected Observed Observed_low Observed_upp
       ##   <fct>                 <dbl>    <dbl>        <dbl>        <dbl>
      @@ -924,23 +923,23 @@ 

      Example 1: Goodness of Fit Test6.1.

      -
      ex1_table %>%
      -  pivot_longer(
      -    cols = c("Expected", "Observed"),
      -    names_to = "Names",
      -    values_to = "Proportion"
      -  ) %>%
      -  mutate(
      -    Observed_low = if_else(Names == "Observed", Observed_low, NA_real_),
      -    Observed_upp = if_else(Names == "Observed", Observed_upp, NA_real_),
      -    Names = if_else(Names == "Observed", "ANES (observed)", "ACS (expected)")
      -  ) %>%
      -  ggplot(aes(x = Education, y = Proportion, color = Names)) +
      -  geom_point(alpha = 0.75, size = 2) +
      -  geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp), width = 0.25) +
      -  theme_bw() +
      -  scale_color_manual(name = "Type", values = book_colors[c(4, 1)]) +
      -  theme(legend.position = "bottom", legend.title=element_blank())
      +
      ex1_table %>%
      +  pivot_longer(
      +    cols = c("Expected", "Observed"),
      +    names_to = "Names",
      +    values_to = "Proportion"
      +  ) %>%
      +  mutate(
      +    Observed_low = if_else(Names == "Observed", Observed_low, NA_real_),
      +    Observed_upp = if_else(Names == "Observed", Observed_upp, NA_real_),
      +    Names = if_else(Names == "Observed", "ANES (observed)", "ACS (expected)")
      +  ) %>%
      +  ggplot(aes(x = Education, y = Proportion, color = Names)) +
      +  geom_point(alpha = 0.75, size = 2) +
      +  geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp), width = 0.25) +
      +  theme_bw() +
      +  scale_color_manual(name = "Type", values = book_colors[c(4, 1)]) +
      +  theme(legend.position = "bottom", legend.title=element_blank())

    To conduct this in R, we use the svychisq() function to compare the two variables:

    -
    chi_ex2 <- anes_des %>%
    -  svychisq(
    -    formula =  ~ TrustGovernment + TrustPeople,
    -    design = .,
    -    statistic = "Wald",
    -    na.rm = TRUE
    -  )
    -
    -chi_ex2
    +
    chi_ex2 <- anes_des %>%
    +  svychisq(
    +    formula =  ~ TrustGovernment + TrustPeople,
    +    design = .,
    +    statistic = "Wald",
    +    na.rm = TRUE
    +  )
    +
    +chi_ex2
    ## 
     ##  Design-based Wald test of association
     ## 
     ## data:  NextMethod()
     ## F = 21, ndf = 16, ddf = 51, p-value <2e-16

    The output from svychisq() indicates that the distribution of people’s trust in the federal government and their trust in other people are not independent, meaning that they are related. Let’s output the distributions in a table to see the relationship. The observed output from the test provides a cross-tabulation of the counts for each category:

    -
    chi_ex2$observed
    +
    chi_ex2$observed
    ##                      TrustPeople
     ## TrustGovernment         Always Most of the time About half the time
     ##   Always                16.470           25.009              31.848
    @@ -992,38 +991,38 @@ 

    Example 2: Test of Independence6.1 and in Chapter 8, we will discuss more on how to make publication-quality tables like this.

    -
    chi_ex2_table<-chi_ex2$observed %>% 
    -  as_tibble() %>%
    -  group_by(TrustPeople) %>%
    -  mutate(prop = round(n / sum(n), 3)) %>%
    -  select(-n) %>%
    -  pivot_wider(names_from = TrustPeople, values_from = prop) %>% 
    -  gt(rowname_col = "TrustGovernment") %>%
    -  tab_stubhead(label = "Trust in Government") %>%
    -  tab_spanner(label = "Trust in People",
    -              columns = everything()) %>%
    -  cols_label(`Most of the time` = md("Most of<br />the time"),
    -             `About half the time` = md("About half<br />the time"),
    -             `Some of the time` = md("Some of<br />the time"))
    -
    chi_ex2_table
    - -
    - @@ -1503,41 +1502,41 @@

    Example 2: Test of Independence6.1, each column sums to 1. For example, we can say that it is estimated that of people who always trust in people, 27.7% also always trust in government based on the top-left cell but 5.3% never trust in government.

    The second option is to use group_by() and survey_mean() functions to calculate the proportions from the ANES design object. A reminder that with more than one variable listed in the group_by() statement, the proportions are within the first variable listed. As mentioned above, we are looking at the distribution of TrustGovernment for each level of TrustPeople.

    -
    chi_ex2_obs <- anes_des %>%
    -  drop_na(TrustPeople, TrustGovernment) %>%
    -  group_by(TrustPeople, TrustGovernment) %>%
    -  summarize(Observed = round(survey_mean(vartype = "ci"), 3),
    -            .groups="drop") 
    -
    -chi_ex2_obs_table<-chi_ex2_obs %>%
    -  mutate(prop = paste0(Observed, " (", Observed_low, ", ",
    -                       Observed_upp, ")")) %>%
    -  select(TrustGovernment, TrustPeople, prop) %>%
    -  pivot_wider(names_from = TrustPeople, values_from = prop) %>%
    -  gt(rowname_col = "TrustGovernment") %>%
    -  tab_stubhead(label = "Trust in Government") %>%
    -  tab_spanner(label = "Trust in People",
    -              columns = everything()) %>% 
    -  tab_options(page.orientation = "landscape")
    -
    chi_ex2_obs_table
    - -
    - @@ -2018,22 +2017,22 @@

    Example 2: Test of Independence6.2:

    -
    chi_ex2_obs %>%
    -  mutate(TrustPeople=
    -           fct_reorder(str_c("Trust in People:\n", TrustPeople), 
    -                       order(TrustPeople))) %>%
    -  ggplot(aes(x = TrustGovernment, y = Observed, color = TrustGovernment)) +
    -  facet_wrap( ~ TrustPeople, ncol = 5) +
    -  geom_point() +
    -  geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp)) +
    -  ylab("Proportion") +
    -  xlab("") +
    -  theme_bw() +
    -  scale_color_manual(name="Trust in Government", values=book_colors) +
    -  theme(axis.text.x = element_blank(), 
    -        axis.ticks.x = element_blank(),
    -        legend.position = "bottom") +
    -  guides(col = guide_legend(nrow=2))
    +
    chi_ex2_obs %>%
    +  mutate(TrustPeople=
    +           fct_reorder(str_c("Trust in People:\n", TrustPeople), 
    +                       order(TrustPeople))) %>%
    +  ggplot(aes(x = TrustGovernment, y = Observed, color = TrustGovernment)) +
    +  facet_wrap( ~ TrustPeople, ncol = 5) +
    +  geom_point() +
    +  geom_errorbar(aes(ymin = Observed_low, ymax = Observed_upp)) +
    +  ylab("Proportion") +
    +  xlab("") +
    +  theme_bw() +
    +  scale_color_manual(name="Trust in Government", values=book_colors) +
    +  theme(axis.text.x = element_blank(), 
    +        axis.ticks.x = element_blank(),
    +        legend.position = "bottom") +
    +  guides(col = guide_legend(nrow=2))
    Proportion of adults in the U.S. by levels of trust in people and government with confidence intervals, ANES 2020. This presents the same information as the previous table in graphical form.

    @@ -2058,55 +2057,55 @@

    Example 3: Test of Homogeneity\(H_A:\) at least one category of \(p_{i_{Biden}}\) does not match \(p_{i_{Trump}}\) or \(p_{i_{Other}}\)

    To conduct this in R, we use the svychisq() function to compare the two variables:

    -
    chi_ex3 <- anes_des %>%
    -  drop_na(VotedPres2020_selection, AgeGroup) %>%
    -  svychisq(
    -    formula =  ~ AgeGroup + VotedPres2020_selection,
    -    design = .,
    -    statistic = "Chisq",
    -    na.rm = TRUE
    -  )
    -
    -chi_ex3
    +
    chi_ex3 <- anes_des %>%
    +  drop_na(VotedPres2020_selection, AgeGroup) %>%
    +  svychisq(
    +    formula =  ~ AgeGroup + VotedPres2020_selection,
    +    design = .,
    +    statistic = "Chisq",
    +    na.rm = TRUE
    +  )
    +
    +chi_ex3
    ## 
     ##  Pearson's X^2: Rao & Scott adjustment
     ## 
     ## data:  NextMethod()
     ## X-squared = 171, df = 10, p-value <2e-16

    The output from svychisq() indicates a difference in how each age group voted in the 2020 election. To get a better idea of the different distributions, let’s output proportions to see the relationship. As we learned in Example 2 above, we can use chi_ex3$observed, or if we want to get the variance information (which is crucial with survey data), we can use survey_mean(). Remember, when we have two variables in group_by(), we obtain the proportions within each level of the variable listed. In this case, we are looking at the distribution of AgeGroup for each level of VotedPres2020_selection.

    -
    chi_ex3_obs <- anes_des %>%
    -  filter(VotedPres2020 == "Yes") %>%
    -  drop_na(VotedPres2020_selection, AgeGroup) %>%
    -  group_by(VotedPres2020_selection, AgeGroup) %>%
    -  summarize(Observed = round(survey_mean(vartype = "ci"), 3)) 
    -
    -chi_ex3_obs_table<-chi_ex3_obs %>%
    -  mutate(prop = paste0(Observed, " (", Observed_low, ", ",
    -                       Observed_upp, ")")) %>%
    -  select(AgeGroup, VotedPres2020_selection, prop) %>%
    -  pivot_wider(names_from = VotedPres2020_selection, 
    -              values_from = prop) %>%
    -  gt(rowname_col = "AgeGroup") %>%
    -  tab_stubhead(label = "Age Group")
    -
    chi_ex3_obs_table
    - -
    - @@ -2581,12 +2580,12 @@

    6.5 Exercises
  • Using the RECS data, do more than 50% of U.S. households use AC (ACUsed)?
  • -
    ttest_solution1 <- recs_des %>%
    -  svyttest(design = .,
    -           formula = ((ACUsed == TRUE) - 0.5) ~ 0,
    -           na.rm = TRUE)
    -
    -ttest_solution1
    +
    ttest_solution1 <- recs_des %>%
    +  svyttest(design = .,
    +           formula = ((ACUsed == TRUE) - 0.5) ~ 0,
    +           na.rm = TRUE)
    +
    +ttest_solution1
    ## 
     ##  Design-based one-sample t-test
     ## 
    @@ -2601,14 +2600,14 @@ 

    6.5 Exercises
  • Using the RECS data, does the average temperature that U.S. households set their thermostats to differ between the day and night in the winter (WinterTempDay and WinterTempNight)?
  • -
    ttest_solution2 <- recs_des %>%
    -  svyttest(
    -    design = .,
    -    formula = WinterTempDay - WinterTempNight ~ 0,
    -    na.rm = TRUE
    -  )
    -
    -ttest_solution2
    +
    ttest_solution2 <- recs_des %>%
    +  svyttest(
    +    design = .,
    +    formula = WinterTempDay - WinterTempNight ~ 0,
    +    na.rm = TRUE
    +  )
    +
    +ttest_solution2
    ## 
     ##  Design-based one-sample t-test
     ## 
    @@ -2623,14 +2622,14 @@ 

    6.5 Exercises
  • Using the ANES data, does the average age (Age) of those who voted for Joseph Biden in 2020 (VotedPres2020_selection) differ from those who voted for another candidate?
  • -
    ttest_solution3 <- anes_des %>%
    -  svyttest(
    -    design = .,
    -    formula = Age ~ VotedPres2020_selection == "Biden",
    -    na.rm = TRUE
    -  )
    -
    -ttest_solution3
    +
    ttest_solution3 <- anes_des %>%
    +  svyttest(
    +    design = .,
    +    formula = Age ~ VotedPres2020_selection == "Biden",
    +    na.rm = TRUE
    +  )
    +
    +ttest_solution3
    ## 
     ##  Design-based t-test
     ## 
    @@ -2650,21 +2649,21 @@ 

    6.5 Exercises
    chisq_solution1 <- "c. Test of homogeneity (`svychisq()`)"
    -chisq_solution1

    +
    chisq_solution1 <- "c. Test of homogeneity (`svychisq()`)"
    +chisq_solution1
    ## [1] "c. Test of homogeneity (`svychisq()`)"
    1. In the RECS data, is there a relationship between the type of housing unit (HousingUnitType) and the year the house was built (YearMade)?
    -
    chisq_solution2 <- recs_des %>%
    -  svychisq(
    -    formula =  ~ HousingUnitType + YearMade,
    -    design = .,
    -    statistic = "Wald",
    -    na.rm = TRUE
    -  )
    -
    -chisq_solution2
    +
    chisq_solution2 <- recs_des %>%
    +  svychisq(
    +    formula =  ~ HousingUnitType + YearMade,
    +    design = .,
    +    statistic = "Wald",
    +    na.rm = TRUE
    +  )
    +
    +chisq_solution2
    ## 
     ##  Design-based Wald test of association
     ## 
    @@ -2673,15 +2672,15 @@ 

    6.5 Exercises
  • In the ANES data, is there a difference in the distribution of gender (Gender) across early voting status in 2020 (EarlyVote2020)?
  • -
    chisq_solution3 <- anes_des %>%
    -  svychisq(
    -    formula =  ~ Gender + EarlyVote2020,
    -    design = .,
    -    statistic = "F",
    -    na.rm = TRUE
    -  )
    -
    -chisq_solution3
    +
    chisq_solution3 <- anes_des %>%
    +  svychisq(
    +    formula =  ~ Gender + EarlyVote2020,
    +    design = .,
    +    statistic = "F",
    +    na.rm = TRUE
    +  )
    +
    +chisq_solution3
    ## 
     ##  Pearson's X^2: Rao & Scott adjustment
     ## 
    diff --git a/c07-modeling.html b/c07-modeling.html
    index 2c2575cd..58baf498 100644
    --- a/c07-modeling.html
    +++ b/c07-modeling.html
    @@ -230,55 +230,54 @@
     
  • 4.3 Survey analysis process
  • 4.4 Similarities between {dplyr} and {srvyr} functions
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
      @@ -497,38 +496,38 @@

      Prerequisites

      For this chapter, load the following packages:

      -
      library(tidyverse)
      -library(survey) 
      -library(srvyr) 
      -library(srvyrexploR)
      -library(broom)
      -library(gt)
      -library(prettyunits)
      +
      library(tidyverse)
      +library(survey) 
      +library(srvyr) 
      +library(srvyrexploR)
      +library(broom)
      +library(gt)
      +library(prettyunits)

      We will be using data from ANES and RECS described in Chapter 4. As a reminder, here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information).

      -
      targetpop <- 231592693
      -data(anes_2020)
      -
      -anes_adjwgt <- anes_2020 %>%
      -  mutate(Weight = Weight / sum(Weight) * targetpop)
      -
      -anes_des <- anes_adjwgt %>%
      -  as_survey_design(
      -    weights = Weight,
      -    strata = Stratum,
      -    ids = VarUnit,
      -    nest = TRUE
      -  )
      +
      targetpop <- 231592693
      +data(anes_2020)
      +
      +anes_adjwgt <- anes_2020 %>%
      +  mutate(Weight = Weight / sum(Weight) * targetpop)
      +
      +anes_des <- anes_adjwgt %>%
      +  as_survey_design(
      +    weights = Weight,
      +    strata = Stratum,
      +    ids = VarUnit,
      +    nest = TRUE
      +  )

      For RECS, details are included in the RECS documentation and Chapters 4 and 10.

      -
      data(recs_2020)
      -
      -recs_des <- recs_2020 %>%
      -  as_survey_rep(
      -    weights = NWEIGHT,
      -    repweights = NWEIGHT1:NWEIGHT60,
      -    type = "JK1",
      -    scale = 59/60,
      -    mse = TRUE
      -  )
      +
      data(recs_2020)
      +
      +recs_des <- recs_2020 %>%
      +  as_survey_rep(
      +    weights = NWEIGHT,
      +    repweights = NWEIGHT1:NWEIGHT60,
      +    type = "JK1",
      +    scale = 59/60,
      +    mse = TRUE
      +  )

      7.1 Introduction

      @@ -646,13 +645,13 @@

      7.2 Analysis of Variance (ANOVA)<

      7.2.1 Syntax

      To perform this type of analysis in R, the general syntax is as follows:

      -
      des_obj %>%
      -  svyglm(
      -    formula = outcome ~ group,
      -    design = .,
      -    na.action = na.omit,
      -    df.resid = NULL
      -  )
      +
      des_obj %>%
      +  svyglm(
      +    formula = outcome ~ group,
      +    design = .,
      +    na.action = na.omit,
      +    df.resid = NULL
      +  )

      The arguments are:

      • formula: Formula in the form of outcome~group. The group variable must be a factor or character.
      • @@ -665,14 +664,14 @@

        7.2.1 Syntax

        7.2.2 Example

        Looking at an example will help us discuss the output and how to interpret the results. In RECS, respondents are asked what temperature they set their thermostat to during the day and evening when using the air-conditioning during the summer. To analyze this data, we filter the respondents to only those using AC (ACUsed). Then if we want to see if there are differences by region, we can use group_by(). A descriptive analysis of the temperature at night (SummerTempNight) set by region and the sample sizes is displayed below.

        -
        recs_des %>%
        -  filter(ACUsed) %>%
        -  group_by(Region) %>%
        -  summarize(
        -    SMN = survey_mean(SummerTempNight, na.rm = TRUE),
        -    n = unweighted(n()),
        -    n_na = unweighted(sum(is.na(SummerTempNight)))
        -  )
        +
        recs_des %>%
        +  filter(ACUsed) %>%
        +  group_by(Region) %>%
        +  summarize(
        +    SMN = survey_mean(SummerTempNight, na.rm = TRUE),
        +    n = unweighted(n()),
        +    n_na = unweighted(sum(is.na(SummerTempNight)))
        +  )
        ## # A tibble: 4 × 5
         ##   Region      SMN SMN_se     n  n_na
         ##   <fct>     <dbl>  <dbl> <int> <int>
        @@ -681,11 +680,11 @@ 

        7.2.2 Example
        anova_out <- recs_des %>%
        -  svyglm(design = .,
        -         formula = SummerTempNight ~ Region)
        -
        -tidy(anova_out)

      +
      anova_out <- recs_des %>%
      +  svyglm(design = .,
      +         formula = SummerTempNight ~ Region)
      +
      +tidy(anova_out)
      ## # A tibble: 4 × 5
       ##   term          estimate std.error statistic   p.value
       ##   <chr>            <dbl>     <dbl>     <dbl>     <dbl>
      @@ -695,32 +694,32 @@ 

      7.2.2 Example8.

      -
      anova_out_relevel <- recs_des %>%
      -  mutate(Region=fct_relevel(Region, "Midwest", after = 0)) %>% 
      -  svyglm(design = .,
      -         formula = SummerTempNight ~ Region)
      -
      tidy(anova_out_relevel) %>%
      -  mutate(p.value=pretty_p_value(p.value)) %>%
      -  gt() %>%
      -  fmt_number()
      - -
      - @@ -1202,13 +1201,13 @@

      7.3 Normal Linear Regression

      7.3.1 Syntax

      The syntax for this regression uses the same function as ANOVA, but can have more than one variable listed on the right-hand side of the formula:

      -
      des_obj %>%
      -  svyglm(
      -    formula = outcomevar ~ x1 + x2 + x3,
      -    design = .,
      -    na.action = na.omit,
      -    df.resid = NULL
      -  )
      +
      des_obj %>%
      +  svyglm(
      +    formula = outcomevar ~ x1 + x2 + x3,
      +    design = .,
      +    na.action = na.omit,
      +    df.resid = NULL
      +  )

      The arguments are:

      • formula: Formula in the form of y~x
      • @@ -1223,23 +1222,23 @@

        7.3.2 Examples

        Example 1: Linear Regression with Single Variable

        On RECS, we can obtain information on the square footage of homes and the electric bills. We assume that square footage is related to the amount of money spent on electricity and examine a model for this. Before any modeling, we first plot the data to determine whether it is reasonable to assume a linear relationship. In Figure 7.1, each hexagon represents the weighted count of households in the bin, and we can see a general positive linear trend (as the square footage increases so does the amount of money spent on electricity).

        -
        recs_2020 %>%
        -  ggplot(aes(
        -    x = TOTSQFT_EN,
        -    y = DOLLAREL,
        -    weight = NWEIGHT / 1000000
        -  )) +
        -  geom_hex() +
        -  scale_fill_gradientn(
        -    guide = "colorbar",
        -    name = "Housing Units\n(Millions)",
        -    labels = scales::comma,
        -    colors = book_colors[c(3, 2, 1)]
        -  ) +
        -  xlab("Total square footage") + ylab("Amount spent on electricity") +
        -  scale_y_continuous(labels = scales::dollar_format()) +
        -  scale_x_continuous(labels = scales::comma_format()) +
        -  theme_minimal() 
        +
        recs_2020 %>%
        +  ggplot(aes(
        +    x = TOTSQFT_EN,
        +    y = DOLLAREL,
        +    weight = NWEIGHT / 1000000
        +  )) +
        +  geom_hex() +
        +  scale_fill_gradientn(
        +    guide = "colorbar",
        +    name = "Housing Units\n(Millions)",
        +    labels = scales::comma,
        +    colors = book_colors[c(3, 2, 1)]
        +  ) +
        +  xlab("Total square footage") + ylab("Amount spent on electricity") +
        +  scale_y_continuous(labels = scales::dollar_format()) +
        +  scale_x_continuous(labels = scales::comma_format()) +
        +  theme_minimal() 
        Hex chart where each hexagon represents a number of housing units at a point. x-axis is 'Total square footage' ranging from 0 to 7,500 and y-axis is 'Amount spent on electricity' ranging from $0 to 8,000. The trend is relatively linear and positve. A high concentration of points have square footage between 0 and 2,500 square feet as well as between electricity expenditure between $0 and 2,000

        @@ -1247,32 +1246,32 @@

        Example 1: Linear Regression with Single Variable
        m_electric_sqft <- recs_des %>%
        -  svyglm(design = .,
        -         formula = DOLLAREL ~ TOTSQFT_EN,
        -         na.action = na.omit)

        -
        tidy(m_electric_sqft) %>%
        -  mutate(p.value=pretty_p_value(p.value)) %>%
        -  gt() %>%
        -  fmt_number()
        - -
        - @@ -1730,34 +1729,34 @@

        Example 1: Linear Regression with Single Variable

        Example 2: Linear Regression with Multiple Variables and Interactions

        In the following example, a model is fit to predict electricity expenditure, including Census region (factor/categorical), urbanicity (factor/categorical), square footage (double/numeric), and whether air-conditioning is used (logical/categorical) with all two-way interactions also included. In this example, we are choosing to fit this model without an intercept (using -1 in the formula). This will result in an intercept estimate for each region instead of a single intercept for all data.

        -
        m_electric_multi <- recs_des %>%
        -  svyglm(
        -    design = .,
        -    formula = DOLLAREL ~ (Region + Urbanicity + TOTSQFT_EN + ACUsed)^2 - 1, 
        -    na.action = na.omit
        -  )
        -
        tidy(m_electric_multi) %>%
        -  mutate(p.value=pretty_p_value(p.value)) %>%
        -  gt() %>%
        -  fmt_number()
        - -
        - @@ -2325,20 +2324,20 @@

        Example 2: Linear Regression with Multiple Variables and Interactions

        As shown above, there are many terms in this model. To test whether coefficients for a term are different from zero, the function regTermTest() can be used. For example, in the above regression, we can test whether the interaction of region and urbanicity is significant as follows:

        -
        urb_reg_test <- regTermTest(m_electric_multi, ~Urbanicity:Region)
        -urb_reg_test
        +
        urb_reg_test <- regTermTest(m_electric_multi, ~Urbanicity:Region)
        +urb_reg_test
        ## Wald test for Urbanicity:Region
         ##  in svyglm(design = ., formula = DOLLAREL ~ (Region + Urbanicity + 
         ##     TOTSQFT_EN + ACUsed)^2 - 1, na.action = na.omit)
         ## F =  6.851  on  6  and  35  df: p= 7.2e-05

        This output indicates there is a significant interaction between urbanicity and region (p-value=<0.0001).

        To examine the predictions, residuals, and more from the model, the function augment() from {broom} can be used. The augment() function will return a tibble with the independent and dependent variables and other fit statistics. The augment() function has not been specifically written for objects of class svyglm, and as such, a warning will be displayed indicating this at this time. As it was not written exactly for this class of objects, a little tweaking needs to be done after using augment(). To obtain the standard error of the predicted values (.se.fit) we need to use the attr() function on the predicted values (.fitted) created by augment(). Additionally, the predicted values created are outputted as a svrep type of data. If we want to plot the predicted values, we need to use as.numeric() to get the predicted values into a numeric format to work with. However, it is important to note that this adjustment must be completed after the standard error adjustment.

        -
        fitstats <-
        -  augment(m_electric_multi) %>%
        -  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        -         .fitted = as.numeric(.fitted)) 
        -
        -fitstats
        +
        fitstats <-
        +  augment(m_electric_multi) %>%
        +  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        +         .fitted = as.numeric(.fitted)) 
        +
        +fitstats
        ## # A tibble: 18,496 × 13
         ##    DOLLAREL Region    Urbanicity   TOTSQFT_EN ACUsed `(weights)` .fitted
         ##       <dbl> <fct>     <fct>             <dbl> <lgl>        <dbl>   <dbl>
        @@ -2356,15 +2355,15 @@ 

        Example 2: Linear Regression with Multiple Variables and Interactions

        These results can then be used in a variety of ways, including examining residual plots as illustrated in the code below and Figure 7.2. In the residual plot, we look for any patterns in the data. If we do see patterns, this may indicate a violation of the heteroscedasticity assumption and the standard errors of the coefficients may be incorrect. In Figure 7.2, we do not see a strong pattern indicating that our assumption of heteroscedasticity may hold.

        -
        fitstats %>%
        -  ggplot(aes(x = .fitted, .resid)) +
        -  geom_point() +
        -  geom_hline(yintercept = 0, color = "red") +
        -  theme_minimal() +
        -  xlab("Fitted value of electricity cost") +
        -  ylab("Residual of model") +
        -  scale_y_continuous(labels = scales::dollar_format()) +
        -  scale_x_continuous(labels = scales::dollar_format()) 
        +
        fitstats %>%
        +  ggplot(aes(x = .fitted, .resid)) +
        +  geom_point() +
        +  geom_hline(yintercept = 0, color = "red") +
        +  theme_minimal() +
        +  xlab("Fitted value of electricity cost") +
        +  ylab("Residual of model") +
        +  scale_y_continuous(labels = scales::dollar_format()) +
        +  scale_x_continuous(labels = scales::dollar_format()) 
        Residual scatter plot with a x-axis of 'Fitted value of electricity cost' ranging between approximately $0 and $4,000 and a y-axis with the 'Residual of model' ranging from approximatley -$3,000 to $5,000. The points create a slight megaphone shape with largest residuals in the middle of the x-range. A red line is drawn horizontally at y=0.

        @@ -2372,27 +2371,27 @@

        Example 2: Linear Regression with Multiple Variables and Interactions

        Additionally, augment() can be used to predict outcomes for data not used in modeling. Perhaps, we would like to predict the energy expenditure for a home in an urban area in the south that uses air-conditioning and is 2,500 square feet. To do this, we first make a tibble including that additional data and then use the newdata argument in the augment() function. As before, to obtain the standard error of the predicted values we need to use the attr() function.

        -
        add_data <- recs_2020 %>% 
        -  select(DOEID, Region, Urbanicity,
        -         TOTSQFT_EN, ACUsed,
        -         DOLLAREL) %>%
        -  rbind(
        -    tibble(
        -      DOEID = NA,
        -      Region = "South",
        -      Urbanicity = "Urban Area",
        -      TOTSQFT_EN = 2500,
        -      ACUsed = TRUE,
        -      DOLLAREL = NA
        -    )
        -  ) %>% 
        -  tail(1)
        -
        -pred_data <- augment(m_electric_multi, newdata = add_data) %>%
        -  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        -         .fitted = as.numeric(.fitted)) 
        -
        -pred_data 
        +
        add_data <- recs_2020 %>% 
        +  select(DOEID, Region, Urbanicity,
        +         TOTSQFT_EN, ACUsed,
        +         DOLLAREL) %>%
        +  rbind(
        +    tibble(
        +      DOEID = NA,
        +      Region = "South",
        +      Urbanicity = "Urban Area",
        +      TOTSQFT_EN = 2500,
        +      ACUsed = TRUE,
        +      DOLLAREL = NA
        +    )
        +  ) %>% 
        +  tail(1)
        +
        +pred_data <- augment(m_electric_multi, newdata = add_data) %>%
        +  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        +         .fitted = as.numeric(.fitted)) 
        +
        +pred_data 
        ## # A tibble: 1 × 8
         ##   DOEID Region Urbanicity TOTSQFT_EN ACUsed DOLLAREL .fitted .se.fit
         ##   <dbl> <fct>  <fct>           <dbl> <lgl>     <dbl>   <dbl>   <dbl>
        @@ -2422,14 +2421,14 @@ 

        7.4 Logistic Regression

        7.4.1 Syntax

        The syntax for logistic regression is as follows:

        -
        des_obj %>%
        -  svyglm(
        -    formula = outcomevar ~ x1 + x2 + x3,
        -    design = .,
        -    na.action = na.omit,
        -    df.resid = NULL,
        -    family = quasibinomial
        -  )
        +
        des_obj %>%
        +  svyglm(
        +    formula = outcomevar ~ x1 + x2 + x3,
        +    design = .,
        +    na.action = na.omit,
        +    df.resid = NULL,
        +    family = quasibinomial
        +  )

        The arguments are:

        • formula: Formula in the form of y~x
        • @@ -2450,31 +2449,31 @@

          7.4.2 Examples

          Example 1: Logistic Regression with Single Variable

          In the following example, the ANES data is used, and we are modeling whether someone usually has trust in the government28 by who someone voted for president in 2020. As a reminder, the leading candidates were Biden and Trump though people could vote for someone else not in the Democratic or Republican parties. Those votes are all grouped into an “Other” category. We first create a binary outcome for trusting in the government by collapsing “Always” and “Most of the time” into a single factor level, and the other response options (“About half the time”, “Some of the time”, and “Never”) into a second factor level. Next, a scatter plot of the raw data is not useful as it is all 0 and 1 outcomes, so instead, we plot a summary of the data.

          -
          anes_des_der <- anes_des %>%
          -  mutate(TrustGovernmentUsually = case_when(
          -    is.na(TrustGovernment) ~ NA,
          -    TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
          -  ))
          -
          -anes_des_der %>%
          -  group_by(VotedPres2020_selection) %>%
          -  summarize(pct_trust = survey_mean(TrustGovernmentUsually,
          -                                    na.rm = TRUE,
          -                                    proportion = TRUE,
          -                                    vartype = "ci"),
          -    .groups = "drop") %>%
          -  filter(complete.cases(.)) %>%
          -  ggplot(aes(x = VotedPres2020_selection, y = pct_trust, 
          -             fill = VotedPres2020_selection)) +
          -  geom_bar(stat = "identity") +
          -  geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), 
          -                width = .2) +
          -  scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
          -  xlab("Election choice (2020)") +
          -  ylab("Usually trust the government") +
          -  scale_y_continuous(labels = scales::percent) +
          -  guides(fill = "none") +
          -  theme_minimal()
          +
          anes_des_der <- anes_des %>%
          +  mutate(TrustGovernmentUsually = case_when(
          +    is.na(TrustGovernment) ~ NA,
          +    TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
          +  ))
          +
          +anes_des_der %>%
          +  group_by(VotedPres2020_selection) %>%
          +  summarize(pct_trust = survey_mean(TrustGovernmentUsually,
          +                                    na.rm = TRUE,
          +                                    proportion = TRUE,
          +                                    vartype = "ci"),
          +    .groups = "drop") %>%
          +  filter(complete.cases(.)) %>%
          +  ggplot(aes(x = VotedPres2020_selection, y = pct_trust, 
          +             fill = VotedPres2020_selection)) +
          +  geom_bar(stat = "identity") +
          +  geom_errorbar(aes(ymin = pct_trust_low, ymax = pct_trust_upp), 
          +                width = .2) +
          +  scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
          +  xlab("Election choice (2020)") +
          +  ylab("Usually trust the government") +
          +  scale_y_continuous(labels = scales::percent) +
          +  guides(fill = "none") +
          +  theme_minimal()
          Bar chart with x-axis of election choice with levels of Biden, Trump, and Other and y-axis of 'Usually trust the government' ranging from 0% to 20%. Bar 1 is centered at 1, and length is from 0 to 0.12 with fill color dark blue which maps to VotedPres2020_selection = Biden. Bar 2 is centered at 2, and length is from 0 to 0.17 with fill color very pale blue which maps to VotedPres2020_selection = Trump. Bar 3 is centered at 3, and length is from 0 to 0.06 with fill color moderate purple which maps to VotedPres2020_selection = Other. Error bars are drawn as well with the width of the Biden and Trump error bars being similar and the error bar for Other being significantly wider.

          @@ -2482,32 +2481,32 @@

          Example 1: Logistic Regression with Single Variable7.3 it appears that people who voted for Trump are more likely to say that they usually have trust in the government compared to those who voted for Biden and Other candidates. To determine if this insight is accurate, we next we fit the model.

          -
          logistic_trust_vote <- anes_des_der %>%
          -  svyglm(design = .,
          -         formula = TrustGovernmentUsually ~ VotedPres2020_selection,
          -         family = quasibinomial) 
          -
          tidy(logistic_trust_vote) %>%
          -  mutate(p.value=pretty_p_value(p.value)) %>%
          -  gt() %>%
          -  fmt_number()
          - -
          - @@ -2966,28 +2965,28 @@

          Example 1: Logistic Regression with Single Variable
          tidy(logistic_trust_vote, exponentiate = TRUE) %>% 
          -  select(term, estimate) %>%
          -  gt() %>%
          -  fmt_number()

          +
          tidy(logistic_trust_vote, exponentiate = TRUE) %>% 
          +  select(term, estimate) %>%
          +  gt() %>%
          +  fmt_number()
          -
          - @@ -3435,14 +3434,14 @@

          Example 1: Logistic Regression with Single Variable as demonstrated below:

          -
          logistic_trust_vote %>%
          -  augment(type.predict = "response") %>%
          -  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
          -         .fitted = as.numeric(.fitted)) %>%
          -  select(TrustGovernmentUsually,
          -         VotedPres2020_selection,
          -         .fitted,
          -         .se.fit) 
          +
          logistic_trust_vote %>%
          +  augment(type.predict = "response") %>%
          +  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
          +         .fitted = as.numeric(.fitted)) %>%
          +  select(TrustGovernmentUsually,
          +         VotedPres2020_selection,
          +         .fitted,
          +         .se.fit) 
          ## # A tibble: 6,212 × 4
           ##    TrustGovernmentUsually VotedPres2020_selection .fitted .se.fit
           ##    <lgl>                  <fct>                     <dbl>   <dbl>
          @@ -3462,38 +3461,38 @@ 

          Example 1: Logistic Regression with Single Variable

          Let’s look at another example with interaction effects. If we’re interested in understanding the demographics of people who voted for Biden among all voters in 2020, we could include EarlyVote2020 and Gender in our model.

          First we need to subset the data to 2020 voters and then create an indicator for voted for Biden.

          -
          anes_des_ind <- anes_des %>% 
          -  filter(!is.na(VotedPres2020_selection)) %>%
          -  mutate(VoteBiden = case_when(VotedPres2020_selection == "Biden"~1,
          -                               TRUE ~ 0))
          +
          anes_des_ind <- anes_des %>% 
          +  filter(!is.na(VotedPres2020_selection)) %>%
          +  mutate(VoteBiden = case_when(VotedPres2020_selection == "Biden"~1,
          +                               TRUE ~ 0))

          Let’s first look at the main effects of gender and early voting behavior.

          -
          log_biden_main <- anes_des_ind %>%
          -  mutate(EarlyVote2020 = fct_relevel(EarlyVote2020, "No", after = 0)) %>% 
          -  svyglm(design = .,
          -         formula = VoteBiden ~ EarlyVote2020 + Gender,
          -         family = quasibinomial) 
          -
          tidy(log_biden_main) %>%
          -  mutate(p.value=pretty_p_value(p.value)) %>%
          -  gt() %>%
          -  fmt_number()
          - -
          - @@ -3952,33 +3951,33 @@

          Example 2: Interaction Effects
          log_biden_int <- anes_des_ind %>%
          -  mutate(EarlyVote2020 = fct_relevel(EarlyVote2020, "No", after = 0)) %>% 
          -  svyglm(design = .,
          -         formula = VoteBiden ~ (EarlyVote2020 + Gender)^2,
          -         family = quasibinomial) 

          -
          tidy(log_biden_int) %>%
          -  mutate(p.value=pretty_p_value(p.value)) %>%
          -  gt() %>%
          -  fmt_number()
          - -
          - @@ -4441,27 +4440,27 @@

          Example 2: Interaction Effects
          log_biden_pred <- log_biden_int %>%
          -  augment(type.predict = "response") %>%
          -  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
          -         .fitted = as.numeric(.fitted)) %>%
          -  select(VoteBiden, EarlyVote2020, Gender, .fitted, .se.fit) 

          +
          log_biden_pred <- log_biden_int %>%
          +  augment(type.predict = "response") %>%
          +  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
          +         .fitted = as.numeric(.fitted)) %>%
          +  select(VoteBiden, EarlyVote2020, Gender, .fitted, .se.fit) 

          To create an interaction plot, the y-axis will be the predicted probabilities, and one of our x-variables will be on the x-axis and the other will be represented by multiple lines. Figure 7.4 shows the interaction plot with gender on the x-axis and early voting behavior represented by the lines.

          -
          log_biden_pred %>% 
          -  filter(VoteBiden==1) %>% 
          -  distinct() %>% 
          -  arrange(Gender, EarlyVote2020) %>% 
          -  mutate(EarlyVote2020 = fct_reorder2(EarlyVote2020, Gender, .fitted)) %>%
          -  ggplot(aes(x = Gender, y = .fitted, group = EarlyVote2020,
          -             color = EarlyVote2020, linetype = EarlyVote2020)) +
          -  geom_line(linewidth = 1.1) +
          -  scale_color_manual(values = book_colors[c(2,4)]) +
          -  ylab("Predicted Probability of Voting for Biden") +
          -  labs(color="Voted Early",
          -       linetype="Voted Early") +
          -  coord_cartesian(ylim=c(0,1)) +
          -  guides(fill = "none") +
          -  theme_minimal()
          +
          log_biden_pred %>% 
          +  filter(VoteBiden==1) %>% 
          +  distinct() %>% 
          +  arrange(Gender, EarlyVote2020) %>% 
          +  mutate(EarlyVote2020 = fct_reorder2(EarlyVote2020, Gender, .fitted)) %>%
          +  ggplot(aes(x = Gender, y = .fitted, group = EarlyVote2020,
          +             color = EarlyVote2020, linetype = EarlyVote2020)) +
          +  geom_line(linewidth = 1.1) +
          +  scale_color_manual(values = book_colors[c(2,4)]) +
          +  ylab("Predicted Probability of Voting for Biden") +
          +  labs(color="Voted Early",
          +       linetype="Voted Early") +
          +  coord_cartesian(ylim=c(0,1)) +
          +  guides(fill = "none") +
          +  theme_minimal()
          Line plot with x-axis as Male and Female (left to right) and y-axis as 'Predicted Probability of Voting for Biden'. There are two lines for early voting indicators with lines being from top to bottom: Did Not Early Vote and Did Early Vote. The line representing did not early vote is roughly parallel with similar predicted probabilities between males and females. For those who did early vote, females have higher predicted probability of voting for Biden than males.

          @@ -4478,11 +4477,11 @@

          7.5 Exercises
        • The type of housing unit may have an impact on energy expenses. Using the RECS data, is there any relationship between housing unit type (HousingUnitType) and total energy expenditure (TOTALDOL)? First, find the average energy expenditure by housing unit type as a descriptive analysis and then do the test. The reference level in the comparison should be the housing unit type that is most common.
        • -
          recs_des %>%
          -  group_by(HousingUnitType) %>%
          -  summarize(Expense = survey_mean(TOTALDOL, na.rm = TRUE),
          -            HUs = survey_total()) %>%
          -  arrange(desc(HUs))
          +
          recs_des %>%
          +  group_by(HousingUnitType) %>%
          +  summarize(Expense = survey_mean(TOTALDOL, na.rm = TRUE),
          +            HUs = survey_total()) %>%
          +  arrange(desc(HUs))
          ## # A tibble: 5 × 5
           ##   HousingUnitType            Expense Expense_se       HUs       HUs_se
           ##   <fct>                        <dbl>      <dbl>     <dbl>        <dbl>
          @@ -4491,15 +4490,15 @@ 

          7.5 Exercises
          exp_unit_out <- recs_des %>%
          -  mutate(HousingUnitType = fct_infreq(HousingUnitType, NWEIGHT)) %>%
          -  svyglm(
          -    design = .,
          -    formula = TOTALDOL ~ HousingUnitType,
          -    na.action = na.omit
          -  )
          -
          -tidy(exp_unit_out)

          +
          exp_unit_out <- recs_des %>%
          +  mutate(HousingUnitType = fct_infreq(HousingUnitType, NWEIGHT)) %>%
          +  svyglm(
          +    design = .,
          +    formula = TOTALDOL ~ HousingUnitType,
          +    na.action = na.omit
          +  )
          +
          +tidy(exp_unit_out)
          ## # A tibble: 5 × 5
           ##   term                             estimate std.error statistic  p.value
           ##   <chr>                               <dbl>     <dbl>     <dbl>    <dbl>
          @@ -4508,59 +4507,59 @@ 

          7.5 Exercises
          # Single-family detached units are most common
          -# There is a significant relationship between energy expenditure and housing unit type

          +
          # Single-family detached units are most common
          +# There is a significant relationship between energy expenditure and housing unit type
          1. Using the RECS data, does temperature play a role in energy expenditure? Cooling degree days are a measure of how hot a place is. Variable CDD65 for a given day indicates the number of degrees Fahrenheit warmer than 65°F (18.3°C) it is in a location. On a day that averages 65°F and below, CDD65=0. While a day that averages 85°F would have CDD65=20 because it is 20 degrees warmer. For each day in the year, this is summed to give an indicator of how hot the place is throughout the year. Similarly, HDD65 indicates the days colder than 65°F (18.3°C)29. Can energy expenditure be predicted using these temperature indicators along with square footage? Is there a significant relationship? Include main effects and two-way interactions.
          -
          temps_sqft_exp <- recs_des %>%
          -  svyglm(
          -    design = .,
          -    formula = DOLLAREL ~ (TOTSQFT_EN + CDD65 + HDD65) ^ 2,
          -    na.action = na.omit
          -  )
          -
          -tidy(temps_sqft_exp)
          +
          temps_sqft_exp <- recs_des %>%
          +  svyglm(
          +    design = .,
          +    formula = DOLLAREL ~ (TOTSQFT_EN + CDD65 + HDD65) ^ 2,
          +    na.action = na.omit
          +  )
          +
          +tidy(temps_sqft_exp)
          ## # A tibble: 7 × 5
          -##   term                 estimate   std.error statistic  p.value
          -##   <chr>                   <dbl>       <dbl>     <dbl>    <dbl>
          -## 1 (Intercept)      741.         70.5           10.5   1.44e-14
          -## 2 TOTSQFT_EN         0.272       0.0471         5.77  4.27e- 7
          -## 3 CDD65              0.0293      0.0227         1.29  2.02e- 1
          -## 4 HDD65             -0.00111     0.0104        -0.107 9.15e- 1
          -## 5 TOTSQFT_EN:CDD65   0.0000459   0.0000154      2.97  4.43e- 3
          -## 6 TOTSQFT_EN:HDD65  -0.00000840  0.00000633    -1.33  1.90e- 1
          -## 7 CDD65:HDD65        0.00000533  0.00000355     1.50  1.39e- 1
          +## term estimate std.error statistic p.value +## <chr> <dbl> <dbl> <dbl> <dbl> +## 1 (Intercept) 741. 70.5 10.5 0.0000000000000144 +## 2 TOTSQFT_EN 0.272 0.0471 5.77 0.000000427 +## 3 CDD65 0.0293 0.0227 1.29 0.202 +## 4 HDD65 -0.00111 0.0104 -0.107 0.915 +## 5 TOTSQFT_EN:CDD65 0.0000459 0.0000154 2.97 0.00443 +## 6 TOTSQFT_EN:HDD65 -0.00000840 0.00000633 -1.33 0.190 +## 7 CDD65:HDD65 0.00000533 0.00000355 1.50 0.139

        1. Continuing with our results from question 2, create a plot between the actual and predicted expenditures and a residual plot for the predicted expenditures.
        -
        temps_sqft_exp_fit <- temps_sqft_exp %>%
        -  augment() %>%
        -  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        -         # extract the variance of the fitted value
        -         .fitted = as.numeric(.fitted)) 
        -
        temps_sqft_exp_fit %>%
        -  ggplot(aes(x = DOLLAREL, y = .fitted)) +
        -  geom_point() +
        -  geom_abline(intercept = 0,
        -              slope = 1,
        -              color = "red") +
        -  xlab("Actual expenditures") +
        -  ylab("Predicted expenditures") +
        -  theme_minimal()
        +
        temps_sqft_exp_fit <- temps_sqft_exp %>%
        +  augment() %>%
        +  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        +         # extract the variance of the fitted value
        +         .fitted = as.numeric(.fitted)) 
        +
        temps_sqft_exp_fit %>%
        +  ggplot(aes(x = DOLLAREL, y = .fitted)) +
        +  geom_point() +
        +  geom_abline(intercept = 0,
        +              slope = 1,
        +              color = "red") +
        +  xlab("Actual expenditures") +
        +  ylab("Predicted expenditures") +
        +  theme_minimal()
        Actual and predicted electricity expenditures

        FIGURE 7.5: Actual and predicted electricity expenditures

        -
        temps_sqft_exp_fit %>%
        -  ggplot(aes(x = .fitted, y = .resid)) +
        -  geom_point() +
        -  geom_hline(yintercept = 0, color = "red") +
        -  xlab("Predicted expenditure") +
        -  ylab("Residual value of expenditure") +
        -  theme_minimal()
        +
        temps_sqft_exp_fit %>%
        +  ggplot(aes(x = .fitted, y = .resid)) +
        +  geom_point() +
        +  geom_hline(yintercept = 0, color = "red") +
        +  xlab("Predicted expenditure") +
        +  ylab("Residual value of expenditure") +
        +  theme_minimal()
        Residual plot of electric cost model with covariates TOTSQFT_EN, CDD65, and HDD65

        @@ -4570,15 +4569,15 @@

        7.5 Exercises
      • Early voting expanded in 202030. Using the ANES data, build a logistic model predicting early voting in 2020 (EarlyVote2020) using age (Age), education (Education), and party identification (PartyID). Include two-way interactions.
      • -
        earlyvote_mod <- anes_des %>%
        -  filter(!is.na(EarlyVote2020)) %>%
        -  svyglm(
        -    design = .,
        -    formula = EarlyVote2020 ~ (Age + Education + PartyID) ^ 2 ,
        -    family = quasibinomial
        -  )
        -
        -tidy(earlyvote_mod) %>% arrange(p.value)
        +
        earlyvote_mod <- anes_des %>%
        +  filter(!is.na(EarlyVote2020)) %>%
        +  svyglm(
        +    design = .,
        +    formula = EarlyVote2020 ~ (Age + Education + PartyID) ^ 2 ,
        +    family = quasibinomial
        +  )
        +
        +tidy(earlyvote_mod) %>% arrange(p.value)
        ## # A tibble: 46 × 5
         ##    term                             estimate std.error statistic p.value
         ##    <chr>                               <dbl>     <dbl>     <dbl>   <dbl>
        @@ -4596,21 +4595,21 @@ 

        7.5 Exercises
      • Continuing from Exercise 4, predict the probability of early voting for two people. Both are 28 years old and have a graduate degree, but one person is a strong Democrat, and the other is a strong Republican.
      • -
        add_vote_dat <- anes_2020 %>%
        -  select(EarlyVote2020, Age, Education, PartyID) %>%
        -  rbind(tibble(
        -    EarlyVote2020 = NA,
        -    Age = 28,
        -    Education = "Graduate",
        -    PartyID = c("Strong democrat", "Strong republican")
        -  )) %>%
        -  tail(2)
        -
        -log_ex_2_out <- earlyvote_mod %>%
        -  augment(newdata = add_vote_dat, type.predict = "response") %>%
        -  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        -         # extract the variance of the fitted value
        -         .fitted = as.numeric(.fitted))
        +
        add_vote_dat <- anes_2020 %>%
        +  select(EarlyVote2020, Age, Education, PartyID) %>%
        +  rbind(tibble(
        +    EarlyVote2020 = NA,
        +    Age = 28,
        +    Education = "Graduate",
        +    PartyID = c("Strong democrat", "Strong republican")
        +  )) %>%
        +  tail(2)
        +
        +log_ex_2_out <- earlyvote_mod %>%
        +  augment(newdata = add_vote_dat, type.predict = "response") %>%
        +  mutate(.se.fit = sqrt(attr(.fitted, "var")), 
        +         # extract the variance of the fitted value
        +         .fitted = as.numeric(.fitted))

        diff --git a/c08-communicating-results.html b/c08-communicating-results.html index 80a146c9..9d4bc773 100644 --- a/c08-communicating-results.html +++ b/c08-communicating-results.html @@ -230,55 +230,54 @@
      • 4.3 Survey analysis process
      • 4.4 Similarities between {dplyr} and {srvyr} functions
      -
    • 5 Descriptive Analyses in {srvyr} +
    • 5 Descriptive analyses
    • 6 Statistical testing
        @@ -497,26 +496,26 @@

        Prerequisites

        For this chapter, load the following packages:

        -
        library(tidyverse)
        -library(survey) 
        -library(srvyr) 
        -library(srvyrexploR)
        -library(gt)
        -library(gtsummary)
        +
        library(tidyverse)
        +library(survey) 
        +library(srvyr) 
        +library(srvyrexploR)
        +library(gt)
        +library(gtsummary)

        We will be using data from ANES as described in Chapter 4. As a reminder, here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 4 for more information).

        -
        targetpop <- 231592693
        -data(anes_2020)
        -
        -anes_adjwgt <- anes_2020 %>%
        -  mutate(Weight = Weight / sum(Weight) * targetpop)
        -
        -anes_des <- anes_adjwgt %>%
        -  as_survey_design(
        -    weights = Weight,
        -    strata = Stratum,
        -    ids = VarUnit,
        -    nest = TRUE
        -  )
        +
        targetpop <- 231592693
        +data(anes_2020)
        +
        +anes_adjwgt <- anes_2020 %>%
        +  mutate(Weight = Weight / sum(Weight) * targetpop)
        +
        +anes_des <- anes_adjwgt %>%
        +  as_survey_design(
        +    weights = Weight,
        +    strata = Stratum,
        +    ids = VarUnit,
        +    nest = TRUE
        +  )

    • 8.1 Introduction

      @@ -572,12 +571,12 @@

      8.3.1 Tables

      8.3.1.1 Transitioning {srvyr} output to a {gt} table

      Let’s start by using some of the data we calculated earlier in this book. In Chapter 6, we looked at data on trust in government with the proportions calculated below:

      -
      trust_gov <- anes_des %>%
      -  drop_na(TrustGovernment) %>%
      -  group_by(TrustGovernment) %>%
      -  summarize(trust_gov_p = survey_prop())
      -
      -trust_gov
      +
      trust_gov <- anes_des %>%
      +  drop_na(TrustGovernment) %>%
      +  group_by(TrustGovernment) %>%
      +  summarize(trust_gov_p = survey_prop())
      +
      +trust_gov
      ## # A tibble: 5 × 3
       ##   TrustGovernment     trust_gov_p trust_gov_p_se
       ##   <fct>                     <dbl>          <dbl>
      @@ -589,33 +588,33 @@ 

      8.3.1.1 Transitioning {srvyr} out

      The default output generated by R may work for initial viewing inside RStudio or when creating basic output in an R Markdown or Quarto document. However, when presenting these results in other publications, such as the print version of this book or with other formal dissemination modes, modifying the display can improve our reader’s experience.

      Looking at the output from trust_gov, a couple of improvements are obvious: (1) switching to percentages instead of proportions and (2) using the variable names as column headers. The {gt} package is a good tool for implementing better labeling and creating publishable tables. Let’s walk through some code as we make a few changes to improve the table’s usefulness.

      First, we initiate the table with the gt() function. Next, we use the argument rowname_col() to designate the TrustGovernment column as the labels for each row (called the table “stub”). We apply the cols_label() function to create informative column labels instead of variable names, and then the tab_spanner() function to add a label across multiple columns. In this case, we label all columns except the stub with “Trust in Government, 2020”. We then format the proportions into percentages with the fmt_percent() function and reduce the number of decimals shown with decimals = 1. Finally, the tab_caption() function adds a table title for HTML version of the book. We can use the caption for cross-referencing in R Markdown, Quarto, and bookdown, as well as adding it to the list of tables in the book.

      -
      trust_gov_gt <- trust_gov %>%
      -  gt(rowname_col = "TrustGovernment") %>%
      -  cols_label(trust_gov_p = "%",
      -             trust_gov_p_se = "s.e. (%)") %>%
      -  tab_spanner(label = "Trust in Government, 2020",
      -              columns = c(trust_gov_p, trust_gov_p_se)) %>%
      -  fmt_percent(decimals = 1)
      -
      trust_gov_gt %>% 
      -  tab_caption("Example of gt table with trust in government estimate")
      - -
      - @@ -1076,35 +1075,35 @@

      8.3.1.1 Transitioning {srvyr} out

      We can add a few more enhancements, such as a title, a data source note, and a footnote with the question information, using the functions tab_header(), tab_source_note(), and tab_footnote(). If having the percentage sign in both the header and the cells seems redundant, we can opt for fmt_number() instead of fmt_percent() and scale the number by 100 with scale_by = 100.

      -
      trust_gov_gt2 <- trust_gov_gt %>%
      -  tab_header("American voter's trust
      -             in the federal government, 2020") %>%
      -  tab_source_note("American National Election Studies, 2020") %>%
      -  tab_footnote(
      -    "Question text: How often can you trust the federal government
      -    in Washington to do what is right?"
      -  ) %>%
      -  fmt_number(scale_by = 100,
      -             decimals = 1)
      -
      trust_gov_gt2
      - -
      - @@ -1605,28 +1604,28 @@

      Expanding tables using {gtsummary}
      anes_des_gtsum <- anes_des %>%
      -  tbl_svysummary(include = TrustGovernment,
      -                 statistic = list(all_categorical() ~ "{p} ({p.std.error})")) 

      -
      anes_des_gtsum
      +
      anes_des_gtsum <- anes_des %>%
      +  tbl_svysummary(include = TrustGovernment,
      +                 statistic = list(all_categorical() ~ "{p} ({p.std.error})")) 
      +
      anes_des_gtsum
      -
      - @@ -2084,33 +2083,33 @@

      Expanding tables using {gtsummary}
      anes_des_gtsum2 <- anes_des %>%
      -  tbl_svysummary(
      -    include = TrustGovernment,
      -    statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
      -    missing = "no",
      -    digits = list(TrustGovernment ~ style_percent),
      -    label = list(TrustGovernment ~ "Trust in Government, 2020")
      -  )

      -
      anes_des_gtsum2
      - -
      - @@ -2566,45 +2565,45 @@

      Expanding tables using {gtsummary}8.3.1.1.

      -
      anes_des_gtsum3 <- anes_des %>%
      -  tbl_svysummary(
      -    include = TrustGovernment,
      -    statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
      -    missing = "no",
      -    digits = list(TrustGovernment ~ style_percent),
      -    label = list(TrustGovernment ~ "Trust in Government, 2020")
      -  ) %>%
      -  modify_footnote(update = everything() ~ NA) %>%
      -  modify_header(label = " ",
      -                stat_0 = "% (s.e.)") %>%
      -  as_gt() %>%
      -  tab_header("American voter's trust
      -             in the federal government, 2020") %>%
      -  tab_source_note("American National Election Studies, 2020") %>%
      -  tab_footnote(
      -    "Question text: How often can you trust
      -    the federal government in Washington
      -    to do what is right?"
      -  )
      -
      anes_des_gtsum3
      - -
      - @@ -3070,49 +3069,49 @@

      Expanding tables using {gtsummary}
      anes_des_gtsum4 <- anes_des %>%
      -  tbl_svysummary(
      -    include = c(TrustGovernment, Age),
      -    statistic = list(
      -      all_categorical() ~ "{p} ({p.std.error})",
      -      all_continuous() ~ "{mean} ({mean.std.error})"
      -    ),
      -    missing = "no",
      -    digits = list(TrustGovernment ~ style_percent,
      -                  Age ~ c(1, 2)),
      -    label = list(TrustGovernment ~ "Trust in Government, 2020")
      -  ) %>%
      -  modify_footnote(update = everything() ~ NA) %>%
      -  modify_header(label = " ",
      -                stat_0 = "% (s.e.)") %>%
      -  as_gt() %>%
      -  tab_header("American voter's trust in the federal government, 2020") %>%
      -  tab_source_note("American National Election Studies, 2020") %>%
      -  tab_footnote(
      -    "Question text: How often can you trust the federal government
      -    in Washington to do what is right?"
      -  ) %>%
      -  tab_caption("Example of gtsummary table with trust in government
      -              estimates and average age")

      -
      anes_des_gtsum4
      - -
      - @@ -3578,50 +3577,50 @@

      Expanding tables using {gtsummary}
      anes_des_gtsum5 <- anes_des %>%
      -  drop_na(VotedPres2020) %>%
      -  tbl_svysummary(
      -    include = TrustGovernment,
      -    statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
      -    missing = "no",
      -    digits = list(TrustGovernment ~ style_percent),
      -    label = list(TrustGovernment ~ "Trust in Government, 2020"),
      -    by = VotedPres2020
      -  ) %>%
      -  modify_footnote(update = everything() ~ NA) %>%
      -  modify_header(label = " ",
      -                stat_1 = "Voted",
      -                stat_2 = "Didn't vote") %>%
      -  as_gt() %>%
      -  tab_header(
      -    "American voter's trust
      -             in the federal government by whether they voted
      -             in the 2020 presidential election"
      -  ) %>%
      -  tab_source_note("American National Election Studies, 2020") %>%
      -  tab_footnote(
      -    "Question text: How often can you trust the federal government
      -               in Washington to do what is right?"
      -  )

      -
      anes_des_gtsum5
      - -
      - @@ -4100,24 +4099,24 @@

      8.3.2 Charts and plots
      anes_des_der <- anes_des %>%
      -  mutate(TrustGovernmentUsually = case_when(
      -    is.na(TrustGovernment) ~ NA,
      -    TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
      -  )) %>%
      -  drop_na(VotedPres2020_selection) %>%
      -  group_by(VotedPres2020_selection) %>%
      -  summarize(
      -    pct_trust = survey_mean(
      -      TrustGovernmentUsually,
      -      na.rm = TRUE,
      -      proportion = TRUE,
      -      vartype = "ci"
      -    ),
      -    .groups = "drop"
      -  )
      -
      -anes_des_der

      +
      anes_des_der <- anes_des %>%
      +  mutate(TrustGovernmentUsually = case_when(
      +    is.na(TrustGovernment) ~ NA,
      +    TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
      +  )) %>%
      +  drop_na(VotedPres2020_selection) %>%
      +  group_by(VotedPres2020_selection) %>%
      +  summarize(
      +    pct_trust = survey_mean(
      +      TrustGovernmentUsually,
      +      na.rm = TRUE,
      +      proportion = TRUE,
      +      vartype = "ci"
      +    ),
      +    .groups = "drop"
      +  )
      +
      +anes_des_der
      ## # A tibble: 3 × 4
       ##   VotedPres2020_selection pct_trust pct_trust_low pct_trust_upp
       ##   <fct>                       <dbl>         <dbl>         <dbl>
      @@ -4125,12 +4124,12 @@ 

      8.3.2 Charts and plots8.1.

      -
      p <- anes_des_der %>%
      -  ggplot(aes(x = VotedPres2020_selection,
      -             y = pct_trust)) +
      -  geom_bar(stat = "identity")
      -
      -p
      +
      p <- anes_des_der %>%
      +  ggplot(aes(x = VotedPres2020_selection,
      +             y = pct_trust)) +
      +  geom_bar(stat = "identity")
      +
      +p
      Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12. Bar 2 (Trump) has a height of 0.17. Bar 3 (Other) has a height of 0.06.

      @@ -4138,13 +4137,13 @@

      8.3.2 Charts and plots
      pcolor <- anes_des_der %>%
      -  ggplot(aes(x = VotedPres2020_selection,
      -             y = pct_trust,
      -             fill = VotedPres2020_selection)) +
      -  geom_bar(stat = "identity")
      -
      -pcolor

      +
      pcolor <- anes_des_der %>%
      +  ggplot(aes(x = VotedPres2020_selection,
      +             y = pct_trust,
      +             fill = VotedPres2020_selection)) +
      +  geom_bar(stat = "identity")
      +
      +pcolor
      Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12 and a color of strong reddish orange. Bar 2 (Trump) has a height of 0.17 and a color of vivid yellowish green. Bar 3 (Other) has a height of 0.06 and color of brilliant blue.

      @@ -4152,16 +4151,16 @@

      8.3.2 Charts and plots
      pcol_error <- anes_des_der %>%
      -  ggplot(aes(x = VotedPres2020_selection,
      -             y = pct_trust,
      -             fill = VotedPres2020_selection)) +
      -  geom_bar(stat = "identity") +
      -  geom_errorbar(aes(ymin = pct_trust_low,
      -                    ymax = pct_trust_upp),
      -                width = .2)
      -
      -pcol_error

      +
      pcol_error <- anes_des_der %>%
      +  ggplot(aes(x = VotedPres2020_selection,
      +             y = pct_trust,
      +             fill = VotedPres2020_selection)) +
      +  geom_bar(stat = "identity") +
      +  geom_errorbar(aes(ymin = pct_trust_low,
      +                    ymax = pct_trust_upp),
      +                width = .2)
      +
      +pcol_error
      Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12 and a color of strong reddish orange. Bar 2 (Trump) has a height of 0.17 and a color of vivid yellowish green. Bar 3 (Other) has a height of 0.06 and color of brilliant blue. Error bars are added with the Bar 1 (Biden) error ranging from 0.11 to 0.14, Bar 2 (Trump) error ranging from 0.16 to 0.19, and the Bar 3 (Other) error ranging from 0.02 to 0.14.

      @@ -4169,25 +4168,25 @@

      8.3.2 Charts and plots. We can specify specific colors for fill using scale_fill_manual(). Inside the function, we provide a vector of values corresponding to the colors in our plot. These values are hexadecimal (hex) color codes, denoted by a leading pound sign # followed by six letters or numbers. The hex code #0b3954 used below is a dark blue. There are many tools online that help pick hex codes, such as htmlcolorcodes.com/.

      -
      pfull <-
      -  anes_des_der %>%
      -  ggplot(aes(x = VotedPres2020_selection,
      -             y = pct_trust,
      -             fill = VotedPres2020_selection)) +
      -  geom_bar(stat = "identity") +
      -  geom_errorbar(aes(ymin = pct_trust_low,
      -                    ymax = pct_trust_upp),
      -                width = .2) +
      -  scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
      -  xlab("Election choice (2020)") +
      -  ylab("Usually trust the government") +
      -  scale_y_continuous(labels = scales::percent) +
      -  guides(fill = "none") +
      -  labs(title = "Percent of voters who usually trust the government
      -       by chosen 2020 presidential candidate",
      -       caption = "Source: American National Election Studies, 2020")
      -
      -pfull
      +
      pfull <-
      +  anes_des_der %>%
      +  ggplot(aes(x = VotedPres2020_selection,
      +             y = pct_trust,
      +             fill = VotedPres2020_selection)) +
      +  geom_bar(stat = "identity") +
      +  geom_errorbar(aes(ymin = pct_trust_low,
      +                    ymax = pct_trust_upp),
      +                width = .2) +
      +  scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
      +  xlab("Election choice (2020)") +
      +  ylab("Usually trust the government") +
      +  scale_y_continuous(labels = scales::percent) +
      +  guides(fill = "none") +
      +  labs(title = "Percent of voters who usually trust the government
      +       by chosen 2020 presidential candidate",
      +       caption = "Source: American National Election Studies, 2020")
      +
      +pfull
      Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12 and a color of dark blue. Bar 2 (Trump) has a height of 0.17 and a color of very pale blue. Bar 3 (Other) has a height of 0.06 and color of moderate purple. Error bars are added with the Bar 1 (Biden) error ranging from 0.11 to 0.14, Bar 2 (Trump) error ranging from 0.16 to 0.19, and the Bar 3 (Other) error ranging from 0.02 to 0.14.

      diff --git a/c09-reprex-data.html b/c09-reprex-data.html index a93f247a..d8141563 100644 --- a/c09-reprex-data.html +++ b/c09-reprex-data.html @@ -230,55 +230,54 @@

    • 4.3 Survey analysis process
    • 4.4 Similarities between {dplyr} and {srvyr} functions
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
      @@ -529,8 +528,8 @@

      9.2 Project-based workflows

  • In a project-based workflow, all paths are relative and, by default, relative to the project’s folder. By using relative paths, others can open and run our files even if their directory configuration differs from ours. The {here} package enables easy file referencing, and we can start with using the here::here() function to build the path for loading or saving data. Below, we ask R to read the CSV file anes_2020.csv in the project directory’s data folder:

    -
    anes <- 
    -  read_csv(here::here("data", "anes2020_clean.csv"))
    +
    anes <- 
    +  read_csv(here::here("data", "anes2020_clean.csv"))

    The combination of projects and the {here} package keep all associated files in an organized manner. This workflow makes it more likely that our analyses can be reproduced by us or our colleagues.

    @@ -576,9 +575,9 @@

    9.9 Other tips for reproducibilit

    9.9.1 Random number seeds

    Some tasks in survey analysis require randomness, such as imputation, model training, or creating random samples. By default, the random numbers generated by R change each time we rerun the code, making it difficult to reproduce the same results. By “setting the seed,” we can control the randomness and ensure that the random numbers remain consistent whenever we rerun the code. Others can use the same seed value to reproduce our random numbers and achieve the same results.

    In R, we can use the set.seed() function to control the randomness in our code. Set a seed value by providing an integer to the function:

    -
    set.seed(999)
    -
    -runif(5)
    +
    set.seed(999)
    +
    +runif(5)

    The runif() function generates five random numbers from a uniform distribution. Since the seed is set to 999, running runif() multiple times will always produce the same sequence:

    [1] 0.38907138 0.58306072 0.09466569 0.85263123 0.78674676

    The choice of the seed number is up to the analyst. For example, this could be the date (20240102) or time of day (1056) when the analysis was first conducted, a phone number (8675309), or the first few numbers that come to mind (369). As long as the seed is set for a given analysis, the actual number is up to the analyst to decide. It is important to note that set.seed() should be used before random number generation. Run it once per program, and the seed will be applied to the entire script. We recommend setting the seed at the beginning of a script, where libraries are loaded.

    diff --git a/c10-specifying-sample-designs.html b/c10-specifying-sample-designs.html index aa174785..ed505f20 100644 --- a/c10-specifying-sample-designs.html +++ b/c10-specifying-sample-designs.html @@ -230,55 +230,54 @@
  • 4.3 Survey analysis process
  • 4.4 Similarities between {dplyr} and {srvyr} functions
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
      @@ -497,16 +496,16 @@

      Prerequisites

      For this chapter, load the following packages:

      -
      library(tidyverse)
      -library(survey)
      -library(srvyr)
      -library(srvyrexploR)
      +
      library(tidyverse)
      +library(survey)
      +library(srvyr)
      +library(srvyrexploR)

      To help explain the different types of sample designs, this chapter will use the api and scd data that are included in the {survey} package:

      -
      data(api)
      -data(scd)
      +
      data(api)
      +data(scd)

      This chapter uses data from the Residential Energy Consumption Survey (RECS) - both 2015 and 2020, so we will use the following code to load the RECS data from the {srvyr.data} package:

      -
      data(recs_2015)
      -data(recs_2020)
      +
      data(recs_2015)
      +data(recs_2020)

  • 10.1 Introduction

    @@ -556,27 +555,27 @@

    The math

    The syntax

    If a sample was drawn through SRS and had no nonresponse or other weighting adjustments, in R, specify this design as:

    -
    srs1_des <- dat %>%
    - as_survey_design(fpc = fpcvar)
    +
    srs1_des <- dat %>%
    + as_survey_design(fpc = fpcvar)

    where dat is a tibble or data.frame with the survey data, and fpcvar is a variable in the data indicating the sampling frame’s size (this variable will have the same value for all cases in an SRS design). If the frame is very large, sometimes the frame size is not provided. In that case, the FPC is not needed, and specify the design as:

    -
    srs2_des <- dat %>%
    - as_survey_design()
    +
    srs2_des <- dat %>%
    + as_survey_design()

    If some post-survey adjustments were implemented and the weights are not all equal, specify the design as:

    -
    srs3_des <- dat %>%
    - as_survey_design(weights = wtvar, 
    -                  fpc = fpcvar)
    +
    srs3_des <- dat %>%
    + as_survey_design(weights = wtvar, 
    +                  fpc = fpcvar)

    where wtvar is a variable in the data indicating the weight for each case. Again, the FPC can be omitted if it is unnecessary because the frame is large compared to the sample size.

    Example

    The {survey} package in R provides some example datasets that we will use throughout this chapter. The documentation provides detailed information about the variables. One of the example datasets we will use is from the Academic Performance Index (API). The API was a program administered by the California Department of Education, and the {survey} package includes a population file (sample frame) of all schools with at least 100 students and several different samples pulled from that data using different sampling methods. For this first example, we will use the apisrs dataset, which contains an SRS of 200 schools. For printing purposes, we create a new dataset called apisrs_slim, which sorts the data by the school district and school ID and subsets the data to only a few columns. The SRS sample data is illustrated below:

    -
    apisrs_slim <-
    - apisrs %>%
    - as_tibble() %>%
    - arrange(dnum, snum) %>%
    - select(cds, dnum, snum, dname, sname, fpc, pw)
    -
    -apisrs_slim
    +
    apisrs_slim <-
    + apisrs %>%
    + as_tibble() %>%
    + arrange(dnum, snum) %>%
    + select(cds, dnum, snum, dname, sname, fpc, pw)
    +
    +apisrs_slim
    ## # A tibble: 200 × 7
     ##    cds             dnum  snum dname                   sname    fpc    pw
     ##    <chr>          <int> <dbl> <chr>                   <chr>  <dbl> <dbl>
    @@ -632,11 +631,11 @@ 

    Example
    apisrs_des <- apisrs_slim %>%
    - as_survey_design(weights = pw, 
    -                  fpc = fpc)
    -
    -apisrs_des

    +
    apisrs_des <- apisrs_slim %>%
    + as_survey_design(weights = pw, 
    +                  fpc = fpc)
    +
    +apisrs_des
    ## Independent Sampling design
     ## Called via srvyr
     ## Sampling variables:
    @@ -647,7 +646,7 @@ 

    Example10.2.4), the FPC variable is indicated, and the weights are indicated. We can also look at the summary of the design object, and see the distribution of the probabilities (inverse of the weights) along with the population size and a list of the variables in the dataset.

    -
    summary(apisrs_des)
    +
    summary(apisrs_des)
    ## Independent Sampling design
     ## Called via srvyr
     ## Probabilities:
    @@ -688,28 +687,28 @@ 

    The math

    The syntax

    If we had a sample that was drawn through SRSWR and had no nonresponse or other weighting adjustments, in R, we should specify this design as:

    -
    srswr1_des <- dat %>%
    - as_survey_design()
    +
    srswr1_des <- dat %>%
    + as_survey_design()

    where dat is a tibble or data.frame containing our survey data. This syntax is the same as a SRS design, except a finite population correction (FPC) is not included. This is because when you claculate a sample with replacement, the population pool to select from is no longer finite, so a correction is not needed. Therefore, with large populations where the FPC is negligble, the underlying formulas for SRS and SRSWR designs are the same.

    If some post-survey adjustments were implemented and the weights are not all equal, specify the design as:

    -
    srswr2_des <- dat %>%
    - as_survey_design(weights = wtvar)
    +
    srswr2_des <- dat %>%
    + as_survey_design(weights = wtvar)

    where wtvar is the variable for the weight on the data.

    Example

    The {survey} package does not include an example of SRSWR, so to illustrate this design we need to create an example. We use the api population data provided by the {survey} package apipop and select a sample of 200 cases using the slice_sample() function from the tidyverse. One of the arguments in the slice_sample() function is replace. If replace=TRUE, then we are conducting a SRSWR. We then calculate selection weights as the inverse of the probability of selection and call this new dataset apisrswr.

    -
    set.seed(409963)
    -
    -apisrswr <- apipop %>%
    - as_tibble() %>%
    - slice_sample(n = 200, replace = TRUE) %>%
    - select(cds, dnum, snum, dname, sname) %>%
    - mutate(
    -  weight = nrow(apipop)/200
    - )
    -
    -head(apisrswr)
    +
    set.seed(409963)
    +
    +apisrswr <- apipop %>%
    + as_tibble() %>%
    + slice_sample(n = 200, replace = TRUE) %>%
    + select(cds, dnum, snum, dname, sname) %>%
    + mutate(
    +  weight = nrow(apipop)/200
    + )
    +
    +head(apisrswr)
    ## # A tibble: 6 × 6
     ##   cds             dnum  snum dname                    sname       weight
     ##   <chr>          <int> <dbl> <chr>                    <chr>        <dbl>
    @@ -720,10 +719,10 @@ 

    Example
    apisrswr %>%
    - group_by(cds) %>%
    - filter(n()>1) %>%
    - arrange(cds)

    +
    apisrswr %>%
    + group_by(cds) %>%
    + filter(n()>1) %>%
    + arrange(cds)
    ## # A tibble: 4 × 6
     ## # Groups:   cds [2]
     ##   cds             dnum  snum dname                 sname          weight
    @@ -733,10 +732,10 @@ 

    Example
    apisrswr_des <- apisrswr %>%
    - as_survey_design(weights = weight)
    -
    -apisrswr_des

    +
    apisrswr_des <- apisrswr %>%
    + as_survey_design(weights = weight)
    +
    +apisrswr_des
    ## Independent Sampling design (with replacement)
     ## Called via srvyr
     ## Sampling variables:
    @@ -745,7 +744,7 @@ 

    Example
    summary(apisrswr_des)
    +
    summary(apisrswr_des)
    ## Independent Sampling design (with replacement)
     ## Called via srvyr
     ## Probabilities:
    @@ -791,25 +790,25 @@ 

    The math

    The syntax

    In addition to the fpc and weights arguments discussed in the types above, stratified designs requires the addition of the strata argument. For example, to specify a stratified SRS design in {srvyr} when using the FPC, that is, where the population sizes of the strata are not too large and are known, specify the design as:

    -
    stsrs1_des <- dat %>%
    - as_survey_design(fpc = fpcvar, 
    -                  strata = stratvar)
    +
    stsrs1_des <- dat %>%
    + as_survey_design(fpc = fpcvar, 
    +                  strata = stratvar)

    where fpcvar is a variable on our data that indicates \(N_h\) for each row, and stratavar is a variable indicating the stratum for each row. You can omit the FPC if it is not applicable. Additionally, we can indicate the weight variable if it is present where wtvar is a variable on our data with a numeric weight.

    -
    stsrs2_des <- dat %>%
    - as_survey_design(weights = wtvar, 
    -                  strata = stratvar)
    +
    stsrs2_des <- dat %>%
    + as_survey_design(weights = wtvar, 
    +                  strata = stratvar)

    Example

    In the example API data, apistrat is a stratified random sample, stratified by school type (stype) with three levels: E for elementary school, M for middle school, and H for high school. As with the SRS example above, we sort and select specific variables for use in printing. The data are illustrated below, including a count of the number of cases per stratum:

    -
    apistrat_slim <-
    - apistrat %>%
    - as_tibble() %>%
    - arrange(dnum, snum) %>%
    - select(cds, dnum, snum, dname, sname, stype, fpc, pw)
    -
    -apistrat_slim %>%
    - count(stype, fpc)
    +
    apistrat_slim <-
    + apistrat %>%
    + as_tibble() %>%
    + arrange(dnum, snum) %>%
    + select(cds, dnum, snum, dname, sname, stype, fpc, pw)
    +
    +apistrat_slim %>%
    + count(stype, fpc)
    ## # A tibble: 3 × 3
     ##   stype   fpc     n
     ##   <fct> <dbl> <int>
    @@ -817,12 +816,12 @@ 

    Example
    apistrat_des <- apistrat_slim %>%
    -  as_survey_design(strata = stype,
    -                   weights = pw,
    -                   fpc = fpc)
    -
    -apistrat_des

    +
    apistrat_des <- apistrat_slim %>%
    +  as_survey_design(strata = stype,
    +                   weights = pw,
    +                   fpc = fpc)
    +
    +apistrat_des
    ## Stratified Independent Sampling design
     ## Called via srvyr
     ## Sampling variables:
    @@ -833,7 +832,7 @@ 

    Example
    summary(apistrat_des)
    +
    summary(apistrat_des)
    ## Stratified Independent Sampling design
     ## Called via srvyr
     ## Probabilities:
    @@ -886,31 +885,31 @@ 

    The math

    The syntax

    Clustered sampling designs require the addition of the ids argument which specifies what variables are the cluster levels. To specify a two-stage clustered design without replacement, use the following syntax:

    -
    clus2_des <- dat %>%
    - as_survey_design(weights = wtvar, 
    -                  ids = c(PSU, SSU), 
    -                  fpc = c(A, B))
    +
    clus2_des <- dat %>%
    + as_survey_design(weights = wtvar, 
    +                  ids = c(PSU, SSU), 
    +                  fpc = c(A, B))

    where PSU and SSU are the variables indicating the PSU and SSU identifiers, and A and B are the variables indicating the population sizes for each level (i.e., A is the number of clusters, and B is the number of units within each cluster). Note that A will be the same for all records (within a strata), and B will be the same for all records within the same cluster.

    If clusters were sampled with replacement or from a very large population, a FPC is unnecessary. Additionally, only the first stage of selection is necessary regardless of whether the units were selected with replacement at any stage. The subsequent stages of selection are ignored in computation as their contribution to the variance is overpowered by the first stage (see Särndal, Swensson, and Wretman (2003) or Wolter (2007) for a more in-depth discussion). Therefore, the syntax below will yield the same estimates in the end:

    -
    clus2wra_des <- dat %>%
    - as_survey_design(weights = wtvar, 
    -                  ids = c(PSU, SSU))
    -
    -clus2wrb_des <- dat %>%
    - as_survey_design(weights = wtvar, 
    -                  ids = PSU)
    +
    clus2wra_des <- dat %>%
    + as_survey_design(weights = wtvar, 
    +                  ids = c(PSU, SSU))
    +
    +clus2wrb_des <- dat %>%
    + as_survey_design(weights = wtvar, 
    +                  ids = PSU)

    Note that there is one additional argument that is sometimes necessary which is nest = TRUE. This option relabels cluster IDs to enforce nesting within strata. Sometimes, as an example, there may be a cluster 1 and a cluster 2 within each stratum but these are actually different clusters. This option indicates that the repeated use of numbering does not mean it is the same cluster. If this option is not used and there are repeated cluster IDs across different strata, an error will be generated.

    Example

    The survey package includes a two-stage cluster sample data, apiclus2, in which school districts were sampled, and then a random sample of five schools was selected within each district. For districts with fewer than five schools, all schools were sampled. School districts are identified by dnum, and schools are identified by snum. The variable fpc1 indicates how many districts there are in California (A), and fpc2 indicates how many schools were in a given district with at least 100 students (B). The data has a row for each school. In the data printed below, there are 757 school districts, as indicated by fpc1, and there are nine schools in District 731, one school in District 742, two schools in District 768, and so on as indicated by fpc2. For illustration purposes, the object apiclus2_slim has been created from apiclus2, which subsets the data to only the necessary columns and sorts data.

    -
    apiclus2_slim <-
    -  apiclus2 %>%
    -  as_tibble() %>%
    -  arrange(desc(dnum), snum) %>%
    -  select(cds, dnum, snum, fpc1, fpc2, pw)
    -
    -apiclus2_slim
    +
    apiclus2_slim <-
    +  apiclus2 %>%
    +  as_tibble() %>%
    +  arrange(desc(dnum), snum) %>%
    +  select(cds, dnum, snum, fpc1, fpc2, pw)
    +
    +apiclus2_slim
    ## # A tibble: 126 × 6
     ##    cds             dnum  snum  fpc1      fpc2    pw
     ##    <chr>          <int> <dbl> <dbl> <int[1d]> <dbl>
    @@ -926,12 +925,12 @@ 

    Example
    apiclus2_des <- apiclus2_slim %>%
    -  as_survey_design(ids = c(dnum, snum),
    -                   fpc = c(fpc1, fpc2),
    -                   weights = pw)
    -
    -apiclus2_des

    +
    apiclus2_des <- apiclus2_slim %>%
    +  as_survey_design(ids = c(dnum, snum),
    +                   fpc = c(fpc1, fpc2),
    +                   weights = pw)
    +
    +apiclus2_des
    ## 2 - level Cluster Sampling design
     ## With (40, 126) clusters.
     ## Called via srvyr
    @@ -942,7 +941,7 @@ 

    Example
    summary(apiclus2_des)
    +
    summary(apiclus2_des)
    ## 2 - level Cluster Sampling design
     ## With (40, 126) clusters.
     ## Called via srvyr
    @@ -968,10 +967,10 @@ 

    Example
    nsfg_des <- nsfgdata %>%
    -  as_survey_design(ids = SECU,
    -                   strata = SEST,
    -                   weights = WGT2017_2019)
    +
    nsfg_des <- nsfgdata %>%
    +  as_survey_design(ids = SECU,
    +                   strata = SEST,
    +                   weights = WGT2017_2019)
    @@ -1004,55 +1003,55 @@

    The math

    Replicate weights generally come in groups and are sequentially numbered, such as PWGTP1, PWGTP2, …, PWGTP80 for the person weights in the American Community Survey (ACS) (U.S. Census Bureau 2021) or BRRWT1, BRRWT2, …, BRRWT96 in the 2015 Residential Energy Consumption Survey (RECS) (U.S. Energy Information Administration 2017). This makes it easy to use some of the tidy selection32 functions in R.

    To specify a BRR design, we need to specify the weight variable (weights), the replicate weight variables (repweights), the type of replicate weights is BRR (type = BRR), and whether the mean squared error should be used (mse = TRUE) or not (mse = FALSE). For example, if a dataset had WT0 for the main weight and had 20 BRR weights indicated WT1, WT2, …, WT20, we can use the following syntax (both are equivalent):

    -
    brr_des <- dat %>%
    -  as_survey_rep(weights = WT0,
    -                repweights = all_of(str_c("WT", 1:20)), 
    -                type = "BRR",
    -                mse = TRUE)
    -
    -brr_des <- dat %>%
    -  as_survey_rep(weights = WT0,
    -                repweights = num_range("WT", 1:20),
    -                type = "BRR",
    -                mse = TRUE)
    +
    brr_des <- dat %>%
    +  as_survey_rep(weights = WT0,
    +                repweights = all_of(str_c("WT", 1:20)), 
    +                type = "BRR",
    +                mse = TRUE)
    +
    +brr_des <- dat %>%
    +  as_survey_rep(weights = WT0,
    +                repweights = num_range("WT", 1:20),
    +                type = "BRR",
    +                mse = TRUE)

    If a dataset had WT for the main weight and had 20 BRR weights indicated REPWT1, REPWT2, …, REPWT20, the following syntax could be used (both are equivalent):

    -
    brr_des <- dat %>%
    -  as_survey_rep(weights = WT,
    -                repweights = all_of(str_c("REPWT", 1:20)),
    -                type = "BRR",
    -                mse = TRUE)
    -
    -brr_des <- dat %>%
    -  as_survey_rep(weights = WT,
    -                repweights = starts_with("REPWT"),
    -                type = "BRR",
    -                mse = TRUE)
    +
    brr_des <- dat %>%
    +  as_survey_rep(weights = WT,
    +                repweights = all_of(str_c("REPWT", 1:20)),
    +                type = "BRR",
    +                mse = TRUE)
    +
    +brr_des <- dat %>%
    +  as_survey_rep(weights = WT,
    +                repweights = starts_with("REPWT"),
    +                type = "BRR",
    +                mse = TRUE)

    If the replicate weight variables are in the file consecutively, the following syntax can also be used:

    -
    brr_des <- dat %>%
    -  as_survey_rep(weights = WT,
    -                repweights = REPWT1:REPWT20,
    -                type = "BRR",
    -                mse = TRUE)
    +
    brr_des <- dat %>%
    +  as_survey_rep(weights = WT,
    +                repweights = REPWT1:REPWT20,
    +                type = "BRR",
    +                mse = TRUE)

    Typically, each replicate weight sums to a value similar to the main weight, as both the replicate weights and the main weight are supposed to provide population estimates. Rarely, an alternative method will be used where the replicate weights have values of 0 or 2 in the case of BRR weights. This would be indicated in the documentation (see Chapter 3 for more information on how to understand the provided documentation). In this case, the replicate weights are not combined, and the option combined_weights = FALSE should be indicated, as the default value for this argument is TRUE. This specific syntax is shown below:

    -
    brr_des <- dat %>%
    -  as_survey_rep(weights = WT,
    -                repweights = starts_with("REPWT"),
    -                type = "BRR",
    -                combined_weights = FALSE,
    -                mse = TRUE)
    +
    brr_des <- dat %>%
    +  as_survey_rep(weights = WT,
    +                repweights = starts_with("REPWT"),
    +                type = "BRR",
    +                combined_weights = FALSE,
    +                mse = TRUE)

    Example

    The {survey} package includes a data example from Section 12.2 of Levy and Lemeshow (2013). In this fictional data, two out of five ambulance stations were sampled from each of three emergency service areas (ESAs), thus BRR weights are appropriate with 2 PSUs (stations) sampled in each stratum (ESA). In the code below, BRR weights are created as was done by Levy and Lemeshow (2013).

    -
    scdbrr <- scd %>%
    -  as_tibble() %>%
    -  mutate(wt = 5 / 2,
    -         rep1 = 2 * c(1, 0, 1, 0, 1, 0),
    -         rep2 = 2 * c(1, 0, 0, 1, 0, 1),
    -         rep3 = 2 * c(0, 1, 1, 0, 0, 1),
    -         rep4 = 2 * c(0, 1, 0, 1, 1, 0))
    -
    -scdbrr
    +
    scdbrr <- scd %>%
    +  as_tibble() %>%
    +  mutate(wt = 5 / 2,
    +         rep1 = 2 * c(1, 0, 1, 0, 1, 0),
    +         rep2 = 2 * c(1, 0, 0, 1, 0, 1),
    +         rep3 = 2 * c(0, 1, 1, 0, 0, 1),
    +         rep4 = 2 * c(0, 1, 0, 1, 1, 0))
    +
    +scdbrr
    ## # A tibble: 6 × 9
     ##     ESA ambulance arrests alive    wt  rep1  rep2  rep3  rep4
     ##   <int>     <int>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    @@ -1063,13 +1062,13 @@ 

    Example
    scdbrr_des <- scdbrr %>%
    -  as_survey_rep(type = "BRR",
    -                repweights = starts_with("rep"),
    -                combined_weights = FALSE,  
    -                weight = wt)
    -
    -scdbrr_des

    +
    scdbrr_des <- scdbrr %>%
    +  as_survey_rep(type = "BRR",
    +                repweights = starts_with("rep"),
    +                combined_weights = FALSE,  
    +                weight = wt)
    +
    +scdbrr_des
    ## Call: Called via srvyr
     ## Balanced Repeated Replicates with 4 replicates.
     ## Sampling variables:
    @@ -1078,7 +1077,7 @@ 

    Example
    summary(scdbrr_des)
    +
    summary(scdbrr_des)
    ## Call: Called via srvyr
     ## Balanced Repeated Replicates with 4 replicates.
     ## Sampling variables:
    @@ -1104,25 +1103,25 @@ 

    The math

    The syntax

    The syntax is very similar for BRR and Fay’s BRR. To specify a Fay’s BRR design, we need to specify the weight variable (weights), the replicate weight variables (repweights), the type of replicate weights is Fay’s BRR (type = Fay), whether the mean squared error should be used (mse = TRUE) or not (mse = FALSE), and Fay’s multiplier (rho). For example, if a dataset had WT0 for the main weight and had 20 BRR weights indicated as WT1, WT2, …, WT20, and Fay’s multiplier is 0.3, use the following syntax:

    -
    fay_des <- dat %>%
    -  as_survey_rep(weights = WT0,
    -                repweights = num_range("WT", 1:20),
    -                type = "Fay",
    -                mse = TRUE,
    -                rho = 0.3)
    +
    fay_des <- dat %>%
    +  as_survey_rep(weights = WT0,
    +                repweights = num_range("WT", 1:20),
    +                type = "Fay",
    +                mse = TRUE,
    +                rho = 0.3)

    Example

    The 2015 RECS (U.S. Energy Information Administration 2017) uses Fay’s BRR weights with the final weight as NWEIGHT and replicate weights as BRRWT1 - BRRWT96 and the documentation specifies a Fay’s multiplier of 0.5. On the file, DOEID is a unique identifier for each respondent, TOTALDOL is the total cost of energy, TOTSQFT_EN is the total square footage of the residence, and REGOINC is the Census region. We have already pulled in the 2015 RECS data from the {srvyrexploR} package that provides data for this book. To specify the design for the recs_2015 data, use the following syntax:

    -
    recs_2015_des <- recs_2015 %>%
    -  as_survey_rep(weights = NWEIGHT,
    -                repweights = BRRWT1:BRRWT96,
    -                type = "Fay",
    -                rho = 0.5,
    -                mse = TRUE,
    -                variables = c(DOEID, TOTALDOL, TOTSQFT_EN, REGIONC))
    -
    -recs_2015_des 
    +
    recs_2015_des <- recs_2015 %>%
    +  as_survey_rep(weights = NWEIGHT,
    +                repweights = BRRWT1:BRRWT96,
    +                type = "Fay",
    +                rho = 0.5,
    +                mse = TRUE,
    +                variables = c(DOEID, TOTALDOL, TOTSQFT_EN, REGIONC))
    +
    +recs_2015_des 
    ## Call: Called via srvyr
     ## Fay's variance method (rho= 0.5 ) with 96 replicates and MSE variances.
     ## Sampling variables:
    @@ -1144,7 +1143,7 @@ 

    Example
    summary(recs_2015_des) 

    +
    summary(recs_2015_des) 
    ## Call: Called via srvyr
     ## Fay's variance method (rho= 0.5 ) with 96 replicates and MSE variances.
     ## Sampling variables:
    @@ -1185,35 +1184,35 @@ 

    The math

    The syntax

    To specify the jackknife method, we use the survey documentation to understand the type of jackknife (1, n, or 2) and the multiplier. In the syntax we need to specify the weight variable (weights), the replicate weight variables (repweights), the type of replicate weights as jackknife 1 (type = "JK1"), n (type = "JKN"), or 2 (type = "JK2"), whether the mean squared error should be used (mse = TRUE) or not (mse = FALSE), and the multiplier (scale). For example, if the survey is a jackknife 1 method with a multiplier of \(\alpha_r=(R-1)/R=19/20=0.95\), the dataset has WT0 for the main weight and 20 replicate weights indicated as WT1, WT2, …, WT20, use the following syntax:

    -
    jk1_des <- dat %>%
    - as_survey_rep(weights = WT0, 
    -               repweights= num_range("WT", 1:20),
    -               type="JK1", 
    -               mse=TRUE, 
    -               scale=0.95)
    +
    jk1_des <- dat %>%
    + as_survey_rep(weights = WT0, 
    +               repweights= num_range("WT", 1:20),
    +               type="JK1", 
    +               mse=TRUE, 
    +               scale=0.95)

    For a jackknife n method, we need to specify the multiplier for all replicates. In this case we use the rscales argument to specify each one. The documentation will provide details on what the multipliers (\(\alpha_r\)) are, and they may be the same for all replicates. For example, consider a case where \(\alpha_r=0.1\) for all replicates and the dataset had WT0 for the main weight and had 20 replicate weights indicated as WT1, WT2, …, WT20. We specify the type as type = "JKN", and the multiplier as rscales=rep(0.1,20):

    -
    jkn_des <- dat %>%
    - as_survey_rep(weights = WT0, 
    -               repweights= num_range("WT", 1:20),
    -               type="JKN", 
    -               mse=TRUE, 
    -               rscales=rep(0.1, 20))
    +
    jkn_des <- dat %>%
    + as_survey_rep(weights = WT0, 
    +               repweights= num_range("WT", 1:20),
    +               type="JKN", 
    +               mse=TRUE, 
    +               rscales=rep(0.1, 20))

    Example

    The 2020 RECS (U.S. Energy Information Administration 2023b) uses jackknife weights with the final weight as NWEIGHT and replicate weights as NWEIGHT1 - NWEIGHT60 with a scale of \((R-1)/R=59/60\). On the file, DOEID is a unique identifier for each respondent, TOTALDOL is the total cost of energy, TOTSQFT_EN is the total square footage of the residence, and REGOINC is the Census region. We have already read in the RECS data and created a dataset called recs_2020 above in the prerequisites.

    To specify this design, use the following syntax:

    -
    recs_des <- recs_2020 %>%
    -  as_survey_rep(
    -    weights = NWEIGHT,
    -    repweights = NWEIGHT1:NWEIGHT60,
    -    type = "JK1",
    -    scale = 59/60,
    -    mse = TRUE,
    -    variables = c(DOEID, TOTALDOL, TOTSQFT_EN, REGIONC)
    -  )
    -
    -recs_des
    +
    recs_des <- recs_2020 %>%
    +  as_survey_rep(
    +    weights = NWEIGHT,
    +    repweights = NWEIGHT1:NWEIGHT60,
    +    type = "JK1",
    +    scale = 59/60,
    +    mse = TRUE,
    +    variables = c(DOEID, TOTALDOL, TOTSQFT_EN, REGIONC)
    +  )
    +
    +recs_des
    ## Call: Called via srvyr
     ## Unstratified cluster jacknife (JK1) with 60 replicates and MSE variances.
     ## Sampling variables:
    @@ -1232,7 +1231,7 @@ 

    Example
    summary(recs_des)

    +
    summary(recs_des)
    ## Call: Called via srvyr
     ## Unstratified cluster jacknife (JK1) with 60 replicates and MSE variances.
     ## Sampling variables:
    @@ -1268,40 +1267,40 @@ 

    The math

    The syntax

    To specify a bootstrap method, we need to specify the weight variable (weights), the replicate weight variables (repweights), the type of replicate weights as bootstrap (type = "bootstrap"), whether the mean squared error should be used (mse = TRUE) or not (mse = FALSE), and the multiplier (scale). For example, if a dataset had WT0 for the main weight, 20 bootstrap weights indicated WT1, WT2, …, WT20, and a multiplier of \(\alpha=.02\), use the following syntax:

    -
    bs_des <- dat %>%
    - as_survey_rep(weights = WT0, 
    -               repweights= num_range("WT", 1:20),
    -               type="bootstrap", 
    -               mse=TRUE, 
    -               scale=.02)
    +
    bs_des <- dat %>%
    + as_survey_rep(weights = WT0, 
    +               repweights= num_range("WT", 1:20),
    +               type="bootstrap", 
    +               mse=TRUE, 
    +               scale=.02)

    Example

    Returning to the api example, we are going to create a dataset with bootstrap weights to use as an example. In this example, we construct a one-cluster design with fifty replicate weights.33

    -
    apiclus1_slim <-
    -  apiclus1 %>%
    -  as_tibble() %>%
    -  arrange(dnum) %>%
    -  select(cds, dnum, fpc, pw)
    -
    -set.seed(662152)
    -apibw <-
    -  bootweights(psu = apiclus1_slim$dnum,
    -              strata = rep(1, nrow(apiclus1_slim)),
    -              fpc = apiclus1_slim$fpc,
    -              replicates = 50)
    -
    -bwmata <-
    -  apibw$repweights$weights[apibw$repweights$index,] * apiclus1_slim$pw
    -
    -apiclus1_slim <- bwmata %>%
    -  as.data.frame() %>%
    -  set_names(str_c("pw", 1:50)) %>%
    -  cbind(apiclus1_slim) %>%
    -  as_tibble() %>%
    -  select(cds, dnum, fpc, pw, everything())
    -
    -apiclus1_slim
    +
    apiclus1_slim <-
    +  apiclus1 %>%
    +  as_tibble() %>%
    +  arrange(dnum) %>%
    +  select(cds, dnum, fpc, pw)
    +
    +set.seed(662152)
    +apibw <-
    +  bootweights(psu = apiclus1_slim$dnum,
    +              strata = rep(1, nrow(apiclus1_slim)),
    +              fpc = apiclus1_slim$fpc,
    +              replicates = 50)
    +
    +bwmata <-
    +  apibw$repweights$weights[apibw$repweights$index,] * apiclus1_slim$pw
    +
    +apiclus1_slim <- bwmata %>%
    +  as.data.frame() %>%
    +  set_names(str_c("pw", 1:50)) %>%
    +  cbind(apiclus1_slim) %>%
    +  as_tibble() %>%
    +  select(cds, dnum, fpc, pw, everything())
    +
    +apiclus1_slim
    ## # A tibble: 183 × 54
     ##    cds        dnum   fpc    pw   pw1   pw2   pw3   pw4   pw5   pw6   pw7
     ##    <chr>     <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    @@ -1325,14 +1324,14 @@ 

    Example10.1), but now additionally includes bootstrap weights pw1, …, pw50. When creating the survey design object, we use the bootstrap weights as the replicate weights. Additionally, with replicate weights we need to include the scale (\(\alpha\)). For this example we created,

    \[\alpha=\frac{M}{(M-1)(R-1)}=\frac{15}{(15-1)*(50-1)}=0.02186589\] where \(M\) is the average number of PSUs per strata and \(R\) is the number of replicates. There is only 1 stratum and the number of clusters/PSUs is 15 so \(M=15\).

    -
    api1_bs_des <- apiclus1_slim %>%
    -  as_survey_rep(weights = pw,
    -                repweights = pw1:pw50,
    -                type = "bootstrap",
    -                scale = 0.02186589,
    -                mse = TRUE)
    -
    -api1_bs_des 
    +
    api1_bs_des <- apiclus1_slim %>%
    +  as_survey_rep(weights = pw,
    +                repweights = pw1:pw50,
    +                type = "bootstrap",
    +                scale = 0.02186589,
    +                mse = TRUE)
    +
    +api1_bs_des 
    ## Call: Called via srvyr
     ## Survey bootstrap with 50 replicates and MSE variances.
     ## Sampling variables:
    @@ -1354,7 +1353,7 @@ 

    Example
    summary(api1_bs_des) 

    +
    summary(api1_bs_des) 
    ## Call: Called via srvyr
     ## Survey bootstrap with 50 replicates and MSE variances.
     ## Sampling variables:
    @@ -1393,18 +1392,18 @@ 

    10.5 Exercises
  • The National Health Interview Survey (NHIS) is an annual household survey conducted by the National Center for Health Statistics (NCHS). The NHIS includes a wide variety of health topics for adults including health status and conditions, functioning and disability, health care access and health service utilization, health-related behaviors, health promotion, mental health, barriers to care, and community engagement. Like many national in-person surveys, the sampling design is a stratified clustered design with details included in the Survey Description34. The Survey Description provides information on setting up syntax in SUDAAN, Stata, SPSS, SAS, and R ({survey} package implementation). How would you specify the design using {srvyr} using either as_survey_design or as_survey_rep()?
  • -
    nhis_adult_des <- nhis_adult_data %>%
    -  as_survey_design(ids=PPSU, 
    -  strata=PSTRAT, 
    -  nest=TRUE, 
    -  weights=WTFA_A)
    +
    nhis_adult_des <- nhis_adult_data %>%
    +  as_survey_design(ids=PPSU, 
    +  strata=PSTRAT, 
    +  nest=TRUE, 
    +  weights=WTFA_A)
    1. The General Social Survey is a survey that has been administered since 1972 on social, behavioral, and attitudinal topics. The 2016-2020 GSS Panel codebook35 provides examples of setting up syntax in SAS and Stata but not R. How would you specify the design in R?
    -
    gss_des <- gss_data %>%
    -  as_survey_design(ids = VPSU_2,
    -                   strata = VSTRAT_2,
    -                   weights = WTSSNR_2)
    +
    gss_des <- gss_data %>%
    +  as_survey_design(ids = VPSU_2,
    +                   strata = VSTRAT_2,
    +                   weights = WTSSNR_2)
    diff --git a/c11-missing-data.html b/c11-missing-data.html index 8e226f70..6d979a4b 100644 --- a/c11-missing-data.html +++ b/c11-missing-data.html @@ -230,55 +230,54 @@
  • 4.3 Survey analysis process
  • 4.4 Similarities between {dplyr} and {srvyr} functions
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
      @@ -497,38 +496,38 @@

      Prerequisites

      For this chapter, load the following packages:

      -
      library(tidyverse)
      -library(survey) 
      -library(srvyr) 
      -library(srvyrexploR)
      -library(naniar)
      -library(haven)
      -library(gt)
      +
      library(tidyverse)
      +library(survey) 
      +library(srvyr) 
      +library(srvyrexploR)
      +library(naniar)
      +library(haven)
      +library(gt)

      We will be using data from ANES and RECS. Here is the code to create the design objects for each to use throughout this chapter. For ANES, we need to adjust the weight so it sums to the population instead of the sample (see the ANES documentation and Chapter 3 for more information).

      -
      targetpop <- 231592693
      -data(anes_2020)
      -
      -anes_adjwgt <- anes_2020 %>%
      -  mutate(Weight = Weight / sum(Weight) * targetpop)
      -
      -anes_des <- anes_adjwgt %>%
      -  as_survey_design(
      -    weights = Weight,
      -    strata = Stratum,
      -    ids = VarUnit,
      -    nest = TRUE
      -  )
      +
      targetpop <- 231592693
      +data(anes_2020)
      +
      +anes_adjwgt <- anes_2020 %>%
      +  mutate(Weight = Weight / sum(Weight) * targetpop)
      +
      +anes_des <- anes_adjwgt %>%
      +  as_survey_design(
      +    weights = Weight,
      +    strata = Stratum,
      +    ids = VarUnit,
      +    nest = TRUE
      +  )

      For RECS, details are included in the RECS documentation and Chapter 10.

      -
      data(recs_2020)
      -
      -recs_des <- recs_2020 %>%
      -  as_survey_rep(
      -    weights = NWEIGHT,
      -    repweights = NWEIGHT1:NWEIGHT60,
      -    type = "JK1",
      -    scale = 59/60,
      -    mse = TRUE
      -  )
      +
      data(recs_2020)
      +
      +recs_des <- recs_2020 %>%
      +  as_survey_rep(
      +    weights = NWEIGHT,
      +    repweights = NWEIGHT1:NWEIGHT60,
      +    type = "JK1",
      +    scale = 59/60,
      +    mse = TRUE
      +  )

      11.1 Introduction

      @@ -553,9 +552,9 @@

      11.3 Assessing missing data

      11.3.1 Summarize data

      A very rudimentary first exploration is to use the summary() function to summarize the data which will illuminate NA values in the data. Let’s look at a few analytic variables on the ANES 2020 data using summary():

      -
      anes_2020 %>%
      -  select(V202051:EarlyVote2020) %>%
      -  summary()
      +
      anes_2020 %>%
      +  select(V202051:EarlyVote2020) %>%
      +  summary()
      ##     V202051          V202066        V202072       VotedPres2020
       ##  Min.   :-9.000   Min.   :-9.0   Min.   :-9.000   Yes :5952    
       ##  1st Qu.:-1.000   1st Qu.: 4.0   1st Qu.: 1.000   No  :  77    
      @@ -578,8 +577,8 @@ 

      11.3.1 Summarize data
      anes_2020 %>% 
      -  count(VotedPres2020,V202072)

      +
      anes_2020 %>% 
      +  count(VotedPres2020,V202072)
      ## # A tibble: 5 × 3
       ##   VotedPres2020 V202072                                   n
       ##   <fct>         <dbl+lbl>                             <int>
      @@ -593,14 +592,14 @@ 

      11.3.1 Summarize data

      11.3.2 Visualization of missing data

      It can be challenging to look at tables for every variable, and instead may be more efficient to view missing data in a graphical format to help narrow in on patterns or unique variables. The {naniar} package is very useful in exploring missing data visually. It provides quick graphics to explore the missingness patterns in the data. We can use the vis_miss() function available in both {visdat} and {naniar} packages to view the amount of missing data by variable.

      -
      anes_2020_derived<-anes_2020 %>%
      -  select(!starts_with("V2"),-CaseID,-InterviewMode,-Weight,-Stratum,-VarUnit)
      -
      -anes_2020_derived %>%
      -  vis_miss(cluster= TRUE, show_perc = FALSE) +
      -  scale_fill_manual(values = book_colors[c(3,1)], 
      -                    labels = c("Present","Missing"),
      -                    name = "")
      +
      anes_2020_derived<-anes_2020 %>%
      +  select(!starts_with("V2"),-CaseID,-InterviewMode,-Weight,-Stratum,-VarUnit)
      +
      +anes_2020_derived %>%
      +  vis_miss(cluster= TRUE, show_perc = FALSE) +
      +  scale_fill_manual(values = book_colors[c(3,1)], 
      +                    labels = c("Present","Missing"),
      +                    name = "")
      This chart shows a the missingness of the selected variables where missing is highlighted in a dark color. Each row of the plot is an observation and each column is a variable. There are some patterns observed such as a large block of missing for `VotedPres2016_selection` and many of the same respondents also having missing for `VotedPres2020_selection`.

      @@ -609,34 +608,34 @@

      11.3.2 Visualization of missing d

      From this visualization, we can start to get a picture of what questions may be related to each other in terms of missing data. Even if we did not have the informative variable names, we could be able to deduce that VotedPres2020, VotedPres2020_selection, and EarlyVote2020 are likely related since their missing data patterns are similar.

      Additionally, we can also look at VotedPres2016_selection and see that there is a lot of missing data in that variable. Most likely this is due to a skip pattern, and we can look at further graphics to see how it might be related to other variables. The {naniar} package has multiple visualization functions that can help dive deeper such as the gg_miss_fct() function which looks at missing data for all variables by levels of another variable.

      -
      anes_2020_derived %>% 
      -  gg_miss_fct(VotedPres2016) +
      -  scale_fill_gradientn(
      -    guide = "colorbar",
      -    name = "% Miss",
      -    colors = book_colors[c(3, 2, 1)] 
      -  ) +
      -  ylab("Variable") +
      -  xlab("Voted for President in 2016")
      +
      anes_2020_derived %>% 
      +  gg_miss_fct(VotedPres2016) +
      +  scale_fill_gradientn(
      +    guide = "colorbar",
      +    name = "% Miss",
      +    colors = book_colors[c(3, 2, 1)] 
      +  ) +
      +  ylab("Variable") +
      +  xlab("Voted for President in 2016")
      ## Scale for fill is already present.
       ## Adding another scale for fill, which will replace the existing scale.
      -
      -This chart has x-axis 'Voted for President in 2016' with labels Yes, No and NA and has y-axis 'Variable' with labels Age, AgeGroup, CampaignInterest, EarlyVote2020, Education, Gender, Income, Income7, PartyID, RaceEth, TrustGovernment, TrustPeople, VotedPres2016_selection, VotedPres2020 and VotedPres2020_selection. There is a legend indicating fill is used to show pct_miss, ranging from 0 represented by fill very pale blue to 100 shown as fill dark blue. Among those that voted for president in 2016, they had little missing for other variables (light color) but those that did not vote have more missing data in their 2020 voting patterns and their 2016 president selection. +
      +This chart has x-axis 'Voted for President in 2016' with labels Yes, No and NA and has y-axis 'Variable' with labels Age, AgeGroup, CampaignInterest, EarlyVote2020, Education, Gender, Income, Income7, PartyID, RaceEth, TrustGovernment, TrustPeople, VotedPres2016_selection, VotedPres2020 and VotedPres2020_selection. There is a legend indicating fill is used to show pct_miss, ranging from 0 represented by fill very pale blue to 100 shown as fill dark blue. Among those that voted for president in 2016, they had little missing for other variables (light color) but those that did not vote have more missing data in their 2020 voting patterns and their 2016 president selection.

      FIGURE 11.2: Missingness in variables for each level of VotedPres2016 in the ANES 2020 data

      In this case, we can see that if they did not vote for president in 2016 or did not answer that question, then they were not asked about who they voted for in 2016 (the percentage of missing data if 100%). Additionally, we can see with this graphic, that there is more missing data across all questions if they did not provide an answer to VotedPres2016.

      There are other graphics that work well with numeric data. For example, in the RECS 2020 data we can plot two continuous variables and the missing data associated with it to see if there are any patterns to the missingness. To do this, we can use the bind_shadow() function from the {naniar} package. This creates a nabular (combination of “na” with “tabular”), which features the original columns followed by the same number of columns with a specific NA format. These NA columns are indicators of if the value in the original data is missing or not. The example printed below shows how most levels of HeatingBehavior are not missing !NA in the NA variable of HeatingBehavior_NA, but those missing in HeatingBehavior are also missing in HeatingBehavior_NA.

      -
      recs_2020_shadow <- recs_2020 %>% 
      -  bind_shadow()
      -
      -ncol(recs_2020)
      +
      recs_2020_shadow <- recs_2020 %>% 
      +  bind_shadow()
      +
      +ncol(recs_2020)
      ## [1] 118
      -
      ncol(recs_2020_shadow)
      +
      ncol(recs_2020_shadow)
      ## [1] 236
      -
      recs_2020_shadow %>% 
      -  count(HeatingBehavior,HeatingBehavior_NA)
      +
      recs_2020_shadow %>% 
      +  count(HeatingBehavior,HeatingBehavior_NA)
      ## # A tibble: 7 × 3
       ##   HeatingBehavior                               HeatingBehavior_NA     n
       ##   <fct>                                         <fct>              <int>
      @@ -648,17 +647,17 @@ 

      11.3.2 Visualization of missing d ## 6 Other !NA 46 ## 7 <NA> NA 751

      We can then use these new variables to plot the missing data along side the actual data. For example, let’s plot a histogram of the total electric bill grouped by those that are missing and not missing by heating behavior.

      -
      recs_2020_shadow %>% 
      -  filter(TOTALDOL < 5000) %>%
      -  ggplot(aes(x=TOTALDOL,fill=HeatingBehavior_NA)) +
      -  geom_histogram() +
      -  scale_fill_manual(values = book_colors[c(3, 1)],
      -                    labels = c("Present", "Missing"),
      -                    name = "Heating Behavior") +
      -  theme_minimal() +
      -  xlab("Total Energy Cost (Truncated at $5000)") +
      -  ylab("Number of Households") +
      -  labs(title = "Histogram of Energy Cost by Heating Behavior Missing Data")
      +
      recs_2020_shadow %>% 
      +  filter(TOTALDOL < 5000) %>%
      +  ggplot(aes(x=TOTALDOL,fill=HeatingBehavior_NA)) +
      +  geom_histogram() +
      +  scale_fill_manual(values = book_colors[c(3, 1)],
      +                    labels = c("Present", "Missing"),
      +                    name = "Heating Behavior") +
      +  theme_minimal() +
      +  xlab("Total Energy Cost (Truncated at $5000)") +
      +  ylab("Number of Households") +
      +  labs(title = "Histogram of Energy Cost by Heating Behavior Missing Data")
      ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
      This chart has title 'Histogram of Energy Cost by Heating Behavior Missing Data'. It has x-axis 'Total Energy Cost (Truncated at $5000)' with labels 0, 1000, 2000, 3000, 4000 and 5000. It has y-axis 'Number of Households' with labels 0, 500, 1000 and 1500. There is a legend indicating fill is used to show HeatingBehavior_NA, with 2 levels: !NA shown as very pale blue fill and  NA shown as dark blue fill. The chart is a bar chart with 30 vertical bars. These are stacked, as sorted by HeatingBehavior_NA. @@ -687,15 +686,15 @@

      11.4.1 Recoding missing data

      When we created the derived variables for use in this book, we coded all negative values as NA and proceeded to analyze the data. For most cases this is an appropriate approach as long as you filter the data appropriately to account for skip patterns (see next section). However, the {nanair} package does have the option to code special missing values. For example, if we wanted to have two NA values, one that indicated the question was missing by design (e.g., due to skip patterns) and one for the other missing categories we can use the nabular format to incorporate these with the recode_shadow() function.

      -
      anes_2020_shadow<-anes_2020 %>%
      -  select(starts_with("V2")) %>%
      -  mutate(across(everything(),~case_when(.x < -1 ~ NA,
      -                                        TRUE~.x))) %>% 
      -  bind_shadow() %>%
      -  recode_shadow(V201103 = .where(V201103==-1~"skip"))
      -
      -anes_2020_shadow %>% 
      -  count(V201103,V201103_NA)
      +
      anes_2020_shadow<-anes_2020 %>%
      +  select(starts_with("V2")) %>%
      +  mutate(across(everything(),~case_when(.x < -1 ~ NA,
      +                                        TRUE~.x))) %>% 
      +  bind_shadow() %>%
      +  recode_shadow(V201103 = .where(V201103==-1~"skip"))
      +
      +anes_2020_shadow %>% 
      +  count(V201103,V201103_NA)
      ## # A tibble: 5 × 3
       ##   V201103                 V201103_NA     n
       ##   <dbl+lbl>               <fct>      <int>
      @@ -710,14 +709,14 @@ 

      11.4.1 Recoding missing data11.4.2 Accounting for skip patterns

      When questions are skipped by design in a survey, it is meaningful that the data is later missing. For example the RECS survey asks people how they control the heat in their home in the winter (HeatingBehavior). This is only among those who have heat in their home (SpaceHeatingUsed). If no there is no heating equipment used, the value of HeatingBehavior is missing. One has several choices when analyzing this data which include 1) only including those with a valid value of HeatingBehavior and specifying the universe as those with heat or 2) including those who do not have heat. It is important to specify what population an analysis generalizes to.

      Here is example code where we only include those with a valid value of HeatingBehavior (choice 1). Note that we use the design object (recs_des) then filter to those that are not missing on HeatingBehavior.

      -
      heat_cntl_1 <- recs_des %>%
      -  filter(!is.na(HeatingBehavior)) %>%
      -  group_by(HeatingBehavior) %>%
      -  summarize(
      -    p=survey_prop()
      -  )
      -
      -heat_cntl_1
      +
      heat_cntl_1 <- recs_des %>%
      +  filter(!is.na(HeatingBehavior)) %>%
      +  group_by(HeatingBehavior) %>%
      +  summarize(
      +    p=survey_prop()
      +  )
      +
      +heat_cntl_1
      ## # A tibble: 6 × 3
       ##   HeatingBehavior                                              p    p_se
       ##   <fct>                                                    <dbl>   <dbl>
      @@ -728,13 +727,13 @@ 

      11.4.2 Accounting for skip patter ## 5 No control 0.0333 1.70e-3 ## 6 Other 0.00208 3.59e-4

      Here is example code where we include those that do not have heat (choice 2). To help understand what we are looking at we have included the output to show both variables SpaceHeatingUsed and HeatingBehavior.

      -
      heat_cntl_2 <- recs_des %>%
      -  group_by(interact(SpaceHeatingUsed, HeatingBehavior)) %>%
      -  summarize(
      -    p=survey_prop()
      -  )
      -
      -heat_cntl_2
      +
      heat_cntl_2 <- recs_des %>%
      +  group_by(interact(SpaceHeatingUsed, HeatingBehavior)) %>%
      +  summarize(
      +    p=survey_prop()
      +  )
      +
      +heat_cntl_2
      ## # A tibble: 7 × 4
       ##   SpaceHeatingUsed HeatingBehavior                             p    p_se
       ##   <lgl>            <fct>                                   <dbl>   <dbl>
      @@ -747,24 +746,24 @@ 

      11.4.2 Accounting for skip patter ## 7 TRUE Other 0.00198 3.41e-4

      If we ran the first analysis, we would say that 16.8% of households with heat use a programmable or smart thermostat for the heating of their home. While if we used the results from the second analysis, we could say that 16% of households use a programmable or smart thermostat for the heating of their home. The distinction of the two statements is bolded for emphasis. Skip patterns often change the universe that we are talking about and need to be carefully examined.

      Filtering to the correct universe is important when handling these types of missing data. The nabular we created above can also help with this. If we have NA_skip values in the shadow, we can make sure that we filter out all of these values and only include relevant missing. To do this with survey data we could first create the nabular, then create the design object on that data, and then use the shadow variables to assist with filtering the data. Let’s use the nabular we created above for ANES 2020 (anes_2020_shadow) to create the design object.

      -
      anes_adjwgt_shadow <- anes_2020_shadow %>% 
      -  mutate(V200010b = V200010b/sum(V200010b)*targetpop)
      -
      -anes_des_shadow <- anes_adjwgt_shadow %>% 
      -  as_survey_design(
      -    weights = V200010b,
      -    strata = V200010d,
      -    ids = V200010c,
      -    nest = TRUE
      -  )
      +
      anes_adjwgt_shadow <- anes_2020_shadow %>% 
      +  mutate(V200010b = V200010b/sum(V200010b)*targetpop)
      +
      +anes_des_shadow <- anes_adjwgt_shadow %>% 
      +  as_survey_design(
      +    weights = V200010b,
      +    strata = V200010d,
      +    ids = V200010c,
      +    nest = TRUE
      +  )

      Then we can use this design object to look at the percent of the population that voted for each candidate in 2016 (V201103). First, let’s look at the percentages without removing any cases:

      -
      pres16_select1<-anes_des_shadow %>% 
      -  group_by(V201103) %>%
      -  summarize(
      -    All_Missing=survey_prop()
      -  )
      -
      -pres16_select1
      +
      pres16_select1<-anes_des_shadow %>% 
      +  group_by(V201103) %>%
      +  summarize(
      +    All_Missing=survey_prop()
      +  )
      +
      +pres16_select1
      ## # A tibble: 5 × 3
       ##   V201103                 All_Missing All_Missing_se
       ##   <dbl+lbl>                     <dbl>          <dbl>
      @@ -774,14 +773,14 @@ 

      11.4.2 Accounting for skip patter ## 4 5 [5. Other {SPECIFY}] 0.0409 0.00230 ## 5 NA 0.00627 0.00121

      Next, we will look at the percentages removing only those that were missing due to skip patterns (i.e., they did not receive this question).

      -
      pres16_select2<-anes_des_shadow %>% 
      -  filter(V201103_NA!="NA_skip") %>% 
      -  group_by(V201103) %>%
      -  summarize(
      -    No_Skip_Missing=survey_prop()
      -  )
      -
      -pres16_select2
      +
      pres16_select2<-anes_des_shadow %>% 
      +  filter(V201103_NA!="NA_skip") %>% 
      +  group_by(V201103) %>%
      +  summarize(
      +    No_Skip_Missing=survey_prop()
      +  )
      +
      +pres16_select2
      ## # A tibble: 4 × 3
       ##   V201103                 No_Skip_Missing No_Skip_Missing_se
       ##   <dbl+lbl>                         <dbl>              <dbl>
      @@ -790,14 +789,14 @@ 

      11.4.2 Accounting for skip patter ## 3 5 [5. Other {SPECIFY}] 0.0606 0.00330 ## 4 NA 0.00928 0.00178

      Finally, we will look at the percentages removing all missing values both due to skip patterns and due to those who refused to answer the question.

      -
      pres16_select3<-anes_des_shadow %>% 
      -  filter(V201103_NA=="!NA") %>% 
      -  group_by(V201103) %>%
      -  summarize(
      -    No_Missing=survey_prop()
      -  )
      -
      -pres16_select3
      +
      pres16_select3<-anes_des_shadow %>% 
      +  filter(V201103_NA=="!NA") %>% 
      +  group_by(V201103) %>%
      +  summarize(
      +    No_Missing=survey_prop()
      +  )
      +
      +pres16_select3
      ## # A tibble: 3 × 3
       ##   V201103                No_Missing No_Missing_se
       ##   <dbl+lbl>                   <dbl>         <dbl>
      diff --git a/c12-pitfalls.html b/c12-pitfalls.html
      index 427ccd25..97f0d405 100644
      --- a/c12-pitfalls.html
      +++ b/c12-pitfalls.html
      @@ -230,55 +230,54 @@
       
    • 4.3 Survey analysis process
    • 4.4 Similarities between {dplyr} and {srvyr} functions
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
  • -
  • 5 Descriptive Analyses in {srvyr} +
  • 5 Descriptive analyses
  • 6 Statistical testing
      @@ -497,15 +496,15 @@

      Prerequisites

      For this chapter, load the following packages:

      -
      library(tidyverse)
      -library(survey)
      -library(srvyr) 
      -library(srvyrexploR)
      -library(gt)
      +
      library(tidyverse)
      +library(survey)
      +library(srvyr) 
      +library(srvyrexploR)
      +library(gt)

      We will use data from the United States National Crime Victimization Survey (NCVS). Here is the code to read in the three datasets from the {srvyrexploR} package:

      -
      data(ncvs_2021_incident)
      -data(ncvs_2021_household)
      -data(ncvs_2021_person)
      +
      data(ncvs_2021_incident)
      +data(ncvs_2021_household)
      +data(ncvs_2021_person)

      13.1 Introduction

      @@ -659,21 +658,21 @@

      13.4.1 Preparing Files for Estima

      We want to create four variables to indicate if an incident is a series crime. First, we create a variable called series using V4017, V4018, and V4019 where an incident is considered a series crime if there are 6 or more incidents (V4107), the incidents are similar in detail (V4018), or there is not enough detail to distinguish the incidents (V4019). Next, we top-code the number of incidents (V4016) by creating a variable n10v4016 which is set to 10 if V4016 > 10. Finally, we create the series weight using our new top-coded variable and the existing weight.

      -
      inc_series <- ncvs_2021_incident %>%
      -  mutate(
      -    series = case_when(V4017 %in% c(1, 8) ~ 1,
      -                       V4018 %in% c(2, 8) ~ 1,
      -                       V4019 %in% c(1, 8) ~ 1,
      -                       TRUE ~ 2
      -    ),
      -    n10v4016 = case_when(V4016 %in% c(997, 998) ~ NA_real_,
      -                         V4016 > 10 ~ 10,
      -                         TRUE ~ V4016),
      -    serieswgt = case_when(series == 2 & is.na(n10v4016) ~ 6,
      -                          series == 2 ~ n10v4016,
      -                          TRUE ~ 1),
      -    NEWWGT = WGTVICCY * serieswgt
      -  )
      +
      inc_series <- ncvs_2021_incident %>%
      +  mutate(
      +    series = case_when(V4017 %in% c(1, 8) ~ 1,
      +                       V4018 %in% c(2, 8) ~ 1,
      +                       V4019 %in% c(1, 8) ~ 1,
      +                       TRUE ~ 2
      +    ),
      +    n10v4016 = case_when(V4016 %in% c(997, 998) ~ NA_real_,
      +                         V4016 > 10 ~ 10,
      +                         TRUE ~ V4016),
      +    serieswgt = case_when(series == 2 & is.na(n10v4016) ~ 6,
      +                          series == 2 ~ n10v4016,
      +                          TRUE ~ 1),
      +    NEWWGT = WGTVICCY * serieswgt
      +  )

      The next step in preparing the files for estimation is to create indicators on the victimization file for characteristics of interest. Almost all BJS publications limit the analysis to records where the victimization occurred in the United States, where V4022 is not equal to 1, and we will do this for all estimates as well. A brief codebook of variables for this task is located in Table 13.2

      @@ -1067,32 +1066,32 @@

      13.4.1 Preparing Files for Estima
    • Variable: AAST_Other
    • -
      inc_ind <- inc_series %>%
      -  filter(V4022 != 1) %>%
      -  mutate(
      -    WeapCat = case_when(
      -      is.na(V4049) ~ NA_character_,
      -      V4049 == 2 ~ "NoWeap",
      -      V4049 == 3 ~ "UnkWeapUse",
      -      V4050 == 3 ~ "Other",
      -      V4051 == 1 | V4052 == 1 | V4050 == 7 ~ "Firearm",
      -      V4053 == 1 | V4054 == 1 ~ "Knife",
      -      TRUE ~ "Other"
      -    ),
      -    V4529_num = parse_number(as.character(V4529)),
      -    ReportPolice = V4399 == 1,
      -    Property = V4529_num >= 31,
      -    Violent = V4529_num <= 20,
      -    Property_ReportPolice = Property & ReportPolice,
      -    Violent_ReportPolice = Violent & ReportPolice,
      -    AAST = V4529_num %in% 11:13,
      -    AAST_NoWeap = AAST & WeapCat == "NoWeap",
      -    AAST_Firearm = AAST & WeapCat == "Firearm",
      -    AAST_Knife = AAST & WeapCat == "Knife",
      -    AAST_Other = AAST & WeapCat == "Other"
      -  )
      +
      inc_ind <- inc_series %>%
      +  filter(V4022 != 1) %>%
      +  mutate(
      +    WeapCat = case_when(
      +      is.na(V4049) ~ NA_character_,
      +      V4049 == 2 ~ "NoWeap",
      +      V4049 == 3 ~ "UnkWeapUse",
      +      V4050 == 3 ~ "Other",
      +      V4051 == 1 | V4052 == 1 | V4050 == 7 ~ "Firearm",
      +      V4053 == 1 | V4054 == 1 ~ "Knife",
      +      TRUE ~ "Other"
      +    ),
      +    V4529_num = parse_number(as.character(V4529)),
      +    ReportPolice = V4399 == 1,
      +    Property = V4529_num >= 31,
      +    Violent = V4529_num <= 20,
      +    Property_ReportPolice = Property & ReportPolice,
      +    Violent_ReportPolice = Violent & ReportPolice,
      +    AAST = V4529_num %in% 11:13,
      +    AAST_NoWeap = AAST & WeapCat == "NoWeap",
      +    AAST_Firearm = AAST & WeapCat == "Firearm",
      +    AAST_Knife = AAST & WeapCat == "Knife",
      +    AAST_Other = AAST & WeapCat == "Other"
      +  )

      This is a good point to pause to look at the output of crosswalks between an original variable and a derived one to check that the logic was programmed correctly and that everything ends up in the expected category.

      -
      inc_series %>% count(V4022)
      +
      inc_series %>% count(V4022)
      ## # A tibble: 6 × 2
       ##   V4022     n
       ##   <fct> <int>
      @@ -1102,7 +1101,7 @@ 

      13.4.1 Preparing Files for Estima ## 4 4 1143 ## 5 5 39 ## 6 8 4

      -
      inc_ind %>% count(V4022)
      +
      inc_ind %>% count(V4022)
      ## # A tibble: 5 × 2
       ##   V4022     n
       ##   <fct> <int>
      @@ -1111,8 +1110,8 @@ 

      13.4.1 Preparing Files for Estima ## 3 4 1143 ## 4 5 39 ## 5 8 4

      -
      inc_ind %>%
      -  count(WeapCat, V4049, V4050, V4051, V4052, V4052, V4053, V4054)
      +
      inc_ind %>%
      +  count(WeapCat, V4049, V4050, V4051, V4052, V4052, V4053, V4054)
      ## # A tibble: 13 × 8
       ##    WeapCat    V4049 V4050 V4051 V4052 V4053 V4054     n
       ##    <chr>      <fct> <fct> <fct> <fct> <fct> <fct> <int>
      @@ -1129,7 +1128,7 @@ 

      13.4.1 Preparing Files for Estima ## 11 Other 1 3 0 0 0 0 26 ## 12 UnkWeapUse 3 <NA> <NA> <NA> <NA> <NA> 519 ## 13 <NA> <NA> <NA> <NA> <NA> <NA> <NA> 6228

      -
      inc_ind %>% count(V4529, Property, Violent, AAST) %>% print(n = 40)
      +
      inc_ind %>% count(V4529, Property, Violent, AAST) %>% print(n = 40)
      ## # A tibble: 34 × 5
       ##    V4529 Property Violent AAST      n
       ##    <fct> <lgl>    <lgl>   <lgl> <int>
      @@ -1167,7 +1166,7 @@ 

      13.4.1 Preparing Files for Estima ## 32 57 TRUE FALSE FALSE 1420 ## 33 58 TRUE FALSE FALSE 798 ## 34 59 TRUE FALSE FALSE 395

      -
      inc_ind %>% count(ReportPolice, V4399)
      +
      inc_ind %>% count(ReportPolice, V4399)
      ## # A tibble: 4 × 3
       ##   ReportPolice V4399     n
       ##   <lgl>        <fct> <int>
      @@ -1175,13 +1174,13 @@ 

      13.4.1 Preparing Files for Estima ## 2 FALSE 3 103 ## 3 FALSE 8 12 ## 4 TRUE 1 3163

      -
      inc_ind %>%
      -  count(AAST,
      -        WeapCat,
      -        AAST_NoWeap,
      -        AAST_Firearm,
      -        AAST_Knife,
      -        AAST_Other)
      +
      inc_ind %>%
      +  count(AAST,
      +        WeapCat,
      +        AAST_NoWeap,
      +        AAST_Firearm,
      +        AAST_Knife,
      +        AAST_Other)
      ## # A tibble: 11 × 7
       ##    AAST  WeapCat    AAST_NoWeap AAST_Firearm AAST_Knife AAST_Other     n
       ##    <lgl> <chr>      <lgl>       <lgl>        <lgl>      <lgl>      <int>
      @@ -1197,43 +1196,43 @@ 

      13.4.1 Preparing Files for Estima ## 10 TRUE Other FALSE FALSE FALSE TRUE 146 ## 11 TRUE UnkWeapUse FALSE FALSE FALSE FALSE 3

      After creating indicators of victimization types and characteristics, the file is summarized, and crimes are summed across persons or households by YEARQ. Property crimes (i.e., crimes committed against households, such as household burglary or motor vehicle theft) are summed across households, and personal crimes (i.e., crimes committed against an individual, such as assault, robbery, and personal theft) are summed across persons. The indicators are summed using the serieswgt, and the variable WGTVICCY needs to be retained for later analysis.

      -
      inc_hh_sums <-
      -  inc_ind %>%
      -  filter(V4529_num > 23) %>% # restrict to household crimes
      -  group_by(YEARQ, IDHH) %>%
      -  summarize(WGTVICCY = WGTVICCY[1],
      -            across(starts_with("Property"), 
      -                   ~ sum(. * serieswgt),
      -                   .names = "{.col}"),
      -            .groups = "drop")
      -
      -inc_pers_sums <-
      -  inc_ind %>%
      -  filter(V4529_num <= 23) %>% # restrict to person crimes
      -  group_by(YEARQ, IDHH, IDPER) %>%
      -  summarize(WGTVICCY = WGTVICCY[1],
      -            across(c(starts_with("Violent"), starts_with("AAST")),
      -                   ~ sum(. * serieswgt), 
      -                   .names = "{.col}"),
      -            .groups = "drop")
      +
      inc_hh_sums <-
      +  inc_ind %>%
      +  filter(V4529_num > 23) %>% # restrict to household crimes
      +  group_by(YEARQ, IDHH) %>%
      +  summarize(WGTVICCY = WGTVICCY[1],
      +            across(starts_with("Property"), 
      +                   ~ sum(. * serieswgt),
      +                   .names = "{.col}"),
      +            .groups = "drop")
      +
      +inc_pers_sums <-
      +  inc_ind %>%
      +  filter(V4529_num <= 23) %>% # restrict to person crimes
      +  group_by(YEARQ, IDHH, IDPER) %>%
      +  summarize(WGTVICCY = WGTVICCY[1],
      +            across(c(starts_with("Violent"), starts_with("AAST")),
      +                   ~ sum(. * serieswgt), 
      +                   .names = "{.col}"),
      +            .groups = "drop")

      Now, we merge the victimization summary files into the appropriate files. For any record on the household or person file that is not on the victimization file, the victimization counts are set to 0 after merging. In this step, we will also create the victimization adjustment factor. See 2.2.4 in the User’s Guide for details of why this adjustment is created (Shook-Sa, Bonnie, Couzens, G. Lance, and Berzofsky, Marcus (2015)). It is calculated as follows:

      \[ A_{ijk}=\frac{v_{ijk}}{w_{ijk}}\]

      where \(w_{ijk}\) is the person weight (WGTPERCY) for personal crimes or the household weight (WGTHHCY) for household crimes, and \(v_{ijk}\) is the victimization weight (WGTVICCY) for household \(i\), respondent \(j\), in reporting period \(k\). The adjustment factor is set to 0 if no incidents are reported.

      -
      # Set up a list of 0s for each crime type/characteristic to replace NA's
      -hh_z_list <- rep(0, ncol(inc_hh_sums) - 3) %>% as.list() %>%
      -  setNames(names(inc_hh_sums)[-(1:3)])
      -pers_z_list <- rep(0, ncol(inc_pers_sums) - 4) %>% as.list() %>%
      -  setNames(names(inc_pers_sums)[-(1:4)])
      -
      -hh_vsum <- ncvs_2021_household %>%
      -  full_join(inc_hh_sums, by = c("YEARQ", "IDHH")) %>%
      -  replace_na(hh_z_list) %>%
      -  mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTHHCY))
      -
      -pers_vsum <- ncvs_2021_person %>%
      -  full_join(inc_pers_sums, by = c("YEARQ", "IDHH", "IDPER")) %>%
      -  replace_na(pers_z_list) %>%
      -  mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTPERCY))
      +
      # Set up a list of 0s for each crime type/characteristic to replace NA's
      +hh_z_list <- rep(0, ncol(inc_hh_sums) - 3) %>% as.list() %>%
      +  setNames(names(inc_hh_sums)[-(1:3)])
      +pers_z_list <- rep(0, ncol(inc_pers_sums) - 4) %>% as.list() %>%
      +  setNames(names(inc_pers_sums)[-(1:4)])
      +
      +hh_vsum <- ncvs_2021_household %>%
      +  full_join(inc_hh_sums, by = c("YEARQ", "IDHH")) %>%
      +  replace_na(hh_z_list) %>%
      +  mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTHHCY))
      +
      +pers_vsum <- ncvs_2021_person %>%
      +  full_join(inc_pers_sums, by = c("YEARQ", "IDHH", "IDPER")) %>%
      +  replace_na(pers_z_list) %>%
      +  mutate(ADJINC_WT = if_else(is.na(WGTVICCY), 0, WGTVICCY / WGTPERCY))

      13.4.2 Derived Demographic Variables

      @@ -1476,41 +1475,41 @@

      13.4.2.1 Household Variables

      TABLE 13.2: Codebook for incident variables - crime type indicators and characteristics
      -
      hh_vsum_der <- hh_vsum %>%
      -  mutate(
      -    Tenure = factor(case_when(V2015 == 1 ~ "Owned", 
      -                              !is.na(V2015) ~ "Rented"),
      -                    levels = c("Owned", "Rented")),
      -    Urbanicity = factor(case_when(V2143 == 1 ~ "Urban",
      -                                  V2143 == 2 ~ "Suburban",
      -                                  V2143 == 3 ~ "Rural"),
      -                        levels = c("Urban", "Suburban", "Rural")),
      -    SC214A_num = as.numeric(as.character(SC214A)),
      -    Income = case_when(SC214A_num <= 8 ~ "Less than $25,000",
      -                       SC214A_num <= 12 ~ "$25,000-49,999",
      -                       SC214A_num <= 15 ~ "$50,000-99,999",
      -                       SC214A_num <= 17 ~ "$100,000-199,999",
      -                       SC214A_num <= 18 ~ "$200,000 or more"),
      -    Income = fct_reorder(Income, SC214A_num, .na_rm = FALSE),
      -    PlaceSize = case_match(as.numeric(as.character(V2126B)),
      -                           0 ~ "Not in a place",
      -                           13 ~ "Under 10,000",
      -                           16 ~ "10,000-49,999",
      -                           17 ~ "50,000-99,999",
      -                           18 ~ "100,000-249,999",
      -                           19 ~ "250,000-499,999",
      -                           20 ~ "500,000-999,999",
      -                           c(21, 22, 23) ~ "1,000,000 or more"),
      -    PlaceSize = fct_reorder(PlaceSize, as.numeric(V2126B)),
      -    Region = case_match(as.numeric(V2127B),
      -                        1 ~ "Northeast",
      -                        2 ~ "Midwest",
      -                        3 ~ "South",
      -                        4 ~ "West"),
      -    Region = fct_reorder(Region, as.numeric(V2127B))
      -  )
      +
      hh_vsum_der <- hh_vsum %>%
      +  mutate(
      +    Tenure = factor(case_when(V2015 == 1 ~ "Owned", 
      +                              !is.na(V2015) ~ "Rented"),
      +                    levels = c("Owned", "Rented")),
      +    Urbanicity = factor(case_when(V2143 == 1 ~ "Urban",
      +                                  V2143 == 2 ~ "Suburban",
      +                                  V2143 == 3 ~ "Rural"),
      +                        levels = c("Urban", "Suburban", "Rural")),
      +    SC214A_num = as.numeric(as.character(SC214A)),
      +    Income = case_when(SC214A_num <= 8 ~ "Less than $25,000",
      +                       SC214A_num <= 12 ~ "$25,000-49,999",
      +                       SC214A_num <= 15 ~ "$50,000-99,999",
      +                       SC214A_num <= 17 ~ "$100,000-199,999",
      +                       SC214A_num <= 18 ~ "$200,000 or more"),
      +    Income = fct_reorder(Income, SC214A_num, .na_rm = FALSE),
      +    PlaceSize = case_match(as.numeric(as.character(V2126B)),
      +                           0 ~ "Not in a place",
      +                           13 ~ "Under 10,000",
      +                           16 ~ "10,000-49,999",
      +                           17 ~ "50,000-99,999",
      +                           18 ~ "100,000-249,999",
      +                           19 ~ "250,000-499,999",
      +                           20 ~ "500,000-999,999",
      +                           c(21, 22, 23) ~ "1,000,000 or more"),
      +    PlaceSize = fct_reorder(PlaceSize, as.numeric(V2126B)),
      +    Region = case_match(as.numeric(V2127B),
      +                        1 ~ "Northeast",
      +                        2 ~ "Midwest",
      +                        3 ~ "South",
      +                        4 ~ "West"),
      +    Region = fct_reorder(Region, as.numeric(V2127B))
      +  )

      As before, we want to check to make sure the recoded variables we create match the existing data as expected.

      -
      hh_vsum_der %>% count(Tenure, V2015)
      +
      hh_vsum_der %>% count(Tenure, V2015)
      ## # A tibble: 4 × 3
       ##   Tenure V2015      n
       ##   <fct>  <fct>  <int>
      @@ -1518,14 +1517,14 @@ 

      13.4.2.1 Household Variables

      -
      hh_vsum_der %>% count(Urbanicity, V2143)
      +
      hh_vsum_der %>% count(Urbanicity, V2143)
      ## # A tibble: 3 × 3
       ##   Urbanicity V2143      n
       ##   <fct>      <fct>  <int>
       ## 1 Urban      1      26878
       ## 2 Suburban   2     173491
       ## 3 Rural      3      56091
      -
      hh_vsum_der %>% count(Income, SC214A)
      +
      hh_vsum_der %>% count(Income, SC214A)
      ## # A tibble: 18 × 3
       ##    Income            SC214A     n
       ##    <fct>             <fct>  <int>
      @@ -1547,7 +1546,7 @@ 

      13.4.2.1 Household Variables

      -
      hh_vsum_der %>% count(PlaceSize, V2126B)
      +
      hh_vsum_der %>% count(PlaceSize, V2126B)
      ## # A tibble: 10 × 3
       ##    PlaceSize         V2126B     n
       ##    <fct>             <fct>  <int>
      @@ -1561,7 +1560,7 @@ 

      13.4.2.1 Household Variables

      -
      hh_vsum_der %>% count(Region, V2127B)
      +
      hh_vsum_der %>% count(Region, V2127B)
      ## # A tibble: 4 × 3
       ##   Region    V2127B     n
       ##   <fct>     <fct>  <int>
      @@ -1766,49 +1765,49 @@ 

      13.4.2.2 Person Variables -
      # Set label for usage later
      -NHOPI <- "Native Hawaiian or Other Pacific Islander"
      -
      -pers_vsum_der <- pers_vsum %>%
      -  mutate(
      -    Sex = factor(case_when(V3018 == 1 ~ "Male",
      -                           V3018 == 2 ~ "Female")),
      -    RaceHispOrigin = factor(case_when(V3024 == 1 ~ "Hispanic",
      -                                      V3023A == 1 ~ "White",
      -                                      V3023A == 2 ~ "Black",
      -                                      V3023A == 4 ~ "Asian",
      -                                      V3023A == 5 ~ NHOPI,
      -                                      TRUE ~ "Other"),
      -                            levels = c("White", "Black", "Hispanic", 
      -                                       "Asian", NHOPI, "Other")),
      -    V3014_num = as.numeric(as.character(V3014)),
      -    AgeGroup = case_when(V3014_num <= 17 ~ "12-17",
      -                         V3014_num <= 24 ~ "18-24",
      -                         V3014_num <= 34 ~ "25-34",
      -                         V3014_num <= 49 ~ "35-49",
      -                         V3014_num <= 64 ~ "50-64",
      -                         V3014_num <= 90 ~ "65 or older"),
      -    AgeGroup = fct_reorder(AgeGroup, V3014_num),
      -    MaritalStatus = factor(case_when(V3015 == 1 ~ "Married",
      -                                     V3015 == 2 ~ "Widowed",
      -                                     V3015 == 3 ~ "Divorced",
      -                                     V3015 == 4 ~ "Separated",
      -                                     V3015 == 5 ~ "Never married"),
      -                           levels = c("Never married", "Married", 
      -                                      "Widowed","Divorced", 
      -                                      "Separated"))
      -  ) %>% 
      -  left_join(hh_vsum_der %>% select(YEARQ, IDHH, 
      -                                   V2117, V2118, Tenure:Region),
      -            by = c("YEARQ", "IDHH"))
      +
      # Set label for usage later
      +NHOPI <- "Native Hawaiian or Other Pacific Islander"
      +
      +pers_vsum_der <- pers_vsum %>%
      +  mutate(
      +    Sex = factor(case_when(V3018 == 1 ~ "Male",
      +                           V3018 == 2 ~ "Female")),
      +    RaceHispOrigin = factor(case_when(V3024 == 1 ~ "Hispanic",
      +                                      V3023A == 1 ~ "White",
      +                                      V3023A == 2 ~ "Black",
      +                                      V3023A == 4 ~ "Asian",
      +                                      V3023A == 5 ~ NHOPI,
      +                                      TRUE ~ "Other"),
      +                            levels = c("White", "Black", "Hispanic", 
      +                                       "Asian", NHOPI, "Other")),
      +    V3014_num = as.numeric(as.character(V3014)),
      +    AgeGroup = case_when(V3014_num <= 17 ~ "12-17",
      +                         V3014_num <= 24 ~ "18-24",
      +                         V3014_num <= 34 ~ "25-34",
      +                         V3014_num <= 49 ~ "35-49",
      +                         V3014_num <= 64 ~ "50-64",
      +                         V3014_num <= 90 ~ "65 or older"),
      +    AgeGroup = fct_reorder(AgeGroup, V3014_num),
      +    MaritalStatus = factor(case_when(V3015 == 1 ~ "Married",
      +                                     V3015 == 2 ~ "Widowed",
      +                                     V3015 == 3 ~ "Divorced",
      +                                     V3015 == 4 ~ "Separated",
      +                                     V3015 == 5 ~ "Never married"),
      +                           levels = c("Never married", "Married", 
      +                                      "Widowed","Divorced", 
      +                                      "Separated"))
      +  ) %>% 
      +  left_join(hh_vsum_der %>% select(YEARQ, IDHH, 
      +                                   V2117, V2118, Tenure:Region),
      +            by = c("YEARQ", "IDHH"))

      As before, we want to check to make sure the recoded variables we create match the existing data as expected.

      -
      pers_vsum_der %>% count(Sex, V3018)
      +
      pers_vsum_der %>% count(Sex, V3018)
      ## # A tibble: 2 × 3
       ##   Sex    V3018      n
       ##   <fct>  <fct>  <int>
       ## 1 Female 2     150956
       ## 2 Male   1     140922
      -
      pers_vsum_der %>% count(RaceHispOrigin, V3024)
      +
      pers_vsum_der %>% count(RaceHispOrigin, V3024)
      ## # A tibble: 11 × 3
       ##    RaceHispOrigin                            V3024      n
       ##    <fct>                                     <fct>  <int>
      @@ -1823,10 +1822,10 @@ 

      13.4.2.2 Person Variables

      -
      pers_vsum_der %>%
      -  filter(RaceHispOrigin != "Hispanic" | 
      -           is.na(RaceHispOrigin)) %>%
      -  count(RaceHispOrigin, V3023A)
      +
      pers_vsum_der %>%
      +  filter(RaceHispOrigin != "Hispanic" | 
      +           is.na(RaceHispOrigin)) %>%
      +  count(RaceHispOrigin, V3023A)
      ## # A tibble: 20 × 3
       ##    RaceHispOrigin                            V3023A      n
       ##    <fct>                                     <fct>   <int>
      @@ -1850,10 +1849,10 @@ 

      13.4.2.2 Person Variables

      -
      pers_vsum_der %>% group_by(AgeGroup) %>%
      -  summarize(minAge = min(V3014),
      -            maxAge = max(V3014),
      -            .groups = "drop")
      +
      pers_vsum_der %>% group_by(AgeGroup) %>%
      +  summarize(minAge = min(V3014),
      +            maxAge = max(V3014),
      +            .groups = "drop")
      ## # A tibble: 6 × 3
       ##   AgeGroup    minAge maxAge
       ##   <fct>        <dbl>  <dbl>
      @@ -1863,7 +1862,7 @@ 

      13.4.2.2 Person Variables

      -
      pers_vsum_der %>% count(MaritalStatus, V3015)
      +
      pers_vsum_der %>% count(MaritalStatus, V3015)
      ## # A tibble: 6 × 3
       ##   MaritalStatus V3015      n
       ##   <fct>         <fct>  <int>
      @@ -1874,36 +1873,36 @@ 

      13.4.2.2 Person Variables

      We then create tibbles that contain only the variables we need, which makes it easier for analyses.

      -
      hh_vsum_slim <- hh_vsum_der %>%
      -  select(YEARQ:V2118,
      -         WGTVICCY:ADJINC_WT,
      -         Tenure,
      -         Urbanicity,
      -         Income,
      -         PlaceSize,
      -         Region)
      -
      -pers_vsum_slim <- pers_vsum_der %>%
      -  select(YEARQ:WGTPERCY, WGTVICCY:ADJINC_WT, Sex:Region)
      +
      hh_vsum_slim <- hh_vsum_der %>%
      +  select(YEARQ:V2118,
      +         WGTVICCY:ADJINC_WT,
      +         Tenure,
      +         Urbanicity,
      +         Income,
      +         PlaceSize,
      +         Region)
      +
      +pers_vsum_slim <- pers_vsum_der %>%
      +  select(YEARQ:WGTPERCY, WGTVICCY:ADJINC_WT, Sex:Region)

      To calculate estimates about types of crime, such as what percentage of violent crimes are reported to the police, we must use the incident file. The incident file is not guaranteed to have every pseudostratum and half-sample code, so dummy records are created to append before estimation. Finally, we merge demographic variables onto the incident tibble.

      -
      dummy_records <- hh_vsum_slim %>%
      -  distinct(V2117, V2118) %>%
      -  mutate(Dummy = 1,
      -         WGTVICCY = 1,
      -         NEWWGT = 1)
      -
      -inc_analysis <- inc_ind %>%
      -  mutate(Dummy = 0) %>%
      -  left_join(select(pers_vsum_slim, YEARQ, IDHH, IDPER, Sex:Region),
      -            by = c("YEARQ", "IDHH", "IDPER")) %>%
      -  bind_rows(dummy_records) %>%
      -  select(YEARQ:IDPER,
      -         WGTVICCY,
      -         NEWWGT,
      -         V4529,
      -         WeapCat,
      -         ReportPolice,
      -         Property:Region)
      +
      dummy_records <- hh_vsum_slim %>%
      +  distinct(V2117, V2118) %>%
      +  mutate(Dummy = 1,
      +         WGTVICCY = 1,
      +         NEWWGT = 1)
      +
      +inc_analysis <- inc_ind %>%
      +  mutate(Dummy = 0) %>%
      +  left_join(select(pers_vsum_slim, YEARQ, IDHH, IDPER, Sex:Region),
      +            by = c("YEARQ", "IDHH", "IDPER")) %>%
      +  bind_rows(dummy_records) %>%
      +  select(YEARQ:IDPER,
      +         WGTVICCY,
      +         NEWWGT,
      +         V4529,
      +         WeapCat,
      +         ReportPolice,
      +         Property:Region)

      The tibbles hh_vsum_slim, pers_vsum_slim, and inc_analysis can now be used to create design objects and calculate crime rate estimates.

      @@ -1911,29 +1910,29 @@

      13.4.2.2 Person Variables

      13.5 Survey Design Objects

      All the data prep above is necessary to prepare the data for survey analysis. At this point, we can create the design objects and finally begin analysis. We will create three design objects for different types of analysis as they depend on which type of estimate we are creating. For the incident data, the weight of analysis is NEWWGT, which we constructed previously. The household and person-level data use WGTHHCY and WGTPERCY, respectively. For all analyses, V2117 is the strata variable, and V2118 is the cluster/PSU variable for analysis.

      -
      inc_des <- inc_analysis %>%
      -  as_survey(
      -    weight = NEWWGT,
      -    strata = V2117,
      -    ids = V2118,
      -    nest = TRUE
      -  )
      -
      -hh_des <- hh_vsum_slim %>%
      -  as_survey(
      -    weight = WGTHHCY,
      -    strata = V2117,
      -    ids = V2118,
      -    nest = TRUE
      -  )
      -
      -pers_des <- pers_vsum_slim %>%
      -  as_survey(
      -    weight = WGTPERCY,
      -    strata = V2117,
      -    ids = V2118,
      -    nest = TRUE
      -  )
      +
      inc_des <- inc_analysis %>%
      +  as_survey(
      +    weight = NEWWGT,
      +    strata = V2117,
      +    ids = V2118,
      +    nest = TRUE
      +  )
      +
      +hh_des <- hh_vsum_slim %>%
      +  as_survey(
      +    weight = WGTHHCY,
      +    strata = V2117,
      +    ids = V2118,
      +    nest = TRUE
      +  )
      +
      +pers_des <- pers_vsum_slim %>%
      +  as_survey(
      +    weight = WGTPERCY,
      +    strata = V2117,
      +    ids = V2118,
      +    nest = TRUE
      +  )

      13.6 Calculating Estimates

      @@ -1947,29 +1946,29 @@

      13.6 Calculating Estimates

      13.6.1 Estimation 1: Victimization Totals

      There are two ways to calculate victimization totals. Using the incident design object (inc_des) is the most straightforward method, but the person (pers_des) and household (hh_des) design objects can be used as well if the adjustment factor (ADJINC_WT) is incorporated. In the example below, the total number of property and violent victimizations is first calculated using the incident file and then using the household and person design objects. The incident file is smaller, and thus, estimation is faster using that file, but the estimates will be the same as illustrated below:

      -
      vt1 <- inc_des %>%
      -  summarize(Property_Vzn = survey_total(Property, na.rm = TRUE),
      -            Violent_Vzn = survey_total(Violent, na.rm = TRUE))
      -
      -vt2a <- hh_des %>%
      -  summarize(Property_Vzn = survey_total(Property * ADJINC_WT, 
      -                                        na.rm = TRUE))
      -
      -vt2b <- pers_des %>%
      -  summarize(Violent_Vzn = survey_total(Violent * ADJINC_WT, 
      -                                       na.rm = TRUE))
      -
      -vt1
      +
      vt1 <- inc_des %>%
      +  summarize(Property_Vzn = survey_total(Property, na.rm = TRUE),
      +            Violent_Vzn = survey_total(Violent, na.rm = TRUE))
      +
      +vt2a <- hh_des %>%
      +  summarize(Property_Vzn = survey_total(Property * ADJINC_WT, 
      +                                        na.rm = TRUE))
      +
      +vt2b <- pers_des %>%
      +  summarize(Violent_Vzn = survey_total(Violent * ADJINC_WT, 
      +                                       na.rm = TRUE))
      +
      +vt1
      ## # A tibble: 1 × 4
       ##   Property_Vzn Property_Vzn_se Violent_Vzn Violent_Vzn_se
       ##          <dbl>           <dbl>       <dbl>          <dbl>
       ## 1    11682056.         263844.    4598306.        198115.
      -
      vt2a
      +
      vt2a
      ## # A tibble: 1 × 2
       ##   Property_Vzn Property_Vzn_se
       ##          <dbl>           <dbl>
       ## 1    11682056.         263844.
      -
      vt2b
      +
      vt2b
      ## # A tibble: 1 × 2
       ##   Violent_Vzn Violent_Vzn_se
       ##         <dbl>          <dbl>
      @@ -1980,21 +1979,21 @@ 

      13.6.1 Estimation 1: Victimizatio

      13.6.2 Estimation 2: Victimization Proportions

      Victimization proportions are proportions describing features of a victimization. The key here is that these are questions among victimizations, not among the population. These types of estimates can only be calculated using the incident design object (inc_des).

      For example, we could be interested in the percentage of property victimizations reported to the police as shown in the following code with an estimate, the standard error, and 95% confidence interval:

      -
      prop1 <- inc_des %>%
      -  filter(Property) %>%
      -  summarize(Pct = survey_mean(ReportPolice, na.rm = TRUE, proportion=TRUE, vartype=c("se", "ci")) * 100)
      -
      -prop1
      +
      prop1 <- inc_des %>%
      +  filter(Property) %>%
      +  summarize(Pct = survey_mean(ReportPolice, na.rm = TRUE, proportion=TRUE, vartype=c("se", "ci")) * 100)
      +
      +prop1
      ## # A tibble: 1 × 4
       ##     Pct Pct_se Pct_low Pct_upp
       ##   <dbl>  <dbl>   <dbl>   <dbl>
       ## 1  30.8  0.798    29.2    32.4

      Or, the percentage of violent victimizations that are in urban areas:

      -
      prop2 <- inc_des %>%
      -  filter(Violent) %>%
      -  summarize(Pct = survey_mean(Urbanicity=="Urban", na.rm = TRUE) * 100)
      -
      -prop2
      +
      prop2 <- inc_des %>%
      +  filter(Violent) %>%
      +  summarize(Pct = survey_mean(Urbanicity=="Urban", na.rm = TRUE) * 100)
      +
      +prop2
      ## # A tibble: 1 × 2
       ##     Pct Pct_se
       ##   <dbl>  <dbl>
      @@ -2005,34 +2004,34 @@ 

      13.6.2 Estimation 2: Victimizatio

      13.6.3 Estimation 3: Victimization Rates

      Victimization rates measure the number of victimizations per population. They are not an estimate of the proportion of households or persons who are victimized, which is a prevalence rate described in section 13.6.4. Victimization rates are estimated using the household (hh_des) or person (pers_des) design objects depending on the type of crime, and the adjustment factor (ADJINC_WT) must be incorporated. We return to the example of property and violent victimizations used in the example for victimization totals (section 13.6.1). In the following example, the property victimization totals are calculated as above, as well as the property victimization rate (using survey_mean()) and the population size using survey_total().

      As mentioned in the introduction, victimization rates use the incident weight in the numerator and the person or household weight in the denominator. This is accomplished by calculating the rates with the weight adjustment (ADJINC_WT) multiplied by the estimate of interest. Let’s look at an example of property victimization.

      -
      vr_prop <- hh_des %>%
      -  summarize(
      -    Property_Vzn = survey_total(Property * ADJINC_WT, 
      -                                na.rm = TRUE),
      -    Property_Rate = survey_mean(Property * ADJINC_WT * 1000,
      -                                na.rm = TRUE),
      -    PopSize = survey_total(1, vartype = NULL)
      -  )
      -
      -vr_prop
      +
      vr_prop <- hh_des %>%
      +  summarize(
      +    Property_Vzn = survey_total(Property * ADJINC_WT, 
      +                                na.rm = TRUE),
      +    Property_Rate = survey_mean(Property * ADJINC_WT * 1000,
      +                                na.rm = TRUE),
      +    PopSize = survey_total(1, vartype = NULL)
      +  )
      +
      +vr_prop
      ## # A tibble: 1 × 5
       ##   Property_Vzn Property_Vzn_se Property_Rate Property_Rate_se    PopSize
       ##          <dbl>           <dbl>         <dbl>            <dbl>      <dbl>
       ## 1    11682056.         263844.          90.3             1.95 129319232.

      In the output above, we see the estimate for property victimization rate in 2021 was 90.3 per 1,000 households, which is consistent with calculating as the number of victimizations per 1,000 population as demonstrated in the next chunk:

      -
      vr_prop %>%
      -  select(-ends_with("se")) %>%
      -  mutate(Property_Rate_manual=Property_Vzn/PopSize*1000)
      +
      vr_prop %>%
      +  select(-ends_with("se")) %>%
      +  mutate(Property_Rate_manual=Property_Vzn/PopSize*1000)
      ## # A tibble: 1 × 4
       ##   Property_Vzn Property_Rate    PopSize Property_Rate_manual
       ##          <dbl>         <dbl>      <dbl>                <dbl>
       ## 1    11682056.          90.3 129319232.                 90.3

      Victimization rates can also be calculated for particular characteristics of the victimization. In the following example, the rate of aggravated assault with no weapon, with a firearm, with a knife, and with another weapon.

      -
      pers_des %>%
      -  summarize(across(
      -    starts_with("AAST_"),
      -    ~ survey_mean(. * ADJINC_WT * 1000, na.rm = TRUE)
      -  ))
      +
      pers_des %>%
      +  summarize(across(
      +    starts_with("AAST_"),
      +    ~ survey_mean(. * ADJINC_WT * 1000, na.rm = TRUE)
      +  ))
      ## # A tibble: 1 × 8
       ##   AAST_NoWeap AAST_NoWeap_se AAST_Firearm AAST_Firearm_se AAST_Knife
       ##         <dbl>          <dbl>        <dbl>           <dbl>      <dbl>
      @@ -2040,85 +2039,85 @@ 

      13.6.3 Estimation 3: Victimizatio ## # ℹ 3 more variables: AAST_Knife_se <dbl>, AAST_Other <dbl>, ## # AAST_Other_se <dbl>

      A common desire is to calculate victimization rates by several characteristics. For example, we may want to calculate the violent victimization rate and aggravated assault rate by sex, race/Hispanic origin, age group, marital status, and household income. This requires a group_by() statement for each categorization separately. Thus, we make a function to do this and then use map_df() from the {purrr} package (part of the tidyverse) to loop through the variables. This function takes a demographic variable as its input (byarvar) and calculates the violent and aggravated assault vicitimization rate for each level. It then creates some columns with the variable, the level of each variable, and a numeric version of the variable (LevelNum) for sorting later. The function is run across multiple variables using map() and then stacks the results into a single output using bind_rows().

      -
      pers_est_by <- function(byvar) {
      -  pers_des %>%
      -    rename(Level := {{byvar}}) %>%
      -    filter(!is.na(Level)) %>%
      -    group_by(Level) %>%
      -    summarize(
      -      Violent = survey_mean(Violent * ADJINC_WT * 1000, na.rm = TRUE),
      -      AAST = survey_mean(AAST * ADJINC_WT * 1000, na.rm = TRUE)
      -    ) %>%
      -    mutate(
      -      Variable = byvar,
      -      LevelNum = as.numeric(Level),
      -      Level = as.character(Level)
      -    ) %>%
      -    select(Variable, Level, LevelNum, everything())
      -}
      -
      -pers_est_df <-
      -  c("Sex", "RaceHispOrigin", "AgeGroup", "MaritalStatus", "Income") %>%
      -  map(pers_est_by) %>%
      -  bind_rows()
      +
      pers_est_by <- function(byvar) {
      +  pers_des %>%
      +    rename(Level := {{byvar}}) %>%
      +    filter(!is.na(Level)) %>%
      +    group_by(Level) %>%
      +    summarize(
      +      Violent = survey_mean(Violent * ADJINC_WT * 1000, na.rm = TRUE),
      +      AAST = survey_mean(AAST * ADJINC_WT * 1000, na.rm = TRUE)
      +    ) %>%
      +    mutate(
      +      Variable = byvar,
      +      LevelNum = as.numeric(Level),
      +      Level = as.character(Level)
      +    ) %>%
      +    select(Variable, Level, LevelNum, everything())
      +}
      +
      +pers_est_df <-
      +  c("Sex", "RaceHispOrigin", "AgeGroup", "MaritalStatus", "Income") %>%
      +  map(pers_est_by) %>%
      +  bind_rows()

      The output from all the estimates is cleanded to create better labels such as going from “RaceHispOrigin” to “Race/Hispanic Origin”. Finally, the {gt} package is used to make a publishable table (Table 13.5). Using the functions from the {gt} package, column labels and footnotes are added and estimates are presented to the first decimal place.

      -
      vr_gt<-pers_est_df %>%
      -  mutate(
      -    Variable = case_when(
      -      Variable == "RaceHispOrigin" ~ "Race/Hispanic origin",
      -      Variable == "MaritalStatus" ~ "Marital status",
      -      Variable == "AgeGroup" ~ "Age",
      -      TRUE ~ Variable
      -    )
      -  ) %>%
      -  select(-LevelNum) %>%
      -  group_by(Variable) %>%
      -  gt(rowname_col = "Level") %>%
      -  tab_spanner(
      -    label = "Violent crime",
      -    id = "viol_span",
      -    columns = c("Violent", "Violent_se")
      -  ) %>%
      -  tab_spanner(label = "Aggravated assault",
      -              columns = c("AAST", "AAST_se")) %>%
      -  cols_label(
      -    Violent = "Rate",
      -    Violent_se = "SE",
      -    AAST = "Rate",
      -    AAST_se = "SE",
      -  ) %>%
      -  fmt_number(
      -    columns = c("Violent", "Violent_se", "AAST", "AAST_se"),
      -    decimals = 1
      -  ) %>%
      -  tab_footnote(
      -    footnote = "Includes rape or sexual assault, robbery,
      -    aggravated assault, and simple assault.",
      -    locations = cells_column_spanners(spanners = "viol_span")
      -  ) %>%
      -  tab_footnote(
      -    footnote = "Excludes persons of Hispanic origin",
      -    locations =
      -      cells_stub(rows = Level %in%
      -                   c("White", "Black", "Asian", NHOPI, "Other"))) %>%
      -  tab_footnote(
      -    footnote = "Includes persons who identified as
      -    Native Hawaiian or Other Pacific Islander only.",
      -    locations = cells_stub(rows = Level == NHOPI)
      -  ) %>%
      -  tab_footnote(
      -    footnote = "Includes persons who identified as American Indian or
      -    Alaska Native only or as two or more races.",
      -    locations = cells_stub(rows = Level == "Other")
      -  ) %>%
      -  tab_source_note(
      -    source_note = "Note: Rates per 1,000 persons age 12 or older.") %>%
      -  tab_source_note(source_note = "Source: Bureau of Justice Statistics,
      -                  National Crime Victimization Survey, 2021.") %>%
      -  tab_stubhead(label = "Victim demographic") %>%
      -  tab_caption("Rate and standard error of violent victimization,
      -             by type of crime and demographic characteristics, 2021")
      -
      vr_gt
      +
      vr_gt<-pers_est_df %>%
      +  mutate(
      +    Variable = case_when(
      +      Variable == "RaceHispOrigin" ~ "Race/Hispanic origin",
      +      Variable == "MaritalStatus" ~ "Marital status",
      +      Variable == "AgeGroup" ~ "Age",
      +      TRUE ~ Variable
      +    )
      +  ) %>%
      +  select(-LevelNum) %>%
      +  group_by(Variable) %>%
      +  gt(rowname_col = "Level") %>%
      +  tab_spanner(
      +    label = "Violent crime",
      +    id = "viol_span",
      +    columns = c("Violent", "Violent_se")
      +  ) %>%
      +  tab_spanner(label = "Aggravated assault",
      +              columns = c("AAST", "AAST_se")) %>%
      +  cols_label(
      +    Violent = "Rate",
      +    Violent_se = "SE",
      +    AAST = "Rate",
      +    AAST_se = "SE",
      +  ) %>%
      +  fmt_number(
      +    columns = c("Violent", "Violent_se", "AAST", "AAST_se"),
      +    decimals = 1
      +  ) %>%
      +  tab_footnote(
      +    footnote = "Includes rape or sexual assault, robbery,
      +    aggravated assault, and simple assault.",
      +    locations = cells_column_spanners(spanners = "viol_span")
      +  ) %>%
      +  tab_footnote(
      +    footnote = "Excludes persons of Hispanic origin",
      +    locations =
      +      cells_stub(rows = Level %in%
      +                   c("White", "Black", "Asian", NHOPI, "Other"))) %>%
      +  tab_footnote(
      +    footnote = "Includes persons who identified as
      +    Native Hawaiian or Other Pacific Islander only.",
      +    locations = cells_stub(rows = Level == NHOPI)
      +  ) %>%
      +  tab_footnote(
      +    footnote = "Includes persons who identified as American Indian or
      +    Alaska Native only or as two or more races.",
      +    locations = cells_stub(rows = Level == "Other")
      +  ) %>%
      +  tab_source_note(
      +    source_note = "Note: Rates per 1,000 persons age 12 or older.") %>%
      +  tab_source_note(source_note = "Source: Bureau of Justice Statistics,
      +                  National Crime Victimization Survey, 2021.") %>%
      +  tab_stubhead(label = "Victim demographic") %>%
      +  tab_caption("Rate and standard error of violent victimization,
      +             by type of crime and demographic characteristics, 2021")
      +
      vr_gt