diff --git a/404.html b/404.html index fd614ffb..605d5842 100644 --- a/404.html +++ b/404.html @@ -23,7 +23,7 @@ - + @@ -57,7 +57,7 @@ @@ -1418,39 +1431,39 @@

Example 2: Test of Independence
chi_ex2_obs <- anes_des %>%
-  drop_na(TrustPeople, TrustGovernment) %>%
-  group_by(TrustPeople, TrustGovernment) %>%
-  summarize(Observed = round(survey_mean(vartype = "ci"), 3), .groups = "drop")
-
-chi_ex2_obs_table <- chi_ex2_obs %>%
-  mutate(prop = paste0(Observed, " (", Observed_low, ", ", Observed_upp,
-    ")")) %>%
-  select(TrustGovernment, TrustPeople, prop) %>%
-  pivot_wider(names_from = TrustPeople, values_from = prop) %>%
-  gt(rowname_col = "TrustGovernment") %>%
-  tab_stubhead(label = "Trust in Government") %>%
-  tab_spanner(label = "Trust in People", columns = everything()) %>%
-  tab_options(page.orientation = "landscape")
-
chi_ex2_obs_table
- -
- @@ -1931,14 +1944,14 @@

Example 2: Test of Independence
chi_ex2_obs %>%
-  mutate(TrustPeople = str_c("Trust in People:\n", TrustPeople)) %>%
-  ggplot(aes(x = TrustGovernment, y = Observed, color = TrustGovernment)) +
-  facet_wrap(~TrustPeople, ncol = 5) + geom_point() + geom_errorbar(aes(ymin = Observed_low,
-  ymax = Observed_upp)) + ylab("Proportion") + xlab("") + theme_bw() +
-  scale_color_manual(name = "Trust in Government", values = book_colors) +
-  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
-    legend.position = "bottom") + guides(col = guide_legend(nrow = 2))

+
chi_ex2_obs %>%
+  mutate(TrustPeople = str_c("Trust in People:\n", TrustPeople)) %>%
+  ggplot(aes(x = TrustGovernment, y = Observed, color = TrustGovernment)) +
+  facet_wrap(~TrustPeople, ncol = 5) + geom_point() + geom_errorbar(aes(ymin = Observed_low,
+  ymax = Observed_upp)) + ylab("Proportion") + xlab("") + theme_bw() +
+  scale_color_manual(name = "Trust in Government", values = book_colors) +
+  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
+    legend.position = "bottom") + guides(col = guide_legend(nrow = 2))
Estimates of proportion of people by levels of trust in people and government with confidence intervals, ANES 2020. This presents the same information as previous table in graphical form.

@@ -1963,50 +1976,50 @@

Example 3: Test of Homogeneity\(H_A:\) at least one category of \(p_{i_{Biden}}\) does not match \(p_{i_{Trump}}\) or \(p_{i_{Other}}\)

To conduct this in R, we use the svychisq() function to compare the two variables:

-
chi_ex3 <- anes_des %>%
-  drop_na(VotedPres2020_selection, AgeGroup) %>%
-  svychisq(formula = ~AgeGroup + VotedPres2020_selection, design = ., statistic = "Chisq",
-    na.rm = TRUE)
-
-chi_ex3
+
chi_ex3 <- anes_des %>%
+  drop_na(VotedPres2020_selection, AgeGroup) %>%
+  svychisq(formula = ~AgeGroup + VotedPres2020_selection, design = ., statistic = "Chisq",
+    na.rm = TRUE)
+
+chi_ex3
## 
 ##  Pearson's X^2: Rao & Scott adjustment
 ## 
 ## data:  NextMethod()
 ## X-squared = 171, df = 10, p-value <2e-16

The output from svychisq() indicates a difference in how each age group voted in the 2020 election. To get a better idea of the different distributions, let’s output proportions to see the relationship. As we learned in Example 2 above, we can use chi_ex3$observed, or if we want to get the variance information (which is crucial with survey data), we can use survey_mean(). Remember, when we have two variables in group_by(), we obtain the proportions within each level of the variable listed. In this case, we are looking at the distribution of AgeGroup for each level of VotedPres2020_selection.

-
chi_ex3_obs <- anes_des %>%
-  filter(VotedPres2020 == "Yes") %>%
-  drop_na(VotedPres2020_selection, AgeGroup) %>%
-  group_by(VotedPres2020_selection, AgeGroup) %>%
-  summarize(Observed = round(survey_mean(vartype = "ci"), 3))
-
-chi_ex3_obs_table <- chi_ex3_obs %>%
-  mutate(prop = paste0(Observed, " (", Observed_low, ", ", Observed_upp,
-    ")")) %>%
-  select(AgeGroup, VotedPres2020_selection, prop) %>%
-  pivot_wider(names_from = VotedPres2020_selection, values_from = prop) %>%
-  gt(rowname_col = "AgeGroup") %>%
-  tab_stubhead(label = "Age Group")
-
chi_ex3_obs_table
- -
- @@ -2481,10 +2494,10 @@

6.5 Exercises
  • Using the RECS data, do more than 50% of U.S. households use AC (ACUsed)?
  • -
    ttest_solution1 <- recs_des %>%
    -  svyttest(design = ., formula = ((ACUsed == TRUE) - 0.5) ~ 0, na.rm = TRUE)
    -
    -ttest_solution1
    +
    ttest_solution1 <- recs_des %>%
    +  svyttest(design = ., formula = ((ACUsed == TRUE) - 0.5) ~ 0, na.rm = TRUE)
    +
    +ttest_solution1
    ## 
     ##  Design-based one-sample t-test
     ## 
    @@ -2499,10 +2512,10 @@ 

    6.5 Exercises
  • Using the RECS data, does the average temperature that U.S. households set their thermostats to differ between the day and night in the winter (WinterTempDay and WinterTempNight)?
  • -
    ttest_solution2 <- recs_des %>%
    -  svyttest(design = ., formula = WinterTempDay - WinterTempNight ~ 0, na.rm = TRUE)
    -
    -ttest_solution2
    +
    ttest_solution2 <- recs_des %>%
    +  svyttest(design = ., formula = WinterTempDay - WinterTempNight ~ 0, na.rm = TRUE)
    +
    +ttest_solution2
    ## 
     ##  Design-based one-sample t-test
     ## 
    @@ -2517,11 +2530,11 @@ 

    6.5 Exercises
  • Using the ANES data, does the average age (Age) of those who voted for Biden in 2020 (VotedPres2020_selection) differ from those who voted for another candidate?
  • -
    ttest_solution3 <- anes_des %>%
    -  svyttest(design = ., formula = Age ~ VotedPres2020_selection == "Biden",
    -    na.rm = TRUE)
    -
    -ttest_solution3
    +
    ttest_solution3 <- anes_des %>%
    +  svyttest(design = ., formula = Age ~ VotedPres2020_selection == "Biden",
    +    na.rm = TRUE)
    +
    +ttest_solution3
    ## 
     ##  Design-based t-test
     ## 
    @@ -2541,17 +2554,17 @@ 

    6.5 Exercises
    chisq_solution1 <- "c. Test of homogeneity (`svychisq()`)"
    -chisq_solution1

    +
    chisq_solution1 <- "c. Test of homogeneity (`svychisq()`)"
    +chisq_solution1
    ## [1] "c. Test of homogeneity (`svychisq()`)"
    1. In the RECS data, is there a relationship between the type of housing unit (HousingUnitType) and the year the house was built (YearMade)?
    -
    chisq_solution2 <- recs_des %>%
    -  svychisq(formula = ~HousingUnitType + YearMade, design = ., statistic = "Wald",
    -    na.rm = TRUE)
    -
    -chisq_solution2
    +
    chisq_solution2 <- recs_des %>%
    +  svychisq(formula = ~HousingUnitType + YearMade, design = ., statistic = "Wald",
    +    na.rm = TRUE)
    +
    +chisq_solution2
    ## 
     ##  Design-based Wald test of association
     ## 
    @@ -2560,11 +2573,11 @@ 

    6.5 Exercises
  • In the ANES data, is there a difference in the distribution of gender (Gender) across early voting status in 2020 (EarlyVote2020)?
  • -
    chisq_solution3 <- anes_des %>%
    -  svychisq(formula = ~Gender + EarlyVote2020, design = ., statistic = "F",
    -    na.rm = TRUE)
    -
    -chisq_solution3
    +
    chisq_solution3 <- anes_des %>%
    +  svychisq(formula = ~Gender + EarlyVote2020, design = ., statistic = "F",
    +    na.rm = TRUE)
    +
    +chisq_solution3
    ## 
     ##  Pearson's X^2: Rao & Scott adjustment
     ## 
    @@ -2574,7 +2587,7 @@ 

    6.5 Exercises

    -
    +
    Lumley, Thomas. 2010. Complex Surveys: A Guide to Analysis Using r: A Guide to Analysis Using r. John Wiley; Sons.
    @@ -2619,12 +2632,12 @@

    ReferencesReferences - + @@ -57,7 +57,7 @@ @@ -1047,35 +1060,35 @@

    8.3.1.1 Transitioning {srvyr} out

    We can add a few more enhancements, such as a title, a data source note, and a footnote with the question information, using the functions tab_header(), tab_source_note(), and tab_footnote(). If having the percentage sign in both the header and the cells seems redundant, we can opt for fmt_number() instead of fmt_percent() and scale the number by 100 with scale_by = 100.

    -
    trust_gov_gt2 <- trust_gov_gt %>%
    -  tab_header("American voter's trust
    -             in the federal government, 2020") %>%
    -  tab_source_note("American National Election Studies, 2020") %>%
    -  tab_footnote(
    -    "Question text: How often can you trust the federal government
    -    in Washington to do what is right?"
    -  ) %>%
    -  fmt_number(scale_by = 100,
    -             decimals = 1)
    -
    trust_gov_gt2
    - -
    - @@ -1576,28 +1589,28 @@

    Expanding tables using {gtsummary}
    anes_des_gtsum <- anes_des %>%
    -  tbl_svysummary(include = TrustGovernment,
    -                 statistic = list(all_categorical() ~ "{p} ({p.std.error})")) 

    -
    anes_des_gtsum
    +
    anes_des_gtsum <- anes_des %>%
    +  tbl_svysummary(include = TrustGovernment,
    +                 statistic = list(all_categorical() ~ "{p} ({p.std.error})")) 
    +
    anes_des_gtsum
    -
    - @@ -2055,33 +2068,33 @@

    Expanding tables using {gtsummary}
    anes_des_gtsum2 <- anes_des %>%
    -  tbl_svysummary(
    -    include = TrustGovernment,
    -    statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
    -    missing = "no",
    -    digits = list(TrustGovernment ~ style_percent),
    -    label = list(TrustGovernment ~ "Trust in Government, 2020")
    -  )

    -
    anes_des_gtsum2
    - -
    - @@ -2537,45 +2550,45 @@

    Expanding tables using {gtsummary}8.3.1.1.

    -
    anes_des_gtsum3 <- anes_des %>%
    -  tbl_svysummary(
    -    include = TrustGovernment,
    -    statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
    -    missing = "no",
    -    digits = list(TrustGovernment ~ style_percent),
    -    label = list(TrustGovernment ~ "Trust in Government, 2020")
    -  ) %>%
    -  modify_footnote(update = everything() ~ NA) %>%
    -  modify_header(label = " ",
    -                stat_0 = "% (s.e.)") %>%
    -  as_gt() %>%
    -  tab_header("American voter's trust
    -             in the federal government, 2020") %>%
    -  tab_source_note("American National Election Studies, 2020") %>%
    -  tab_footnote(
    -    "Question text: How often can you trust
    -    the federal government in Washington
    -    to do what is right?"
    -  )
    -
    anes_des_gtsum3
    - -
    - @@ -3041,49 +3054,49 @@

    Expanding tables using {gtsummary}
    anes_des_gtsum4 <- anes_des %>%
    -  tbl_svysummary(
    -    include = c(TrustGovernment, Age),
    -    statistic = list(
    -      all_categorical() ~ "{p} ({p.std.error})",
    -      all_continuous() ~ "{mean} ({mean.std.error})"
    -    ),
    -    missing = "no",
    -    digits = list(TrustGovernment ~ style_percent,
    -                  Age ~ c(1, 2)),
    -    label = list(TrustGovernment ~ "Trust in Government, 2020")
    -  ) %>%
    -  modify_footnote(update = everything() ~ NA) %>%
    -  modify_header(label = " ",
    -                stat_0 = "% (s.e.)") %>%
    -  as_gt() %>%
    -  tab_header("American voter's trust in the federal government, 2020") %>%
    -  tab_source_note("American National Election Studies, 2020") %>%
    -  tab_footnote(
    -    "Question text: How often can you trust the federal government
    -    in Washington to do what is right?"
    -  ) %>%
    -  tab_caption("Example of gtsummary table with trust in government
    -              estimates and average age")

    -
    anes_des_gtsum4
    - -
    - @@ -3549,50 +3562,50 @@

    Expanding tables using {gtsummary}
    anes_des_gtsum5 <- anes_des %>%
    -  drop_na(VotedPres2020) %>%
    -  tbl_svysummary(
    -    include = TrustGovernment,
    -    statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
    -    missing = "no",
    -    digits = list(TrustGovernment ~ style_percent),
    -    label = list(TrustGovernment ~ "Trust in Government, 2020"),
    -    by = VotedPres2020
    -  ) %>%
    -  modify_footnote(update = everything() ~ NA) %>%
    -  modify_header(label = " ",
    -                stat_1 = "Voted",
    -                stat_2 = "Didn't vote") %>%
    -  as_gt() %>%
    -  tab_header(
    -    "American voter's trust
    -             in the federal government by whether they voted
    -             in the 2020 presidential election"
    -  ) %>%
    -  tab_source_note("American National Election Studies, 2020") %>%
    -  tab_footnote(
    -    "Question text: How often can you trust the federal government
    -               in Washington to do what is right?"
    -  )

    -
    anes_des_gtsum5
    - -
    - @@ -4071,24 +4084,24 @@

    8.3.2 Charts and plots
    anes_des_der <- anes_des %>%
    -  mutate(TrustGovernmentUsually = case_when(
    -    is.na(TrustGovernment) ~ NA,
    -    TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
    -  )) %>%
    -  drop_na(VotedPres2020_selection) %>%
    -  group_by(VotedPres2020_selection) %>%
    -  summarize(
    -    pct_trust = survey_mean(
    -      TrustGovernmentUsually,
    -      na.rm = TRUE,
    -      proportion = TRUE,
    -      vartype = "ci"
    -    ),
    -    .groups = "drop"
    -  )
    -
    -anes_des_der

    +
    anes_des_der <- anes_des %>%
    +  mutate(TrustGovernmentUsually = case_when(
    +    is.na(TrustGovernment) ~ NA,
    +    TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
    +  )) %>%
    +  drop_na(VotedPres2020_selection) %>%
    +  group_by(VotedPres2020_selection) %>%
    +  summarize(
    +    pct_trust = survey_mean(
    +      TrustGovernmentUsually,
    +      na.rm = TRUE,
    +      proportion = TRUE,
    +      vartype = "ci"
    +    ),
    +    .groups = "drop"
    +  )
    +
    +anes_des_der
    ## # A tibble: 3 × 4
     ##   VotedPres2020_selection pct_trust pct_trust_low pct_trust_upp
     ##   <fct>                       <dbl>         <dbl>         <dbl>
    @@ -4096,12 +4109,12 @@ 

    8.3.2 Charts and plots8.1.

    -
    p <- anes_des_der %>%
    -  ggplot(aes(x = VotedPres2020_selection,
    -             y = pct_trust)) +
    -  geom_bar(stat = "identity")
    -
    -p
    +
    p <- anes_des_der %>%
    +  ggplot(aes(x = VotedPres2020_selection,
    +             y = pct_trust)) +
    +  geom_bar(stat = "identity")
    +
    +p
    Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12. Bar 2 (Trump) has a height of 0.17. Bar 3 (Other) has a height of 0.06.

    @@ -4109,13 +4122,13 @@

    8.3.2 Charts and plots
    pcolor <- anes_des_der %>%
    -  ggplot(aes(x = VotedPres2020_selection,
    -             y = pct_trust,
    -             fill = VotedPres2020_selection)) +
    -  geom_bar(stat = "identity")
    -
    -pcolor

    +
    pcolor <- anes_des_der %>%
    +  ggplot(aes(x = VotedPres2020_selection,
    +             y = pct_trust,
    +             fill = VotedPres2020_selection)) +
    +  geom_bar(stat = "identity")
    +
    +pcolor
    Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12 and a color of strong reddish orange. Bar 2 (Trump) has a height of 0.17 and a color of vivid yellowish green. Bar 3 (Other) has a height of 0.06 and color of brilliant blue.

    @@ -4123,16 +4136,16 @@

    8.3.2 Charts and plots
    pcol_error <- anes_des_der %>%
    -  ggplot(aes(x = VotedPres2020_selection,
    -             y = pct_trust,
    -             fill = VotedPres2020_selection)) +
    -  geom_bar(stat = "identity") +
    -  geom_errorbar(aes(ymin = pct_trust_low,
    -                    ymax = pct_trust_upp),
    -                width = .2)
    -
    -pcol_error

    +
    pcol_error <- anes_des_der %>%
    +  ggplot(aes(x = VotedPres2020_selection,
    +             y = pct_trust,
    +             fill = VotedPres2020_selection)) +
    +  geom_bar(stat = "identity") +
    +  geom_errorbar(aes(ymin = pct_trust_low,
    +                    ymax = pct_trust_upp),
    +                width = .2)
    +
    +pcol_error
    Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12 and a color of strong reddish orange. Bar 2 (Trump) has a height of 0.17 and a color of vivid yellowish green. Bar 3 (Other) has a height of 0.06 and color of brilliant blue. Error bars are added with the Bar 1 (Biden) error ranging from 0.11 to 0.14, Bar 2 (Trump) error ranging from 0.16 to 0.19, and the Bar 3 (Other) error ranging from 0.02 to 0.14.

    @@ -4140,38 +4153,38 @@

    8.3.2 Charts and plots. We can specify specific colors for fill using scale_fill_manual(). Inside the function, we provide a vector of values corresponding to the colors in our plot. These values are hexadecimal (hex) color codes, denoted by a leading pound sign # followed by six letters or numbers. The hex code #0b3954 used below is a dark blue. There are many tools online that help pick hex codes, such as htmlcolorcodes.com/.

    -
    pfull <-
    -  anes_des_der %>%
    -  ggplot(aes(x = VotedPres2020_selection,
    -             y = pct_trust,
    -             fill = VotedPres2020_selection)) +
    -  geom_bar(stat = "identity") +
    -  geom_errorbar(aes(ymin = pct_trust_low,
    -                    ymax = pct_trust_upp),
    -                width = .2) +
    -  scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
    -  xlab("Election choice (2020)") +
    -  ylab("Usually trust the government") +
    -  scale_y_continuous(labels = scales::percent) +
    -  guides(fill = "none") +
    -  labs(title = "Percent of voters who usually trust the government
    -       by chosen 2020 presidential candidate",
    -       caption = "Source: American National Election Studies, 2020")
    -
    -pfull
    +
    pfull <-
    +  anes_des_der %>%
    +  ggplot(aes(x = VotedPres2020_selection,
    +             y = pct_trust,
    +             fill = VotedPres2020_selection)) +
    +  geom_bar(stat = "identity") +
    +  geom_errorbar(aes(ymin = pct_trust_low,
    +                    ymax = pct_trust_upp),
    +                width = .2) +
    +  scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
    +  xlab("Election choice (2020)") +
    +  ylab("Usually trust the government") +
    +  scale_y_continuous(labels = scales::percent) +
    +  guides(fill = "none") +
    +  labs(title = "Percent of voters who usually trust the government
    +       by chosen 2020 presidential candidate",
    +       caption = "Source: American National Election Studies, 2020")
    +
    +pfull
    Bar chart with x-axis of 'VotedPres2020_selection' with labels Biden, Trump and Other. It has y-axis 'pct_trust' with labels 0.00, 0.05, 0.10 and 0.15. The chart is a bar chart with 3 vertical bars. Bar 1 (Biden) has a height of 0.12 and a color of dark blue. Bar 2 (Trump) has a height of 0.17 and a color of very pale blue. Bar 3 (Other) has a height of 0.06 and color of moderate purple. Error bars are added with the Bar 1 (Biden) error ranging from 0.11 to 0.14, Bar 2 (Trump) error ranging from 0.16 to 0.19, and the Bar 3 (Other) error ranging from 0.02 to 0.14.

    FIGURE 8.4: Bar chart of trust in government by chosen 2020 presidential candidate with colors, labels, error bars, and title

    -

    What we’ve explored in this section are just the foundational aspects of {ggplot2}, and the capabilities of this package extend far beyond what we’ve covered. Advanced features such as annotation, faceting, and theming allow for more sophisticated and customized visualizations. The book Wickham (2023) is a comprehensive guide to learning more about this powerful tool.

    +

    What we’ve explored in this section are just the foundational aspects of {ggplot2}, and the capabilities of this package extend far beyond what we’ve covered. Advanced features such as annotation, faceting, and theming allow for more sophisticated and customized visualizations. The book Wickham (2023) is a comprehensive guide to learning more about this powerful tool.

    References

    -
    +
    ———. 2023. Ggplot2: Elegant Graphics for Data Analysis. 3rd Edition. https://ggplot2-book.org/; Springer.
    @@ -4200,12 +4213,12 @@

    ReferencesReferences - + @@ -57,7 +57,7 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    TABLE 11.1: Percentage of Votes by Candidate for Different Missing Data Inclusions
    Candidate + Including All Missing Data + + Removing Skip Patterns Only + + Removing All Missing Data +
    %s.e. (%)%s.e. (%)%s.e. (%)
    Did not Vote for President in 201632.4%0.9%NANANANA
    Hillary Clinton33.0%0.7%48.8%0.9%49.2%0.9%
    Donald Trump29.9%0.7%44.3%0.9%44.7%0.9%
    Other Candidate4.1%0.2%6.1%0.3%6.1%0.3%
    Missing0.6%0.1%0.9%0.2%NANA
    +

    +

    As Table 11.1 shows, the results can vary greatly depending on which type of missing data that are removed. If we remove only the skip patterns the margin between the Clinton and Trump is 4.5 percentage points, but if we include all data even including those that did not vote in 2016, the margin is 3.1 percentage points. How we handle the different types of missing values is important for interpretation of the data.

    + +
    +

    + +

    References

    +
    +
    +DeBell, Matthew. 2010. “How to Analyze ANES Survey Data.” ANES Technical Report Series nes012492. Palo Alto, CA: Stanford University; Ann Arbor, MI: the University of Michigan; https://electionstudies.org/wp-content/uploads/2018/05/HowToAnalyzeANESData.pdf. +
    +
    +Kim, Jae Kwang, and Jun Shao. 2021. Statistical Methods for Handling Incomplete Data. Chapman & Hall/CRC Press. +
    +
    +Mack, Christina, Zhaohui Su, and Daniel Westreich. 2018. “Types of Missing Data.” In Managing Missing Data in Patient Registries: Addendum to Registries for Evaluating Patient Outcomes: A User’s Guide, Third Edition [Internet]. Rockville (MD): Agency for Healthcare Research; Quality (US); https://www.ncbi.nlm.nih.gov/books/NBK493614/. +
    +
    +Schafer, Joseph L, and John W Graham. 2002. “Missing Data: Our View of the State of the Art.” Psychological Methods 7: 147–77. https://doi.org/10.1037//1082-989X.7.2.147. +
    +
    +Tierney, Nicholas, and Dianne Cook. 2023. “Expanding Tidy Data Principles to Facilitate Missing Data Exploration, Visualization and Assessment of Imputations.” Journal of Statistical Software 105 (7): 1–31. https://doi.org/10.18637/jss.v105.i07. +
    +
    +Valliant, Richard, and Jill A. Dever. 2018. Survey Weights: A Step-by-Step Guide to Calculation. Stata Press. +
    @@ -489,12 +1345,12 @@

    Chapter 11 Missing dataChapter 11 Missing data - + @@ -57,7 +57,7 @@ @@ -2719,24 +2732,24 @@

    13.6.3 Estimation 3: Victimizatio

    13.6.4 Estimation 4: Prevalence Rates

    Prevalence rates differ from victimization rates as the numerator is the number of people or households victimized rather than the number of victimizations. To calculate the prevalence rates, we must run another summary of the data by calculating an indicator for whether a person or household is a victim of a particular crime at any point in the year. Below is an example of calculating first the indicator and then the prevalence rate of violent crime and aggravated assault.

    -
    pers_prev_des <-
    -  pers_vsum_slim %>%
    -  mutate(Year = floor(YEARQ)) %>%
    -  mutate(Violent_Ind = sum(Violent) > 0,
    -         AAST_Ind = sum(AAST) > 0,
    -         .by = c("Year", "IDHH", "IDPER")) %>%
    -  as_survey(
    -    weight = WGTPERCY,
    -    strata = V2117,
    -    ids = V2118,
    -    nest = TRUE
    -  )
    -
    -pers_prev_ests <- pers_prev_des %>%
    -  summarize(Violent_Prev = survey_mean(Violent_Ind * 100),
    -            AAST_Prev = survey_mean(AAST_Ind * 100))
    -
    -pers_prev_ests
    +
    pers_prev_des <-
    +  pers_vsum_slim %>%
    +  mutate(Year = floor(YEARQ)) %>%
    +  mutate(Violent_Ind = sum(Violent) > 0,
    +         AAST_Ind = sum(AAST) > 0,
    +         .by = c("Year", "IDHH", "IDPER")) %>%
    +  as_survey(
    +    weight = WGTPERCY,
    +    strata = V2117,
    +    ids = V2118,
    +    nest = TRUE
    +  )
    +
    +pers_prev_ests <- pers_prev_des %>%
    +  summarize(Violent_Prev = survey_mean(Violent_Ind * 100),
    +            AAST_Prev = survey_mean(AAST_Ind * 100))
    +
    +pers_prev_ests
    ## # A tibble: 1 × 4
     ##   Violent_Prev Violent_Prev_se AAST_Prev AAST_Prev_se
     ##          <dbl>           <dbl>     <dbl>        <dbl>
    @@ -2749,10 +2762,10 @@ 

    13.7 Exercises
  • What proportion of completed motor vehicle thefts are not reported to the police? Hint: Use the codebook to look at the definition of Type of Crime (V4529).
  • -
    ans1 <- inc_des %>%
    -  filter(str_detect(V4529, "40|41")) %>%
    -  summarize(Pct = survey_mean(ReportPolice, na.rm = TRUE) * 100)
    -ans1
    +
    ans1 <- inc_des %>%
    +  filter(str_detect(V4529, "40|41")) %>%
    +  summarize(Pct = survey_mean(ReportPolice, na.rm = TRUE) * 100)
    +ans1
    ## # A tibble: 1 × 2
     ##     Pct Pct_se
     ##   <dbl>  <dbl>
    @@ -2760,9 +2773,9 @@ 

    13.7 Exercises
  • How many violent crimes occur in each region?
  • -
    inc_des %>%
    -  filter(Violent) %>%
    -  survey_count(Region)
    +
    inc_des %>%
    +  filter(Violent) %>%
    +  survey_count(Region)
    ## # A tibble: 4 × 3
     ##   Region           n    n_se
     ##   <fct>        <dbl>   <dbl>
    @@ -2773,10 +2786,10 @@ 

    13.7 Exercises
  • What is the property victimization rate among each income level?
  • -
    hh_des %>%
    -  group_by(Income) %>%
    -  summarize(Property_Rate = survey_mean(Property * ADJINC_WT * 1000, 
    -                                        na.rm = TRUE))
    +
    hh_des %>%
    +  group_by(Income) %>%
    +  summarize(Property_Rate = survey_mean(Property * ADJINC_WT * 1000, 
    +                                        na.rm = TRUE))
    ## # A tibble: 6 × 3
     ##   Income            Property_Rate Property_Rate_se
     ##   <fct>                     <dbl>            <dbl>
    @@ -2790,7 +2803,7 @@ 

    13.7 Exercises

    -
    +
    Bureau of Justice Statistics. 2017. “National Crime Victimization Survey, 2016: Technical Documentation.” https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvstd16.pdf.
    @@ -2833,12 +2846,12 @@

    ReferencesReferences - + @@ -57,7 +57,7 @@ @@ -1156,11 +1169,11 @@

    14.5 Calculating estimates and ma

    Multiple-choice questions are interesting. If we want to look at how education was impacted only among those in school, we need to filter to the relevant responses, which is anyone that responded no to the first part. The variable Educ_NotInSchool in the dataset has values of 0 and 1. A value of 1 means that the respondent selected the first option in the question (none of the children are in school) and a value of 0 means that at least one of their children are in school. Using this variable, we can filter the data to only those with a value of 0.

    There are three additional variables that we can look at that correlate to the second option (Educ_NormalSchool), third option (Educ_VirtualSchool), and fourth option (Educ_Hybrid). An unweighted cross-tab for the responses is included below, and we can see there is a wide-range of impacts and that many combinations of effects on education are possible.

    -
    ambarom %>% filter(Educ_NotInSchool == 0) %>% 
    -  distinct(Educ_NormalSchool,
    -        Educ_VirtualSchool,
    -        Educ_Hybrid) %>% 
    -  print(n = 50)
    +
    ambarom %>% filter(Educ_NotInSchool == 0) %>% 
    +  distinct(Educ_NormalSchool,
    +        Educ_VirtualSchool,
    +        Educ_Hybrid) %>% 
    +  print(n = 50)
    ## # A tibble: 8 × 3
     ##   Educ_NormalSchool Educ_VirtualSchool Educ_Hybrid
     ##               <dbl>              <dbl>       <dbl>
    @@ -1178,58 +1191,58 @@ 

    14.5 Calculating estimates and ma
  • Indicator that the education medium was changed - either virtual or hybrid
  • In this next code chunk, we create these indicators, make national estimates, and display a summary table of the data.

    -
    ambarom_des_educ <- ambarom_des %>%
    -  filter(Educ_NotInSchool == 0) %>%
    -  mutate(Educ_OnlyNormal = (Educ_NormalSchool == 1 &
    -                              Educ_VirtualSchool == 0 & 
    -                              Educ_Hybrid == 0),
    -         Educ_MediumChange = (Educ_VirtualSchool == 1 | 
    -                                Educ_Hybrid == 1))
    -
    -covid_educ_ests <-
    -  ambarom_des_educ %>%
    -  group_by(Country) %>%
    -  summarize(
    -    p_onlynormal = survey_mean(Educ_OnlyNormal, na.rm = TRUE) * 100,
    -    p_mediumchange = survey_mean(Educ_MediumChange, na.rm = TRUE) * 100,
    -    p_noschool = survey_mean(Educ_NoSchool, na.rm = TRUE) * 100,
    -  ) 
    -
    -covid_educ_ests_gt<-covid_educ_ests %>%
    -  gt(rowname_col = "Country") %>%
    -  cols_label(p_onlynormal = "%",
    -             p_onlynormal_se = "SE",
    -             p_mediumchange = "%",
    -             p_mediumchange_se = "SE",
    -             p_noschool = "%",
    -             p_noschool_se = "SE") %>%
    -  tab_spanner(label = "Normal school only",
    -              columns = c("p_onlynormal", "p_onlynormal_se")) %>%
    -  tab_spanner(label = "Medium change",
    -              columns = c("p_mediumchange", "p_mediumchange_se")) %>%
    -  tab_spanner(label = "Cut ties with school",
    -              columns = c("p_noschool", "p_noschool_se")) %>%
    -  fmt_number(decimals = 1) %>%
    -  tab_source_note("AmericasBarometer Surveys, 2021")
    -
    covid_educ_ests_gt
    - -
    - @@ -1805,16 +1818,16 @@

    14.5 Calculating estimates and ma

    14.6 Mapping survey data

    While the table presents the data well, a map could also be used. To obtain maps of the countries, the package {rnaturalearth} is used, subsetting North and South America using the function ne_countries(). This returns an sf object with many columns but, most importantly soverignt (sovereignty), geounit (country or territory), and geometry (the shape). As an example of the difference between soverignty and country/territory, the United States, Puerto Rico, and the US Virgin Islands are all separate units with the same sovereignty. This map (without data) is plotted in Figure 14.1.

    -
    country_shape <-
    -  ne_countries(
    -    scale = "medium",
    -    returnclass = "sf",
    -    continent = c("North America", "South America")
    -  )
    -
    -country_shape %>%
    -  ggplot() + 
    -  geom_sf()
    +
    country_shape <-
    +  ne_countries(
    +    scale = "medium",
    +    returnclass = "sf",
    +    continent = c("North America", "South America")
    +  )
    +
    +country_shape %>%
    +  ggplot() + 
    +  geom_sf()
    Map of North and South America

    @@ -1822,25 +1835,25 @@

    14.6 Mapping survey data14.1 is very wide as the Aleutian islands in Alaska extend into the Eastern Hemisphere. We can crop the shape file to only the Western Hemisphere to remove some of the trailing islands of Alaska.

    -
    country_shape_crop <- country_shape %>%
    -  st_crop(c(xmin = -180,
    -            xmax = 0,
    -            ymin = -90,
    -            ymax = 90)) 
    +
    country_shape_crop <- country_shape %>%
    +  st_crop(c(xmin = -180,
    +            xmax = 0,
    +            ymin = -90,
    +            ymax = 90)) 

    Now that we have the shape files we need, our next step is to match our survey data to the map. Countries can be called by different names (e.g., “U.S”, “U.S.A”, “United States”). To make sure we can plot our survey data on the map, we will need to make sure the country in both datasets match. To do this, we can use the anti_join() function and check to see what countries are in the survey data but not in the map data. As shown below, the United States is referred to as “United States” in the survey data but “United States of America” in the map data. The code below shows countries in the survey but not the map data.

    -
    survey_country_list <- ambarom %>% distinct(Country)
    -survey_country_list %>% 
    -  anti_join(country_shape_crop, by = c("Country" = "geounit"))
    +
    survey_country_list <- ambarom %>% distinct(Country)
    +survey_country_list %>% 
    +  anti_join(country_shape_crop, by = c("Country" = "geounit"))
    ## # A tibble: 1 × 1
     ##   Country      
     ##   <fct>        
     ## 1 United States

    The code below shows countries in the map data but not hte survey data.

    -
    country_shape_crop %>% as_tibble() %>% 
    -  select(geounit, sovereignt) %>%
    -  anti_join(survey_country_list, by = c("geounit" = "Country")) %>%
    -  arrange(geounit) %>%
    -  print(n = 30)
    +
    country_shape_crop %>% as_tibble() %>% 
    +  select(geounit, sovereignt) %>%
    +  anti_join(survey_country_list, by = c("geounit" = "Country")) %>%
    +  arrange(geounit) %>%
    +  print(n = 30)
    ## # A tibble: 30 × 2
     ##    geounit                          sovereignt                      
     ##    <chr>                            <chr>                           
    @@ -1875,54 +1888,54 @@ 

    14.6 Mapping survey data
    country_shape_upd <- country_shape_crop %>%
    -  mutate(geounit = if_else(geounit == "United States of America", 
    -                           "United States", geounit))

    +
    country_shape_upd <- country_shape_crop %>%
    +  mutate(geounit = if_else(geounit == "United States of America", 
    +                           "United States", geounit))

    To merge the data and make a map, we begin with the map file, merge the estimates data, and then plot. Let’s use the outcomes we created in section 14.5 for the table output (covid_worry_country_ests and covid_educ_ests). Figures 14.2 and 14.3 display the maps for each measure.

    -
    covid_sf <- country_shape_upd %>%
    -  full_join(covid_worry_country_ests, 
    -            by = c("geounit" = "Country")) %>%
    -  full_join(covid_educ_ests,
    -            by = c("geounit" = "Country"))
    -
    -ggplot() +
    -  geom_sf(data = covid_sf, aes(fill = p, geometry = geometry)) +
    -  scale_fill_gradientn(
    -    guide = "colorbar",
    -    name = "Percent",
    -    labels = scales::comma,
    -    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    -    na.value = NA
    -  ) +
    -  geom_sf_pattern(
    -    data = filter(covid_sf, is.na(p)),
    -    pattern = "crosshatch",
    -    pattern_fill = "black",
    -    fill = NA
    -  ) +
    -  theme_minimal()
    +
    covid_sf <- country_shape_upd %>%
    +  full_join(covid_worry_country_ests, 
    +            by = c("geounit" = "Country")) %>%
    +  full_join(covid_educ_ests,
    +            by = c("geounit" = "Country"))
    +
    +ggplot() +
    +  geom_sf(data = covid_sf, aes(fill = p, geometry = geometry)) +
    +  scale_fill_gradientn(
    +    guide = "colorbar",
    +    name = "Percent",
    +    labels = scales::comma,
    +    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    +    na.value = NA
    +  ) +
    +  geom_sf_pattern(
    +    data = filter(covid_sf, is.na(p)),
    +    pattern = "crosshatch",
    +    pattern_fill = "black",
    +    fill = NA
    +  ) +
    +  theme_minimal()
    Percent of people worried someone in their household will get COVID-19 in the next 3 months by country

    FIGURE 14.2: Percent of people worried someone in their household will get COVID-19 in the next 3 months by country

    -
    ggplot() +
    -  geom_sf(data = covid_sf, aes(fill = p_mediumchange, geometry = geometry)) +
    -  scale_fill_gradientn(
    -    guide = "colorbar",
    -    name = "Percent",
    -    labels = scales::comma,
    -    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    -    na.value = NA
    -  ) +
    -  geom_sf_pattern(
    -    data = filter(covid_sf, is.na(p_mediumchange)),
    -    pattern = "crosshatch",
    -    pattern_fill = "black",
    -    fill = NA
    -  ) +
    -  theme_minimal()
    +
    ggplot() +
    +  geom_sf(data = covid_sf, aes(fill = p_mediumchange, geometry = geometry)) +
    +  scale_fill_gradientn(
    +    guide = "colorbar",
    +    name = "Percent",
    +    labels = scales::comma,
    +    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    +    na.value = NA
    +  ) +
    +  geom_sf_pattern(
    +    data = filter(covid_sf, is.na(p_mediumchange)),
    +    pattern = "crosshatch",
    +    pattern_fill = "black",
    +    fill = NA
    +  ) +
    +  theme_minimal()
    Percent of students who participated in virtual or hybrid learning

    @@ -1930,25 +1943,25 @@

    14.6 Mapping survey data14.3 we can see that Canada, Mexico, and the United States have missing data (the crosshatch pattern). Reviewing the questionnaires indicate that these three countries did not include the education question in the survey. To better see the differences in the data, it may make sense to remove North America from the map and focus on Central and South America. This is done below by restricting the shape files to Latin America and the Caribbean as seen in Figure 14.4

    -
    covid_c_s <- covid_sf %>%
    -  filter(region_wb == "Latin America & Caribbean")
    -
    -ggplot() +
    -  geom_sf(data = covid_c_s, aes(fill = p_mediumchange, geometry = geometry)) +
    -  scale_fill_gradientn(
    -    guide = "colorbar",
    -    name = "Percent",
    -    labels = scales::comma,
    -    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    -    na.value = NA
    -  ) +
    -  geom_sf_pattern(
    -    data = filter(covid_c_s, is.na(p_mediumchange)),
    -    pattern = "crosshatch",
    -    pattern_fill = "black",
    -    fill = NA
    -  ) + 
    -  theme_minimal()
    +
    covid_c_s <- covid_sf %>%
    +  filter(region_wb == "Latin America & Caribbean")
    +
    +ggplot() +
    +  geom_sf(data = covid_c_s, aes(fill = p_mediumchange, geometry = geometry)) +
    +  scale_fill_gradientn(
    +    guide = "colorbar",
    +    name = "Percent",
    +    labels = scales::comma,
    +    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    +    na.value = NA
    +  ) +
    +  geom_sf_pattern(
    +    data = filter(covid_c_s, is.na(p_mediumchange)),
    +    pattern = "crosshatch",
    +    pattern_fill = "black",
    +    fill = NA
    +  ) + 
    +  theme_minimal()
    Percent of students who participated in virtual or hybrid learning, Central and South America

    @@ -1961,17 +1974,17 @@

    14.7 Exercises
  • Calculate the percentage of households with broadband internet and those with any internet at home, including from phone or tablet. Hint: if you see countries with 0% Internet usage, you may want to filter by something first.
  • -
    int_ests <-
    -  ambarom_des %>%
    -  filter(!is.na(Internet) | !is.na(BroadbandInternet)) %>%
    -  group_by(Country) %>%
    -  summarize(
    -    p_broadband = survey_mean(BroadbandInternet, na.rm = TRUE) * 100,
    -    p_internet = survey_mean(Internet, na.rm = TRUE) * 100
    -  ) 
    -
    -int_ests %>%
    -  print(n = 30)
    +
    int_ests <-
    +  ambarom_des %>%
    +  filter(!is.na(Internet) | !is.na(BroadbandInternet)) %>%
    +  group_by(Country) %>%
    +  summarize(
    +    p_broadband = survey_mean(BroadbandInternet, na.rm = TRUE) * 100,
    +    p_internet = survey_mean(Internet, na.rm = TRUE) * 100
    +  ) 
    +
    +int_ests %>%
    +  print(n = 30)
    ## # A tibble: 20 × 5
     ##    Country           p_broadband p_broadband_se p_internet p_internet_se
     ##    <fct>                   <dbl>          <dbl>      <dbl>         <dbl>
    @@ -1998,34 +2011,34 @@ 

    14.7 Exercises
  • Make a faceted map showing both broadband internet and any internet usage.
  • -
    internet_sf <- country_shape_upd %>%
    -  full_join(select(int_ests, p = p_internet, geounit = Country), by = "geounit") %>%
    -  mutate(Type = "Internet")
    -broadband_sf <- country_shape_upd %>%
    -  full_join(select(int_ests, p = p_broadband, geounit = Country), by = "geounit") %>%
    -  mutate(Type = "Broadband")
    -b_int_sf <- internet_sf %>%
    -  bind_rows(broadband_sf) %>%
    -  filter(region_wb == "Latin America & Caribbean")
    -
    -b_int_sf %>%
    -  ggplot(aes(fill = p)) +
    -  geom_sf() +
    -  facet_wrap( ~ Type) +
    -  scale_fill_gradientn(
    -    guide = "colorbar",
    -    name = "Percent",
    -    labels = scales::comma,
    -    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    -    na.value = NA
    -  ) +
    -  geom_sf_pattern(
    -    data = filter(b_int_sf, is.na(p)),
    -    pattern = "crosshatch",
    -    pattern_fill = "black",
    -    fill = NA
    -  ) +
    -  theme_minimal()
    +
    internet_sf <- country_shape_upd %>%
    +  full_join(select(int_ests, p = p_internet, geounit = Country), by = "geounit") %>%
    +  mutate(Type = "Internet")
    +broadband_sf <- country_shape_upd %>%
    +  full_join(select(int_ests, p = p_broadband, geounit = Country), by = "geounit") %>%
    +  mutate(Type = "Broadband")
    +b_int_sf <- internet_sf %>%
    +  bind_rows(broadband_sf) %>%
    +  filter(region_wb == "Latin America & Caribbean")
    +
    +b_int_sf %>%
    +  ggplot(aes(fill = p)) +
    +  geom_sf() +
    +  facet_wrap( ~ Type) +
    +  scale_fill_gradientn(
    +    guide = "colorbar",
    +    name = "Percent",
    +    labels = scales::comma,
    +    colors = c("#BFD7EA", "#087E8B", "#0B3954"),
    +    na.value = NA
    +  ) +
    +  geom_sf_pattern(
    +    data = filter(b_int_sf, is.na(p)),
    +    pattern = "crosshatch",
    +    pattern_fill = "black",
    +    fill = NA
    +  ) +
    +  theme_minimal()
    Percent of broadband internet and any internet usage, Central and South America

    @@ -2039,7 +2052,7 @@

    14.7 Exercises

    -
    +
    LAPOP. 2021a. “AmericasBarometer 2021 - Canada: Technical Information.” Vanderbilt University; http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf.
    @@ -2062,7 +2075,7 @@

    References
      -
    1. See Table 2 in LAPOP (2021c) for dates by country↩︎

    2. +
    3. See Table 2 in LAPOP (2021c) for dates by country↩︎

    @@ -2089,12 +2102,12 @@

    ReferencesReferences - + @@ -57,7 +57,7 @@