diff --git a/404.html b/404.html index fd614ffb..605d5842 100644 --- a/404.html +++ b/404.html @@ -23,7 +23,7 @@ - + @@ -57,7 +57,7 @@ @@ -1418,39 +1431,39 @@
chi_ex2_obs <- anes_des %>%
- drop_na(TrustPeople, TrustGovernment) %>%
- group_by(TrustPeople, TrustGovernment) %>%
- summarize(Observed = round(survey_mean(vartype = "ci"), 3), .groups = "drop")
-
-chi_ex2_obs_table <- chi_ex2_obs %>%
- mutate(prop = paste0(Observed, " (", Observed_low, ", ", Observed_upp,
- ")")) %>%
- select(TrustGovernment, TrustPeople, prop) %>%
- pivot_wider(names_from = TrustPeople, values_from = prop) %>%
- gt(rowname_col = "TrustGovernment") %>%
- tab_stubhead(label = "Trust in Government") %>%
- tab_spanner(label = "Trust in People", columns = everything()) %>%
- tab_options(page.orientation = "landscape")
- chi_ex2_obs_table
chi_ex2_obs %>%
+ mutate(TrustPeople = str_c("Trust in People:\n", TrustPeople)) %>%
+ ggplot(aes(x = TrustGovernment, y = Observed, color = TrustGovernment)) +
+ facet_wrap(~TrustPeople, ncol = 5) + geom_point() + geom_errorbar(aes(ymin = Observed_low,
+ ymax = Observed_upp)) + ylab("Proportion") + xlab("") + theme_bw() +
+ scale_color_manual(name = "Trust in Government", values = book_colors) +
+ theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
+ legend.position = "bottom") + guides(col = guide_legend(nrow = 2))
To conduct this in R, we use the svychisq()
function to compare the two variables:
<- anes_des %>%
- chi_ex3 drop_na(VotedPres2020_selection, AgeGroup) %>%
- svychisq(formula = ~AgeGroup + VotedPres2020_selection, design = ., statistic = "Chisq",
- na.rm = TRUE)
-
- chi_ex3
chi_ex3 <- anes_des %>%
+ drop_na(VotedPres2020_selection, AgeGroup) %>%
+ svychisq(formula = ~AgeGroup + VotedPres2020_selection, design = ., statistic = "Chisq",
+ na.rm = TRUE)
+
+chi_ex3
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: NextMethod()
## X-squared = 171, df = 10, p-value <2e-16
The output from svychisq()
indicates a difference in how each age group voted in the 2020 election. To get a better idea of the different distributions, let’s output proportions to see the relationship. As we learned in Example 2 above, we can use chi_ex3$observed
, or if we want to get the variance information (which is crucial with survey data), we can use survey_mean()
. Remember, when we have two variables in group_by()
, we obtain the proportions within each level of the variable listed. In this case, we are looking at the distribution of AgeGroup
for each level of VotedPres2020_selection
.
<- anes_des %>%
- chi_ex3_obs filter(VotedPres2020 == "Yes") %>%
- drop_na(VotedPres2020_selection, AgeGroup) %>%
- group_by(VotedPres2020_selection, AgeGroup) %>%
- summarize(Observed = round(survey_mean(vartype = "ci"), 3))
-
-<- chi_ex3_obs %>%
- chi_ex3_obs_table mutate(prop = paste0(Observed, " (", Observed_low, ", ", Observed_upp,
- ")")) %>%
- select(AgeGroup, VotedPres2020_selection, prop) %>%
- pivot_wider(names_from = VotedPres2020_selection, values_from = prop) %>%
- gt(rowname_col = "AgeGroup") %>%
- tab_stubhead(label = "Age Group")
chi_ex3_obs_table
ACUsed
)?<- recs_des %>%
- ttest_solution1 svyttest(design = ., formula = ((ACUsed == TRUE) - 0.5) ~ 0, na.rm = TRUE)
-
- ttest_solution1
ttest_solution1 <- recs_des %>%
+ svyttest(design = ., formula = ((ACUsed == TRUE) - 0.5) ~ 0, na.rm = TRUE)
+
+ttest_solution1
##
## Design-based one-sample t-test
##
@@ -2499,10 +2512,10 @@ 6.5 Exercises
Using the RECS data, does the average temperature that U.S. households set their thermostats to differ between the day and night in the winter (WinterTempDay
and WinterTempNight
)?
-<- recs_des %>%
- ttest_solution2 svyttest(design = ., formula = WinterTempDay - WinterTempNight ~ 0, na.rm = TRUE)
-
- ttest_solution2
+ttest_solution2 <- recs_des %>%
+ svyttest(design = ., formula = WinterTempDay - WinterTempNight ~ 0, na.rm = TRUE)
+
+ttest_solution2
##
## Design-based one-sample t-test
##
@@ -2517,11 +2530,11 @@ 6.5 Exercises
Using the ANES data, does the average age (Age
) of those who voted for Biden in 2020 (VotedPres2020_selection
) differ from those who voted for another candidate?
-<- anes_des %>%
- ttest_solution3 svyttest(design = ., formula = Age ~ VotedPres2020_selection == "Biden",
- na.rm = TRUE)
-
- ttest_solution3
+ttest_solution3 <- anes_des %>%
+ svyttest(design = ., formula = Age ~ VotedPres2020_selection == "Biden",
+ na.rm = TRUE)
+
+ttest_solution3
##
## Design-based t-test
##
@@ -2541,17 +2554,17 @@ 6.5 Exercises<- "c. Test of homogeneity (`svychisq()`)"
- chisq_solution1 chisq_solution1
## [1] "c. Test of homogeneity (`svychisq()`)"
HousingUnitType
) and the year the house was built (YearMade
)?<- recs_des %>%
- chisq_solution2 svychisq(formula = ~HousingUnitType + YearMade, design = ., statistic = "Wald",
- na.rm = TRUE)
-
- chisq_solution2
chisq_solution2 <- recs_des %>%
+ svychisq(formula = ~HousingUnitType + YearMade, design = ., statistic = "Wald",
+ na.rm = TRUE)
+
+chisq_solution2
##
## Design-based Wald test of association
##
@@ -2560,11 +2573,11 @@ 6.5 Exercises
In the ANES data, is there a difference in the distribution of gender (Gender
) across early voting status in 2020 (EarlyVote2020
)?
-<- anes_des %>%
- chisq_solution3 svychisq(formula = ~Gender + EarlyVote2020, design = ., statistic = "F",
- na.rm = TRUE)
-
- chisq_solution3
+chisq_solution3 <- anes_des %>%
+ svychisq(formula = ~Gender + EarlyVote2020, design = ., statistic = "F",
+ na.rm = TRUE)
+
+chisq_solution3
##
## Pearson's X^2: Rao & Scott adjustment
##
@@ -2574,7 +2587,7 @@ 6.5 Exercises
-
+
Lumley, Thomas. 2010. Complex Surveys: A Guide to Analysis Using r: A Guide to Analysis Using r. John Wiley; Sons.
@@ -2619,12 +2632,12 @@ ReferencesReferences
-
+
@@ -57,7 +57,7 @@
@@ -1047,35 +1060,35 @@ 8.3.1.1 Transitioning {srvyr} out
We can add a few more enhancements, such as a title, a data source note, and a footnote with the question information, using the functions tab_header()
, tab_source_note()
, and tab_footnote()
. If having the percentage sign in both the header and the cells seems redundant, we can opt for fmt_number()
instead of fmt_percent()
and scale the number by 100 with scale_by = 100
.
-<- trust_gov_gt %>%
- trust_gov_gt2 tab_header("American voter's trust
- in the federal government, 2020") %>%
-tab_source_note("American National Election Studies, 2020") %>%
- tab_footnote(
- "Question text: How often can you trust the federal government
- in Washington to do what is right?"
-%>%
- ) fmt_number(scale_by = 100,
- decimals = 1)
- trust_gov_gt2
-
-
-
@@ -1576,28 +1589,28 @@ Expanding tables using {gtsummary}<- anes_des %>%
- anes_des_gtsum tbl_svysummary(include = TrustGovernment,
- statistic = list(all_categorical() ~ "{p} ({p.std.error})"))
- anes_des_gtsum
+anes_des_gtsum <- anes_des %>%
+ tbl_svysummary(include = TrustGovernment,
+ statistic = list(all_categorical() ~ "{p} ({p.std.error})"))
+
-
-
@@ -2055,33 +2068,33 @@ Expanding tables using {gtsummary}<- anes_des %>%
- anes_des_gtsum2 tbl_svysummary(
- include = TrustGovernment,
- statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
- missing = "no",
- digits = list(TrustGovernment ~ style_percent),
- label = list(TrustGovernment ~ "Trust in Government, 2020")
- )
- anes_des_gtsum2
-
-
-
@@ -2537,45 +2550,45 @@ Expanding tables using {gtsummary}8.3.1.1.
-<- anes_des %>%
- anes_des_gtsum3 tbl_svysummary(
- include = TrustGovernment,
- statistic = list(all_categorical() ~ "{p} ({p.std.error})"),
- missing = "no",
- digits = list(TrustGovernment ~ style_percent),
- label = list(TrustGovernment ~ "Trust in Government, 2020")
- %>%
- ) modify_footnote(update = everything() ~ NA) %>%
- modify_header(label = " ",
- stat_0 = "% (s.e.)") %>%
- as_gt() %>%
- tab_header("American voter's trust
- in the federal government, 2020") %>%
-tab_source_note("American National Election Studies, 2020") %>%
- tab_footnote(
- "Question text: How often can you trust
- the federal government in Washington
- to do what is right?"
- )
- anes_des_gtsum3
-
-
- anes_des_gtsum4
-
-
- anes_des_gtsum5
-
-
-
@@ -4071,24 +4084,24 @@ 8.3.2 Charts and plots<- anes_des %>%
- anes_des_der mutate(TrustGovernmentUsually = case_when(
- is.na(TrustGovernment) ~ NA,
- TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
- %>%
- )) drop_na(VotedPres2020_selection) %>%
- group_by(VotedPres2020_selection) %>%
- summarize(
- pct_trust = survey_mean(
-
- TrustGovernmentUsually,na.rm = TRUE,
- proportion = TRUE,
- vartype = "ci"
-
- ),.groups = "drop"
-
- )
- anes_des_der
+anes_des_der <- anes_des %>%
+ mutate(TrustGovernmentUsually = case_when(
+ is.na(TrustGovernment) ~ NA,
+ TRUE ~ TrustGovernment %in% c("Always", "Most of the time")
+ )) %>%
+ drop_na(VotedPres2020_selection) %>%
+ group_by(VotedPres2020_selection) %>%
+ summarize(
+ pct_trust = survey_mean(
+ TrustGovernmentUsually,
+ na.rm = TRUE,
+ proportion = TRUE,
+ vartype = "ci"
+ ),
+ .groups = "drop"
+ )
+
+anes_des_der
## # A tibble: 3 × 4
## VotedPres2020_selection pct_trust pct_trust_low pct_trust_upp
## <fct> <dbl> <dbl> <dbl>
@@ -4096,12 +4109,12 @@ 8.3.2 Charts and plots8.1.
-<- anes_des_der %>%
- p ggplot(aes(x = VotedPres2020_selection,
- y = pct_trust)) +
- geom_bar(stat = "identity")
-
- p
+p <- anes_des_der %>%
+ ggplot(aes(x = VotedPres2020_selection,
+ y = pct_trust)) +
+ geom_bar(stat = "identity")
+
+p
8.3.2 Charts and plots<- anes_des_der %>%
- pcolor ggplot(aes(x = VotedPres2020_selection,
- y = pct_trust,
- fill = VotedPres2020_selection)) +
- geom_bar(stat = "identity")
-
- pcolor
+pcolor <- anes_des_der %>%
+ ggplot(aes(x = VotedPres2020_selection,
+ y = pct_trust,
+ fill = VotedPres2020_selection)) +
+ geom_bar(stat = "identity")
+
+pcolor
+pcol_error <- anes_des_der %>%
+ ggplot(aes(x = VotedPres2020_selection,
+ y = pct_trust,
+ fill = VotedPres2020_selection)) +
+ geom_bar(stat = "identity") +
+ geom_errorbar(aes(ymin = pct_trust_low,
+ ymax = pct_trust_upp),
+ width = .2)
+
+pcol_error
8.3.2 Charts and plots. We can specify specific colors for fill
using scale_fill_manual()
. Inside the function, we provide a vector of values corresponding to the colors in our plot. These values are hexadecimal (hex) color codes, denoted by a leading pound sign #
followed by six letters or numbers. The hex code #0b3954
used below is a dark blue. There are many tools online that help pick hex codes, such as htmlcolorcodes.com/.
-<-
- pfull %>%
- anes_des_der ggplot(aes(x = VotedPres2020_selection,
- y = pct_trust,
- fill = VotedPres2020_selection)) +
- geom_bar(stat = "identity") +
- geom_errorbar(aes(ymin = pct_trust_low,
- ymax = pct_trust_upp),
- width = .2) +
- scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
- xlab("Election choice (2020)") +
- ylab("Usually trust the government") +
- scale_y_continuous(labels = scales::percent) +
- guides(fill = "none") +
- labs(title = "Percent of voters who usually trust the government
- by chosen 2020 presidential candidate",
-caption = "Source: American National Election Studies, 2020")
-
- pfull
+pfull <-
+ anes_des_der %>%
+ ggplot(aes(x = VotedPres2020_selection,
+ y = pct_trust,
+ fill = VotedPres2020_selection)) +
+ geom_bar(stat = "identity") +
+ geom_errorbar(aes(ymin = pct_trust_low,
+ ymax = pct_trust_upp),
+ width = .2) +
+ scale_fill_manual(values = c("#0b3954", "#bfd7ea", "#8d6b94")) +
+ xlab("Election choice (2020)") +
+ ylab("Usually trust the government") +
+ scale_y_continuous(labels = scales::percent) +
+ guides(fill = "none") +
+ labs(title = "Percent of voters who usually trust the government
+ by chosen 2020 presidential candidate",
+ caption = "Source: American National Election Studies, 2020")
+
+pfull
-
What we’ve explored in this section are just the foundational aspects of {ggplot2}, and the capabilities of this package extend far beyond what we’ve covered. Advanced features such as annotation, faceting, and theming allow for more sophisticated and customized visualizations. The book Wickham (2023) is a comprehensive guide to learning more about this powerful tool.
+What we’ve explored in this section are just the foundational aspects of {ggplot2}, and the capabilities of this package extend far beyond what we’ve covered. Advanced features such as annotation, faceting, and theming allow for more sophisticated and customized visualizations. The book Wickham (2023) is a comprehensive guide to learning more about this powerful tool.
References
-
+
———. 2023. Ggplot2: Elegant Graphics for Data Analysis. 3rd Edition. https://ggplot2-book.org/; Springer.
@@ -4200,12 +4213,12 @@ ReferencesReferences
-
+
@@ -57,7 +57,7 @@
+
+ TABLE 11.1: Percentage of Votes by Candidate for Different Missing Data Inclusions
+
+
+
+ Candidate
+
+ Including All Missing Data
+
+
+ Removing Skip Patterns Only
+
+
+ Removing All Missing Data
+
+
+
+ %
+ s.e. (%)
+ %
+ s.e. (%)
+ %
+ s.e. (%)
+
+
+
+ Did not Vote for President in 2016
+32.4%
+0.9%
+NA
+NA
+NA
+NA
+ Hillary Clinton
+33.0%
+0.7%
+48.8%
+0.9%
+49.2%
+0.9%
+ Donald Trump
+29.9%
+0.7%
+44.3%
+0.9%
+44.7%
+0.9%
+ Other Candidate
+4.1%
+0.2%
+6.1%
+0.3%
+6.1%
+0.3%
+ Missing
+0.6%
+0.1%
+0.9%
+0.2%
+NA
+NA
+
+
+
+
+
+As Table 11.1 shows, the results can vary greatly depending on which type of missing data that are removed. If we remove only the skip patterns the margin between the Clinton and Trump is 4.5 percentage points, but if we include all data even including those that did not vote in 2016, the margin is 3.1 percentage points. How we handle the different types of missing values is important for interpretation of the data.
+
+
+
Prevalence rates differ from victimization rates as the numerator is the number of people or households victimized rather than the number of victimizations. To calculate the prevalence rates, we must run another summary of the data by calculating an indicator for whether a person or household is a victim of a particular crime at any point in the year. Below is an example of calculating first the indicator and then the prevalence rate of violent crime and aggravated assault.
-<-
- pers_prev_des %>%
- pers_vsum_slim mutate(Year = floor(YEARQ)) %>%
- mutate(Violent_Ind = sum(Violent) > 0,
- AAST_Ind = sum(AAST) > 0,
- .by = c("Year", "IDHH", "IDPER")) %>%
- as_survey(
- weight = WGTPERCY,
- strata = V2117,
- ids = V2118,
- nest = TRUE
-
- )
-<- pers_prev_des %>%
- pers_prev_ests summarize(Violent_Prev = survey_mean(Violent_Ind * 100),
- AAST_Prev = survey_mean(AAST_Ind * 100))
-
- pers_prev_ests
pers_prev_des <-
+ pers_vsum_slim %>%
+ mutate(Year = floor(YEARQ)) %>%
+ mutate(Violent_Ind = sum(Violent) > 0,
+ AAST_Ind = sum(AAST) > 0,
+ .by = c("Year", "IDHH", "IDPER")) %>%
+ as_survey(
+ weight = WGTPERCY,
+ strata = V2117,
+ ids = V2118,
+ nest = TRUE
+ )
+
+pers_prev_ests <- pers_prev_des %>%
+ summarize(Violent_Prev = survey_mean(Violent_Ind * 100),
+ AAST_Prev = survey_mean(AAST_Ind * 100))
+
+pers_prev_ests
## # A tibble: 1 × 4
## Violent_Prev Violent_Prev_se AAST_Prev AAST_Prev_se
## <dbl> <dbl> <dbl> <dbl>
@@ -2749,10 +2762,10 @@ 13.7 Exercises
What proportion of completed motor vehicle thefts are not reported to the police? Hint: Use the codebook to look at the definition of Type of Crime (V4529).
-<- inc_des %>%
- ans1 filter(str_detect(V4529, "40|41")) %>%
- summarize(Pct = survey_mean(ReportPolice, na.rm = TRUE) * 100)
- ans1
+ans1 <- inc_des %>%
+ filter(str_detect(V4529, "40|41")) %>%
+ summarize(Pct = survey_mean(ReportPolice, na.rm = TRUE) * 100)
+ans1
## # A tibble: 1 × 2
## Pct Pct_se
## <dbl> <dbl>
@@ -2760,9 +2773,9 @@ 13.7 Exercises
How many violent crimes occur in each region?
-%>%
- inc_des filter(Violent) %>%
- survey_count(Region)
+
## # A tibble: 4 × 3
## Region n n_se
## <fct> <dbl> <dbl>
@@ -2773,10 +2786,10 @@ 13.7 Exercises
What is the property victimization rate among each income level?
-%>%
- hh_des group_by(Income) %>%
- summarize(Property_Rate = survey_mean(Property * ADJINC_WT * 1000,
- na.rm = TRUE))
+hh_des %>%
+ group_by(Income) %>%
+ summarize(Property_Rate = survey_mean(Property * ADJINC_WT * 1000,
+ na.rm = TRUE))
## # A tibble: 6 × 3
## Income Property_Rate Property_Rate_se
## <fct> <dbl> <dbl>
@@ -2790,7 +2803,7 @@ 13.7 Exercises
-
+
Bureau of Justice Statistics. 2017. “National Crime Victimization Survey, 2016: Technical Documentation.” https://bjs.ojp.gov/sites/g/files/xyckuh236/files/media/document/ncvstd16.pdf.
@@ -2833,12 +2846,12 @@ ReferencesReferences
-
+
@@ -57,7 +57,7 @@
@@ -1156,11 +1169,11 @@ 14.5 Calculating estimates and ma
Multiple-choice questions are interesting. If we want to look at how education was impacted only among those in school, we need to filter to the relevant responses, which is anyone that responded no to the first part. The variable Educ_NotInSchool
in the dataset has values of 0 and 1. A value of 1 means that the respondent selected the first option in the question (none of the children are in school) and a value of 0 means that at least one of their children are in school. Using this variable, we can filter the data to only those with a value of 0.
There are three additional variables that we can look at that correlate to the second option (Educ_NormalSchool
), third option (Educ_VirtualSchool
), and fourth option (Educ_Hybrid
). An unweighted cross-tab for the responses is included below, and we can see there is a wide-range of impacts and that many combinations of effects on education are possible.
-%>% filter(Educ_NotInSchool == 0) %>%
- ambarom distinct(Educ_NormalSchool,
-
- Educ_VirtualSchool,%>%
- Educ_Hybrid) print(n = 50)
+ambarom %>% filter(Educ_NotInSchool == 0) %>%
+ distinct(Educ_NormalSchool,
+ Educ_VirtualSchool,
+ Educ_Hybrid) %>%
+ print(n = 50)
## # A tibble: 8 × 3
## Educ_NormalSchool Educ_VirtualSchool Educ_Hybrid
## <dbl> <dbl> <dbl>
@@ -1178,58 +1191,58 @@ 14.5 Calculating estimates and ma
Indicator that the education medium was changed - either virtual or hybrid
In this next code chunk, we create these indicators, make national estimates, and display a summary table of the data.
-<- ambarom_des %>%
- ambarom_des_educ filter(Educ_NotInSchool == 0) %>%
- mutate(Educ_OnlyNormal = (Educ_NormalSchool == 1 &
- == 0 &
- Educ_VirtualSchool == 0),
- Educ_Hybrid Educ_MediumChange = (Educ_VirtualSchool == 1 |
- == 1))
- Educ_Hybrid
-<-
- covid_educ_ests %>%
- ambarom_des_educ group_by(Country) %>%
- summarize(
- p_onlynormal = survey_mean(Educ_OnlyNormal, na.rm = TRUE) * 100,
- p_mediumchange = survey_mean(Educ_MediumChange, na.rm = TRUE) * 100,
- p_noschool = survey_mean(Educ_NoSchool, na.rm = TRUE) * 100,
-
- )
-<-covid_educ_ests %>%
- covid_educ_ests_gtgt(rowname_col = "Country") %>%
- cols_label(p_onlynormal = "%",
- p_onlynormal_se = "SE",
- p_mediumchange = "%",
- p_mediumchange_se = "SE",
- p_noschool = "%",
- p_noschool_se = "SE") %>%
- tab_spanner(label = "Normal school only",
- columns = c("p_onlynormal", "p_onlynormal_se")) %>%
- tab_spanner(label = "Medium change",
- columns = c("p_mediumchange", "p_mediumchange_se")) %>%
- tab_spanner(label = "Cut ties with school",
- columns = c("p_noschool", "p_noschool_se")) %>%
- fmt_number(decimals = 1) %>%
- tab_source_note("AmericasBarometer Surveys, 2021")
- covid_educ_ests_gt
-
-
-
@@ -1805,16 +1818,16 @@ 14.5 Calculating estimates and ma
14.6 Mapping survey data
While the table presents the data well, a map could also be used. To obtain maps of the countries, the package {rnaturalearth} is used, subsetting North and South America using the function ne_countries()
. This returns an sf object with many columns but, most importantly soverignt
(sovereignty), geounit
(country or territory), and geometry
(the shape). As an example of the difference between soverignty and country/territory, the United States, Puerto Rico, and the US Virgin Islands are all separate units with the same sovereignty. This map (without data) is plotted in Figure 14.1.
-<-
- country_shape ne_countries(
- scale = "medium",
- returnclass = "sf",
- continent = c("North America", "South America")
-
- )
-%>%
- country_shape ggplot() +
- geom_sf()
+country_shape <-
+ ne_countries(
+ scale = "medium",
+ returnclass = "sf",
+ continent = c("North America", "South America")
+ )
+
+country_shape %>%
+ ggplot() +
+ geom_sf()
14.6 Mapping survey data14.1 is very wide as the Aleutian islands in Alaska extend into the Eastern Hemisphere. We can crop the shape file to only the Western Hemisphere to remove some of the trailing islands of Alaska.
-<- country_shape %>%
- country_shape_crop st_crop(c(xmin = -180,
- xmax = 0,
- ymin = -90,
- ymax = 90))
+
Now that we have the shape files we need, our next step is to match our survey data to the map. Countries can be called by different names (e.g., “U.S”, “U.S.A”, “United States”). To make sure we can plot our survey data on the map, we will need to make sure the country in both datasets match. To do this, we can use the anti_join()
function and check to see what countries are in the survey data but not in the map data. As shown below, the United States is referred to as “United States” in the survey data but “United States of America” in the map data. The code below shows countries in the survey but not the map data.
-<- ambarom %>% distinct(Country)
- survey_country_list %>%
- survey_country_list anti_join(country_shape_crop, by = c("Country" = "geounit"))
+survey_country_list <- ambarom %>% distinct(Country)
+survey_country_list %>%
+ anti_join(country_shape_crop, by = c("Country" = "geounit"))
## # A tibble: 1 × 1
## Country
## <fct>
## 1 United States
The code below shows countries in the map data but not hte survey data.
-%>% as_tibble() %>%
- country_shape_crop select(geounit, sovereignt) %>%
- anti_join(survey_country_list, by = c("geounit" = "Country")) %>%
- arrange(geounit) %>%
- print(n = 30)
+country_shape_crop %>% as_tibble() %>%
+ select(geounit, sovereignt) %>%
+ anti_join(survey_country_list, by = c("geounit" = "Country")) %>%
+ arrange(geounit) %>%
+ print(n = 30)
## # A tibble: 30 × 2
## geounit sovereignt
## <chr> <chr>
@@ -1875,54 +1888,54 @@ 14.6 Mapping survey data<- country_shape_crop %>%
- country_shape_upd mutate(geounit = if_else(geounit == "United States of America",
- "United States", geounit))
+country_shape_upd <- country_shape_crop %>%
+ mutate(geounit = if_else(geounit == "United States of America",
+ "United States", geounit))
To merge the data and make a map, we begin with the map file, merge the estimates data, and then plot. Let’s use the outcomes we created in section 14.5 for the table output (covid_worry_country_ests
and covid_educ_ests
). Figures 14.2 and 14.3 display the maps for each measure.
-<- country_shape_upd %>%
- covid_sf full_join(covid_worry_country_ests,
- by = c("geounit" = "Country")) %>%
- full_join(covid_educ_ests,
- by = c("geounit" = "Country"))
-
-ggplot() +
-geom_sf(data = covid_sf, aes(fill = p, geometry = geometry)) +
- scale_fill_gradientn(
- guide = "colorbar",
- name = "Percent",
- labels = scales::comma,
- colors = c("#BFD7EA", "#087E8B", "#0B3954"),
- na.value = NA
- +
- ) geom_sf_pattern(
- data = filter(covid_sf, is.na(p)),
- pattern = "crosshatch",
- pattern_fill = "black",
- fill = NA
- +
- ) theme_minimal()
+covid_sf <- country_shape_upd %>%
+ full_join(covid_worry_country_ests,
+ by = c("geounit" = "Country")) %>%
+ full_join(covid_educ_ests,
+ by = c("geounit" = "Country"))
+
+ggplot() +
+ geom_sf(data = covid_sf, aes(fill = p, geometry = geometry)) +
+ scale_fill_gradientn(
+ guide = "colorbar",
+ name = "Percent",
+ labels = scales::comma,
+ colors = c("#BFD7EA", "#087E8B", "#0B3954"),
+ na.value = NA
+ ) +
+ geom_sf_pattern(
+ data = filter(covid_sf, is.na(p)),
+ pattern = "crosshatch",
+ pattern_fill = "black",
+ fill = NA
+ ) +
+ theme_minimal()
-
ggplot() +
-geom_sf(data = covid_sf, aes(fill = p_mediumchange, geometry = geometry)) +
- scale_fill_gradientn(
- guide = "colorbar",
- name = "Percent",
- labels = scales::comma,
- colors = c("#BFD7EA", "#087E8B", "#0B3954"),
- na.value = NA
- +
- ) geom_sf_pattern(
- data = filter(covid_sf, is.na(p_mediumchange)),
- pattern = "crosshatch",
- pattern_fill = "black",
- fill = NA
- +
- ) theme_minimal()
+ggplot() +
+ geom_sf(data = covid_sf, aes(fill = p_mediumchange, geometry = geometry)) +
+ scale_fill_gradientn(
+ guide = "colorbar",
+ name = "Percent",
+ labels = scales::comma,
+ colors = c("#BFD7EA", "#087E8B", "#0B3954"),
+ na.value = NA
+ ) +
+ geom_sf_pattern(
+ data = filter(covid_sf, is.na(p_mediumchange)),
+ pattern = "crosshatch",
+ pattern_fill = "black",
+ fill = NA
+ ) +
+ theme_minimal()
14.6 Mapping survey data14.3 we can see that Canada, Mexico, and the United States have missing data (the crosshatch pattern). Reviewing the questionnaires indicate that these three countries did not include the education question in the survey. To better see the differences in the data, it may make sense to remove North America from the map and focus on Central and South America. This is done below by restricting the shape files to Latin America and the Caribbean as seen in Figure 14.4
-<- covid_sf %>%
- covid_c_s filter(region_wb == "Latin America & Caribbean")
-
-ggplot() +
-geom_sf(data = covid_c_s, aes(fill = p_mediumchange, geometry = geometry)) +
- scale_fill_gradientn(
- guide = "colorbar",
- name = "Percent",
- labels = scales::comma,
- colors = c("#BFD7EA", "#087E8B", "#0B3954"),
- na.value = NA
- +
- ) geom_sf_pattern(
- data = filter(covid_c_s, is.na(p_mediumchange)),
- pattern = "crosshatch",
- pattern_fill = "black",
- fill = NA
- +
- ) theme_minimal()
+covid_c_s <- covid_sf %>%
+ filter(region_wb == "Latin America & Caribbean")
+
+ggplot() +
+ geom_sf(data = covid_c_s, aes(fill = p_mediumchange, geometry = geometry)) +
+ scale_fill_gradientn(
+ guide = "colorbar",
+ name = "Percent",
+ labels = scales::comma,
+ colors = c("#BFD7EA", "#087E8B", "#0B3954"),
+ na.value = NA
+ ) +
+ geom_sf_pattern(
+ data = filter(covid_c_s, is.na(p_mediumchange)),
+ pattern = "crosshatch",
+ pattern_fill = "black",
+ fill = NA
+ ) +
+ theme_minimal()
14.7 Exercises
Calculate the percentage of households with broadband internet and those with any internet at home, including from phone or tablet. Hint: if you see countries with 0% Internet usage, you may want to filter by something first.
-<-
- int_ests %>%
- ambarom_des filter(!is.na(Internet) | !is.na(BroadbandInternet)) %>%
- group_by(Country) %>%
- summarize(
- p_broadband = survey_mean(BroadbandInternet, na.rm = TRUE) * 100,
- p_internet = survey_mean(Internet, na.rm = TRUE) * 100
-
- )
-%>%
- int_ests print(n = 30)
+int_ests <-
+ ambarom_des %>%
+ filter(!is.na(Internet) | !is.na(BroadbandInternet)) %>%
+ group_by(Country) %>%
+ summarize(
+ p_broadband = survey_mean(BroadbandInternet, na.rm = TRUE) * 100,
+ p_internet = survey_mean(Internet, na.rm = TRUE) * 100
+ )
+
+int_ests %>%
+ print(n = 30)
## # A tibble: 20 × 5
## Country p_broadband p_broadband_se p_internet p_internet_se
## <fct> <dbl> <dbl> <dbl> <dbl>
@@ -1998,34 +2011,34 @@ 14.7 Exercises
Make a faceted map showing both broadband internet and any internet usage.
-<- country_shape_upd %>%
- internet_sf full_join(select(int_ests, p = p_internet, geounit = Country), by = "geounit") %>%
- mutate(Type = "Internet")
- <- country_shape_upd %>%
- broadband_sf full_join(select(int_ests, p = p_broadband, geounit = Country), by = "geounit") %>%
- mutate(Type = "Broadband")
- <- internet_sf %>%
- b_int_sf bind_rows(broadband_sf) %>%
- filter(region_wb == "Latin America & Caribbean")
-
-%>%
- b_int_sf ggplot(aes(fill = p)) +
- geom_sf() +
- facet_wrap( ~ Type) +
- scale_fill_gradientn(
- guide = "colorbar",
- name = "Percent",
- labels = scales::comma,
- colors = c("#BFD7EA", "#087E8B", "#0B3954"),
- na.value = NA
- +
- ) geom_sf_pattern(
- data = filter(b_int_sf, is.na(p)),
- pattern = "crosshatch",
- pattern_fill = "black",
- fill = NA
- +
- ) theme_minimal()
+internet_sf <- country_shape_upd %>%
+ full_join(select(int_ests, p = p_internet, geounit = Country), by = "geounit") %>%
+ mutate(Type = "Internet")
+broadband_sf <- country_shape_upd %>%
+ full_join(select(int_ests, p = p_broadband, geounit = Country), by = "geounit") %>%
+ mutate(Type = "Broadband")
+b_int_sf <- internet_sf %>%
+ bind_rows(broadband_sf) %>%
+ filter(region_wb == "Latin America & Caribbean")
+
+b_int_sf %>%
+ ggplot(aes(fill = p)) +
+ geom_sf() +
+ facet_wrap( ~ Type) +
+ scale_fill_gradientn(
+ guide = "colorbar",
+ name = "Percent",
+ labels = scales::comma,
+ colors = c("#BFD7EA", "#087E8B", "#0B3954"),
+ na.value = NA
+ ) +
+ geom_sf_pattern(
+ data = filter(b_int_sf, is.na(p)),
+ pattern = "crosshatch",
+ pattern_fill = "black",
+ fill = NA
+ ) +
+ theme_minimal()
14.7 Exercises
-
+
LAPOP. 2021a. “AmericasBarometer 2021 - Canada: Technical Information.” Vanderbilt University; http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf.
@@ -2062,7 +2075,7 @@ References
@@ -2089,12 +2102,12 @@ ReferencesReferences
-
+
@@ -57,7 +57,7 @@