From 1aa432f7d779ef08a7192f008f568412fedebea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isabella=20Vel=C3=A1squez?= Date: Sun, 11 Feb 2024 13:08:45 -0800 Subject: [PATCH] Incorporate UnderstandingSurveyDocs Feedback (#93) * Chapters updates * Small edits * Add bit on methdology * Fix spelling * survey doc sz --------- Co-authored-by: Stephanie Zimmer --- 02-overview-surveys.Rmd | 10 +- ...nderstanding-survey-data-documentation.Rmd | 79 ++--- book.bib | 325 +++++++++--------- renv.lock | 19 +- 4 files changed, 218 insertions(+), 215 deletions(-) diff --git a/02-overview-surveys.Rmd b/02-overview-surveys.Rmd index 533bf1c5..5ab7ff66 100644 --- a/02-overview-surveys.Rmd +++ b/02-overview-surveys.Rmd @@ -6,10 +6,18 @@ Developing surveys to gather accurate information about populations involves a m While this book focuses on the analysis methods of complex surveys, understanding the entire survey life cycle can provide a better insight into what types of analyses should be conducted on the data. The *survey life cycle* consists of the necessary stages to execute a survey project successfully. Each stage influences the survey's timing, costs, and feasibility, consequently impacting the data collected and how we should analyze it. -The survey life cycle starts with a *research topic or question of interest* (e.g., what impact does childhood trauma have on health outcomes later in life). Researchers typically review existing data sources to determine if data are already available that can answer this question, as drawing from available resources can result in a reduced burden on respondents, cheaper research costs, and faster research outcomes. However, if existing data cannot answer the nuances of the research question, a survey can be used to capture the exact data that the researcher needs. +The survey life cycle starts with a *research topic or question of interest* (e.g., what impact does childhood trauma have on health outcomes later in life). Researchers typically review existing data sources to determine if data are already available that can answer this question, as drawing from available resources can result in a reduced burden on respondents, cheaper research costs, and faster research outcomes. However, if existing data cannot answer the nuances of the research question, a survey can be used to capture the exact data that the researcher needs through a questionnaire, or a set of questions. To gain a deeper understanding of survey design and implementation, we recommend reviewing several pieces of existing literature in detail [e.g., @dillman2014mode; @groves2009survey; @Tourangeau2000psych; @Bradburn2004; @valliant2013practical; @biemer2003survqual]. +## Searching for public-use survey data + +Throughout this book, we use public-use datasets from different surveys, including the American National Election Survey (ANES), the Residential Energy Consumption Survey (RECS), the National Crime Victimization Survey (NCVS), and the AmericasBarometer surveys. + +As mentioned above, researchers should look for existing data that can provide insights into their research questions before embarking on a new survey. One of the greatest sources of data is the government. For example, in the U.S., we can get data directly from the various statistical agencies like with RECS and NCVS. Other countries often have data available through official statistics offices, such as the Office for National Statistics in the United Kingdom. + +In addition to government data, many researchers will make their data publicly available through repositories such as the [Inter-university Consortium for Political and Social Research (ICPSR) variable search](https://www.icpsr.umich.edu/web/pages/ICPSR/ssvd/) or the [Odum Institute Data Archive](https://odum.unc.edu/archive/). Searching these repositories or other compiled lists (e.g., [Analyze Survey Data for Free - asdfree.com](https://asdfree.com)) can be an efficient way to identify surveys with questions related to the researcher's topic of interest. + ## Pre-Survey Planning {#pre-survey-planning} There are multiple things to consider when starting a survey. *Errors* are the differences between the true values of the variables being studied and the values obtained through the survey. Each step and decision made before the launch of the survey impact the types of errors that are introduced into the data, which in turn impact how to interpret the results. diff --git a/03-understanding-survey-data-documentation.Rmd b/03-understanding-survey-data-documentation.Rmd index 0076708c..388a098c 100644 --- a/03-understanding-survey-data-documentation.Rmd +++ b/03-understanding-survey-data-documentation.Rmd @@ -11,28 +11,30 @@ library(tidyverse) ## Introduction -Before diving into survey analysis, it's crucial to review the survey documentation thoroughly. The documentation includes technical guides, questionnaires, codebooks, errata, and other useful resources. By taking the time to review these materials, we can gain a comprehensive understanding of the survey data (including research and design decisions discussed in Chapters \@ref(c02-overview-surveys) and \@ref(c10-specifying-sample-designs)) and effectively conduct our analysis. +Survey documentation helps us prepare before we look at the actual survey data. The documentation includes technical guides, questionnaires, codebooks, errata, and other useful resources. By taking the time to review these materials, we can gain a comprehensive understanding of the survey data (including research and design decisions discussed in Chapters \@ref(c02-overview-surveys) and \@ref(c10-specifying-sample-designs)) and conduct our analysis more effectively. -Survey documentation can vary in organization, type, and ease of use. The information may be stored in any format - PDFs, Excel spreadsheets, Word documents, etc. Some surveys save different documentation together, such as providing a single document containing both the codebook and the questionnaire. Others keep them in separate files. Despite these differences, it is important to know what kind of information is available in each documentation type and what to focus on in each one. +Survey documentation can vary in organization, type, and ease of use. The information may be stored in any format - PDFs, Excel spreadsheets, Word documents, and so on. Some surveys bundle documentation together, such as providing the codebook and questionnaire in a single document. Others keep them in separate files. Despite these variations, we can gain a general understanding of the documentation types and what aspects to focus on in each. -## Types of Survey Documentation +## Types of survey documentation -### Technical Documentation -The technical documentation, also known as user guides or methodology/analysis guides, highlights the variables necessary to specify the survey design. We recommend focusing on these key sections: +### Technical documentation + +The technical documentation, also known as user guides or methodology/analysis guides, highlights the variables necessary to specify the survey design. We recommend concentrating on these key sections: * **Introduction:** The introduction orients us to the survey. This section provides the project's background, the study's purpose, and the main research questions. * **Study design:** The study design section describes how researchers prepared and administered the survey. - * **Sample:** The sample section describes how researchers selected cases, any sampling error that occurred, and the limitations of the sample. This section can contain recommendations on how to use sampling weights. Look for weight information, whether the survey design contains strata, clusters/PSUs, or replicate weights. Also look for population sizes, finite population correction, or replicate weight scaling information. The sample documentation is critical in successfully running our analysis, and more detail on sample designs is available in Chapter \@ref(c10-specifying-sample-designs). + * **Sample:** The sample section describes the sample frame, any known sampling errors, and the limitations of the sample. This section can contain recommendations on how to use sampling weights. Look for weight information, whether the survey design contains strata, clusters/PSUs, or replicate weights. Also look for population sizes, finite population correction, or replicate weight scaling information. Additional detail on sample designs is available in Chapter \@ref(c10-specifying-sample-designs). + * **Notes on fielding:** Any additional notes on fielding, such as response rates, may be found in the technical documentation. -The technical documentation may include other helpful information. Some technical documentation includes syntax for SAS, SUDAAN, Stata, and/or R, meaning we do not have to create this code from scratch. +The technical documentation may include other helpful resources. Some technical documentation includes syntax for SAS, SUDAAN, Stata, and/or R, so we do not have to create this code from scratch. ### Questionnaires -A questionnaire is a series of questions used to collect information from people in a survey. It can ask about opinions, behaviors, demographics, or even just numbers like the count of lightbulbs, square footage, or farm size. Questionnaires can employ different types of questions, such as closed-ended (e.g., select one or check all that apply), open-ended (e.g., numeric or text), Likert scales, or ranking questions. It may randomize the display order of responses or include instructions to help respondents understand the questions. A survey may have one questionnaire or multiple, depending on its scale and scope. +A questionnaire is a series of questions used to collect information from people in a survey. It can ask about opinions, behaviors, demographics, or even just numbers like the count of lightbulbs, square footage, or farm size. Questionnaires can employ different types of questions, such as closed-ended (e.g., select one or check all that apply), open-ended (e.g., numeric or text), Likert scales (e.g., a 5- or 7-point scale specifying a respondent's level of agreement to a statement), or ranking questions (e.g., a list of options that a respondent ranks by preference). It may randomize the display order of responses or include instructions that help respondents understand the questions. A survey may have one questionnaire or multiple, depending on its scale and scope. -The questionnaire is another essential resource for understanding and interpreting the survey data (see Section \@ref(overview-design-questionnaire)), and we should use it alongside any analysis. It provides details about each of the questions asked in the survey, such as question name, question wording, response options, skip logic, randomizations, display specification, mode differences, and the universe (if only a subset of respondents were asked the question). +The questionnaire is another important resource for understanding and interpreting the survey data (see Section \@ref(overview-design-questionnaire)), and we should use it alongside any analysis. It provides details about each of the questions asked in the survey, such as question name, question wording, response options, skip logic, randomizations, display specification, mode differences, and the universe (the subset of respondents that were asked a question). -Below, in Figure \@ref(fig:understand-que-examp), we show a question from the ANES 2020 questionnaire [@anes-svy]. This figure shows a particular question's question name (`POSTVOTE_RVOTE`), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (if `vote_pre` = 0), and other specifications. The section also includes the variable name, which we can link to the codebook. +Below, in Figure \@ref(fig:understand-que-examp), we show an example from the ANES 2020 questionnaire [@anes-svy]. The figure shows a question's question name (`POSTVOTE_RVOTE`), description (Did R Vote?), full wording of the question and responses, response order, universe, question logic (this question was only asked if `vote_pre` = 0), and other specifications. The section also includes the variable name, which we can link to the codebook. ```{r} #| label: understand-que-examp @@ -40,10 +42,10 @@ Below, in Figure \@ref(fig:understand-que-examp), we show a question from the AN #| fig.cap: ANES 2020 Questionnaire Example #| fig.alt: Question information about the variable postvote_rvote from ANES 2020 questionnaire Survey question, Universe, Logic, Web Spec, Response Order, and Released Variable are included. -knitr::include_graphics(path="images/questionnaire-example.jpg") +knitr::include_graphics(path = "images/questionnaire-example.jpg") ``` -The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure \@ref(fig:understand-que-examp-2) shows a question from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) [@brfss-svy]. +The content and structure of questionnaires vary depending on the specific survey. For instance, question names may be informative (like the ANES example above), sequential, or denoted by a code. In some cases, surveys may not use separate names for questions and variables. Figure \@ref(fig:understand-que-examp-2) shows an example from the Behavioral Risk Factor Surveillance System (BRFSS) questionnaire that shows a sequential question number and a coded variable name (as opposed to a question name) [@brfss-svy]. ```{r} #| label: understand-que-examp-2 @@ -51,16 +53,16 @@ The content and structure of questionnaires vary depending on the specific surve #| fig.cap: BRFSS 2021 Questionnaire Example #| fig.alt: Question information about the variable BPHIGH6 from BRFSS 2021 questionnaire. Question number, question text, variable names, responses, skip info and CATI note, interviewer notes, and columns are included. -knitr::include_graphics(path="images/questionnaire-example-2.jpg") +knitr::include_graphics(path = "images/questionnaire-example-2.jpg") ``` -Given the variety in how the survey information is presented in documentation, we need to consider the details of a survey when running our analyses. For example, surveys that use different modes (e.g., web and mail) may have different question wording or skip logic, as a web survey can include fills or automate skip logic. This may result in separate scripts for each mode. Reviewing the questionnaire documentation for the specific survey is crucial in understanding how to interpret the data and findings. +We should factor in the details of a survey when conducting our analyses. For example, surveys that use various modes (e.g., web and mail) may have differences in question wording or skip logic, as web surveys can include fills or automate skip logic. These variations could warrant separate analyses for each mode. ### Codebooks -While a questionnaire provides information about the questions asked to respondents, the codebook explains how the survey data was coded and recorded. The codebook lists details such as variable names, variable labels, variable meanings, codes for missing data, value labels, and value types (whether categorical or continuous, etc.). In particular, the codebook (as opposed to the questionnaire) often includes information on missing data. The codebook enables us to understand and use the variables appropriately in our analysis. Note that some studies use the terminology *data dictionary* rather than codebook. +While a questionnaire provides information about the questions posed to respondents, the codebook explains how the survey data was coded and recorded. It lists details such as variable names, variable labels, variable meanings, codes for missing data, value labels, and value types (whether categorical or continuous, etc.). The codebook helps us understand and use the variables appropriately in our analysis. In particular, the codebook (as opposed to the questionnaire) often includes information on missing data. Note that the term *data dictionary* is sometimes used interchangeably with codebook, but a data dictionary may include more details on the structure and elements of the data. -Figure \@ref(fig:understand-codebook-examp) is a question from the ANES 2020 codebook [@anes-cb]. This part indicates a particular variable's name (`V202066`), question wording, value labels, universe, and associated survey question (`POSTVOTE_RVOTE`). +Figure \@ref(fig:understand-codebook-examp) is a question from the ANES 2020 codebook [@anes-cb]. This section indicates a particular variable's name (`V202066`), question wording, value labels, universe, and associated survey question (`POSTVOTE_RVOTE`). ```{r} #| label: understand-codebook-examp @@ -71,39 +73,42 @@ Figure \@ref(fig:understand-codebook-examp) is a question from the ANES 2020 cod knitr::include_graphics(path="images/codebook-example.jpg") ``` -Reviewing both questionnaires and codebooks in parallel is important (Figures \@ref(fig:understand-que-examp) and \@ref(fig:understand-codebook-examp)), as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. Reviewing the codebook clarifies how to interpret the variables. +Reviewing the questionnaires and codebooks in parallel can clarify how to interpret the variables (Figures \@ref(fig:understand-que-examp) and \@ref(fig:understand-codebook-examp)), as questions and variables do not always correspond directly to each other in a one-to-one mapping. A single question may have multiple associated variables, or a single variable may summarize multiple questions. ### Errata -An erratum (singular) or errata (plural) is a document that lists errors found in a publication or dataset, such as a survey questionnaire. The purpose of an erratum is to correct or update mistakes or inaccuracies in the original document. +An erratum (singular) or errata (plural) is a document that lists errors found in a publication or dataset. The purpose of an erratum is to correct or update inaccuracies in the original document. Examples of errata include: + +* Issuing a corrected data table after realizing a typo or mistake in a table cell +* Reporting incorrectly programmed skips in an electronic survey where questions are skipped by the respondent when they should not have been -For example, if a survey questionnaire contains an error, such as a typo or confusing wording, the researchers would release an erratum that provides a corrected version. Another type of erratum is amending incorrectly programmed skips in an electronic survey where questions are skipped by the respondent when they should not have been. Review these errata before conducting any analysis to ensure the accuracy and reliability of the survey data and analysis. +The 2004 ANES dataset released an erratum, notifying analysts to remove a specific row from the data file due to the inclusion of a respondent who should not have been part of the sample. Adhering to an issued erratum helps us increase the accuracy and reliability of analysis. -### Additional Resources +### Additional resources -Surveys may have additional resources, such as interviewer instructions or "show cards" provided to respondents during interviewer-administered surveys to help respondents answer questions. Explore the survey website to find out what resources were used and in what contexts. +Survey documentation may include additional material, such as interviewer instructions or "show cards" provided to respondents during interviewer-administered surveys to help respondents answer questions. Explore the survey website to find out what resources were used and in what contexts. ## Working with Missing Data Missing data in surveys refers to situations where participants do not provide complete responses to survey questions. Respondents may not have seen a question by design. Or, they may not respond to a question for various other reasons, such as not wanting to answer a particular question, not understanding the question, or simply forgetting to answer. -Missing data can be a significant problem in survey analysis, as it can introduce bias and reduce the representativeness of the data. Missing data typically falls into two main categories: missing by design or unintentional mechanisms. +Missing data can be a significant problem in survey analysis, as it can introduce bias and reduce the representativeness of the data. Missing data typically falls into two main categories: missing by design or unintentional missing data. 1. **Missing by design/questionnaire skip logic**: This type of missingness occurs when certain respondents are intentionally directed to skip specific questions based on their previous responses or characteristics. For example, in a survey about employment, if a respondent indicates that they are not employed, they may be directed to skip questions related to their job responsibilities. Additionally, some surveys randomize questions or modules so that not all participants respond to all questions. In these instances, respondents would have missing data for the modules not randomly assigned to them. 2. **Unintentional missing data**: This type of missingness occurs when researchers do not intend for there to be missing data on a particular question, for example, if respondents did not finish the survey or refused to answer individual questions. There are three main types of unintentional missing data that each should be considered and handled differently [@mack; @Schafer2002]: - a. **Missing completely at random (MCAR)**: The missing data is unrelated to both observed and unobserved data, and the probability of being missing is the same across all cases. For example, if a respondent missed a question because they had to leave the survey early due to an emergency. + a. **Missing completely at random (MCAR)**: The missing data are unrelated to both observed and unobserved data, and the probability of being missing is the same across all cases. For example, if a respondent missed a question because they had to leave the survey early due to an emergency. - b. **Missing at random (MAR)**: The missing data is related to observed data but not unobserved data, and the probability of being missing is the same within groups. For example, if older respondents choose not to answer specific questions but younger respondents do answer them and we know the respondent's age. + b. **Missing at random (MAR)**: The missing data are related to observed data but not unobserved data, and the probability of being missing is the same within groups. For example, if older respondents choose not to answer specific questions but younger respondents do answer them and we know the respondent's age. - c. **Missing not at random (MNAR)**: The missing data is related to unobserved data, and the probability of being missing varies for reasons we are not measuring. For example, if respondents with depression do not answer a question about depression severity. + c. **Missing not at random (MNAR)**: The missing data are related to unobserved data, and the probability of being missing varies for reasons we are not measuring. For example, if respondents with depression do not answer a question about depression severity. -The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have "Yes" responses coded to `1`, "No" responses coded to `2`, and missing responses coded to `-9`. Or, the codebook may list different codes depending on why certain data is missing. In the example of variable `V202066` from the ANES (Figure \@ref(fig:understand-codebook-examp)), `-9` represents "Refused," `-7` means that the response was deleted due to an incomplete interview, `-6` means that there is no response because there was no follow-up interview, and `-1` means "Inapplicable" (due to the designed skip pattern). +The survey documentation, often the codebook, represents the missing data with a code. For example, a survey may have "Yes" responses coded to `1`, "No" responses coded to `2`, and missing responses coded to `-9`. Or, the codebook may list different codes depending on why certain data are missing. In the example of variable `V202066` from the ANES (Figure \@ref(fig:understand-codebook-examp)), `-9` represents "Refused," `-7` means that the response was deleted due to an incomplete interview, `-6` means that there is no response because there was no follow-up interview, and `-1` means "Inapplicable" (due to the designed skip pattern). When running analysis in R, we must handle missing responses as missing data (i.e., `NA`) and not numeric data. If missing responses are treated as zeros or arbitrary values, they can artificially alter summary statistics or introduce spurious patterns in the analysis. Recoding these values to `NA` will allow us to handle missing data in different ways in R, such as using functions like `na.omit()`, `complete.cases()`, or specialized packages like {tidyimpute} or {mice}. These tools allow us to treat missing responses as missing data to conduct our analysis accurately and obtain valid results. -Visualizing the missing data can also inform the types of missing data that are present. The {naniar} package provides many valuable missing data visualizations, such as using `gg_miss_var()` to see the count or percent of missing data points by variable or `gg_miss_fct()` to see relationships in missing data across levels of a factor variable. Investigating the relationships and nature of the missing data before running models can ensure that the missing data is accurately accounted for. +Visualizing the missing data can also inform the types of missing data that are present. The {naniar} package provides many valuable missing data visualizations, such as using `gg_miss_var()` to see the count or percent of missing data points by variable or `gg_miss_fct()` to see relationships in missing data across levels of a factor variable. Investigating the relationships and nature of the missing data before running models can ensure that the missing data are accurately accounted for. ### Accounting for Questionnaire Skip Patterns @@ -144,7 +149,7 @@ dat %>% The `drop_na()` function works on `tbl_svy` objects as well and should only be applied after creating the design object. -If the data is not missing completely at random (MCAR), then listwise deletion may produce biased estimates if there is a pattern of respondents who do not respond to specific questions. In these circumstances, we should explore other options, such as multiple imputation or weighted estimation. However, imputation is not always appropriate and can introduce its own sources of bias. See @allison for more details. +If the data are not missing completely at random (MCAR), then listwise deletion may produce biased estimates if there is a pattern of respondents who do not respond to specific questions. In these circumstances, we should explore other options, such as multiple imputation or weighted estimation. However, imputation is not always appropriate and can introduce its own sources of bias. See @allison for more details. In summary, we need to deeply understand the types and reasons for missing data in our survey before running any analysis. The survey documentation is an important resource for understanding how to deal with missing data. Carefully review the documentation for guidance from the researchers. @@ -152,7 +157,7 @@ In summary, we need to deeply understand the types and reasons for missing data Let's look at the survey documentation for the American National Election Studies (ANES) 2020. The survey website is located at [https://electionstudies.org/data-center/2020-time-series-study/](https://electionstudies.org/data-center/2020-time-series-study/). -Navigating to "User Guide and Codebook" [@anes-cb], we can download the PDF that contains the survey documentation, titled "ANES 2020 Time Series Study Full Release: User Guide and Codebook". Do not be daunted by the 796-page PDF. We can focus on the most critical information. +Navigating to "User Guide and Codebook" [@anes-cb], we can download the PDF that contains the survey documentation, titled "ANES 2020 Time Series Study Full Release: User Guide and Codebook". Do not be daunted by the 796-page PDF. We will focus on the most critical information. #### Introduction {-} @@ -160,11 +165,11 @@ The first section in the User Guide explains that the ANES 2020 Times Series Stu #### Sample Design and Respondent Recruitment {-} -The section "Sample Design and Respondent Recruitment" provides more detail about how the survey was conducted in that it was a sequential mixed-mode design. This means that all three modes were conducted one after another and not at the same time. Additionally, it indicates that for the 2020 survey, they resampled all respondents who participated in 2016 ANES, along with a freshly-drawn cross-section: +The section "Sample Design and Respondent Recruitment" provides more detail about the survey's sequential mixed-mode design. All three modes were conducted one after another and not at the same time. Additionally, it indicates that for the 2020 survey, they resampled all respondents who participated in 2016 ANES, along with a newly-drawn cross-section: > The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or the District of Columbia. -The document continues with more details on the sample groups. +The document continues with more details on the sample groups. #### Data Analysis, Weights, and Variance Estimation {-} @@ -181,14 +186,10 @@ For weight | Use variance unit/PSU/cluster | and use variance stratum V200010a| V200010c| V200010d V200010b| V200010c| V200010d -The user guide references a supplemental document called "How to Analyze ANES Survey Data" [@debell] as a 'how-to guide' to help us with our analysis. In the how-to guide, we learn more about the weights, including that the weights sum to the sample size and not the population. If we want to create estimates at the population level instead of the sample level, we will need to adjust the weights to the population. Let's recall the "Sample Design and Respondent Recruitment" section: +### Methodology {-} -> The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or the District of Columbia. +The user guide mentions a supplemental document called "How to Analyze ANES Survey Data" [@debell] as a 'how-to guide' for analyzing the data. In this document, we learn more about the weights, where we learn that they sum to the sample size and not the population. If our goal is to calculate estimates for the entire U.S. population instead of just the sample, we must adjust the weights to the U.S. population. To create accurate weights for the population, we need to determine the total population size at the time of the survey. Let's review the "Sample Design and Respondent Recruitment" section for more details: -To create accurate weights for the population, we need to determine the total population size when the survey was conducted. This can be determined using the Current Population Survey (CPS) for March of 2020 as stated in the ANES documentation. Chapter \@ref(c04-set-up) goes into more detail about how to calculate this value and adjust the data. - -## Searching for Public-Use Survey Data -Throughout this book, we use public-use datasets from different surveys. Above, we provided an example from the American National Election Survey (ANES), and we will continue to use this dataset throughout the book. Additionally, we use the Residential Energy Consumption Survey (RECS), the National Crime Victimization Survey (NCVS), and the AmericasBarometer surveys. -As mentioned in Chapter \@ref(c02-overview-surveys), instead of creating a new survey, researchers should look for existing data that can provide insights into their research questions. One of the greatest sources of data is the government. For example, in the U.S., we can get data directly from the various statistical agencies like we have done with RECS and NCVS. Other countries often have data available through official statistics offices, such as the Office for National Statistics in the U.K. +> The target population for the fresh cross-section was the 231 million non-institutional U.S. citizens aged 18 or older living in the 50 U.S. states or the District of Columbia. -In addition to government data, many researchers will make their data publicly available through repositories such as the [Inter-university Consortium for Political and Social Research (ICPSR) variable search](https://www.icpsr.umich.edu/web/pages/ICPSR/ssvd/) or the [Odum Institute Data Archive](https://odum.unc.edu/archive/). Searching these repositories or other compiled lists (e.g., [Analyze Survey Data for Free - asdfree.com](https://asdfree.com)) can be an efficient way to identify surveys with questions related to the researcher's topic of interest. \ No newline at end of file +The documentation suggests that the population should equal around 231 million, but this is a very imprecise count. Upon further investigation in the available resources, we can find the methodology file titled "Methodology Report for the ANES 2020 Time Series Study" [@anes-2020-tech]. This file states that we can use the population total from the Current Population Survey (CPS), a monthly survey sponsored by the U.S. Census Bureau and the U.S. Bureau of Labor Statistics. The CPS provides a more accurate population estimate for a specific month. Therefore, we can use the CPS to get the total population number for March 2020, the time in which the ANES was conducted. Chapter \@ref(c04-set-up) goes into detailed instructions on how to calculate and adjust this value in the data. \ No newline at end of file diff --git a/book.bib b/book.bib index 0529d60b..0d0cfaad 100644 --- a/book.bib +++ b/book.bib @@ -4,8 +4,8 @@ @book{xie2015 year = 2015, publisher = {Chapman and Hall/CRC}, address = {Boca Raton, Florida}, - howpublished = {\url{http://yihui.name/knitr/}}, note = {ISBN 978-1498716963}, + howpublished = {\url{http://yihui.name/knitr/}}, edition = {2nd} } @book{lohr2021sampling, @@ -63,35 +63,34 @@ @misc{lapopdat year = 2023, howpublished = {\url{www.vanderbilt.edu/lapop}} } - @misc{lapop, - author = {{LAPOP}}, - title = {The AmericasBarometer by the LAPOP Lab}, - year = 2023, - howpublished = {\url{www.vanderbilt.edu/lapop}} + title = {The AmericasBarometer by the LAPOP Lab}, + author = {{LAPOP}}, + year = 2023, + howpublished = {\url{www.vanderbilt.edu/lapop}} } @misc{lapop-about, - author = {{LAPOP}}, - title = {About the AmericasBarometer}, - year = 2023, - howpublished = {\url{https://www.vanderbilt.edu/lapop/about-americasbarometer.php}} + title = {About the AmericasBarometer}, + author = {{LAPOP}}, + year = 2023, + howpublished = {\url{https://www.vanderbilt.edu/lapop/about-americasbarometer.php}} } @report{lapop-tech, - title = {AmericasBarometer 2021: Technical Information}, + title = {AmericasBarometer 2021: Technical Information}, author = {{LAPOP}}, year = 2021, howpublished = {\url{https://www.vanderbilt.edu/lapop/ab2021/AB2021-Technical-Report-v1.0-FINAL-eng-030722.pdf}}, institution = {Vanderbilt University} } @report{lapop-can, - title = {AmericasBarometer 2021 - Canada: Technical Information}, + title = {AmericasBarometer 2021 - Canada: Technical Information}, author = {{LAPOP}}, year = 2021, howpublished = {\url{http://datasets.americasbarometer.org/database/files/ABCAN2021-Technical-Report-v1.0-FINAL-eng-110921.pdf}}, institution = {Vanderbilt University} } @report{lapop-usa, - title = {AmericasBarometer 2021 - {U.S.}: Technical Information}, + title = {AmericasBarometer 2021 - {U.S.}: Technical Information}, author = {{LAPOP}}, year = 2021, howpublished = {\url{http://datasets.americasbarometer.org/database/files/ABUSA2021-Technical-Report-v1.0-FINAL-eng-110921.pdf}}, @@ -103,9 +102,6 @@ @misc{lapop-svy year = 2021, howpublished = {\url{https://www.vanderbilt.edu/lapop/ab2021/AB2021-Core-Questionnaire-v17.5-Eng-210514-W-v2.pdf}} } - - - @book{deming1991sample, title = {Sample design in business research}, author = {Deming, W Edwards}, @@ -175,64 +171,58 @@ @article{hansen1987 number = 2, pages = {180--190} } - @misc{anes-svy, title = {ANES 2020 Time Series Study: Pre-Election and Post-Election Survey Questionnaires}, author = {{American National Election Studies}}, year = 2021, howpublished = {\url{https://electionstudies.org/wp-content/uploads/2021/07/anes_timeseries_2020_questionnaire_20210719.pdf}} } - @misc{anes-cb, title = {ANES 2020 Time Series Study Full Release: User Guide and Codebook}, author = {{American National Election Studies}}, year = 2022, - howpublished = {\url{https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf}} + howpublished = {\url{https://electionstudies.org/wp-content/uploads/2022/02/anes_timeseries_2020_userguidecodebook_20220210.pdf}} } - @misc{brfss-svy, - title = {Behavioral Risk Factor Surveillance System Survey Questionnaire}, + title = {Behavioral Risk Factor Surveillance System Survey Questionnaire}, author = {{Centers for Disease Control and Prevention (CDC)}}, year = 2021, howpublished = {\url{https://www.cdc.gov/brfss/questionnaires/pdf-ques/2021-BRFSS-Questionnaire-1-19-2022-508.pdf}}, institution = {U.S. Department of Health and Human Services, Centers for Disease Control and Prevention} } - @book{allison, - title = {Missing Data}, - author = {Allison, Paul}, - year = 2002, - publisher = {SAGE Publications}, - doi = {10.4135/9781412985079} + title = {Missing Data}, + author = {Allison, Paul}, + year = 2002, + publisher = {SAGE Publications}, + doi = {10.4135/9781412985079} } - @inbook{mack, title = {Types of Missing Data}, - booktitle = {Managing Missing Data in Patient Registries: Addendum to Registries for Evaluating Patient Outcomes: A User’s Guide, Third Edition [Internet]}, author = {Mack, Christina and Su, Zhaohui and Westreich, Daniel}, year = 2018, + booktitle = {Managing Missing Data in Patient Registries: Addendum to Registries for Evaluating Patient Outcomes: A User’s Guide, Third Edition [Internet]}, howpublished = {\url{https://www.ncbi.nlm.nih.gov/books/NBK493614/}}, institution = {Rockville (MD): Agency for Healthcare Research and Quality (US)} } - @report{debell, - title = {How to Analyze ANES Survey Data}, + title = {How to Analyze ANES Survey Data}, author = {DeBell, Matthew}, year = 2010, - type = {ANES Technical Report Series}, - number = {nes012492}, + number = {nes012492}, + type = {ANES Technical Report Series}, howpublished = {\url{https://electionstudies.org/wp-content/uploads/2018/05/HowToAnalyzeANESData.pdf}}, institution = {Palo Alto, CA: Stanford University and Ann Arbor, MI: the University of Michigan} } @article{Schafer2002, - author = {Joseph L Schafer and John W Graham}, - doi = {10.1037//1082-989X.7.2.147}, - issue = {2}, - journal = {Psychological Methods}, - pages = {147-177}, - title = {Missing Data: Our View of the State of the Art}, - volume = {7}, - year = {2002}, + title = {Missing Data: Our View of the State of the Art}, + author = {Joseph L Schafer and John W Graham}, + year = 2002, + journal = {Psychological Methods}, + volume = 7, + pages = {147--177}, + doi = {10.1037//1082-989X.7.2.147}, + issue = 2 } @article{kruskal1980, title = {Representative sampling, IV: The history of the concept in statistics, 1895-1939}, @@ -280,8 +270,12 @@ @misc{recs-2020-tech year = 2023, howpublished = {\url{https://www.eia.gov/consumption/residential/data/2020/pdf/2020%20RECS_Methodology%20Report.pdf}} } - - +@misc{anes-2020-tech, + title = {{Methodology Report for the ANES 2020 Time Series Study}}, + author = {{DeBell, Matthew and Amsbary, Michelle and Brader, Ted and Brock, Shelley and Good, Cindy and Kamens, Justin and Maisel, Natalya and Pinto, Sarah}}, + year = 2022, + howpublished = {\url{https://electionstudies.org/wp-content/uploads/2022/08/anes_timeseries_2020_methodology_report.pdf}} +} @misc{acs-5yr-doc, title = {{American Community Survey 2017-2021 5-Year: PUMS User Guide and Overview}}, author = {{U.S. Census Bureau}}, @@ -289,160 +283,157 @@ @misc{acs-5yr-doc howpublished = {\url{https://www2.census.gov/programs-surveys/acs/tech_docs/pums/2017_2021ACS_PUMS_User_Guide.pdf}} } @article{tse-doc, - author = {Biemer, Paul P.}, - title = {Total Survey Error: Design, Implementation, and Evaluation}, - journal = {Public Opinion Quarterly}, - volume = {74}, - number = {5}, - pages = {817-848}, - year = {2010}, - month = {01}, - issn = {0033-362X}, - doi = {10.1093/poq/nfq058}, - howpublished = {\url{https://doi.org/10.1093/poq/nfq058}}, - eprint = {\url{https://academic.oup.com/poq/article-pdf/74/5/817/5138301/nfq058.pdf}}, + title = {Total Survey Error: Design, Implementation, and Evaluation}, + author = {Biemer, Paul P.}, + year = 2010, + month = {01}, + journal = {Public Opinion Quarterly}, + volume = 74, + number = 5, + pages = {817--848}, + doi = {10.1093/poq/nfq058}, + issn = {0033-362X}, + howpublished = {\url{https://doi.org/10.1093/poq/nfq058}}, + eprint = {\url{https://academic.oup.com/poq/article-pdf/74/5/817/5138301/nfq058.pdf}} } @book{dillman2014mode, - title = {Internet, phone, mail, and mixed-mode surveys: The tailored design method}, - author = {Dillman, Don A and Smyth, Jolene D and Christian, Leah Melani}, - year = 2014, - publisher = {John Wiley \& Sons} + title = {Internet, phone, mail, and mixed-mode surveys: The tailored design method}, + author = {Dillman, Don A and Smyth, Jolene D and Christian, Leah Melani}, + year = 2014, + publisher = {John Wiley \& Sons} } @book{groves2009survey, - title = {Survey methodology}, - author = {Groves, Robert M and Fowler Jr, Floyd J and Couper, Mick P and Lepkowski, James M and Singer, Eleanor and Tourangeau, Roger}, - year = 2009, - publisher = {John Wiley \& Sons} + title = {Survey methodology}, + author = {Groves, Robert M and Fowler Jr, Floyd J and Couper, Mick P and Lepkowski, James M and Singer, Eleanor and Tourangeau, Roger}, + year = 2009, + publisher = {John Wiley \& Sons} } @book{biemer2003survqual, - title = {Introduction to survey quality}, - author = {Biemer, Paul P. and Lyberg, Lars E.}, - year = 2003, - publisher = {John Wiley \& Sons} + title = {Introduction to survey quality}, + author = {Biemer, Paul P. and Lyberg, Lars E.}, + year = 2003, + publisher = {John Wiley \& Sons} } @techreport{harter2016address, - title = {Address-based sampling}, - author = {Harter, Rachel and Battaglia, Michael P and Buskirk, Trent D and Dillman, Don A and English, Ned and Fahimi, Mansour and Frankel, Martin R and Kennel, Timothy and McMichael, Joseph P and McPhee, Cameron Brook and Montaquila, Jill and Yancey, Tracie and Zuckerberg, Andrew L.}, - year = 2016, - howpublished = {\url{https://aapor.org/wp-content/uploads/2022/11/AAPOR_Report_1_7_16_CLEAN-COPY-FINAL-2.pdf}}, - institution = {American Association for Public Opinion Research}, - type = {Task force report} + title = {Address-based sampling}, + author = {Harter, Rachel and Battaglia, Michael P and Buskirk, Trent D and Dillman, Don A and English, Ned and Fahimi, Mansour and Frankel, Martin R and Kennel, Timothy and McMichael, Joseph P and McPhee, Cameron Brook and Montaquila, Jill and Yancey, Tracie and Zuckerberg, Andrew L.}, + year = 2016, + howpublished = {\url{https://aapor.org/wp-content/uploads/2022/11/AAPOR_Report_1_7_16_CLEAN-COPY-FINAL-2.pdf}}, + institution = {American Association for Public Opinion Research}, + type = {Task force report} } @article{DeLeeuw_2018, - title = {Mixed-Mode: Past, Present, and Future}, - author = {DeLeeuw, Edith D.}, - year = 2018, - month = {Aug.}, - journal = {Survey Research Methods}, - volume = 12, - number = 2, - pages = {75–89}, - doi = {10.18148/srm/2018.v12i2.7402}, - howpublished = {\url{https://ojs.ub.uni-konstanz.de/srm/article/view/7402}} + title = {Mixed-Mode: Past, Present, and Future}, + author = {DeLeeuw, Edith D.}, + year = 2018, + month = {Aug.}, + journal = {Survey Research Methods}, + volume = 12, + number = 2, + pages = {75–89}, + doi = {10.18148/srm/2018.v12i2.7402}, + howpublished = {\url{https://ojs.ub.uni-konstanz.de/srm/article/view/7402}} } @article{biemer_choiceplus, - title = {{Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys}}, - author = {Biemer, Paul P. and Murphy, Joe and Zimmer, Stephanie and Berry, Chip and Deng, Grace and Lewis, Katie}, - year = 2017, - month = {06}, - journal = {Journal of Survey Statistics and Methodology}, - volume = 6, - number = 2, - pages = {240-261}, - doi = {10.1093/jssam/smx015}, - issn = {2325-0984}, - howpublished = {\url{https://doi.org/10.1093/jssam/smx015}}, - eprint = {https://academic.oup.com/jssam/article-pdf/6/2/240/24807375/smx015.pdf} + title = {{Using Bonus Monetary Incentives to Encourage Web Response in Mixed-Mode Household Surveys}}, + author = {Biemer, Paul P. and Murphy, Joe and Zimmer, Stephanie and Berry, Chip and Deng, Grace and Lewis, Katie}, + year = 2017, + month = {06}, + journal = {Journal of Survey Statistics and Methodology}, + volume = 6, + number = 2, + pages = {240--261}, + doi = {10.1093/jssam/smx015}, + issn = {2325-0984}, + howpublished = {\url{https://doi.org/10.1093/jssam/smx015}}, + eprint = {https://academic.oup.com/jssam/article-pdf/6/2/240/24807375/smx015.pdf} } @book{Bradburn2004, - author = {Norman M. Bradburn and Seymour Sudman and Brian Wansink}, - edition = {2nd Edition}, - publisher = {Jossey-Bass}, - title = {Asking Questions: The Definitive Guide to Questionnaire Design}, - year = {2004}, + title = {Asking Questions: The Definitive Guide to Questionnaire Design}, + author = {Norman M. Bradburn and Seymour Sudman and Brian Wansink}, + year = 2004, + publisher = {Jossey-Bass}, + edition = {2nd Edition} } @book{Fowler1989, - author = {Floyd J Fowler and Thomas W. Mangione}, - publisher = {SAGE}, - title = {Standardized Survey Interviewing}, - year = {1989}, + title = {Standardized Survey Interviewing}, + author = {Floyd J Fowler and Thomas W. Mangione}, + year = 1989, + publisher = {SAGE} } @book{Kim2021, - author = {Jae Kwang Kim and Jun Shao}, - publisher = {Chapman \& Hall/CRC Press}, - title = {Statistical Methods for Handling Incomplete Data}, - year = {2021}, + title = {Statistical Methods for Handling Incomplete Data}, + author = {Jae Kwang Kim and Jun Shao}, + year = 2021, + publisher = {Chapman \& Hall/CRC Press} } @book{Schouten2018, - author = {Barry Schouten and Andy Peytchev and James Wagner}, - publisher = {Chapman \& Hall/CRC Press}, - title = {Adaptive Survey Design}, - year = {2018}, + title = {Adaptive Survey Design}, + author = {Barry Schouten and Andy Peytchev and James Wagner}, + year = 2018, + publisher = {Chapman \& Hall/CRC Press} } @book{Tourangeau2000psych, - author = {Roger Tourangeau and Lance J. Rips and Kenneth Rasinski}, - publisher = {Cambridge University Press}, - title = {Psychology of Survey Response}, - year = {2000}, + title = {Psychology of Survey Response}, + author = {Roger Tourangeau and Lance J. Rips and Kenneth Rasinski}, + year = 2000, + publisher = {Cambridge University Press} } @article{Tourangeau2004spacing, - author = {Roger Tourangeau and Mick P. Couper and Frederick Conrad}, - isbn = {0033-362X}, - issn = {0033362X}, - issue = {3}, - journal = {Public Opinion Quarterly}, - pages = {368-393}, - publisher = {Oxford University Press}, - title = {Sapcing, Position, and Order: Interpretive Heuristics for Visual Features of Survey Questions}, - volume = {68}, - howpublished = {\url{http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp}}, - year = {2004}, + title = {Sapcing, Position, and Order: Interpretive Heuristics for Visual Features of Survey Questions}, + author = {Roger Tourangeau and Mick P. Couper and Frederick Conrad}, + year = 2004, + journal = {Public Opinion Quarterly}, + publisher = {Oxford University Press}, + volume = 68, + pages = {368--393}, + isbn = {0033-362X}, + issn = {0033362X}, + issue = 3, + howpublished = {\url{http://www.jstor.org/stable/3521676 http://www.jstor.org/page/info/about/policies/terms.jsp}} } @book{Valliant2018weights, - author = {Richard Valliant and Jill A. Dever}, - publisher = {Stata Press}, - title = {Survey Weights: A Step-by-step Guide to Calculation}, - year = {2018}, + title = {Survey Weights: A Step-by-step Guide to Calculation}, + author = {Richard Valliant and Jill A. Dever}, + year = 2018, + publisher = {Stata Press} } @article{deLeeuw2005, - author = {DeLeeuw, Edith D.}, - issue = {2}, - journal = {Journal of Official Statistics}, - pages = {233-255}, - title = {To Mix or Not to Mix Data Collection Modes in Surveys}, - volume = {21}, - year = {2005}, -} - + title = {To Mix or Not to Mix Data Collection Modes in Surveys}, + author = {DeLeeuw, Edith D.}, + year = 2005, + journal = {Journal of Official Statistics}, + volume = 21, + pages = {233--255}, + issue = 2 +} @inbook{Skinner2009, - author = {Chris Skinner}, - editor = {C.R. Rao}, - title = {Chapter 15: Statistical Disclosure Control for Survey Data}, - booktitle = {Handbook of Statistics: Sample Surveys: Design, Methods and Applications}, - pages = {381-396}, - publisher = {Elsevier B.V.}, - year = {2009}, -} - + title = {Chapter 15: Statistical Disclosure Control for Survey Data}, + author = {Chris Skinner}, + year = 2009, + booktitle = {Handbook of Statistics: Sample Surveys: Design, Methods and Applications}, + publisher = {Elsevier B.V.}, + pages = {381--396}, + editor = {C.R. Rao} +} @misc{pennstate506, - title = {STAT 506: Sampling Theory and Methods [Online Course]}, - author = {{Penn State}}, - year = 2019, - howpublished = {\url{https://online.stat.psu.edu/stat506/}} + title = {STAT 506: Sampling Theory and Methods [Online Course]}, + author = {{Penn State}}, + year = 2019, + howpublished = {\url{https://online.stat.psu.edu/stat506/}} } - @proceedings{Scott2007, - title = {Rao-Scott corrections and their impact}, - author = {Alastair Scott}, - series = {Section on Survey Research Methods}, - publisher = {ASA}, - pages = {3514-3518}, - year = {2007}, - howpublished = {\url{http://www.asasrms.org/Proceedings/y2007/Files/JSM2007-000874.pdf}} -} - + title = {Rao-Scott corrections and their impact}, + author = {Alastair Scott}, + year = 2007, + publisher = {ASA}, + series = {Section on Survey Research Methods}, + pages = {3514--3518}, + howpublished = {\url{http://www.asasrms.org/Proceedings/y2007/Files/JSM2007-000874.pdf}} +} @book{git-w-R, - title = {Happy Git and GitHub for the useR}, - author = {Jenny Bryan and Jim Hester}, - howpublished = {\url{https://happygitwithr.com/}} -} \ No newline at end of file + title = {Happy Git and GitHub for the useR}, + author = {Jenny Bryan and Jim Hester}, + year = {2023}, + howpublished = {\url{https://happygitwithr.com/}} +} diff --git a/renv.lock b/renv.lock index d2cab0b6..48319288 100644 --- a/renv.lock +++ b/renv.lock @@ -1410,10 +1410,13 @@ }, "prettyunits": { "Package": "prettyunits", - "Version": "1.1.1", + "Version": "1.2.0", "Source": "Repository", "Repository": "CRAN", - "Hash": "95ef9167b75dde9d2ccc3c7528393e7e" + "Requirements": [ + "R" + ], + "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" }, "processx": { "Package": "processx", @@ -1826,9 +1829,9 @@ "Source": "GitHub", "RemoteType": "github", "RemoteHost": "api.github.com", - "RemoteUsername": "gergness", "RemoteRepo": "srvyr", - "RemoteRef": "main", + "RemoteUsername": "gergness", + "RemoteRef": "HEAD", "RemoteSha": "1917f75487fa40f2ea6fd4e33323cd9278afb356", "Requirements": [ "R", @@ -1842,7 +1845,7 @@ "tidyselect", "vctrs" ], - "Hash": "932c30103619651286c6eba783e9a248" + "Hash": "c77ebba142d814788bab0092bf102f6d" }, "srvyr.data": { "Package": "srvyr.data", @@ -1852,12 +1855,12 @@ "RemoteHost": "api.github.com", "RemoteUsername": "tidy-survey-r", "RemoteRepo": "srvyr.data", - "RemoteRef": "main", - "RemoteSha": "1f84dfdd630dde7fecc1a26b44543cd45674d08d", + "RemoteRef": "4deed50c78bbbd9847da001724a7f2315d06e8f5", + "RemoteSha": "4deed50c78bbbd9847da001724a7f2315d06e8f5", "Requirements": [ "R" ], - "Hash": "5a90f75ff1373bd3a1906d6712576c14" + "Hash": "e5749bb48c61981ab533e63145cd305d" }, "stringi": { "Package": "stringi",