From 0f724f3f10130039a80f87d119008054dd738c12 Mon Sep 17 00:00:00 2001 From: Ben Schneider Date: Tue, 11 May 2021 18:50:48 -0400 Subject: [PATCH 1/3] Add filtering joins, with documentation and tests. --- NAMESPACE | 6 + R/join.R | 91 ++++++++++++++ man/dplyr_filter_joins.Rd | 22 ++++ tests/testthat/test_joins.R | 229 ++++++++++++++++++++++++++++++++++++ 4 files changed, 348 insertions(+) create mode 100644 R/join.R create mode 100644 man/dplyr_filter_joins.Rd create mode 100644 tests/testthat/test_joins.R diff --git a/NAMESPACE b/NAMESPACE index 38c204e..f425bae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(anti_join,tbl_svy) S3method(as.character,survey_vars) S3method(as.data.frame,tbl_svy) S3method(as_survey,data.frame) @@ -58,6 +59,7 @@ S3method(rename_,tbl_svy) S3method(rename_with,tbl_svy) S3method(select,tbl_svy) S3method(select_,tbl_svy) +S3method(semi_join,tbl_svy) S3method(summarise,grouped_svy) S3method(summarise,tbl_svy) S3method(summarise_,grouped_svy) @@ -70,6 +72,7 @@ S3method(ungroup,tbl_svy) export("%>%") export(across) export(all_vars) +export(anti_join) export(any_vars) export(as_survey) export(as_survey_) @@ -149,6 +152,7 @@ export(select_) export(select_all) export(select_at) export(select_if) +export(semi_join) export(set_survey_vars) export(summarise) export(summarise_) @@ -186,6 +190,7 @@ export(vars) import(rlang) importFrom(dplyr,across) importFrom(dplyr,all_vars) +importFrom(dplyr,anti_join) importFrom(dplyr,any_vars) importFrom(dplyr,c_across) importFrom(dplyr,collect) @@ -241,6 +246,7 @@ importFrom(dplyr,select_) importFrom(dplyr,select_all) importFrom(dplyr,select_at) importFrom(dplyr,select_if) +importFrom(dplyr,semi_join) importFrom(dplyr,summarise) importFrom(dplyr,summarise_) importFrom(dplyr,summarise_all) diff --git a/R/join.R b/R/join.R new file mode 100644 index 0000000..51e7825 --- /dev/null +++ b/R/join.R @@ -0,0 +1,91 @@ +#' @export +semi_join.tbl_svy <- function( + x, + y, + by = NULL, + copy = FALSE, + ..., + na_matches = c("na", "never") +) { + + if (inherits(y, "tbl_svy")) { + y <- y$variables + } + + x <- mutate(x, `___row_number` = dplyr::row_number()) + + filtered_vars <- semi_join(x = x$variables, + y = y, + by = by, + copy = copy, + na_matches = na_matches, + ...) + + x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']]) + x <- filter(x, `___retained`) + x <- select(x, -`___retained`) + + x + +} + +#' @export +anti_join.tbl_svy <- function( + x, + y, + by = NULL, + copy = FALSE, + ..., + na_matches = c("na", "never") +) { + + if (inherits(y, "tbl_svy")) { + y <- y$variables + } + + x <- mutate(x, `___row_number` = dplyr::row_number()) + + filtered_vars <- anti_join(x = x$variables, + y = y, + by = by, + copy = copy, + na_matches = na_matches, + ...) + + x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']]) + x <- filter(x, `___retained`) + x <- select(x, -`___retained`) + + x + +} + +# Import + export generics from dplyr and tidyr +#' Filtering joins from dplyr +#' +#' These are data manipulation functions designed to work on a \code{tbl_svy} object +#' and another data frame or \code{tbl_svy} object. +#' +#' \code{semi_join} and \code{anti_join} filter certain observations from a \code{tbl_svy} +#' depending on the presence or absence of matches in another table. +#' See \code{\link[dplyr]{filter-joins}} for more details. +#' +#' Mutating joins (\code{full_join}, \code{left_join}, etc.) are not implemented +#' for any \code{tbl_svy} objects. These data manipulations +#' may require modifications to the survey variable specifications and so +#' cannot be done automatically. Instead, use dplyr to perform them while the +#' data is still stored in data.frames. +#' @name dplyr_filter_joins +NULL + +#' @name semi_join +#' @export +#' @importFrom dplyr semi_join +#' @rdname dplyr_filter_joins +NULL + +#' @name anti_join +#' @export +#' @importFrom dplyr anti_join +#' @rdname dplyr_filter_joins +NULL diff --git a/man/dplyr_filter_joins.Rd b/man/dplyr_filter_joins.Rd new file mode 100644 index 0000000..9bb3e4e --- /dev/null +++ b/man/dplyr_filter_joins.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join.R +\name{dplyr_filter_joins} +\alias{dplyr_filter_joins} +\alias{semi_join} +\title{Filtering joins from dplyr and tidyr} + +\description{ +These are data manipulation functions designed to work on a \code{tbl_svy} object +and another data frame or \code{tbl_svy} object. +} +\details{ +\code{semi_join} and \code{anti_join} filter certain observations from a \code{tbl_svy} +depending on the presence or absence of matches in another table. +See \code{\link[dplyr]{filter-joins}} for more details. + +Mutating joins (\code{full_join}, \code{left_join}, etc.) are not implemented +for any \code{tbl_svy} objects. These data manipulations +may require modifications to the survey variable specifications and so +cannot be done automatically. Instead, use dplyr to perform them while the +data is still stored in data.frames. +} diff --git a/tests/testthat/test_joins.R b/tests/testthat/test_joins.R new file mode 100644 index 0000000..9813b95 --- /dev/null +++ b/tests/testthat/test_joins.R @@ -0,0 +1,229 @@ +context("filtering joins (semi_join and anti_join) work") + +suppressPackageStartupMessages({ + library(survey) + library(srvyr) + library(dplyr) +}) + +source("utilities.R") + +# Set up example data ---- + + data(api) + + ##_ Create simple stratified survey design object ---- + stratified_design <- apistrat %>% + as_survey_design(strata = stype, weights = pw) + + ##_ Create clustered survey design object ---- + cluster_design <- as_survey_design( + .data = apiclus1, + id = dnum, + weights = pw, + fpc = fpc + ) + + ##_ Create survey design object with calibration weights ---- + ##_ NOTE: The survey package uses special behavior when subsetting such survey designs. + ##_ Rows are never removed, the weights are simply set effectively to zero (technically, Inf) + + ### Add raking weights for school type + pop.types <- data.frame(stype=c("E","H","M"), Freq=c(4421,755,1018)) + pop.schwide <- data.frame(sch.wide=c("No","Yes"), Freq=c(1072,5122)) + + raked_design <- rake( + cluster_design, + sample.margins = list(~stype,~sch.wide), + population.margins = list(pop.types, pop.schwide) + ) + +# semi_join ---- + + test_that( + "semi_join works with `by = NULL`", { + # Stratified design + expect_equal( + ## Calculate statistic, after using a filtering join + object = stratified_design %>% + semi_join(y = filter(apistrat, stype == "E")) %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = stratified_design %>% + filter(stype == "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Cluster design + expect_equal( + ## Calculate statistic, after using a filtering join + object = cluster_design %>% + semi_join(y = filter(apiclus1, stype == "E")) %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = cluster_design %>% + filter(stype == "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Calibration weighted design + expect_equal( + ## Calculate statistic, after using a filtering join + object = raked_design %>% + semi_join(y = filter(apiclus1, stype == "E")) %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = raked_design %>% + filter(stype == "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + }) + + test_that( + "semi_join works with supplied `by` argument", { + # Stratified design + expect_equal( + ## Calculate statistic, after using a filtering join + object = stratified_design %>% + semi_join(y = filter(apistrat, stype == "E"), + by = "stype") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = stratified_design %>% + filter(stype == "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Cluster design + expect_equal( + ## Calculate statistic, after using a filtering join + object = cluster_design %>% + semi_join(y = filter(apiclus1, stype == "E"), + by = "stype") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = cluster_design %>% + filter(stype == "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Calibration weighted design + expect_equal( + ## Calculate statistic, after using a filtering join + object = raked_design %>% + semi_join(y = filter(apiclus1, stype == "E"), + by = "stype") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = raked_design %>% + filter(stype == "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + }) + +# anti_join ---- + + test_that( + "anti_join works with `by = NULL`", { + # Stratified design + expect_equal( + ## Calculate statistic, after using a filtering join + object = stratified_design %>% + anti_join(y = filter(apistrat, stype == "E")) %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = stratified_design %>% + filter(stype != "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Cluster design + expect_equal( + ## Calculate statistic, after using a filtering join + object = cluster_design %>% + anti_join(y = filter(apiclus1, stype == "E")) %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = cluster_design %>% + filter(stype != "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Calibration weighted design + expect_equal( + ## Calculate statistic, after using a filtering join + object = raked_design %>% + anti_join(y = filter(apiclus1, stype == "E")) %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = raked_design %>% + filter(stype != "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + }) + + test_that( + "anti_join works with supplied `by` argument", { + # Stratified design + expect_equal( + ## Calculate statistic, after using a filtering join + object = stratified_design %>% + anti_join(y = filter(apistrat, stype == "E"), + by = "stype") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = stratified_design %>% + filter(stype != "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Cluster design + expect_equal( + ## Calculate statistic, after using a filtering join + object = cluster_design %>% + anti_join(y = filter(apiclus1, stype == "E"), + by = "stype") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = cluster_design %>% + filter(stype != "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + + # Calibration weighted design + expect_equal( + ## Calculate statistic, after using a filtering join + object = raked_design %>% + anti_join(y = filter(apiclus1, stype == "E"), + by = "stype") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat"), + ## Calculate statistic after manually filtering + expected = raked_design %>% + filter(stype != "E") %>% + summarize(stat = survey_mean(pcttest)) %>% + pull("stat") + ) + }) From 7f2130c75282e624144f916fe10a4d7b0efe7deb Mon Sep 17 00:00:00 2001 From: Ben Schneider Date: Tue, 11 May 2021 21:12:54 -0400 Subject: [PATCH 2/3] Try to fix R CMD check warning about global variable. --- R/join.R | 4 ++-- man/dplyr_filter_joins.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/join.R b/R/join.R index 51e7825..da296b8 100644 --- a/R/join.R +++ b/R/join.R @@ -22,8 +22,8 @@ semi_join.tbl_svy <- function( ...) x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']]) - x <- filter(x, `___retained`) - x <- select(x, -`___retained`) + x <- filter(x, .data$`___retained`) + x <- select(x, -.data$`___retained`) x diff --git a/man/dplyr_filter_joins.Rd b/man/dplyr_filter_joins.Rd index 9bb3e4e..28c6f15 100644 --- a/man/dplyr_filter_joins.Rd +++ b/man/dplyr_filter_joins.Rd @@ -3,8 +3,8 @@ \name{dplyr_filter_joins} \alias{dplyr_filter_joins} \alias{semi_join} -\title{Filtering joins from dplyr and tidyr} - +\alias{anti_join} +\title{Filtering joins from dplyr} \description{ These are data manipulation functions designed to work on a \code{tbl_svy} object and another data frame or \code{tbl_svy} object. From f240676f07a95ff9251e548cf6abc37f03566b36 Mon Sep 17 00:00:00 2001 From: Ben Schneider Date: Wed, 12 May 2021 13:26:20 -0400 Subject: [PATCH 3/3] Skip the `___retained` variable and just filter on the expression. Also conditionally remove the `___row_number` variable if it's not already removed internally by `filter()`. --- R/join.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/R/join.R b/R/join.R index da296b8..6211c8f 100644 --- a/R/join.R +++ b/R/join.R @@ -21,9 +21,10 @@ semi_join.tbl_svy <- function( na_matches = na_matches, ...) - x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']]) - x <- filter(x, .data$`___retained`) - x <- select(x, -.data$`___retained`) + x <- filter(x, .data$`___row_number` %in% filtered_vars[['___row_number']]) + if ("___row_number" %in% tbl_vars(x)) { + x <- select(x, -`___row_number`) + } x @@ -52,9 +53,10 @@ anti_join.tbl_svy <- function( na_matches = na_matches, ...) - x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']]) - x <- filter(x, `___retained`) - x <- select(x, -`___retained`) + x <- filter(x, .data$`___row_number` %in% filtered_vars[['___row_number']]) + if ("___row_number" %in% tbl_vars(x)) { + x <- select(x, -`___row_number`) + } x