From 0f724f3f10130039a80f87d119008054dd738c12 Mon Sep 17 00:00:00 2001
From: Ben Schneider <benjamin.julius.schneider@gmail.com>
Date: Tue, 11 May 2021 18:50:48 -0400
Subject: [PATCH 1/3] Add filtering joins, with documentation and tests.

---
 NAMESPACE                   |   6 +
 R/join.R                    |  91 ++++++++++++++
 man/dplyr_filter_joins.Rd   |  22 ++++
 tests/testthat/test_joins.R | 229 ++++++++++++++++++++++++++++++++++++
 4 files changed, 348 insertions(+)
 create mode 100644 R/join.R
 create mode 100644 man/dplyr_filter_joins.Rd
 create mode 100644 tests/testthat/test_joins.R

diff --git a/NAMESPACE b/NAMESPACE
index 38c204e..f425bae 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(anti_join,tbl_svy)
 S3method(as.character,survey_vars)
 S3method(as.data.frame,tbl_svy)
 S3method(as_survey,data.frame)
@@ -58,6 +59,7 @@ S3method(rename_,tbl_svy)
 S3method(rename_with,tbl_svy)
 S3method(select,tbl_svy)
 S3method(select_,tbl_svy)
+S3method(semi_join,tbl_svy)
 S3method(summarise,grouped_svy)
 S3method(summarise,tbl_svy)
 S3method(summarise_,grouped_svy)
@@ -70,6 +72,7 @@ S3method(ungroup,tbl_svy)
 export("%>%")
 export(across)
 export(all_vars)
+export(anti_join)
 export(any_vars)
 export(as_survey)
 export(as_survey_)
@@ -149,6 +152,7 @@ export(select_)
 export(select_all)
 export(select_at)
 export(select_if)
+export(semi_join)
 export(set_survey_vars)
 export(summarise)
 export(summarise_)
@@ -186,6 +190,7 @@ export(vars)
 import(rlang)
 importFrom(dplyr,across)
 importFrom(dplyr,all_vars)
+importFrom(dplyr,anti_join)
 importFrom(dplyr,any_vars)
 importFrom(dplyr,c_across)
 importFrom(dplyr,collect)
@@ -241,6 +246,7 @@ importFrom(dplyr,select_)
 importFrom(dplyr,select_all)
 importFrom(dplyr,select_at)
 importFrom(dplyr,select_if)
+importFrom(dplyr,semi_join)
 importFrom(dplyr,summarise)
 importFrom(dplyr,summarise_)
 importFrom(dplyr,summarise_all)
diff --git a/R/join.R b/R/join.R
new file mode 100644
index 0000000..51e7825
--- /dev/null
+++ b/R/join.R
@@ -0,0 +1,91 @@
+#' @export
+semi_join.tbl_svy <- function(
+  x,
+  y,
+  by = NULL,
+  copy = FALSE,
+  ...,
+  na_matches = c("na", "never")
+) {
+
+  if (inherits(y, "tbl_svy")) {
+    y <- y$variables
+  }
+
+  x <- mutate(x, `___row_number` = dplyr::row_number())
+
+  filtered_vars <- semi_join(x = x$variables,
+                             y = y,
+                             by = by,
+                             copy = copy,
+                             na_matches = na_matches,
+                             ...)
+
+  x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']])
+  x <- filter(x, `___retained`)
+  x <- select(x, -`___retained`)
+
+  x
+
+}
+
+#' @export
+anti_join.tbl_svy <- function(
+  x,
+  y,
+  by = NULL,
+  copy = FALSE,
+  ...,
+  na_matches = c("na", "never")
+) {
+
+  if (inherits(y, "tbl_svy")) {
+    y <- y$variables
+  }
+
+  x <- mutate(x, `___row_number` = dplyr::row_number())
+
+  filtered_vars <- anti_join(x = x$variables,
+                             y = y,
+                             by = by,
+                             copy = copy,
+                             na_matches = na_matches,
+                             ...)
+
+  x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']])
+  x <- filter(x, `___retained`)
+  x <- select(x, -`___retained`)
+
+  x
+
+}
+
+# Import + export generics from dplyr and tidyr
+#' Filtering joins from dplyr
+#'
+#' These are data manipulation functions designed to work on a \code{tbl_svy} object
+#' and another data frame or \code{tbl_svy} object.
+#'
+#' \code{semi_join} and \code{anti_join} filter certain observations from a \code{tbl_svy}
+#' depending on the presence or absence of matches in another table.
+#' See \code{\link[dplyr]{filter-joins}} for more details.
+#'
+#' Mutating joins (\code{full_join}, \code{left_join}, etc.) are not implemented
+#' for any \code{tbl_svy} objects. These data manipulations
+#' may require modifications to the survey variable specifications and so
+#' cannot be done automatically. Instead, use dplyr to perform them while the
+#' data is still stored in data.frames.
+#' @name dplyr_filter_joins
+NULL
+
+#' @name semi_join
+#' @export
+#' @importFrom dplyr semi_join
+#' @rdname dplyr_filter_joins
+NULL
+
+#' @name anti_join
+#' @export
+#' @importFrom dplyr anti_join
+#' @rdname dplyr_filter_joins
+NULL
diff --git a/man/dplyr_filter_joins.Rd b/man/dplyr_filter_joins.Rd
new file mode 100644
index 0000000..9bb3e4e
--- /dev/null
+++ b/man/dplyr_filter_joins.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/join.R
+\name{dplyr_filter_joins}
+\alias{dplyr_filter_joins}
+\alias{semi_join}
+\title{Filtering joins from dplyr and tidyr}
+
+\description{
+These are data manipulation functions designed to work on a \code{tbl_svy} object
+and another data frame or \code{tbl_svy} object.
+}
+\details{
+\code{semi_join} and \code{anti_join} filter certain observations from a \code{tbl_svy}
+depending on the presence or absence of matches in another table.
+See \code{\link[dplyr]{filter-joins}} for more details.
+
+Mutating joins (\code{full_join}, \code{left_join}, etc.) are not implemented
+for any \code{tbl_svy} objects. These data manipulations
+may require modifications to the survey variable specifications and so
+cannot be done automatically. Instead, use dplyr to perform them while the
+data is still stored in data.frames.
+}
diff --git a/tests/testthat/test_joins.R b/tests/testthat/test_joins.R
new file mode 100644
index 0000000..9813b95
--- /dev/null
+++ b/tests/testthat/test_joins.R
@@ -0,0 +1,229 @@
+context("filtering joins (semi_join and anti_join) work")
+
+suppressPackageStartupMessages({
+  library(survey)
+  library(srvyr)
+  library(dplyr)
+})
+
+source("utilities.R")
+
+# Set up example data ----
+
+  data(api)
+
+  ##_ Create simple stratified survey design object ----
+    stratified_design <- apistrat %>%
+      as_survey_design(strata = stype, weights = pw)
+
+  ##_ Create clustered survey design object ----
+    cluster_design <- as_survey_design(
+      .data = apiclus1,
+      id = dnum,
+      weights = pw,
+      fpc = fpc
+    )
+
+  ##_ Create survey design object with calibration weights ----
+  ##_ NOTE: The survey package uses special behavior when subsetting such survey designs.
+  ##_       Rows are never removed, the weights are simply set effectively to zero (technically, Inf)
+
+    ### Add raking weights for school type
+      pop.types <- data.frame(stype=c("E","H","M"), Freq=c(4421,755,1018))
+      pop.schwide <- data.frame(sch.wide=c("No","Yes"), Freq=c(1072,5122))
+
+      raked_design <- rake(
+        cluster_design,
+        sample.margins = list(~stype,~sch.wide),
+        population.margins = list(pop.types, pop.schwide)
+      )
+
+# semi_join ----
+
+      test_that(
+        "semi_join works with `by = NULL`", {
+        # Stratified design
+        expect_equal(
+          ## Calculate statistic, after using a filtering join
+          object = stratified_design %>%
+            semi_join(y = filter(apistrat, stype == "E")) %>%
+            summarize(stat = survey_mean(pcttest)) %>%
+            pull("stat"),
+          ## Calculate statistic after manually filtering
+          expected = stratified_design %>%
+            filter(stype == "E") %>%
+            summarize(stat = survey_mean(pcttest)) %>%
+            pull("stat")
+        )
+
+        # Cluster design
+        expect_equal(
+          ## Calculate statistic, after using a filtering join
+          object = cluster_design %>%
+            semi_join(y = filter(apiclus1, stype == "E")) %>%
+            summarize(stat = survey_mean(pcttest)) %>%
+            pull("stat"),
+          ## Calculate statistic after manually filtering
+          expected = cluster_design %>%
+            filter(stype == "E") %>%
+            summarize(stat = survey_mean(pcttest)) %>%
+            pull("stat")
+        )
+
+        # Calibration weighted design
+        expect_equal(
+          ## Calculate statistic, after using a filtering join
+          object = raked_design %>%
+            semi_join(y = filter(apiclus1, stype == "E")) %>%
+            summarize(stat = survey_mean(pcttest)) %>%
+            pull("stat"),
+          ## Calculate statistic after manually filtering
+          expected = raked_design %>%
+            filter(stype == "E") %>%
+            summarize(stat = survey_mean(pcttest)) %>%
+            pull("stat")
+        )
+      })
+
+      test_that(
+        "semi_join works with supplied `by` argument", {
+          # Stratified design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = stratified_design %>%
+              semi_join(y = filter(apistrat, stype == "E"),
+                        by = "stype") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = stratified_design %>%
+              filter(stype == "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+
+          # Cluster design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = cluster_design %>%
+              semi_join(y = filter(apiclus1, stype == "E"),
+                        by = "stype") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = cluster_design %>%
+              filter(stype == "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+
+          # Calibration weighted design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = raked_design %>%
+              semi_join(y = filter(apiclus1, stype == "E"),
+                        by = "stype") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = raked_design %>%
+              filter(stype == "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+        })
+
+# anti_join ----
+
+      test_that(
+        "anti_join works with `by = NULL`", {
+          # Stratified design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = stratified_design %>%
+              anti_join(y = filter(apistrat, stype == "E")) %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = stratified_design %>%
+              filter(stype != "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+
+          # Cluster design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = cluster_design %>%
+              anti_join(y = filter(apiclus1, stype == "E")) %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = cluster_design %>%
+              filter(stype != "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+
+          # Calibration weighted design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = raked_design %>%
+              anti_join(y = filter(apiclus1, stype == "E")) %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = raked_design %>%
+              filter(stype != "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+        })
+
+      test_that(
+        "anti_join works with supplied `by` argument", {
+          # Stratified design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = stratified_design %>%
+              anti_join(y = filter(apistrat, stype == "E"),
+                        by = "stype") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = stratified_design %>%
+              filter(stype != "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+
+          # Cluster design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = cluster_design %>%
+              anti_join(y = filter(apiclus1, stype == "E"),
+                        by = "stype") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = cluster_design %>%
+              filter(stype != "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+
+          # Calibration weighted design
+          expect_equal(
+            ## Calculate statistic, after using a filtering join
+            object = raked_design %>%
+              anti_join(y = filter(apiclus1, stype == "E"),
+                        by = "stype") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat"),
+            ## Calculate statistic after manually filtering
+            expected = raked_design %>%
+              filter(stype != "E") %>%
+              summarize(stat = survey_mean(pcttest)) %>%
+              pull("stat")
+          )
+        })

From 7f2130c75282e624144f916fe10a4d7b0efe7deb Mon Sep 17 00:00:00 2001
From: Ben Schneider <benjamin.julius.schneider@gmail.com>
Date: Tue, 11 May 2021 21:12:54 -0400
Subject: [PATCH 2/3] Try to fix R CMD check warning about global variable.

---
 R/join.R                  | 4 ++--
 man/dplyr_filter_joins.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/join.R b/R/join.R
index 51e7825..da296b8 100644
--- a/R/join.R
+++ b/R/join.R
@@ -22,8 +22,8 @@ semi_join.tbl_svy <- function(
                              ...)
 
   x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']])
-  x <- filter(x, `___retained`)
-  x <- select(x, -`___retained`)
+  x <- filter(x, .data$`___retained`)
+  x <- select(x, -.data$`___retained`)
 
   x
 
diff --git a/man/dplyr_filter_joins.Rd b/man/dplyr_filter_joins.Rd
index 9bb3e4e..28c6f15 100644
--- a/man/dplyr_filter_joins.Rd
+++ b/man/dplyr_filter_joins.Rd
@@ -3,8 +3,8 @@
 \name{dplyr_filter_joins}
 \alias{dplyr_filter_joins}
 \alias{semi_join}
-\title{Filtering joins from dplyr and tidyr}
-
+\alias{anti_join}
+\title{Filtering joins from dplyr}
 \description{
 These are data manipulation functions designed to work on a \code{tbl_svy} object
 and another data frame or \code{tbl_svy} object.

From f240676f07a95ff9251e548cf6abc37f03566b36 Mon Sep 17 00:00:00 2001
From: Ben Schneider <benjamin.julius.schneider@gmail.com>
Date: Wed, 12 May 2021 13:26:20 -0400
Subject: [PATCH 3/3] Skip the `___retained` variable and just filter on the
 expression. Also conditionally remove the `___row_number` variable if it's
 not already removed internally by `filter()`.

---
 R/join.R | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/R/join.R b/R/join.R
index da296b8..6211c8f 100644
--- a/R/join.R
+++ b/R/join.R
@@ -21,9 +21,10 @@ semi_join.tbl_svy <- function(
                              na_matches = na_matches,
                              ...)
 
-  x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']])
-  x <- filter(x, .data$`___retained`)
-  x <- select(x, -.data$`___retained`)
+  x <- filter(x, .data$`___row_number` %in% filtered_vars[['___row_number']])
+  if ("___row_number" %in% tbl_vars(x)) {
+    x <- select(x, -`___row_number`)
+  }
 
   x
 
@@ -52,9 +53,10 @@ anti_join.tbl_svy <- function(
                              na_matches = na_matches,
                              ...)
 
-  x <- mutate(x, `___retained` = `___row_number` %in% filtered_vars[['___row_number']])
-  x <- filter(x, `___retained`)
-  x <- select(x, -`___retained`)
+  x <- filter(x, .data$`___row_number` %in% filtered_vars[['___row_number']])
+  if ("___row_number" %in% tbl_vars(x)) {
+    x <- select(x, -`___row_number`)
+  }
 
   x