From c3f12b2996084646fcf218495dfe8f7cd3183607 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <elbersb@gmail.com>
Date: Tue, 3 Oct 2023 15:30:53 +0200
Subject: [PATCH] allow multiple curves in `segcurve` function

---
 NEWS.md                     |  1 +
 R/plots.R                   | 51 ++++++++++++++++++++++++-------------
 man/segcurve.Rd             | 17 +++++++------
 tests/testthat/test_plots.R |  8 ++++++
 vignettes/plotting.Rmd      | 21 ++++++++++++---
 5 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 3f90765..daa109f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,7 @@
 
 - various improvements to compression algorithm
 - add dendrogram visualization
+- allow multiple curves in `segcurve` function
 
 # segregation 1.0.0
 
diff --git a/R/plots.R b/R/plots.R
index 9c2fadc..b9e69c1 100644
--- a/R/plots.R
+++ b/R/plots.R
@@ -170,50 +170,67 @@ segplot <- function(data, group, unit, weight, order = "segregation",
 
 #' A visual representation of two-group segregation
 #'
-#' Produces a segregation curve, as defined in Duncan and Duncan (1955)
+#' Produces one or several segregation curves, as defined in Duncan and Duncan (1955)
 #'
 #' @param data A data frame.
-#' @param group A categorical variable or a vector of variables
-#'   contained in \code{data}. Defines the first dimension
-#'   over which segregation is computed.
-#' @param unit A categorical variable or a vector of variables
-#'   contained in \code{data}. Defines the second dimension
-#'   over which segregation is computed.
+#' @param group A categorical variable contained in \code{data}.
+#'   Defines the first dimension over which segregation is computed.
+#' @param unit A categorical variable contained in \code{data}.
+#'   Defines the second dimension over which segregation is computed.
 #' @param weight Numeric. (Default \code{NULL})
+#' @param segment A categorical variable contained in \code{data}. (Default \code{NULL})
+#'   If given, several segregation curves will be shown, one for each segment.
 #' @return Returns a ggplot2 object.
 #' @import data.table
 #' @export
-segcurve <- function(data, group, unit, weight) {
+segcurve <- function(data, group, unit, weight = NULL, segment = NULL) {
     if (!requireNamespace("ggplot2", quietly = TRUE)) {
         stop("Please install ggplot2 to use this function")
     }
 
     stopifnot(length(group) == 1)
     stopifnot(length(unit) == 1)
-    d <- prepare_data(data, group, unit, weight)
+    d <- prepare_data(data, group, unit, weight, within = segment)
     # easier if renamed
     setnames(d, group, "group")
     setnames(d, unit, "unit")
+    if (is.null(segment)) {
+        d[["segment"]] <- 1
+    } else {
+        stopifnot(length(segment) == 1)
+        setnames(d, segment, "segment")
+        d[["segment"]] <- as.factor(d[["segment"]])
+    }
 
     if (d[, uniqueN(group)] != 2) {
         stop("requires exactly two groups")
     }
 
-    wide <- dcast(d, unit ~ group, value.var = "freq", fill = 0)
-    group_names <- names(wide)[2:3]
+    wide <- dcast(d, segment + unit ~ group, value.var = "freq", fill = 0)
+    group_names <- names(wide)[3:4]
     setnames(wide, group_names, c("group1", "group2"))
     wide[, pct_group_1 := group1 / (group1 + group2)]
-    setorder(wide, pct_group_1)
-    wide[, cumul_prob_1 := cumsum(group1) / sum(group1)]
-    wide[, cumul_prob_2 := cumsum(group2) / sum(group2)]
+    setorder(wide, segment, pct_group_1)
+    wide[, cumul_prob_1 := cumsum(group1) / sum(group1), by = .(segment)]
+    wide[, cumul_prob_2 := cumsum(group2) / sum(group2), by = .(segment)]
 
-    ggplot2::ggplot(wide, ggplot2::aes(x = cumul_prob_2, y = cumul_prob_1)) +
+    p <- ggplot2::ggplot(wide, ggplot2::aes(x = cumul_prob_2, y = cumul_prob_1)) +
         ggplot2::annotate(geom = "segment", x = 0, y = 0, xend = 1, yend = 1, colour = "darkgray") +
-        ggplot2::geom_line() +
         ggplot2::scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
         ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
         ggplot2::labs(
             x = paste("Cumulative % ", group_names[2]),
             y = paste("Cumulative % ", group_names[1])
-        )
+        ) +
+        ggplot2::coord_fixed()
+
+    if (is.null(segment)) {
+        p <- p + ggplot2::geom_line()
+    } else {
+        p <- p +
+            ggplot2::geom_line(ggplot2::aes(color = segment)) +
+            ggplot2::labs(color = segment)
+    }
+
+    return(p)
 }
diff --git a/man/segcurve.Rd b/man/segcurve.Rd
index c169f06..6bac546 100644
--- a/man/segcurve.Rd
+++ b/man/segcurve.Rd
@@ -4,24 +4,25 @@
 \alias{segcurve}
 \title{A visual representation of two-group segregation}
 \usage{
-segcurve(data, group, unit, weight)
+segcurve(data, group, unit, weight = NULL, segment = NULL)
 }
 \arguments{
 \item{data}{A data frame.}
 
-\item{group}{A categorical variable or a vector of variables
-contained in \code{data}. Defines the first dimension
-over which segregation is computed.}
+\item{group}{A categorical variable contained in \code{data}.
+Defines the first dimension over which segregation is computed.}
 
-\item{unit}{A categorical variable or a vector of variables
-contained in \code{data}. Defines the second dimension
-over which segregation is computed.}
+\item{unit}{A categorical variable contained in \code{data}.
+Defines the second dimension over which segregation is computed.}
 
 \item{weight}{Numeric. (Default \code{NULL})}
+
+\item{segment}{A categorical variable contained in \code{data}. (Default \code{NULL})
+If given, several segregation curves will be shown, one for each segment.}
 }
 \value{
 Returns a ggplot2 object.
 }
 \description{
-Produces a segregation curve, as defined in Duncan and Duncan (1955)
+Produces one or several segregation curves, as defined in Duncan and Duncan (1955)
 }
diff --git a/tests/testthat/test_plots.R b/tests/testthat/test_plots.R
index 08bd34d..931cc4c 100644
--- a/tests/testthat/test_plots.R
+++ b/tests/testthat/test_plots.R
@@ -60,6 +60,8 @@ test_that("axis_labels", {
 
 test_that("segcurve", {
     expect_error(segcurve(schools00, "race", "school", weight = "n"))
+    expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("a", "b")))
+    expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("state", "school")))
 
     p1 <- segcurve(subset(schools00, race %in% c("white", "black")),
         "race", "school",
@@ -69,4 +71,10 @@ test_that("segcurve", {
         "race", "school",
         weight = "n"
     )
+    p3 <- segcurve(subset(schools00, race %in% c("white", "asian")),
+        "race", "school",
+        weight = "n",
+        segment = "state"
+    )
+    expect_equal(p3$labels$colour, "state")
 })
diff --git a/vignettes/plotting.Rmd b/vignettes/plotting.Rmd
index feb344f..5df6a8a 100644
--- a/vignettes/plotting.Rmd
+++ b/vignettes/plotting.Rmd
@@ -32,15 +32,30 @@ detail [in this working paper](https://osf.io/preprints/socarxiv/ruw4g/).
 ## Segregation curve
 
 The segregation curve was first introduced by [Duncan and Duncan (1955)](https://www.jstor.org/stable/2088328).
-The function `segcurve()` provides a simple way of plotting a segregation curve:
+The function `segcurve()` provides a simple way of plotting one or several segregation curves:
 
 ```{r}
-segcurve(subset(schools00, race %in% c("white", "black")),
+segcurve(subset(schools00, race %in% c("white", "asian")),
   "race", "school",
-  weight = "n"
+  weight = "n",
+  segment = "state" # leave this out to produce a single curve
 )
 ```
 
+In this case, state `A` is the most segregated, while state `B` and `C` are similarly segregated,
+but at a lower level. Segregation curves are closely related to the index of dissimilarity, and 
+here this corresponds to the following index values:
+
+```{r}
+# converting to data.table makes this easier
+data.table::as.data.table(schools00)[
+  race %in% c("white", "asian"),
+  dissimilarity(.SD, "race", "school", weight = "n"),
+  by = .(state)
+]
+```
+
+
 ## Segplot
 
 The function `segplot()` is provided to generate segplots. Segplots are described in more