From c3f12b2996084646fcf218495dfe8f7cd3183607 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 3 Oct 2023 15:30:53 +0200 Subject: [PATCH] allow multiple curves in `segcurve` function --- NEWS.md | 1 + R/plots.R | 51 ++++++++++++++++++++++++------------- man/segcurve.Rd | 17 +++++++------ tests/testthat/test_plots.R | 8 ++++++ vignettes/plotting.Rmd | 21 ++++++++++++--- 5 files changed, 70 insertions(+), 28 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3f90765..daa109f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ - various improvements to compression algorithm - add dendrogram visualization +- allow multiple curves in `segcurve` function # segregation 1.0.0 diff --git a/R/plots.R b/R/plots.R index 9c2fadc..b9e69c1 100644 --- a/R/plots.R +++ b/R/plots.R @@ -170,50 +170,67 @@ segplot <- function(data, group, unit, weight, order = "segregation", #' A visual representation of two-group segregation #' -#' Produces a segregation curve, as defined in Duncan and Duncan (1955) +#' Produces one or several segregation curves, as defined in Duncan and Duncan (1955) #' #' @param data A data frame. -#' @param group A categorical variable or a vector of variables -#' contained in \code{data}. Defines the first dimension -#' over which segregation is computed. -#' @param unit A categorical variable or a vector of variables -#' contained in \code{data}. Defines the second dimension -#' over which segregation is computed. +#' @param group A categorical variable contained in \code{data}. +#' Defines the first dimension over which segregation is computed. +#' @param unit A categorical variable contained in \code{data}. +#' Defines the second dimension over which segregation is computed. #' @param weight Numeric. (Default \code{NULL}) +#' @param segment A categorical variable contained in \code{data}. (Default \code{NULL}) +#' If given, several segregation curves will be shown, one for each segment. #' @return Returns a ggplot2 object. #' @import data.table #' @export -segcurve <- function(data, group, unit, weight) { +segcurve <- function(data, group, unit, weight = NULL, segment = NULL) { if (!requireNamespace("ggplot2", quietly = TRUE)) { stop("Please install ggplot2 to use this function") } stopifnot(length(group) == 1) stopifnot(length(unit) == 1) - d <- prepare_data(data, group, unit, weight) + d <- prepare_data(data, group, unit, weight, within = segment) # easier if renamed setnames(d, group, "group") setnames(d, unit, "unit") + if (is.null(segment)) { + d[["segment"]] <- 1 + } else { + stopifnot(length(segment) == 1) + setnames(d, segment, "segment") + d[["segment"]] <- as.factor(d[["segment"]]) + } if (d[, uniqueN(group)] != 2) { stop("requires exactly two groups") } - wide <- dcast(d, unit ~ group, value.var = "freq", fill = 0) - group_names <- names(wide)[2:3] + wide <- dcast(d, segment + unit ~ group, value.var = "freq", fill = 0) + group_names <- names(wide)[3:4] setnames(wide, group_names, c("group1", "group2")) wide[, pct_group_1 := group1 / (group1 + group2)] - setorder(wide, pct_group_1) - wide[, cumul_prob_1 := cumsum(group1) / sum(group1)] - wide[, cumul_prob_2 := cumsum(group2) / sum(group2)] + setorder(wide, segment, pct_group_1) + wide[, cumul_prob_1 := cumsum(group1) / sum(group1), by = .(segment)] + wide[, cumul_prob_2 := cumsum(group2) / sum(group2), by = .(segment)] - ggplot2::ggplot(wide, ggplot2::aes(x = cumul_prob_2, y = cumul_prob_1)) + + p <- ggplot2::ggplot(wide, ggplot2::aes(x = cumul_prob_2, y = cumul_prob_1)) + ggplot2::annotate(geom = "segment", x = 0, y = 0, xend = 1, yend = 1, colour = "darkgray") + - ggplot2::geom_line() + ggplot2::scale_x_continuous(labels = scales::percent_format(accuracy = 1)) + ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + ggplot2::labs( x = paste("Cumulative % ", group_names[2]), y = paste("Cumulative % ", group_names[1]) - ) + ) + + ggplot2::coord_fixed() + + if (is.null(segment)) { + p <- p + ggplot2::geom_line() + } else { + p <- p + + ggplot2::geom_line(ggplot2::aes(color = segment)) + + ggplot2::labs(color = segment) + } + + return(p) } diff --git a/man/segcurve.Rd b/man/segcurve.Rd index c169f06..6bac546 100644 --- a/man/segcurve.Rd +++ b/man/segcurve.Rd @@ -4,24 +4,25 @@ \alias{segcurve} \title{A visual representation of two-group segregation} \usage{ -segcurve(data, group, unit, weight) +segcurve(data, group, unit, weight = NULL, segment = NULL) } \arguments{ \item{data}{A data frame.} -\item{group}{A categorical variable or a vector of variables -contained in \code{data}. Defines the first dimension -over which segregation is computed.} +\item{group}{A categorical variable contained in \code{data}. +Defines the first dimension over which segregation is computed.} -\item{unit}{A categorical variable or a vector of variables -contained in \code{data}. Defines the second dimension -over which segregation is computed.} +\item{unit}{A categorical variable contained in \code{data}. +Defines the second dimension over which segregation is computed.} \item{weight}{Numeric. (Default \code{NULL})} + +\item{segment}{A categorical variable contained in \code{data}. (Default \code{NULL}) +If given, several segregation curves will be shown, one for each segment.} } \value{ Returns a ggplot2 object. } \description{ -Produces a segregation curve, as defined in Duncan and Duncan (1955) +Produces one or several segregation curves, as defined in Duncan and Duncan (1955) } diff --git a/tests/testthat/test_plots.R b/tests/testthat/test_plots.R index 08bd34d..931cc4c 100644 --- a/tests/testthat/test_plots.R +++ b/tests/testthat/test_plots.R @@ -60,6 +60,8 @@ test_that("axis_labels", { test_that("segcurve", { expect_error(segcurve(schools00, "race", "school", weight = "n")) + expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("a", "b"))) + expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("state", "school"))) p1 <- segcurve(subset(schools00, race %in% c("white", "black")), "race", "school", @@ -69,4 +71,10 @@ test_that("segcurve", { "race", "school", weight = "n" ) + p3 <- segcurve(subset(schools00, race %in% c("white", "asian")), + "race", "school", + weight = "n", + segment = "state" + ) + expect_equal(p3$labels$colour, "state") }) diff --git a/vignettes/plotting.Rmd b/vignettes/plotting.Rmd index feb344f..5df6a8a 100644 --- a/vignettes/plotting.Rmd +++ b/vignettes/plotting.Rmd @@ -32,15 +32,30 @@ detail [in this working paper](https://osf.io/preprints/socarxiv/ruw4g/). ## Segregation curve The segregation curve was first introduced by [Duncan and Duncan (1955)](https://www.jstor.org/stable/2088328). -The function `segcurve()` provides a simple way of plotting a segregation curve: +The function `segcurve()` provides a simple way of plotting one or several segregation curves: ```{r} -segcurve(subset(schools00, race %in% c("white", "black")), +segcurve(subset(schools00, race %in% c("white", "asian")), "race", "school", - weight = "n" + weight = "n", + segment = "state" # leave this out to produce a single curve ) ``` +In this case, state `A` is the most segregated, while state `B` and `C` are similarly segregated, +but at a lower level. Segregation curves are closely related to the index of dissimilarity, and +here this corresponds to the following index values: + +```{r} +# converting to data.table makes this easier +data.table::as.data.table(schools00)[ + race %in% c("white", "asian"), + dissimilarity(.SD, "race", "school", weight = "n"), + by = .(state) +] +``` + + ## Segplot The function `segplot()` is provided to generate segplots. Segplots are described in more