From d7651c82d7af3ef65835c48137fe492c4e95ae8b Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Wed, 20 Sep 2023 13:23:17 +0200 Subject: [PATCH] add dendrogram visualization --- DESCRIPTION | 4 +- NAMESPACE | 2 + NEWS.md | 3 + R/compression.R | 152 ++++++++++++++++++++++++++++++ tests/testthat/test_compression.R | 9 ++ vignettes/plotting.Rmd | 12 ++- 6 files changed, 179 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 681fc88..a1bedfc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,7 +34,9 @@ Suggests: ggplot2, scales, tidycensus, - tigris + tigris, + rrapply, + dendextend URL: https://elbersb.github.io/segregation/ BugReports: https://github.com/elbersb/segregation/issues RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE index 6c5aa92..ca17fde 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(as.dendrogram,segcompression) S3method(print,segcompression) export(compress) export(dissimilarity) @@ -23,4 +24,5 @@ export(segplot) import(RcppProgress) import(data.table) importFrom(Rcpp,sourceCpp) +importFrom(stats,as.dendrogram) useDynLib(segregation, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index d4803a3..3f90765 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # segregation (development version) +- various improvements to compression algorithm +- add dendrogram visualization + # segregation 1.0.0 - add mutual_total_nested diff --git a/R/compression.R b/R/compression.R index cd64fe2..0df212e 100644 --- a/R/compression.R +++ b/R/compression.R @@ -238,3 +238,155 @@ get_crosswalk <- function(compression, n_units = NULL, percent = NULL, parts = F setnames(combined, "unit", compression$unit) combined } + +#' @importFrom stats as.dendrogram +#' @export +as.dendrogram.segcompression <- function(object, ...) { + if (utils::tail(object$iterations$N_units, 1) != 1) { + stop("Compression algorithm needs to be run on complete dataset (max_iter = Inf)") + } + if (!requireNamespace("rrapply", quietly = TRUE)) { + stop("Please install rrapply to use this function") + } + if (!requireNamespace("dendextend", quietly = TRUE)) { + stop("Please install dendextend to use this function") + } + + reduction <- 0 + tree <- list() + + for (i in seq_len(nrow(object$iterations))) { + old_unit <- object$iterations[i, ][["old_unit"]] + new_unit <- object$iterations[i, ][["new_unit"]] + M_wgt <- object$iterations[i, ][["M_wgt"]] + reduction <- reduction + M_wgt + + if (length(tree) > 0) { + find_old <- rrapply::rrapply(tree, + classes = "list", + condition = function(x, .xname) x == old_unit, + f = function(x, .xpos) .xpos, + how = "flatten" + ) + find_new <- rrapply::rrapply(tree, + classes = "list", + condition = function(x, .xname) x == new_unit, + f = function(x, .xpos) .xpos, + how = "flatten" + ) + } else { + find_old <- list() + find_new <- list() + } + + if (length(find_old) > 0 && length(find_new) > 0) { + indices <- sort(c(find_old[[1]][1], find_new[[1]][1])) + combined <- list(tree[[indices[1]]], tree[[indices[2]]]) + attr(combined, "height") <- reduction + + tree[[indices[1]]] <- combined + tree[[indices[2]]] <- NULL + } else if (length(find_old) > 0 || length(find_new) > 0) { + if (length(find_old) > 0) { + index <- find_old[[1]][1] + new_item <- list(new_unit) + attr(new_item, "label") <- new_unit + } else { + index <- find_new[[1]][1] + new_item <- list(old_unit) + attr(new_item, "label") <- old_unit + } + attr(new_item, "height") <- 0 + attr(new_item, "members") <- 1 + attr(new_item, "leaf") <- TRUE + + if (M_wgt == attr(tree[[index]], "height")) { + tree[[index]][[length(tree[[index]]) + 1]] <- new_item + } else { + combined <- list(tree[[index]], new_item) + attr(combined, "height") <- reduction + tree[[index]] <- combined + } + } else { + lhs <- list(old_unit) + attr(lhs, "label") <- old_unit + attr(lhs, "height") <- 0 + attr(lhs, "members") <- 1 + attr(lhs, "leaf") <- TRUE + rhs <- list(new_unit) + attr(rhs, "label") <- new_unit + attr(rhs, "height") <- 0 + attr(rhs, "members") <- 1 + attr(rhs, "leaf") <- TRUE + combined <- list(lhs, rhs) + attr(combined, "height") <- M_wgt + attr(combined, "members") <- 2 + tree[[length(tree) + 1]] <- combined + } + } + + dend <- tree[[1]] + class(dend) <- "dendrogram" + dend <- dendextend::fix_members_attr.dendrogram(dend) + dend <- midcache.dendrogram(dend) + return(dend) +} + +# copied from stats -- not exported +midcache.dendrogram <- function(x) { + stopifnot(inherits(x, "dendrogram")) + setmid <- function(d, type) { + depth <- 0L + kk <- integer() + jj <- integer() + dd <- list() + repeat { + if (!stats::is.leaf(d)) { + k <- length(d) + if (k < 1) { + stop("dendrogram node with non-positive #{branches}") + } + depth <- depth + 1L + kk[depth] <- k + if (storage.mode(jj) != storage.mode(kk)) { + storage.mode(jj) <- storage.mode(kk) + } + dd[[depth]] <- d + d <- d[[jj[depth] <- 1L]] + next + } + while (depth) { + k <- kk[depth] + j <- jj[depth] + r <- dd[[depth]] + r[[j]] <- unclass(d) + if (j < k) { + break + } + depth <- depth - 1L + midS <- sum(vapply(r, .midDend, 0)) + attr(r, "midpoint") <- (.memberDend(r[[1L]]) + midS) / 2 + d <- r + } + if (!depth) { + break + } + dd[[depth]] <- r + d <- r[[jj[depth] <- j + 1L]] + } + d + } + setmid(x) +} + +.midDend <- function(x) { + attr(x, "midpoint") %||% 0 +} + +.memberDend <- function(x) { + attr(x, "x.member") %||% (attr(x, "members") %||% 1L) +} + +`%||%` <- function(L, R) { + if (is.null(L)) R else L +} diff --git a/tests/testthat/test_compression.R b/tests/testthat/test_compression.R index a53bd70..06b9626 100644 --- a/tests/testthat/test_compression.R +++ b/tests/testthat/test_compression.R @@ -147,3 +147,12 @@ test_that("local neighbors", { res_local$iterations[pct_M > 0.99][.N][["N_units"]] ) }) + +test_that("dendrogram", { + dend <- as.dendrogram(res_all) + expect_equal(attr(dend, "height"), res_all$iterations$M[[1]]) + expect_equal(attr(dend, "members"), length(unique(res_all$data$school))) + + res_limited <- compress(subset, "race", "school", weight = "n", max_iter = 5) + expect_error(as.dendrogram(res_limited)) +}) diff --git a/vignettes/plotting.Rmd b/vignettes/plotting.Rmd index 824bdbf..fa0df46 100644 --- a/vignettes/plotting.Rmd +++ b/vignettes/plotting.Rmd @@ -1,8 +1,8 @@ --- -title: "Visualizing segregation" +title: "Visualizing and compressing segregation" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Visualizing segregation} + %\VignetteIndexEntry{Visualizing and compressing segregation} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -134,6 +134,14 @@ between the number of merges and the loss in segregation information: scree_plot(comp) ``` +Another way to learn more about the compression is to visualize the information as a dendrogram: + +```{r} +dend <- as.dendrogram(comp) +dendextend::labels(dend) <- NULL # remove the labels +plot(dend) +``` + The third step is to create a new dataset based on the desired level of compression. This can be achieved using the function `merge_units()`, and either `n_units` or `percent` can be specified to indicate the desired level of compression.