Resolve #80

Part of this work was done live during today's LIBD rstats club. Although I didn't find the solution to this bug during that time.
LieberInstitute · Jul 12, 2024 · 08c2b2d · 08c2b2d
1 parent 8cc5dd0
commit 08c2b2d
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 15 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,14 @@
+# spatialLIBD 1.17.6
+
+BUG FIXES
+
+* Fixed the bug reported by @lahuuki about `vis_grid_clus()` not handling
+`logical()` cluster variables.
+See <https://github.com/LieberInstitute/spatialLIBD/issues/80>. To resolve this,
+`sort_clusters()` and `get_colors()` had to change internally. Examples and
+documentation for both functions have now been updated to showcase what happens
+when you provide a `logical()` vector as an input.
+
 # spatialLIBD 1.17.5
 
 NEW FEATURES

diff --git a/R/get_colors.R b/R/get_colors.R
@@ -31,6 +31,17 @@
 #'
 #' ## Example where Polychrome::palette36.colors() gets used
 #' get_colors(clusters = letters[seq_len(13)])
+#'
+#' ## What happens if you have a logical variable with NAs?
+#' set.seed(20240712)
+#' log_var <- sample(c(TRUE, FALSE, NA),
+#'     1000,
+#'     replace = TRUE,
+#'     prob = c(0.3, 0.15, 0.55))
+#' log_var_sorted <- sort_clusters(log_var)
+#' ## A color does get assigned to 'NA', but will be overwritten by
+#' ## 'na_color' passed to `vis_clus_p()` and related functions.
+#' get_colors(colors = NULL, clusters = log_var_sorted)
 get_colors <- function(colors = NULL, clusters) {
     n_clus <- length(unique(clusters))
 
@@ -67,7 +78,12 @@ get_colors <- function(colors = NULL, clusters) {
                     "purple"
                 )
             }
-        names(colors) <- seq_len(length(colors))
+        ## Subset to the actual number of values if we are working with < 12
+        colors <- colors[seq_len(n_clus)]
+
+        ## Set the names of the colors in a way compatible with how names
+        ## are set in vis_clus_p().
+        names(colors) <- levels(factor(clusters))
     } else if (all(unique(as.character(clusters)) %in% c(gsub("ayer", "", names(colors)), NA))) {
         names(colors) <- gsub("ayer", "", names(colors))
     }

diff --git a/R/sort_clusters.R b/R/sort_clusters.R
@@ -1,15 +1,15 @@
 #' Sort clusters by frequency
 #'
-#' This function takes a vector with cluster labels and sorts it by frequency
-#' such that the most frequent cluster is the first one and so on.
+#' This function takes a vector with cluster labels, recasts it as a `factor()`,
+#' and sorts the `factor()` levels by frequency such that the most frequent
+#' cluster is the first level and so on.
 #'
 #' @param clusters A vector with cluster labels.
 #' @param map_subset A logical vector of length equal to `clusters` specifying
 #' which elements of `clusters` to use to determine the ranking of the clusters.
 #'
-#' @return A factor of length equal to `clusters` where the levels are the new
-#' ordered clusters and the names of the factor are the original values from
-#' `clusters`.
+#' @return A `factor()` version of `clusters` where the levels are ordered by
+#' frequency.
 #'
 #' @export
 #'
@@ -21,9 +21,49 @@
 #' ## In this case, it's a character vector
 #' class(clus)
 #'
-#' ## Sort them and obtain a factor
+#' ## We see that we have 10 elements in this vector, which is
+#' ## an unnamed character vector
+#' clus
+#'
+#' ## letter 'd' is the most frequent
+#' table(clus)
+#'
+#' ## Sort them and obtain a factor. Notice that it's a named
+#' ## factor, and the names correspond to the original values
+#' ## in the character vector.
 #' sort_clusters(clus)
+#'
+#' ## Since 'd' was the most frequent, it gets assigned to the first level
+#' ## in the factor variable.
+#' table(sort_clusters(clus))
+#'
+#' ## If we skip the first 3 values of clus (which are all 'd'), we can
+#' ## change the most frequent cluster. And thus the ordering of the
+#' ## factor levels.
+#' sort_clusters(clus, map_subset = seq_len(length(clus)) > 3)
+#'
+#' ## Let's try with a factor variable
+#' clus_factor <- factor(clus)
+#' ## sort_clusters() returns an identical result in this case
+#' stopifnot(identical(sort_clusters(clus), sort_clusters(clus_factor)))
+#'
+#' ## What happens if you have a logical variable with NAs?
+#' set.seed(20240712)
+#' log_var <- sample(c(TRUE, FALSE, NA),
+#'     1000,
+#'     replace = TRUE,
+#'     prob = c(0.3, 0.15, 0.55))
+#' ## Here, the NAs are the most frequent group.
+#' table(log_var, useNA = "ifany")
+#'
+#' ## The NAs are not used for sorting. Since we have more 'TRUE' than 'FALSE'
+#' ## then, 'TRUE' becomes the first level.
+#' table(sort_clusters(log_var), useNA = "ifany")
 sort_clusters <- function(clusters, map_subset = NULL) {
+    if (is.logical(clusters)) {
+        clusters <- as.character(clusters)
+    }
+
     if (is.null(map_subset)) {
         map_subset <- rep(TRUE, length(clusters))
     } else {
@@ -36,6 +76,5 @@ sort_clusters <- function(clusters, map_subset = NULL) {
     }
     map <-
         rank(length(clusters[map_subset]) - table(clusters[map_subset]), ties.method = "first")
-    res <- map[clusters]
-    factor(res)
+    factor(clusters, levels = names(sort(map)))
 }
diff --git a/man/get_colors.Rd b/man/get_colors.Rd
diff --git a/man/sort_clusters.Rd b/man/sort_clusters.Rd