From 08c2b2d80a29b1decee578d01f7230b595a9bf5a Mon Sep 17 00:00:00 2001
From: lcolladotor <lcolladotor@gmail.com>
Date: Fri, 12 Jul 2024 10:56:10 -0400
Subject: [PATCH] Resolve #80

Part of this work was done live during today's LIBD rstats club. Although I didn't find the solution to this bug during that time.
---
 NEWS.md              | 11 +++++++++
 R/get_colors.R       | 18 ++++++++++++++-
 R/sort_clusters.R    | 55 +++++++++++++++++++++++++++++++++++++-------
 man/get_colors.Rd    | 11 +++++++++
 man/sort_clusters.Rd | 48 +++++++++++++++++++++++++++++++++-----
 5 files changed, 128 insertions(+), 15 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 328c9750..0d9e48e1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,14 @@
+# spatialLIBD 1.17.6
+
+BUG FIXES
+
+* Fixed the bug reported by @lahuuki about `vis_grid_clus()` not handling
+`logical()` cluster variables.
+See <https://github.com/LieberInstitute/spatialLIBD/issues/80>. To resolve this,
+`sort_clusters()` and `get_colors()` had to change internally. Examples and
+documentation for both functions have now been updated to showcase what happens
+when you provide a `logical()` vector as an input.
+
 # spatialLIBD 1.17.5
 
 NEW FEATURES
diff --git a/R/get_colors.R b/R/get_colors.R
index 7d601839..93b53351 100644
--- a/R/get_colors.R
+++ b/R/get_colors.R
@@ -31,6 +31,17 @@
 #'
 #' ## Example where Polychrome::palette36.colors() gets used
 #' get_colors(clusters = letters[seq_len(13)])
+#'
+#' ## What happens if you have a logical variable with NAs?
+#' set.seed(20240712)
+#' log_var <- sample(c(TRUE, FALSE, NA),
+#'     1000,
+#'     replace = TRUE,
+#'     prob = c(0.3, 0.15, 0.55))
+#' log_var_sorted <- sort_clusters(log_var)
+#' ## A color does get assigned to 'NA', but will be overwritten by
+#' ## 'na_color' passed to `vis_clus_p()` and related functions.
+#' get_colors(colors = NULL, clusters = log_var_sorted)
 get_colors <- function(colors = NULL, clusters) {
     n_clus <- length(unique(clusters))
 
@@ -67,7 +78,12 @@ get_colors <- function(colors = NULL, clusters) {
                     "purple"
                 )
             }
-        names(colors) <- seq_len(length(colors))
+        ## Subset to the actual number of values if we are working with < 12
+        colors <- colors[seq_len(n_clus)]
+
+        ## Set the names of the colors in a way compatible with how names
+        ## are set in vis_clus_p().
+        names(colors) <- levels(factor(clusters))
     } else if (all(unique(as.character(clusters)) %in% c(gsub("ayer", "", names(colors)), NA))) {
         names(colors) <- gsub("ayer", "", names(colors))
     }
diff --git a/R/sort_clusters.R b/R/sort_clusters.R
index 7d906690..f4b8563c 100644
--- a/R/sort_clusters.R
+++ b/R/sort_clusters.R
@@ -1,15 +1,15 @@
 #' Sort clusters by frequency
 #'
-#' This function takes a vector with cluster labels and sorts it by frequency
-#' such that the most frequent cluster is the first one and so on.
+#' This function takes a vector with cluster labels, recasts it as a `factor()`,
+#' and sorts the `factor()` levels by frequency such that the most frequent
+#' cluster is the first level and so on.
 #'
 #' @param clusters A vector with cluster labels.
 #' @param map_subset A logical vector of length equal to `clusters` specifying
 #' which elements of `clusters` to use to determine the ranking of the clusters.
 #'
-#' @return A factor of length equal to `clusters` where the levels are the new
-#' ordered clusters and the names of the factor are the original values from
-#' `clusters`.
+#' @return A `factor()` version of `clusters` where the levels are ordered by
+#' frequency.
 #'
 #' @export
 #'
@@ -21,9 +21,49 @@
 #' ## In this case, it's a character vector
 #' class(clus)
 #'
-#' ## Sort them and obtain a factor
+#' ## We see that we have 10 elements in this vector, which is
+#' ## an unnamed character vector
+#' clus
+#'
+#' ## letter 'd' is the most frequent
+#' table(clus)
+#'
+#' ## Sort them and obtain a factor. Notice that it's a named
+#' ## factor, and the names correspond to the original values
+#' ## in the character vector.
 #' sort_clusters(clus)
+#'
+#' ## Since 'd' was the most frequent, it gets assigned to the first level
+#' ## in the factor variable.
+#' table(sort_clusters(clus))
+#'
+#' ## If we skip the first 3 values of clus (which are all 'd'), we can
+#' ## change the most frequent cluster. And thus the ordering of the
+#' ## factor levels.
+#' sort_clusters(clus, map_subset = seq_len(length(clus)) > 3)
+#'
+#' ## Let's try with a factor variable
+#' clus_factor <- factor(clus)
+#' ## sort_clusters() returns an identical result in this case
+#' stopifnot(identical(sort_clusters(clus), sort_clusters(clus_factor)))
+#'
+#' ## What happens if you have a logical variable with NAs?
+#' set.seed(20240712)
+#' log_var <- sample(c(TRUE, FALSE, NA),
+#'     1000,
+#'     replace = TRUE,
+#'     prob = c(0.3, 0.15, 0.55))
+#' ## Here, the NAs are the most frequent group.
+#' table(log_var, useNA = "ifany")
+#'
+#' ## The NAs are not used for sorting. Since we have more 'TRUE' than 'FALSE'
+#' ## then, 'TRUE' becomes the first level.
+#' table(sort_clusters(log_var), useNA = "ifany")
 sort_clusters <- function(clusters, map_subset = NULL) {
+    if (is.logical(clusters)) {
+        clusters <- as.character(clusters)
+    }
+
     if (is.null(map_subset)) {
         map_subset <- rep(TRUE, length(clusters))
     } else {
@@ -36,6 +76,5 @@ sort_clusters <- function(clusters, map_subset = NULL) {
     }
     map <-
         rank(length(clusters[map_subset]) - table(clusters[map_subset]), ties.method = "first")
-    res <- map[clusters]
-    factor(res)
+    factor(clusters, levels = names(sort(map)))
 }
diff --git a/man/get_colors.Rd b/man/get_colors.Rd
index ec9a4673..82d96a6d 100644
--- a/man/get_colors.Rd
+++ b/man/get_colors.Rd
@@ -39,4 +39,15 @@ get_colors(clusters = sce_layer$kmeans_k7)
 
 ## Example where Polychrome::palette36.colors() gets used
 get_colors(clusters = letters[seq_len(13)])
+
+## What happens if you have a logical variable with NAs?
+set.seed(20240712)
+log_var <- sample(c(TRUE, FALSE, NA),
+    1000,
+    replace = TRUE,
+    prob = c(0.3, 0.15, 0.55))
+log_var_sorted <- sort_clusters(log_var)
+## A color does get assigned to 'NA', but will be overwritten by
+## 'na_color' passed to `vis_clus_p()` and related functions.
+get_colors(colors = NULL, clusters = log_var_sorted)
 }
diff --git a/man/sort_clusters.Rd b/man/sort_clusters.Rd
index 93d05aa4..66ec783c 100644
--- a/man/sort_clusters.Rd
+++ b/man/sort_clusters.Rd
@@ -13,13 +13,13 @@ sort_clusters(clusters, map_subset = NULL)
 which elements of \code{clusters} to use to determine the ranking of the clusters.}
 }
 \value{
-A factor of length equal to \code{clusters} where the levels are the new
-ordered clusters and the names of the factor are the original values from
-\code{clusters}.
+A \code{factor()} version of \code{clusters} where the levels are ordered by
+frequency.
 }
 \description{
-This function takes a vector with cluster labels and sorts it by frequency
-such that the most frequent cluster is the first one and so on.
+This function takes a vector with cluster labels, recasts it as a \code{factor()},
+and sorts the \code{factor()} levels by frequency such that the most frequent
+cluster is the first level and so on.
 }
 \examples{
 
@@ -29,6 +29,42 @@ clus <- letters[unlist(lapply(4:1, function(x) rep(x, x)))]
 ## In this case, it's a character vector
 class(clus)
 
-## Sort them and obtain a factor
+## We see that we have 10 elements in this vector, which is
+## an unnamed character vector
+clus
+
+## letter 'd' is the most frequent
+table(clus)
+
+## Sort them and obtain a factor. Notice that it's a named
+## factor, and the names correspond to the original values
+## in the character vector.
 sort_clusters(clus)
+
+## Since 'd' was the most frequent, it gets assigned to the first level
+## in the factor variable.
+table(sort_clusters(clus))
+
+## If we skip the first 3 values of clus (which are all 'd'), we can
+## change the most frequent cluster. And thus the ordering of the
+## factor levels.
+sort_clusters(clus, map_subset = seq_len(length(clus)) > 3)
+
+## Let's try with a factor variable
+clus_factor <- factor(clus)
+## sort_clusters() returns an identical result in this case
+stopifnot(identical(sort_clusters(clus), sort_clusters(clus_factor)))
+
+## What happens if you have a logical variable with NAs?
+set.seed(20240712)
+log_var <- sample(c(TRUE, FALSE, NA),
+    1000,
+    replace = TRUE,
+    prob = c(0.3, 0.15, 0.55))
+## Here, the NAs are the most frequent group.
+table(log_var, useNA = "ifany")
+
+## The NAs are not used for sorting. Since we have more 'TRUE' than 'FALSE'
+## then, 'TRUE' becomes the first level.
+table(sort_clusters(log_var), useNA = "ifany")
 }