From 08c2b2d80a29b1decee578d01f7230b595a9bf5a Mon Sep 17 00:00:00 2001 From: lcolladotor Date: Fri, 12 Jul 2024 10:56:10 -0400 Subject: [PATCH] Resolve #80 Part of this work was done live during today's LIBD rstats club. Although I didn't find the solution to this bug during that time. --- NEWS.md | 11 +++++++++ R/get_colors.R | 18 ++++++++++++++- R/sort_clusters.R | 55 +++++++++++++++++++++++++++++++++++++------- man/get_colors.Rd | 11 +++++++++ man/sort_clusters.Rd | 48 +++++++++++++++++++++++++++++++++----- 5 files changed, 128 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 328c9750..0d9e48e1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,14 @@ +# spatialLIBD 1.17.6 + +BUG FIXES + +* Fixed the bug reported by @lahuuki about `vis_grid_clus()` not handling +`logical()` cluster variables. +See . To resolve this, +`sort_clusters()` and `get_colors()` had to change internally. Examples and +documentation for both functions have now been updated to showcase what happens +when you provide a `logical()` vector as an input. + # spatialLIBD 1.17.5 NEW FEATURES diff --git a/R/get_colors.R b/R/get_colors.R index 7d601839..93b53351 100644 --- a/R/get_colors.R +++ b/R/get_colors.R @@ -31,6 +31,17 @@ #' #' ## Example where Polychrome::palette36.colors() gets used #' get_colors(clusters = letters[seq_len(13)]) +#' +#' ## What happens if you have a logical variable with NAs? +#' set.seed(20240712) +#' log_var <- sample(c(TRUE, FALSE, NA), +#' 1000, +#' replace = TRUE, +#' prob = c(0.3, 0.15, 0.55)) +#' log_var_sorted <- sort_clusters(log_var) +#' ## A color does get assigned to 'NA', but will be overwritten by +#' ## 'na_color' passed to `vis_clus_p()` and related functions. +#' get_colors(colors = NULL, clusters = log_var_sorted) get_colors <- function(colors = NULL, clusters) { n_clus <- length(unique(clusters)) @@ -67,7 +78,12 @@ get_colors <- function(colors = NULL, clusters) { "purple" ) } - names(colors) <- seq_len(length(colors)) + ## Subset to the actual number of values if we are working with < 12 + colors <- colors[seq_len(n_clus)] + + ## Set the names of the colors in a way compatible with how names + ## are set in vis_clus_p(). + names(colors) <- levels(factor(clusters)) } else if (all(unique(as.character(clusters)) %in% c(gsub("ayer", "", names(colors)), NA))) { names(colors) <- gsub("ayer", "", names(colors)) } diff --git a/R/sort_clusters.R b/R/sort_clusters.R index 7d906690..f4b8563c 100644 --- a/R/sort_clusters.R +++ b/R/sort_clusters.R @@ -1,15 +1,15 @@ #' Sort clusters by frequency #' -#' This function takes a vector with cluster labels and sorts it by frequency -#' such that the most frequent cluster is the first one and so on. +#' This function takes a vector with cluster labels, recasts it as a `factor()`, +#' and sorts the `factor()` levels by frequency such that the most frequent +#' cluster is the first level and so on. #' #' @param clusters A vector with cluster labels. #' @param map_subset A logical vector of length equal to `clusters` specifying #' which elements of `clusters` to use to determine the ranking of the clusters. #' -#' @return A factor of length equal to `clusters` where the levels are the new -#' ordered clusters and the names of the factor are the original values from -#' `clusters`. +#' @return A `factor()` version of `clusters` where the levels are ordered by +#' frequency. #' #' @export #' @@ -21,9 +21,49 @@ #' ## In this case, it's a character vector #' class(clus) #' -#' ## Sort them and obtain a factor +#' ## We see that we have 10 elements in this vector, which is +#' ## an unnamed character vector +#' clus +#' +#' ## letter 'd' is the most frequent +#' table(clus) +#' +#' ## Sort them and obtain a factor. Notice that it's a named +#' ## factor, and the names correspond to the original values +#' ## in the character vector. #' sort_clusters(clus) +#' +#' ## Since 'd' was the most frequent, it gets assigned to the first level +#' ## in the factor variable. +#' table(sort_clusters(clus)) +#' +#' ## If we skip the first 3 values of clus (which are all 'd'), we can +#' ## change the most frequent cluster. And thus the ordering of the +#' ## factor levels. +#' sort_clusters(clus, map_subset = seq_len(length(clus)) > 3) +#' +#' ## Let's try with a factor variable +#' clus_factor <- factor(clus) +#' ## sort_clusters() returns an identical result in this case +#' stopifnot(identical(sort_clusters(clus), sort_clusters(clus_factor))) +#' +#' ## What happens if you have a logical variable with NAs? +#' set.seed(20240712) +#' log_var <- sample(c(TRUE, FALSE, NA), +#' 1000, +#' replace = TRUE, +#' prob = c(0.3, 0.15, 0.55)) +#' ## Here, the NAs are the most frequent group. +#' table(log_var, useNA = "ifany") +#' +#' ## The NAs are not used for sorting. Since we have more 'TRUE' than 'FALSE' +#' ## then, 'TRUE' becomes the first level. +#' table(sort_clusters(log_var), useNA = "ifany") sort_clusters <- function(clusters, map_subset = NULL) { + if (is.logical(clusters)) { + clusters <- as.character(clusters) + } + if (is.null(map_subset)) { map_subset <- rep(TRUE, length(clusters)) } else { @@ -36,6 +76,5 @@ sort_clusters <- function(clusters, map_subset = NULL) { } map <- rank(length(clusters[map_subset]) - table(clusters[map_subset]), ties.method = "first") - res <- map[clusters] - factor(res) + factor(clusters, levels = names(sort(map))) } diff --git a/man/get_colors.Rd b/man/get_colors.Rd index ec9a4673..82d96a6d 100644 --- a/man/get_colors.Rd +++ b/man/get_colors.Rd @@ -39,4 +39,15 @@ get_colors(clusters = sce_layer$kmeans_k7) ## Example where Polychrome::palette36.colors() gets used get_colors(clusters = letters[seq_len(13)]) + +## What happens if you have a logical variable with NAs? +set.seed(20240712) +log_var <- sample(c(TRUE, FALSE, NA), + 1000, + replace = TRUE, + prob = c(0.3, 0.15, 0.55)) +log_var_sorted <- sort_clusters(log_var) +## A color does get assigned to 'NA', but will be overwritten by +## 'na_color' passed to `vis_clus_p()` and related functions. +get_colors(colors = NULL, clusters = log_var_sorted) } diff --git a/man/sort_clusters.Rd b/man/sort_clusters.Rd index 93d05aa4..66ec783c 100644 --- a/man/sort_clusters.Rd +++ b/man/sort_clusters.Rd @@ -13,13 +13,13 @@ sort_clusters(clusters, map_subset = NULL) which elements of \code{clusters} to use to determine the ranking of the clusters.} } \value{ -A factor of length equal to \code{clusters} where the levels are the new -ordered clusters and the names of the factor are the original values from -\code{clusters}. +A \code{factor()} version of \code{clusters} where the levels are ordered by +frequency. } \description{ -This function takes a vector with cluster labels and sorts it by frequency -such that the most frequent cluster is the first one and so on. +This function takes a vector with cluster labels, recasts it as a \code{factor()}, +and sorts the \code{factor()} levels by frequency such that the most frequent +cluster is the first level and so on. } \examples{ @@ -29,6 +29,42 @@ clus <- letters[unlist(lapply(4:1, function(x) rep(x, x)))] ## In this case, it's a character vector class(clus) -## Sort them and obtain a factor +## We see that we have 10 elements in this vector, which is +## an unnamed character vector +clus + +## letter 'd' is the most frequent +table(clus) + +## Sort them and obtain a factor. Notice that it's a named +## factor, and the names correspond to the original values +## in the character vector. sort_clusters(clus) + +## Since 'd' was the most frequent, it gets assigned to the first level +## in the factor variable. +table(sort_clusters(clus)) + +## If we skip the first 3 values of clus (which are all 'd'), we can +## change the most frequent cluster. And thus the ordering of the +## factor levels. +sort_clusters(clus, map_subset = seq_len(length(clus)) > 3) + +## Let's try with a factor variable +clus_factor <- factor(clus) +## sort_clusters() returns an identical result in this case +stopifnot(identical(sort_clusters(clus), sort_clusters(clus_factor))) + +## What happens if you have a logical variable with NAs? +set.seed(20240712) +log_var <- sample(c(TRUE, FALSE, NA), + 1000, + replace = TRUE, + prob = c(0.3, 0.15, 0.55)) +## Here, the NAs are the most frequent group. +table(log_var, useNA = "ifany") + +## The NAs are not used for sorting. Since we have more 'TRUE' than 'FALSE' +## then, 'TRUE' becomes the first level. +table(sort_clusters(log_var), useNA = "ifany") }