Added visualization functions

almeidasilvaf · Feb 3, 2024 · 16ac02e · 16ac02e
1 parent e409176
commit 16ac02e
Show file tree

Hide file tree

Showing 25 changed files with 823 additions and 58 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(classify_gene_pairs)
 export(classify_genes)
+export(duplicates2counts)
 export(find_ks_peaks)
 export(get_anchors_list)
 export(get_intron_counts)
@@ -10,7 +11,10 @@ export(get_tandem_proximal)
 export(get_transposed)
 export(get_transposed_classes)
 export(pairs2kaks)
+export(plot_duplicate_freqs)
+export(plot_ks_distro)
 export(plot_ks_peaks)
+export(plot_rates_by_species)
 export(split_pairs_by_peak)
 importFrom(AnnotationDbi,select)
 importFrom(BiocParallel,SerialParam)
@@ -20,15 +24,29 @@ importFrom(GenomicFeatures,intronsByTranscript)
 importFrom(GenomicRanges,GRangesList)
 importFrom(MSA2dist,dnastring2kaks)
 importFrom(ggplot2,aes)
+importFrom(ggplot2,after_stat)
+importFrom(ggplot2,element_blank)
+importFrom(ggplot2,facet_grid)
+importFrom(ggplot2,facet_wrap)
+importFrom(ggplot2,geom_bar)
+importFrom(ggplot2,geom_boxplot)
+importFrom(ggplot2,geom_density)
 importFrom(ggplot2,geom_histogram)
+importFrom(ggplot2,geom_violin)
 importFrom(ggplot2,geom_vline)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,ggplot_build)
 importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_fill_manual)
+importFrom(ggplot2,scale_x_continuous)
+importFrom(ggplot2,scale_y_continuous)
 importFrom(ggplot2,stat_function)
+importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_bw)
+importFrom(ggplot2,vars)
 importFrom(mclust,densityMclust)
 importFrom(rlang,.data)
+importFrom(stats,density)
 importFrom(stats,dnorm)
 importFrom(syntenet,interspecies_synteny)
 importFrom(syntenet,intraspecies_synteny)

diff --git a/R/data.R b/R/data.R
@@ -67,13 +67,16 @@
 "cds_scerevisiae"
 
 
-#' Duplicate pairs and Ka, Ks, and Ka/Ks values for S. cerevisiae
+#' Duplicate pairs and Ka, Ks, and Ka/Ks values for fungi species
 #'
 #' This data set was obtained with \code{classify_gene_pairs()} followed
 #' by \code{pairs2kaks()}.
 #' 
-#' @name scerevisiae_kaks
-#' @format A data frame with the following variables:
+#' @name fungi_kaks
+#' @format A list of data frame with elements 
+#' named \strong{saccharomyces_cerevisiae}, \strong{candida_glabrata},
+#' and \strong{schizosaccharomyces_pombe}. Each data frame contains 
+#' the following variables:
 #' \describe{
 #'   \item{dup1}{Character, duplicated gene 1.}
 #'   \item{dup2}{Character, duplicated gene 2.}
@@ -83,9 +86,9 @@
 #'   \item{type}{Character, mode of duplication}
 #' }
 #' @examples 
-#' data(scerevisiae_kaks)
-#' @usage data(scerevisiae_kaks)
-"scerevisiae_kaks"
+#' data(fungi_kaks)
+#' @usage data(fungi_kaks)
+"fungi_kaks"
 
 
 #' Duplicate pairs and Ks values for Glycine max

diff --git a/R/duplicate_classification.R b/R/duplicate_classification.R
@@ -174,7 +174,8 @@ classify_gene_pairs <- function(
 #' @export
 #' @importFrom GenomicRanges GRangesList
 #' @examples
-#' data(scerevisiae_kaks)
+#' data(fungi_kaks)
+#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 #' 
 #' cols <- c("dup1", "dup2", "type")
 #' gene_pairs_list <- list(Scerevisiae = scerevisiae_kaks[, cols])

diff --git a/R/ka_ks_analyses.R b/R/ka_ks_analyses.R
@@ -133,7 +133,8 @@ pairs2kaks <- function(
 #' @export
 #' @rdname find_ks_peaks
 #' @examples 
-#' data(scerevisiae_kaks)
+#' data(fungi_kaks)
+#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 #' ks <- scerevisiae_kaks$Ks
 #' 
 #' # Find 2 peaks in Ks distribution
@@ -205,7 +206,8 @@ find_ks_peaks <- function(ks, npeaks = 2, min_ks = 0.01, max_ks = 4,
 #' @export
 #' @rdname split_pairs_by_peak
 #' @examples
-#' data(scerevisiae_kaks)
+#' data(fungi_kaks)
+#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 #'
 #' # Create a data frame of duplicate pairs and Ks values
 #' ks_df <- scerevisiae_kaks[, c("dup1", "dup2", "Ks")]

diff --git a/R/utils.R b/R/utils.R
@@ -197,7 +197,8 @@ get_intron_counts <- function(txdb) {
 #' @noRd
 #' @rdname find_intersect_mixtures
 #' @examples
-#' data(scerevisiae_kaks)
+#' data(fungi_kaks)
+#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 #' ks <- scerevisiae_kaks$Ks
 #' 
 #' # Find 2 peaks in Ks distribution
@@ -239,4 +240,64 @@ find_intersect_mixtures <- function(peaks) {
 }
 
 
+#' Get a duplicate count matrix for each genome
+#'
+#' @param duplicate_list A list of data frames with the duplicated genes or
+#' gene pairs and their modes of duplication as returned 
+#' by \code{classify_gene_pairs()} or \code{classify_genes()}.
+#' @param shape Character specifying the shape of the output data frame.
+#' One of "long" (data frame in the long shape, in the tidyverse sense),
+#' or "wide" (data frame in the wide shape, in the tidyverse sense).
+#' Default: "long".
+#' 
+#' @return If \strong{shape = "wide"}, a count matrix containing the 
+#' frequency of duplicated genes (or gene pairs) by mode for each species, 
+#' with species in rows and duplication modes in columns.
+#' If \strong{shape = "long"}, a data frame in long format with the following
+#' variables:
+#' \describe{
+#'   \item{type}{Factor, type of duplication.}
+#'   \item{n}{Numeric, number of duplicates.}
+#'   \item{species}{Character, species name}
+#' }
+#' 
+#' @export
+#' @rdname duplicates2counts
+#' @examples
+#' data(fungi_kaks)
+#' 
+#' # Get unique duplicates
+#' duplicate_list <- classify_genes(fungi_kaks)
+#' 
+#' # Get count table
+#' counts <- duplicates2counts(duplicate_list)
+duplicates2counts <- function(duplicate_list, shape = "long") {
+
+    # Get factor levels for variable `type`
+    tlevels <- lapply(duplicate_list, function(x) return(levels(x$type)))
+    tlevels <- tlevels[[names(sort(lengths(tlevels), decreasing = TRUE)[1])]]
+
+    counts <- Reduce(rbind, lapply(seq_along(duplicate_list), function(x) {
+
+        species <- names(duplicate_list)[x]
+
+        dup_table <- duplicate_list[[x]]
+        dup_table$type <- factor(dup_table$type, levels = tlevels)
+
+        if(shape == "long") {
+            final_dups <- as.data.frame(table(dup_table$type))
+            names(final_dups) <- c("type", "n")
+            final_dups$species <- species
+        } else if(shape == "wide") {
+            final_dups <- t(as.matrix(table(dup_table$type)))
+            final_dups <- cbind(species, as.data.frame(final_dups))
+        } else {
+            stop("Argument 'format' must be one of 'long' or 'wide'.")
+        }
+
+        return(final_dups)
+    }))
+
+    return(counts)
+}
 
diff --git a/R/utils_duplicate_classification.R b/R/utils_duplicate_classification.R
@@ -90,7 +90,8 @@ get_segmental <- function(anchor_pairs = NULL, pairs = NULL) {
 #' @examples
 #' data(yeast_annot)
 #' data(yeast_seq)
-#' data(scerevisiae_kaks)
+#' data(fungi_kaks)
+#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 #' 
 #' # Get processed annotation for S. cerevisiae
 #' pdata <- annotation <- syntenet::process_input(yeast_seq, yeast_annot)
@@ -191,7 +192,8 @@ get_tandem_proximal <- function(
 #' data(diamond_intra)
 #' data(yeast_seq)
 #' data(yeast_annot)
-#' data(scerevisiae_kaks)
+#' data(fungi_kaks)
+#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 #' 
 #' # Get processed annotation
 #' pdata <- syntenet::process_input(yeast_seq, yeast_annot)
@@ -307,7 +309,8 @@ get_transposed <- function(
 #' data(diamond_intra)
 #' data(yeast_seq)
 #' data(yeast_annot)
-#' data(scerevisiae_kaks)
+#' data(fungi_kaks)
+#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae
 #' 
 #' # Get processed annotation
 #' pdata <- syntenet::process_input(yeast_seq, yeast_annot)