diff --git a/NAMESPACE b/NAMESPACE index eb5f242..f1f8743 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(classify_gene_pairs) export(classify_genes) +export(duplicates2counts) export(find_ks_peaks) export(get_anchors_list) export(get_intron_counts) @@ -10,7 +11,10 @@ export(get_tandem_proximal) export(get_transposed) export(get_transposed_classes) export(pairs2kaks) +export(plot_duplicate_freqs) +export(plot_ks_distro) export(plot_ks_peaks) +export(plot_rates_by_species) export(split_pairs_by_peak) importFrom(AnnotationDbi,select) importFrom(BiocParallel,SerialParam) @@ -20,15 +24,29 @@ importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicRanges,GRangesList) importFrom(MSA2dist,dnastring2kaks) importFrom(ggplot2,aes) +importFrom(ggplot2,after_stat) +importFrom(ggplot2,element_blank) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,facet_wrap) +importFrom(ggplot2,geom_bar) +importFrom(ggplot2,geom_boxplot) +importFrom(ggplot2,geom_density) importFrom(ggplot2,geom_histogram) +importFrom(ggplot2,geom_violin) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,ggplot_build) importFrom(ggplot2,labs) +importFrom(ggplot2,scale_fill_manual) +importFrom(ggplot2,scale_x_continuous) +importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,stat_function) +importFrom(ggplot2,theme) importFrom(ggplot2,theme_bw) +importFrom(ggplot2,vars) importFrom(mclust,densityMclust) importFrom(rlang,.data) +importFrom(stats,density) importFrom(stats,dnorm) importFrom(syntenet,interspecies_synteny) importFrom(syntenet,intraspecies_synteny) diff --git a/R/data.R b/R/data.R index a05bae2..5b418b8 100644 --- a/R/data.R +++ b/R/data.R @@ -67,13 +67,16 @@ "cds_scerevisiae" -#' Duplicate pairs and Ka, Ks, and Ka/Ks values for S. cerevisiae +#' Duplicate pairs and Ka, Ks, and Ka/Ks values for fungi species #' #' This data set was obtained with \code{classify_gene_pairs()} followed #' by \code{pairs2kaks()}. #' -#' @name scerevisiae_kaks -#' @format A data frame with the following variables: +#' @name fungi_kaks +#' @format A list of data frame with elements +#' named \strong{saccharomyces_cerevisiae}, \strong{candida_glabrata}, +#' and \strong{schizosaccharomyces_pombe}. Each data frame contains +#' the following variables: #' \describe{ #' \item{dup1}{Character, duplicated gene 1.} #' \item{dup2}{Character, duplicated gene 2.} @@ -83,9 +86,9 @@ #' \item{type}{Character, mode of duplication} #' } #' @examples -#' data(scerevisiae_kaks) -#' @usage data(scerevisiae_kaks) -"scerevisiae_kaks" +#' data(fungi_kaks) +#' @usage data(fungi_kaks) +"fungi_kaks" #' Duplicate pairs and Ks values for Glycine max diff --git a/R/duplicate_classification.R b/R/duplicate_classification.R index e2651f3..15167d1 100644 --- a/R/duplicate_classification.R +++ b/R/duplicate_classification.R @@ -174,7 +174,8 @@ classify_gene_pairs <- function( #' @export #' @importFrom GenomicRanges GRangesList #' @examples -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' #' cols <- c("dup1", "dup2", "type") #' gene_pairs_list <- list(Scerevisiae = scerevisiae_kaks[, cols]) diff --git a/R/ka_ks_analyses.R b/R/ka_ks_analyses.R index 43642d3..b882058 100644 --- a/R/ka_ks_analyses.R +++ b/R/ka_ks_analyses.R @@ -133,7 +133,8 @@ pairs2kaks <- function( #' @export #' @rdname find_ks_peaks #' @examples -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' ks <- scerevisiae_kaks$Ks #' #' # Find 2 peaks in Ks distribution @@ -205,7 +206,8 @@ find_ks_peaks <- function(ks, npeaks = 2, min_ks = 0.01, max_ks = 4, #' @export #' @rdname split_pairs_by_peak #' @examples -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' #' # Create a data frame of duplicate pairs and Ks values #' ks_df <- scerevisiae_kaks[, c("dup1", "dup2", "Ks")] diff --git a/R/utils.R b/R/utils.R index a239f8a..36b77e9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -197,7 +197,8 @@ get_intron_counts <- function(txdb) { #' @noRd #' @rdname find_intersect_mixtures #' @examples -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' ks <- scerevisiae_kaks$Ks #' #' # Find 2 peaks in Ks distribution @@ -239,4 +240,64 @@ find_intersect_mixtures <- function(peaks) { } +#' Get a duplicate count matrix for each genome +#' +#' @param duplicate_list A list of data frames with the duplicated genes or +#' gene pairs and their modes of duplication as returned +#' by \code{classify_gene_pairs()} or \code{classify_genes()}. +#' @param shape Character specifying the shape of the output data frame. +#' One of "long" (data frame in the long shape, in the tidyverse sense), +#' or "wide" (data frame in the wide shape, in the tidyverse sense). +#' Default: "long". +#' +#' @return If \strong{shape = "wide"}, a count matrix containing the +#' frequency of duplicated genes (or gene pairs) by mode for each species, +#' with species in rows and duplication modes in columns. +#' If \strong{shape = "long"}, a data frame in long format with the following +#' variables: +#' \describe{ +#' \item{type}{Factor, type of duplication.} +#' \item{n}{Numeric, number of duplicates.} +#' \item{species}{Character, species name} +#' } +#' +#' @export +#' @rdname duplicates2counts +#' @examples +#' data(fungi_kaks) +#' +#' # Get unique duplicates +#' duplicate_list <- classify_genes(fungi_kaks) +#' +#' # Get count table +#' counts <- duplicates2counts(duplicate_list) +duplicates2counts <- function(duplicate_list, shape = "long") { + + # Get factor levels for variable `type` + tlevels <- lapply(duplicate_list, function(x) return(levels(x$type))) + tlevels <- tlevels[[names(sort(lengths(tlevels), decreasing = TRUE)[1])]] + + counts <- Reduce(rbind, lapply(seq_along(duplicate_list), function(x) { + + species <- names(duplicate_list)[x] + + dup_table <- duplicate_list[[x]] + dup_table$type <- factor(dup_table$type, levels = tlevels) + + if(shape == "long") { + final_dups <- as.data.frame(table(dup_table$type)) + names(final_dups) <- c("type", "n") + final_dups$species <- species + } else if(shape == "wide") { + final_dups <- t(as.matrix(table(dup_table$type))) + final_dups <- cbind(species, as.data.frame(final_dups)) + } else { + stop("Argument 'format' must be one of 'long' or 'wide'.") + } + + return(final_dups) + })) + + return(counts) +} diff --git a/R/utils_duplicate_classification.R b/R/utils_duplicate_classification.R index cbcceea..e424d70 100644 --- a/R/utils_duplicate_classification.R +++ b/R/utils_duplicate_classification.R @@ -90,7 +90,8 @@ get_segmental <- function(anchor_pairs = NULL, pairs = NULL) { #' @examples #' data(yeast_annot) #' data(yeast_seq) -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' #' # Get processed annotation for S. cerevisiae #' pdata <- annotation <- syntenet::process_input(yeast_seq, yeast_annot) @@ -191,7 +192,8 @@ get_tandem_proximal <- function( #' data(diamond_intra) #' data(yeast_seq) #' data(yeast_annot) -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' #' # Get processed annotation #' pdata <- syntenet::process_input(yeast_seq, yeast_annot) @@ -307,7 +309,8 @@ get_transposed <- function( #' data(diamond_intra) #' data(yeast_seq) #' data(yeast_annot) -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' #' # Get processed annotation #' pdata <- syntenet::process_input(yeast_seq, yeast_annot) diff --git a/R/visualization.R b/R/visualization.R index 5718a23..9a582bf 100644 --- a/R/visualization.R +++ b/R/visualization.R @@ -1,4 +1,299 @@ +#' Create a named vector with a color palette for duplication modes +#' +#' @return A named character vector with colors for each duplication mode. +#' @noRd +#' +dup_palette <- function() { + + pal <- c( + All = "gray20", + SD = "#EFC000FF", + TD = "#CD534CFF", + PD = "#79AF97FF", + TRD = "#7AA6DCFF", + rTRD = "#7AA6DCFF", + dTRD = "#003C67FF", + DD = "#6A6599FF", + SSD = "#7AA6DCFF" + ) + + return(pal) +} + + +#' Plot frequency of duplicates per mode for each species +#' +#' @param dup_counts A data frame in long format with the number of +#' duplicates per mode for each species, as returned by +#' the function \code{duplicates2counts}. +#' @param plot_type Character indicating how to plot frequencies. One of +#' 'facet' (facets for each level of the variable \strong{type}), +#' 'stack' (levels of the variable \strong{type} as stacked bars), or +#' 'stack_percent' (levels of the variable \strong{type} as stacked bars, +#' with x-axis representing relative frequencies). Default: 'facet'. +#' @param remove_zero Logical indicating whether or not to remove rows +#' with zero values. Default: TRUE. +#' +#' @return A ggplot object. +#' +#' @importFrom ggplot2 ggplot aes geom_bar facet_wrap theme_bw theme labs +#' scale_fill_manual element_blank +#' @importFrom rlang .data +#' @export +#' @rdname plot_duplicate_freqs +#' @examples +#' data(fungi_kaks) +#' +#' # Get unique duplicates +#' duplicate_list <- classify_genes(fungi_kaks) +#' +#' # Get count table +#' dup_counts <- duplicates2counts(duplicate_list) +#' +#' # Plot counts +#' plot_duplicate_freqs(dup_counts, plot_type = "stack_percent") +plot_duplicate_freqs <- function( + dup_counts, plot_type = "facet", remove_zero = TRUE +) { + + # Define palette + pal <- dup_palette() + + # Remove zeros + if(remove_zero) { dup_counts <- dup_counts[dup_counts$n != 0, ] } + + if(plot_type == "facet") { + p <- ggplot(dup_counts, aes(x = .data$n, y = .data$species)) + + geom_bar( + aes(fill = .data$type), stat = "identity", color = "grey20", + show.legend = FALSE + ) + + facet_wrap("type", nrow = 1, scales = "free_x") + + labs(y = "", x = "Absolute frequency") + + } else if(plot_type == "stack") { + p <- ggplot( + dup_counts, aes(x = .data$n, y = .data$species, fill = .data$type) + ) + + geom_bar(color = "gray20", position = "stack", stat = "identity") + + labs(fill = "Type", y = "", x = "Absolute frequency") + + } else if(plot_type == "stack_percent") { + p <- ggplot( + dup_counts, aes(x = .data$n, y = .data$species, fill = .data$type) + ) + + geom_bar(position = "fill", stat = "identity", color = "gray20") + + labs(fill = "Type", y = "", x = "Relative frequency") + + } else { + stop("Input to argument 'plot_type' must be one of 'facet', 'stack', or 'stack_percent'.") + } + + p <- p + + scale_fill_manual(values = pal) + + theme_bw() + + theme(panel.grid = element_blank()) + + return(p) +} + + +#' Plot distribution of synonymous substitution rates (Ks) +#' +#' @param ks_df A data frame with Ks values for each gene pair +#' as returned by \code{pairs2kaks()}. +#' @param min_ks Numeric indicating the minimum Ks value to keep. +#' Default: 0.01. +#' @param max_ks Numeric indicating the maximum Ks value to keep. +#' Default: 2. +#' @param bytype Logical indicating whether or not to plot the distribution +#' by type of duplication (requires a column named `type`). +#' @param type_levels (Only valid if \strong{bytype} is not NULL) Character +#' indicating which levels of the variable specified in +#' parameter \strong{group_by} should be kept. By default, all levels are kept. +#' @param plot_type Character indicating the type of plot to create. +#' If \strong{bytype = TRUE}, possible types are "histogram" or "violin". +#' If \strong{bytype = FALSE}, possible types are "histogram", "density", +#' or "density_histogram". Default: "histogram". +#' @param binwidth (Only valid if \strong{plot_type = "histogram"}) +#' Numeric indicating the bin width. Default: 0.03. +#' +#' @return A ggplot object. +#' +#' @importFrom ggplot2 ggplot aes geom_density geom_histogram facet_grid +#' theme theme_bw geom_violin geom_boxplot scale_x_continuous vars +#' scale_y_continuous after_stat +#' @importFrom stats density +#' @export +#' @rdname plot_ks_distro +#' @examples +#' data(fungi_kaks) +#' ks_df <- fungi_kaks$saccharomyces_cerevisiae +#' +#' # Plot distro +#' plot_ks_distro(ks_df, bytype = TRUE) +plot_ks_distro <- function( + ks_df, min_ks = 0.01, max_ks = 2, + bytype = FALSE, type_levels = NULL, + plot_type = "histogram", + binwidth = 0.03 +) { + + pal <- dup_palette() + + # Filter Ks values + filt_ks <- ks_df[ks_df$Ks >= min_ks & ks_df$Ks <= max_ks, ] + filt_ks <- filt_ks[!is.na(filt_ks$Ks), ] + + if(bytype) { + # Add level for all combined + ks_all <- filt_ks + ks_all$type <- factor("All", levels = "All") + filt_ks <- rbind(ks_all, filt_ks) + + # Keep only desired levels (optional) + if(!is.null(type_levels)) { + filt_ks <- filt_ks[filt_ks$type %in% type_levels, ] + filt_ks$type <- droplevels(filt_ks$type) + } + + # Plot + if(plot_type == "histogram") { + p <- ggplot(filt_ks, aes(x = .data$Ks)) + + geom_histogram( + aes(fill = .data$type), + color = "gray30", binwidth = binwidth, show.legend = FALSE + ) + + scale_fill_manual(values = pal) + + facet_grid(rows = vars(.data$type), scales = "free_y") + + labs(y = "Count") + } else if(plot_type == "violin") { + p <- ggplot(filt_ks, aes(x = .data$Ks, y = .data$type)) + + geom_violin(aes(fill = .data$type), show.legend = FALSE) + + scale_fill_manual(values = pal) + + labs(y = "Type") + } else { + stop("When plotting by groups, `plot_type` must be either 'histogram' or 'violin'.") + } + p <- p + labs(title = "Ks distribution for gene pairs by mode") + } else { + + if(plot_type == "histogram") { + p <- ggplot(filt_ks, aes(x = .data$Ks)) + + geom_histogram( + fill = "#9196ca", color = "#3e57a7", binwidth = binwidth + ) + + labs(y = "Count") + } else if(plot_type == "density") { + p <- ggplot(filt_ks, aes(x = .data$Ks)) + + geom_density( + fill = "#9196ca", color = "#3e57a7" + ) + + labs(y = "Density") + } else if(plot_type == "density_histogram") { + p <- ggplot(filt_ks, aes(x = .data$Ks)) + + geom_histogram( + aes(y = after_stat(density)), alpha = 0.5, + fill = "#9196ca", color = "#3e57a7", binwidth = binwidth + ) + + geom_density(color = "gray30", linewidth = 1) + + labs(y = "Density") + } else { + stop("Without groups, `plot_type` must be one of 'histogram', 'density', or 'density_histogram'.") + } + p <- p + scale_y_continuous(expand = c(1e-2, 1e-2)) + + labs(title = "Ks distribution for gene pairs") + } + + # Polish plot + p <- p + + theme_bw() + + theme(panel.grid = element_blank()) + + scale_x_continuous(expand = c(1e-2, 1e-2)) + + labs(x = expression(K[s])) + + return(p) +} + + +#' Plot distributions of substitution rates (Ka, Ks, or Ka/Ks) per species +#' +#' @param kaks_list A list of data frames with substitution rates per gene +#' pair in each species as returned by \code{pairs2kaks()}. +#' @param rate_column Character indicating the name of the column to plot. +#' Default: "Ks". +#' @param bytype Logical indicating whether or not to show distributions by +#' type of duplication. Default: FALSE. +#' @param range Numeric vector of length 2 indicating the minimum and maximum +#' values to plot. Default: \code{c(0, 2)}. +#' @param fill Character with color to use for the fill aesthetic. Ignored +#' if \strong{bytype = TRUE}. Default: "deepskyblue3". +#' @param color Character with color to use for the color aesthetic. Ignored +#' if \strong{bytype = FALSE}. Default: "deepskyblue4". +#' +#' @return A ggplot object. +#' +#' @details +#' Data will be plotted using the species order of the list. To change the +#' order of the species to plot, reorder the list elements +#' in \strong{kaks_list}. +#' +#' @importFrom ggplot2 geom_violin scale_fill_manual facet_wrap +#' @export +#' @rdname plot_rates_by_species +#' @examples +#' data(fungi_kaks) +#' +#' # Plot rates +#' plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks") +plot_rates_by_species <- function( + kaks_list, rate_column = "Ks", bytype = FALSE, range = c(0, 2), + fill = "deepskyblue3", color = "deepskyblue4" +) { + + xl <- switch( + rate_column, + "Ks" = expression(K[s]), + "Ka" = expression(K[a]), + "Ka_Ks" = expression(K[a] / K[s]), + stop("Input to 'rate_column' must be one of 'Ka', 'Ks', or 'Ka_Ks'.") + ) + + # From list to data frame + rate_df <- Reduce(rbind, lapply(seq_along(kaks_list), function(x) { + + df <- kaks_list[[x]] + df$species <- names(kaks_list)[x] + + return(df) + })) + rate_df <- rate_df[!is.na(rate_df[[rate_column]]), ] + rate_df <- rate_df[rate_df[[rate_column]] >= range[1] & + rate_df[[rate_column]] <= range[2], ] + rate_df$species <- factor(rate_df$species, levels = names(kaks_list)) + + # Create plot + if(bytype) { + p <- ggplot(rate_df, aes(x = .data[[rate_column]], y = .data$species)) + + geom_violin(aes(fill = .data$type), show.legend = FALSE) + + scale_fill_manual(values = dup_palette()) + + facet_wrap("type", nrow = 1) + + } else { + p <- ggplot(rate_df, aes(x = .data[[rate_column]], y = .data$species)) + + geom_violin(fill = fill, color = color) + } + + # Polish the plot + p <- p + + theme_bw() + + labs(x = xl, y = "") + + theme(panel.grid = element_blank()) + + return(p) +} + #' Plot histogram of Ks distribution with peaks #' @@ -17,7 +312,8 @@ #' @rdname plot_ks_peaks #' @export #' @examples -#' data(scerevisiae_kaks) +#' data(fungi_kaks) +#' scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae #' ks <- scerevisiae_kaks$Ks #' #' # Find 2 peaks in Ks distribution @@ -49,6 +345,7 @@ plot_ks_peaks <- function(peaks = NULL, binwidth = 0.05) { n = length(ks_df$ks), binwidth = binwidth, color = pal) + theme_bw() + + theme(panel.grid = element_blank()) + labs(title = "Ks distribution with peaks", y = "Frequency", x = "Ks values") diff --git a/data/fungi_kaks.rda b/data/fungi_kaks.rda new file mode 100644 index 0000000..c0648f4 Binary files /dev/null and b/data/fungi_kaks.rda differ diff --git a/data/scerevisiae_kaks.rda b/data/scerevisiae_kaks.rda deleted file mode 100644 index 3605050..0000000 Binary files a/data/scerevisiae_kaks.rda and /dev/null differ diff --git a/man/classify_genes.Rd b/man/classify_genes.Rd index c718a24..d6dd3a3 100644 --- a/man/classify_genes.Rd +++ b/man/classify_genes.Rd @@ -28,7 +28,8 @@ For scheme "extended", the order is SD > TD > PD > TRD > DD. For scheme "full", the order is SD > TD > PD > rTRD > dTRD > DD. } \examples{ -data(scerevisiae_kaks) +data(fungi_kaks) +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae cols <- c("dup1", "dup2", "type") gene_pairs_list <- list(Scerevisiae = scerevisiae_kaks[, cols]) diff --git a/man/duplicates2counts.Rd b/man/duplicates2counts.Rd new file mode 100644 index 0000000..72e0859 --- /dev/null +++ b/man/duplicates2counts.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{duplicates2counts} +\alias{duplicates2counts} +\title{Get a duplicate count matrix for each genome} +\usage{ +duplicates2counts(duplicate_list, shape = "long") +} +\arguments{ +\item{duplicate_list}{A list of data frames with the duplicated genes or +gene pairs and their modes of duplication as returned +by \code{classify_gene_pairs()} or \code{classify_genes()}.} + +\item{shape}{Character specifying the shape of the output data frame. +One of "long" (data frame in the long shape, in the tidyverse sense), +or "wide" (data frame in the wide shape, in the tidyverse sense). +Default: "long".} +} +\value{ +If \strong{shape = "wide"}, a count matrix containing the +frequency of duplicated genes (or gene pairs) by mode for each species, +with species in rows and duplication modes in columns. +If \strong{shape = "long"}, a data frame in long format with the following +variables: +\describe{ +\item{type}{Factor, type of duplication.} +\item{n}{Numeric, number of duplicates.} +\item{species}{Character, species name} +} +} +\description{ +Get a duplicate count matrix for each genome +} +\examples{ +data(fungi_kaks) + +# Get unique duplicates +duplicate_list <- classify_genes(fungi_kaks) + +# Get count table +counts <- duplicates2counts(duplicate_list) +} diff --git a/man/find_ks_peaks.Rd b/man/find_ks_peaks.Rd index 8e8fe5d..77b1576 100644 --- a/man/find_ks_peaks.Rd +++ b/man/find_ks_peaks.Rd @@ -41,7 +41,8 @@ arguments passed to min_ks and max_ks.} Find peaks in a Ks distribution with Gaussian Mixture Models } \examples{ -data(scerevisiae_kaks) +data(fungi_kaks) +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae ks <- scerevisiae_kaks$Ks # Find 2 peaks in Ks distribution diff --git a/man/scerevisiae_kaks.Rd b/man/fungi_kaks.Rd similarity index 60% rename from man/scerevisiae_kaks.Rd rename to man/fungi_kaks.Rd index f5f43e3..b6be0f5 100644 --- a/man/scerevisiae_kaks.Rd +++ b/man/fungi_kaks.Rd @@ -1,11 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} -\name{scerevisiae_kaks} -\alias{scerevisiae_kaks} -\title{Duplicate pairs and Ka, Ks, and Ka/Ks values for S. cerevisiae} +\name{fungi_kaks} +\alias{fungi_kaks} +\title{Duplicate pairs and Ka, Ks, and Ka/Ks values for fungi species} \format{ -A data frame with the following variables: +A list of data frame with elements +named \strong{saccharomyces_cerevisiae}, \strong{candida_glabrata}, +and \strong{schizosaccharomyces_pombe}. Each data frame contains +the following variables: \describe{ \item{dup1}{Character, duplicated gene 1.} \item{dup2}{Character, duplicated gene 2.} @@ -16,13 +19,13 @@ A data frame with the following variables: } } \usage{ -data(scerevisiae_kaks) +data(fungi_kaks) } \description{ This data set was obtained with \code{classify_gene_pairs()} followed by \code{pairs2kaks()}. } \examples{ -data(scerevisiae_kaks) +data(fungi_kaks) } \keyword{datasets} diff --git a/man/get_tandem_proximal.Rd b/man/get_tandem_proximal.Rd index 3a24c5e..dc870e8 100644 --- a/man/get_tandem_proximal.Rd +++ b/man/get_tandem_proximal.Rd @@ -37,7 +37,8 @@ Classify gene pairs derived from tandem and proximal duplications \examples{ data(yeast_annot) data(yeast_seq) -data(scerevisiae_kaks) +data(fungi_kaks) +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae # Get processed annotation for S. cerevisiae pdata <- annotation <- syntenet::process_input(yeast_seq, yeast_annot) diff --git a/man/get_transposed.Rd b/man/get_transposed.Rd index d6c068d..d060974 100644 --- a/man/get_transposed.Rd +++ b/man/get_transposed.Rd @@ -66,7 +66,8 @@ data(diamond_inter) data(diamond_intra) data(yeast_seq) data(yeast_annot) -data(scerevisiae_kaks) +data(fungi_kaks) +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae # Get processed annotation pdata <- syntenet::process_input(yeast_seq, yeast_annot) diff --git a/man/get_transposed_classes.Rd b/man/get_transposed_classes.Rd index 1c2b770..58fbf4a 100644 --- a/man/get_transposed_classes.Rd +++ b/man/get_transposed_classes.Rd @@ -38,7 +38,8 @@ data(diamond_inter) data(diamond_intra) data(yeast_seq) data(yeast_annot) -data(scerevisiae_kaks) +data(fungi_kaks) +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae # Get processed annotation pdata <- syntenet::process_input(yeast_seq, yeast_annot) diff --git a/man/plot_duplicate_freqs.Rd b/man/plot_duplicate_freqs.Rd new file mode 100644 index 0000000..0e4f34c --- /dev/null +++ b/man/plot_duplicate_freqs.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization.R +\name{plot_duplicate_freqs} +\alias{plot_duplicate_freqs} +\title{Plot frequency of duplicates per mode for each species} +\usage{ +plot_duplicate_freqs(dup_counts, plot_type = "facet", remove_zero = TRUE) +} +\arguments{ +\item{dup_counts}{A data frame in long format with the number of +duplicates per mode for each species, as returned by +the function \code{duplicates2counts}.} + +\item{plot_type}{Character indicating how to plot frequencies. One of +'facet' (facets for each level of the variable \strong{type}), +'stack' (levels of the variable \strong{type} as stacked bars), or +'stack_percent' (levels of the variable \strong{type} as stacked bars, +with x-axis representing relative frequencies). Default: 'facet'.} + +\item{remove_zero}{Logical indicating whether or not to remove rows +with zero values. Default: TRUE.} +} +\value{ +A ggplot object. +} +\description{ +Plot frequency of duplicates per mode for each species +} +\examples{ +data(fungi_kaks) + +# Get unique duplicates +duplicate_list <- classify_genes(fungi_kaks) + +# Get count table +dup_counts <- duplicates2counts(duplicate_list) + +# Plot counts +plot_duplicate_freqs(dup_counts, plot_type = "stack_percent") +} diff --git a/man/plot_ks_distro.Rd b/man/plot_ks_distro.Rd new file mode 100644 index 0000000..7f21f5f --- /dev/null +++ b/man/plot_ks_distro.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization.R +\name{plot_ks_distro} +\alias{plot_ks_distro} +\title{Plot distribution of synonymous substitution rates (Ks)} +\usage{ +plot_ks_distro( + ks_df, + min_ks = 0.01, + max_ks = 2, + bytype = FALSE, + type_levels = NULL, + plot_type = "histogram", + binwidth = 0.03 +) +} +\arguments{ +\item{ks_df}{A data frame with Ks values for each gene pair +as returned by \code{pairs2kaks()}.} + +\item{min_ks}{Numeric indicating the minimum Ks value to keep. +Default: 0.01.} + +\item{max_ks}{Numeric indicating the maximum Ks value to keep. +Default: 2.} + +\item{bytype}{Logical indicating whether or not to plot the distribution +by type of duplication (requires a column named \code{type}).} + +\item{type_levels}{(Only valid if \strong{bytype} is not NULL) Character +indicating which levels of the variable specified in +parameter \strong{group_by} should be kept. By default, all levels are kept.} + +\item{plot_type}{Character indicating the type of plot to create. +If \strong{bytype = TRUE}, possible types are "histogram" or "violin". +If \strong{bytype = FALSE}, possible types are "histogram", "density", +or "density_histogram". Default: "histogram".} + +\item{binwidth}{(Only valid if \strong{plot_type = "histogram"}) +Numeric indicating the bin width. Default: 0.03.} +} +\value{ +A ggplot object. +} +\description{ +Plot distribution of synonymous substitution rates (Ks) +} +\examples{ +data(fungi_kaks) +ks_df <- fungi_kaks$saccharomyces_cerevisiae + +# Plot distro +plot_ks_distro(ks_df, bytype = TRUE) +} diff --git a/man/plot_ks_peaks.Rd b/man/plot_ks_peaks.Rd index ab60022..243a4e2 100644 --- a/man/plot_ks_peaks.Rd +++ b/man/plot_ks_peaks.Rd @@ -21,7 +21,8 @@ A ggplot object with a histogram and lines for each Ks peak. Plot histogram of Ks distribution with peaks } \examples{ -data(scerevisiae_kaks) +data(fungi_kaks) +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae ks <- scerevisiae_kaks$Ks # Find 2 peaks in Ks distribution diff --git a/man/plot_rates_by_species.Rd b/man/plot_rates_by_species.Rd new file mode 100644 index 0000000..9ac8f8f --- /dev/null +++ b/man/plot_rates_by_species.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization.R +\name{plot_rates_by_species} +\alias{plot_rates_by_species} +\title{Plot distributions of substitution rates (Ka, Ks, or Ka/Ks) per species} +\usage{ +plot_rates_by_species( + kaks_list, + rate_column = "Ks", + bytype = FALSE, + range = c(0, 2), + fill = "deepskyblue3", + color = "deepskyblue4" +) +} +\arguments{ +\item{kaks_list}{A list of data frames with substitution rates per gene +pair in each species as returned by \code{pairs2kaks()}.} + +\item{rate_column}{Character indicating the name of the column to plot. +Default: "Ks".} + +\item{bytype}{Logical indicating whether or not to show distributions by +type of duplication. Default: FALSE.} + +\item{range}{Numeric vector of length 2 indicating the minimum and maximum +values to plot. Default: \code{c(0, 2)}.} + +\item{fill}{Character with color to use for the fill aesthetic. Ignored +if \strong{bytype = TRUE}. Default: "deepskyblue3".} + +\item{color}{Character with color to use for the color aesthetic. Ignored +if \strong{bytype = FALSE}. Default: "deepskyblue4".} +} +\value{ +A ggplot object. +} +\description{ +Plot distributions of substitution rates (Ka, Ks, or Ka/Ks) per species +} +\details{ +Data will be plotted using the species order of the list. To change the +order of the species to plot, reorder the list elements +in \strong{kaks_list}. +} +\examples{ +data(fungi_kaks) + +# Plot rates +plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks") +} diff --git a/man/split_pairs_by_peak.Rd b/man/split_pairs_by_peak.Rd index a5557f7..b4934e6 100644 --- a/man/split_pairs_by_peak.Rd +++ b/man/split_pairs_by_peak.Rd @@ -39,7 +39,8 @@ certain number of standard deviations from the highest peak, and older genes are found close within smaller peaks. } \examples{ -data(scerevisiae_kaks) +data(fungi_kaks) +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae # Create a data frame of duplicate pairs and Ks values ks_df <- scerevisiae_kaks[, c("dup1", "dup2", "Ks")] diff --git a/tests/testthat/test-duplicate_classification.R b/tests/testthat/test-duplicate_classification.R index b7e89a2..99465ee 100644 --- a/tests/testthat/test-duplicate_classification.R +++ b/tests/testthat/test-duplicate_classification.R @@ -1,13 +1,15 @@ #----Load data------------------------------------------------------------------ data(diamond_intra) -data(scerevisiae_kaks) +data(fungi_kaks) data(diamond_inter) data(yeast_annot) data(yeast_seq) blast_list <- diamond_intra blast_inter <- diamond_inter +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae + pdata <- syntenet::process_input(yeast_seq, yeast_annot) annotation <- pdata$annotation annotation_granges <- pdata$annotation[["Scerevisiae"]] diff --git a/tests/testthat/test-ka_ks_analyses.R b/tests/testthat/test-ka_ks_analyses.R index e581a23..2a398f7 100644 --- a/tests/testthat/test-ka_ks_analyses.R +++ b/tests/testthat/test-ka_ks_analyses.R @@ -1,6 +1,6 @@ #----Load data------------------------------------------------------------------ -data(scerevisiae_kaks) +data(fungi_kaks) data(diamond_intra) data(diamond_inter) data(yeast_annot) @@ -8,7 +8,9 @@ data(yeast_seq) data(cds_scerevisiae) blast_list <- diamond_intra blast_inter <- diamond_inter - + +scerevisiae_kaks <- fungi_kaks$saccharomyces_cerevisiae + pdata <- syntenet::process_input(yeast_seq, yeast_annot) annot <- pdata$annotation["Scerevisiae"] diff --git a/tests/testthat/test-visualization.R b/tests/testthat/test-visualization.R new file mode 100644 index 0000000..98d7c34 --- /dev/null +++ b/tests/testthat/test-visualization.R @@ -0,0 +1,78 @@ + +# Load data ---- +data(fungi_kaks) + +# Start tests ---- +test_that("duplicates2counts() returns counts", { + + duplicate_list <- classify_genes(fungi_kaks) + + # Get count table + d1 <- duplicates2counts(duplicate_list, shape = "wide") + d2 <- duplicates2counts(duplicate_list, shape = "long") + + expect_true(is.data.frame(d1)) + expect_true(is.data.frame(d2)) + + expect_error(duplicates2counts(duplicate_list, shape = "error")) +}) + + +test_that("plot_duplicate_freqs() returns a ggplot object", { + + duplicate_list <- classify_genes(fungi_kaks) + + # Get count table + dup_counts <- duplicates2counts(duplicate_list) + + # Plot + p1 <- plot_duplicate_freqs(dup_counts, plot_type = "facet") + p2 <- plot_duplicate_freqs(dup_counts, plot_type = "stack") + p3 <- plot_duplicate_freqs(dup_counts, plot_type = "stack_percent") + + expect_true("ggplot" %in% class(p1)) + expect_true("ggplot" %in% class(p2)) + expect_true("ggplot" %in% class(p3)) + + expect_error(plot_duplicate_freqs(dup_counts, plot_type = "error")) +}) + + +test_that("plot_ks_distro() returns a ggplot object", { + + df <- fungi_kaks$saccharomyces_cerevisiae + + p1 <- plot_ks_distro(df, bytype = TRUE) + p2 <- plot_ks_distro( + df, bytype = TRUE, plot_type = "violin", type_levels = c("SD", "All") + ) + p3 <- plot_ks_distro(df, bytype = FALSE, plot_type = "histogram") + p4 <- plot_ks_distro(df, bytype = FALSE, plot_type = "density") + p5 <- plot_ks_distro(df, bytype = FALSE, plot_type = "density_histogram") + + expect_true("ggplot" %in% class(p1)) + expect_true("ggplot" %in% class(p2)) + expect_true("ggplot" %in% class(p3)) + expect_true("ggplot" %in% class(p4)) + expect_true("ggplot" %in% class(p5)) + + expect_error(plot_ks_distro(df, bytype = TRUE, plot_type = "error")) + expect_error(plot_ks_distro(df, bytype = FALSE, plot_type = "error")) +}) + + +test_that("plot_rates_by_species() returns a ggplot object", { + + p1 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks") + p2 <- plot_rates_by_species(fungi_kaks, rate_column = "Ks", bytype = TRUE) + p3 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka") + + expect_true("ggplot" %in% class(p1)) + expect_true("ggplot" %in% class(p2)) + expect_true("ggplot" %in% class(p3)) + + expect_error(plot_rates_by_species(fungi_kaks, rate_column = "Kw")) +}) + + + diff --git a/vignettes/doubletrouble_vignette.Rmd b/vignettes/doubletrouble_vignette.Rmd index 00ae8d9..ed0b645 100644 --- a/vignettes/doubletrouble_vignette.Rmd +++ b/vignettes/doubletrouble_vignette.Rmd @@ -37,7 +37,7 @@ help you explore the different contributions of WGD and SSD to evolution, we developed `r BiocStyle::Githubpkg("doubletrouble")`, a package that can be used to identify and classify duplicated genes from whole-genome protein sequences, calculate substitution rates per substitution site (i.e., -Ka and Ks) for gene pairs, find peaks in Ks distributions, and classify +$K_a$ and $K_s$) for gene pairs, find peaks in $K_s$ distributions, and classify gene pairs by age groups. # Installation @@ -436,18 +436,18 @@ head(c_genes$Scerevisiae) table(c_genes$Scerevisiae$type) ``` -# Calculating Ka, Ks, and Ka/Ks for duplicated gene pairs +# Calculating substitution rates for duplicated gene pairs You can use the function `pairs2kaks()` to calculate rates of nonsynonymous -substitutions per nonsynonymous site (Ka), synonymouys substitutions per -synonymous site (Ks), and their ratios (Ka/Ks). These rates are calculated +substitutions per nonsynonymous site ($K_a$), synonymouys substitutions per +synonymous site ($K_s$), and their ratios ($K_a/K_s$). These rates are calculated using the Bioconductor package `r BiocStyle::Biocpkg("MSA2dist")`, which implements all codon models in KaKs_Calculator 2.0 [@wang2010kaks_calculator]. -For the purpose of demonstration, we will only calculate Ka, Ks, and Ka/Ks -for 5 WGD-derived gene pairs. The CDS for WGD-derived genes were obtained -from Ensembl Fungi [@yates2022ensembl], and +For the purpose of demonstration, we will only calculate $K_a$, $K_s$, +and $K_a/K_s$ for 5 SD-derived gene pairs. The CDS for SD-derived +genes were obtained from Ensembl Fungi [@yates2022ensembl], and they are stored in `cds_scerevisiae`. ```{r kaks_calculation} @@ -468,35 +468,37 @@ head(kaks) ``` -# Identifying and visualizing Ks peaks +# Identifying and visualizing $K_s$ peaks -Peaks in Ks distributions typically indicate whole-genome duplication (WGD) +Peaks in $K_s$ distributions typically indicate whole-genome duplication (WGD) events, and they can be identified by fitting Gaussian mixture models (GMMs) to -Ks distributions. In `r BiocStyle::Githubpkg("doubletrouble")`, this can be +$K_s$ distributions. In `r BiocStyle::Githubpkg("doubletrouble")`, this can be performed with the function `find_ks_peaks()`. -However, because of saturation at higher Ks values, only **recent WGD** -events can be reliably identified from Ks +However, because of saturation at higher $K_s$ values, only **recent WGD** +events can be reliably identified from $K_s$ distributions [@vanneste2013inference]. Recent WGD events are commonly found in plant species, such as maize, soybean, apple, etc. Although the genomes of yeast species have signatures of WGD, these events are ancient, so it is very hard to find evidence for them -using Ks distributions. [^4] +using $K_s$ distributions. [^4] [^4]: **Tip:** You might be asking yourself: "How does one identify ancient WGD, then?". A common approach is to look for syntenic blocks (i.e., regions with conserved gene content and order) within genomes. This is what -`classify_gene_pairs()` does under the hood to find WGD-derived gene pairs. +`classify_gene_pairs()` does under the hood to find SD-derived gene pairs. -To demonstrate how you can find peaks in Ks distributions -with `find_ks_peaks()`, we will use a data frame containing Ks values for +To demonstrate how you can find peaks in $K_s$ distributions +with `find_ks_peaks()`, we will use a data frame containing $K_s$ values for duplicate pairs in the soybean (*Glycine max*) genome, which has undergone 2 WGDs events ~13 and ~58 million years ago [@schmutz2010genome]. -Then, we will visualize Ks distributions with peaks using the function +Then, we will visualize $K_s$ distributions with peaks using the function `plot_ks_peaks()`. -First of all, let's look at the data and have a quick look at the distribution. +First of all, let's look at the data and have a quick look at the distribution +with the function `plot_ks_distro()` (more details on this function in the +data visualization section). ```{r ks_eda} # Load data and inspect it @@ -504,16 +506,13 @@ data(gmax_ks) head(gmax_ks) # Plot distribution -library(ggplot2) -ggplot(gmax_ks, aes(x = Ks)) + - geom_histogram(color = "black", fill = "grey90", bins = 50) + - theme_bw() +plot_ks_distro(gmax_ks) ``` By visual inspection, we can see 2 or 3 peaks. Based on our prior knowledge, we know that 2 WGD events have occurred in the ancestral of the *Glycine* genus and in the ancestral of all Fabaceae, which seem to correspond to the -peaks we see at Ks values around 0.1 and 0.5, respectively. There could be +peaks we see at $K_s$ values around 0.1 and 0.5, respectively. There could be a third, flattened peak at around 1.6, which would represent the WGD shared by all eudicots. Let's test which number of peaks has more support: 2 or 3. @@ -529,7 +528,7 @@ plot_ks_peaks(peaks) As we can see, the presence of 3 peaks is more supported (lowest BIC). The function returns a list with the mean, variance and amplitude -of mixture components (i.e., peaks), as well as the Ks distribution itself. +of mixture components (i.e., peaks), as well as the $K_s$ distribution itself. Now, suppose you just want to get the first 2 peaks. You can do that by explictly saying to `find_ks_peaks()` how many peaks there are. @@ -540,9 +539,9 @@ peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = 2, max_ks = 1) plot_ks_peaks(peaks) ``` -**Important consideration on GMMs and Ks distributions:** +**Important consideration on GMMs and $K_s$ distributions:** Peaks identified with GMMs should not be blindly regarded as "the truth". -Using GMMs to find peaks in Ks distributions can lead to problems such as +Using GMMs to find peaks in $K_s$ distributions can lead to problems such as overfitting and overclustering [@tiley2018assessing]. Some general recommendations are: @@ -574,12 +573,12 @@ of the curve, which support the 2 peaks we found. # Classifying genes by age groups Finally, you can use the peaks you obtained before to classify gene pairs -by age group. Age groups are defined based on the Ks peak to which pairs belong. +by age group. Age groups are defined based on the $K_s$ peak to which pairs belong. This is useful if you want to analyze duplicate pairs from a specific WGD event, for instance. You can do this with the function `split_pairs_by_peak()`. This function returns a list containing the classified pairs in a data frame, and a ggplot object with the -age boundaries highlighted in the histogram of Ks values. +age boundaries highlighted in the histogram of $K_s$ values. ```{r split_by_peak} # Gene pairs without age-based classification @@ -598,6 +597,108 @@ head(pairs_age_group$pairs) pairs_age_group$plot ``` +# Data visualization + +Last but not least, `r BiocStyle::Biocpkg("doubletrouble")` provides users +with graphical functions to produce publication-ready plots from the output +of `classify_gene_pairs()`, `classify_genes()`, and `pairs2kaks()`. +Let's take a look at them one by one. + +## Visualizing the frequency of duplicates per mode + +To visualize the frequency of duplicated gene pairs or genes by duplication +type (as returned by `classify_gene_pairs()` and `classify_genes()`, +respectively), you will first need to create a data frame of counts with +`duplicates2counts()`. To demonstrate how this works, we will use an +example data set with duplicate pairs for 3 fungi species (and substitution +rates, which will be ignored by `duplicates2counts()`). + +```{r} +# Load data set with pre-computed duplicates for 3 fungi species +data(fungi_kaks) +names(fungi_kaks) +head(fungi_kaks$saccharomyces_cerevisiae) + +# Get a data frame of counts per mode in all species +counts_table <- duplicates2counts(fungi_kaks |> classify_genes()) + +counts_table +``` + +Now, let's visualize the frequency of duplicate gene pairs by duplication +type with the function `plot_duplicate_freqs()`. You can visualize frequencies +in three different ways, as demonstrated below. + +```{r} +# 1) Facets +plot_duplicate_freqs(counts_table) + +# 2) Stacked barplot, absolute frequencies +plot_duplicate_freqs(counts_table, plot_type = "stack") + +# 3) Stacked barplot, relative frequencies +plot_duplicate_freqs(counts_table, plot_type = "stack_percent") +``` + +If you want to visually the frequency of duplicated **genes** (not gene pairs), +you'd first need to classify genes into unique modes of duplication +with `classify_genes()`, and then repeat the code above. For example: + +```{r} +# Frequency of duplicated genes by mode +classify_genes(fungi_kaks) |> # classify genes into unique duplication types + duplicates2counts() |> # get a data frame of counts (long format) + plot_duplicate_freqs() # plot frequencies +``` + +## Visualizing $K_s$ distributions + +As briefly demonstrated before, to plot a $K_s$ distribution for the +whole paranome, you will use the function `plot_ks_distro()`. + +```{r} +ks_df <- fungi_kaks$saccharomyces_cerevisiae + +# 1) Histogram, whole paranome +plot_ks_distro(ks_df, plot_type = "histogram") + +# 2) Density, whole paranome +plot_ks_distro(ks_df, plot_type = "density") + +# 3) Histogram with density lines, whole paranome +plot_ks_distro(ks_df, plot_type = "density_histogram") +``` + +However, visualizing the distribution for the whole paranome can mask patterns +that only happen for duplicates originating from particular duplication types. +For instance, when looking for evidence of WGD events, +visualizing the $K_s$ distribution for SD-derived pairs only can reveal +whether syntenic genes cluster together, suggesting the presence of WGD history. +To visualize the distribution by duplication type, use `bytype = TRUE` in +`plot_ks_distro()`. + +```{r} +# 1) Duplicates by type, histogram +plot_ks_distro(ks_df, bytype = TRUE, plot_type = "histogram") + +# 2) Duplicates by type, violin +plot_ks_distro(ks_df, bytype = TRUE, plot_type = "violin") +``` + +## Visualizing substitution rates by species + +The function `plot_rates_by_species()` can be used to show distributions of +substitution rates ($K_s$, $K_a$, or their ratio $K_a/K_s$) by species. +You can choose which rate you want to visualize, and whether or not to +group gene pairs by duplication mode, as demonstrated below. + +```{r} +# Ks for each species +plot_rates_by_species(fungi_kaks) + +# Ka/Ks by duplication type for each species +plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks", bytype = TRUE) +``` # Session information {.unnumbered}