diff --git a/DESCRIPTION b/DESCRIPTION index 60c4cc4..ce957f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,7 @@ biocViews: DifferentialExpression Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.0 Imports: ggplot2, patchwork, @@ -55,6 +55,7 @@ Imports: DESeq2, SummarizedExperiment, stats, + methods, RColorBrewer, ComplexHeatmap, grDevices, diff --git a/NAMESPACE b/NAMESPACE index 94c9d9f..df0a9d8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(add_midparent_expression) +export(add_size_factors) export(expression_partitioning) export(get_deg_counts) export(get_deg_list) @@ -46,6 +47,7 @@ importFrom(ggplot2,theme_void) importFrom(ggplot2,unit) importFrom(ggplot2,ylim) importFrom(grDevices,colorRampPalette) +importFrom(methods,as) importFrom(patchwork,wrap_plots) importFrom(rlang,.data) importFrom(stats,as.formula) diff --git a/R/01_data_processing.R b/R/01_data_processing.R index 0131f67..1e11f8c 100644 --- a/R/01_data_processing.R +++ b/R/01_data_processing.R @@ -78,4 +78,63 @@ add_midparent_expression <- function( ) return(new_se) -} \ No newline at end of file +} + + +#' Add size factors to normalize count data by library size or by biomass +#' +#' @param se A `SummarizedExperiment` object with a count matrix and sample +#' metadata. +#' @param spikein Logical indicating whether or not to normalize data +#' using spike-ins. If FALSE, data will be normalized by library size. +#' Default: FALSE. +#' @param spikein_pattern Character with the pattern (regex) to use +#' to identify spike-in features in the count matrix. Only valid +#' if \code{spikein_norm = TRUE}. +#' +#' @return A `SummarizedExperiment` object as in \strong{se}, but with an extra +#' column in the colData slot named "sizeFactor". This column contains size +#' factors that will be used by DESeq2 when performing differential expression +#' analyses. +#' +#' @importFrom DESeq2 DESeqDataSet estimateSizeFactors sizeFactors +#' @importFrom methods as +#' @export +#' @rdname add_size_factors +#' @examples +#' data(se_chlamy) +#' se_norm <- add_size_factors(se_chlamy) +add_size_factors <- function( + se, spikein = FALSE, spikein_pattern = "ERCC" +) { + + # Create a DESeqDataSet with no design + deseq <- suppressMessages(DESeq2::DESeqDataSet(se, design = ~1)) + + # Estimate size factors + ## Option 1: Using library size + sf <- estimateSizeFactors(deseq) + sf <- DESeq2::sizeFactors(sf) + + ## Option 2: Using spike-in controls + if(spikein) { + se_spikein <- se[grepl(spikein_pattern, rownames(se)), ] + deseq_spikein <- DESeqDataSet(se_spikein, design = ~1) + deseq_spikein <- estimateSizeFactors(deseq_spikein) + + sf <- sizeFactors(deseq_spikein) + + ## Remove rows with spike-in controls (no longer needed) + deseq <- deseq[!grepl(spikein_pattern, rownames(se)), ] + } + + # Add size factors to colData of DESeqDataSet + DESeq2::sizeFactors(deseq) <- sf + + # Create SummarizedExperiment object from DESeqDataSet + final_se <- as(deseq, "SummarizedExperiment") + rownames(final_se) <- rownames(deseq) + + return(final_se) +} + diff --git a/R/02_de_analyses.R b/R/02_de_analyses.R index 726ce16..fe160d3 100644 --- a/R/02_de_analyses.R +++ b/R/02_de_analyses.R @@ -16,11 +16,6 @@ #' @param midparent Character indicating which level of the variable #' \strong{coldata_column} represents the midparent value. Default: #' "midparent", as returned by \code{add_midparent_expression()}. -#' @param spikein_norm Logical indicating whether or not to normalize data -#' using spike-ins. Default: FALSE. -#' @param spikein_pattern Character with the pattern (regex) to use -#' to identify spike-in features in the count matrix. Only valid -#' if \code{spikein_norm = TRUE}. #' @param lfcThreshold Numeric indicating the log2 fold-change threshold #' to use to consider differentially expressed genes. Default: 0. #' @param alpha Numeric indicating the adjusted P-value threshold to use @@ -66,7 +61,6 @@ get_deg_list <- function( se, coldata_column = "Generation", parent1 = "P1", parent2 = "P2", offspring = "F1", midparent = "midparent", - spikein_norm = FALSE, spikein_pattern = "ERCC", lfcThreshold = 0, alpha = 0.01, ... @@ -77,18 +71,6 @@ get_deg_list <- function( # Create DESeq object form <- as.formula(paste0("~", coldata_column)) deseq <- suppressMessages(DESeq2::DESeqDataSet(se, design = form)) - if(spikein_norm) { - spikein_rows <- grepl(spikein_pattern, rownames(se)) - spikein_se <- se[spikein_rows, ] - spikein_deseq <- DESeqDataSet(spikein_se, design = form) - spikein_deseq <- estimateSizeFactors(spikein_deseq) - - spikein_size_factors <- sizeFactors(spikein_deseq) - - deseq <- deseq[!spikein_rows, ] - DESeq2::sizeFactors(deseq) <- spikein_size_factors - ngenes <- nrow(deseq) - } # Perform DGE res <- DESeq2::DESeq(deseq) diff --git a/man/add_size_factors.Rd b/man/add_size_factors.Rd new file mode 100644 index 0000000..b30a3a1 --- /dev/null +++ b/man/add_size_factors.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/01_data_processing.R +\name{add_size_factors} +\alias{add_size_factors} +\title{Add size factors to normalize count data by library size or by biomass} +\usage{ +add_size_factors(se, spikein = FALSE, spikein_pattern = "ERCC") +} +\arguments{ +\item{se}{A \code{SummarizedExperiment} object with a count matrix and sample +metadata.} + +\item{spikein}{Logical indicating whether or not to normalize data +using spike-ins. If FALSE, data will be normalized by library size. +Default: FALSE.} + +\item{spikein_pattern}{Character with the pattern (regex) to use +to identify spike-in features in the count matrix. Only valid +if \code{spikein_norm = TRUE}.} +} +\value{ +A \code{SummarizedExperiment} object as in \strong{se}, but with an extra +column in the colData slot named "sizeFactor". This column contains size +factors that will be used by DESeq2 when performing differential expression +analyses. +} +\description{ +Add size factors to normalize count data by library size or by biomass +} +\examples{ +data(se_chlamy) +se_norm <- add_size_factors(se_chlamy) +} diff --git a/man/get_deg_list.Rd b/man/get_deg_list.Rd index 83e2bc5..7e60728 100644 --- a/man/get_deg_list.Rd +++ b/man/get_deg_list.Rd @@ -11,8 +11,6 @@ get_deg_list( parent2 = "P2", offspring = "F1", midparent = "midparent", - spikein_norm = FALSE, - spikein_pattern = "ERCC", lfcThreshold = 0, alpha = 0.01, ... @@ -40,13 +38,6 @@ or allopolyploid). Default: "F1"} \strong{coldata_column} represents the midparent value. Default: "midparent", as returned by \code{add_midparent_expression()}.} -\item{spikein_norm}{Logical indicating whether or not to normalize data -using spike-ins. Default: FALSE.} - -\item{spikein_pattern}{Character with the pattern (regex) to use -to identify spike-in features in the count matrix. Only valid -if \code{spikein_norm = TRUE}.} - \item{lfcThreshold}{Numeric indicating the log2 fold-change threshold to use to consider differentially expressed genes. Default: 0.} diff --git a/tests/testthat/test-data_processing.R b/tests/testthat/test-data_processing.R index 94e9983..7f88dfa 100644 --- a/tests/testthat/test-data_processing.R +++ b/tests/testthat/test-data_processing.R @@ -18,3 +18,12 @@ test_that("add_midparent_expression() adds columns with midparent expression", { expect_error(add_midparent_expression(se_chlamy, method = "error")) }) +test_that("add_size_factors() adds a column named 'sizeFactor' for DESeq2", { + + se_norm1 <- add_size_factors(se_chlamy, spikein = FALSE) + se_norm2 <- add_size_factors(se_chlamy, spikein = TRUE) + + expect_true("sizeFactor" %in% colnames(colData(se_norm1))) + expect_true("sizeFactor" %in% colnames(colData(se_norm2))) + expect_true(!identical(se_norm1$sizeFactor, se_norm2$sizeFactor)) +}) diff --git a/tests/testthat/test-de_analyses.R b/tests/testthat/test-de_analyses.R index 79ac8f5..6c83d53 100644 --- a/tests/testthat/test-de_analyses.R +++ b/tests/testthat/test-de_analyses.R @@ -5,12 +5,13 @@ data(deg_list) ## SummarizedExperiment object with midparent expression se <- add_midparent_expression(se_chlamy) +se <- add_size_factors(se, spikein = TRUE) # Start tests ---- test_that("get_deg_list() and get_deg_counts() returns list and data frames", { # 1) Get list of DEGs - degs <- get_deg_list(se, spikein_norm = TRUE) + degs <- get_deg_list(se) # 2) Get data frame of DEG counts deg_counts <- get_deg_counts(degs) diff --git a/vignettes/HybridExpress.Rmd b/vignettes/HybridExpress.Rmd index 5b76d9e..4e7c315 100644 --- a/vignettes/HybridExpress.Rmd +++ b/vignettes/HybridExpress.Rmd @@ -51,7 +51,7 @@ By offering these capabilities, __HybridExpress__ provides a robust toolset for # Installation -`r BiocStyle::Biocpkg("HybridExpress")` can be installed from Bioconductor +`r BiocStyle::Githubpkg("HybridExpress")` can be installed from Bioconductor with the following code: ```{r installation, eval=FALSE} @@ -147,6 +147,57 @@ add_midparent_expression(se_chlamy, method = "weightedmean", weights = w) |> We will proceed our analyses with the midparent expression values obtained from the mean of the counts, stored in the `se` object. +# Normalizing count data + +To normalize count data, the function `add_size_factors()` calculates +size factors (used by `r BiocStyle::Biocpkg("DESeq2")` +for normalization) and adds them as an extra column in the colData slot +of your `SummarizedExperiment` object. Such size factors are calculated using +one of two methods: + +1. By library size (default) using the 'median of +ratios' method implemented in `r BiocStyle::Biocpkg("DESeq2")`. + +2. By cell size/biomass using spike-in controls (if available). If spike-in +controls are present in the count matrix, you can use them +for normalization by setting `spikein = TRUE` and specifying the +pattern used to indicate rows that contain spike-ins (usually they start +with *ERCC*)[^1]. Normalization with spike-in controls is +particularly useful if the amount of mRNA per cell is not equal +between generations (due to, for instance, different ploidy levels, which +in turn can lead to different cell sizes and/or biomass). + +[^1]: **Note:** if you use spike-in normalization, the +function `add_size_factors()` automatically removes rows containing +spike-in counts from the `SummarizedExperiment` object after normalization. +However, if you have counts for spike-in controls in your count matrix, +but do not want to use spike-in normalization, you should remove them +before creating the `SummarizedExperiment` object. + +In our example data set, spike-in controls are available +in the last rows of the count matrix. Let's take a look at them. + +```{r} +# Show last rows of the count matrix +assay(se) |> + tail() +``` + +As you can see, rows with spike-in counts start with "ERCC". Let's add +size factors to our `SummarizedExperiment` object using +spike-in controls. + +```{r} +# Take a look at the original colData +colData(se) + +# Add size factors estimated from spike-in controls +se <- add_size_factors(se, spikein = TRUE, spikein_pattern = "ERCC") + +# Take a look at the new colData +colData(se) +``` + # Exploratory data analyses Next, you can perform exploratory analyses of sample clustering to verify if @@ -180,7 +231,7 @@ Now, let's plot the heatmap of sample correlations: ```{r} # Plot a heatmap of sample correlations -plot_samplecor(se) +plot_samplecor(se, coldata_cols = c("Generation", "Ploidy")) ``` We can see that samples group well together both in the PCA plot and in @@ -207,33 +258,13 @@ for the following contrasts: 3. `F1_vs_P2`: hybrid (numerator) versus parent 2 (denominator). 4. `F1_vs_midparent`: hybrid (numerator) vs midparent (denominator). -By default, count data are normalized by library size using the standard -normalization process in `r BiocStyle::Biocpkg("DESeq2")`. However, -if spike-in standards are included in the count matrix, you can use them -for normalization by setting `spikein_norm = TRUE` and specifying the -pattern used to indicate rows that contain spike-ins (usually they start -with *ERCC*)[^1]. In our example data set, spike-in standards are available -in the last rows of the count matrix. - -[^1]: **Note:** if you use spike-in normalization, the -function `get_deg_list()` automatically removes rows containing spike-in counts -from the `SummarizedExperiment` object after normalization. However, -if you have counts for spike-in standards in your count matrix, -but do not want to use spike-in normalization, you should remove them -before creating the `SummarizedExperiment` object. - -```{r} -# Show last rows of the count matrix -assay(se) |> - tail() -``` - -Now, we will use `get_deg_list()` to get differentially expressed genes (DEGs) -for each contrast while using spike-in normalization. +The size factors estimated with `add_size_factors()` are used for normalization +before differential expression testing. Let's use `get_deg_list()` to get +differentially expressed genes (DEGs) for each contrast. ```{r} # Get a list of differentially expressed genes for each contrast -deg_list <- get_deg_list(se, spikein_norm = TRUE, spikein_pattern = "ERCC") +deg_list <- get_deg_list(se) # Inspecting the output ## Getting contrast names @@ -338,7 +369,6 @@ in the hybrid is the same as in parent 1, but different from parent 2. in the hybrid is the same as in parent 2, but different from parent 1. - ```{r} # Classify genes in expression partitions exp_partitions <- expression_partitioning(deg_list) @@ -545,8 +575,3 @@ sessioninfo::session_info() # References {.unnumbered} - - - - -