Moved the size factor calculation to a dedicated function named `add_…

…size_factors()` and updated vignettes and tests accordingly
almeidasilvaf · Feb 2, 2024 · a698140 · a698140
1 parent 81e81ae
commit a698140
Show file tree

Hide file tree

Showing 9 changed files with 165 additions and 62 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -47,14 +47,15 @@ biocViews:
     DifferentialExpression
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0
 Imports:
     ggplot2,
     patchwork,
     rlang,
     DESeq2,
     SummarizedExperiment,
     stats,
+    methods,
     RColorBrewer,
     ComplexHeatmap,
     grDevices,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_midparent_expression)
+export(add_size_factors)
 export(expression_partitioning)
 export(get_deg_counts)
 export(get_deg_list)
@@ -46,6 +47,7 @@ importFrom(ggplot2,theme_void)
 importFrom(ggplot2,unit)
 importFrom(ggplot2,ylim)
 importFrom(grDevices,colorRampPalette)
+importFrom(methods,as)
 importFrom(patchwork,wrap_plots)
 importFrom(rlang,.data)
 importFrom(stats,as.formula)

diff --git a/R/01_data_processing.R b/R/01_data_processing.R
@@ -78,4 +78,63 @@ add_midparent_expression <- function(
     )
 
     return(new_se)
-}
+}
+
+
+#' Add size factors to normalize count data by library size or by biomass
+#'
+#' @param se A `SummarizedExperiment` object with a count matrix and sample
+#' metadata.
+#' @param spikein Logical indicating whether or not to normalize data
+#' using spike-ins. If FALSE, data will be normalized by library size.
+#' Default: FALSE.
+#' @param spikein_pattern Character with the pattern (regex) to use
+#' to identify spike-in features in the count matrix. Only valid 
+#' if \code{spikein_norm = TRUE}.
+#' 
+#' @return A `SummarizedExperiment` object as in \strong{se}, but with an extra
+#' column in the colData slot named "sizeFactor". This column contains size
+#' factors that will be used by DESeq2 when performing differential expression
+#' analyses.
+#'
+#' @importFrom DESeq2 DESeqDataSet estimateSizeFactors sizeFactors
+#' @importFrom methods as
+#' @export
+#' @rdname add_size_factors
+#' @examples
+#' data(se_chlamy)
+#' se_norm <- add_size_factors(se_chlamy)
+add_size_factors <- function(
+        se, spikein = FALSE, spikein_pattern = "ERCC"
+) {
+
+    # Create a DESeqDataSet with no design
+    deseq <- suppressMessages(DESeq2::DESeqDataSet(se, design = ~1))
+
+    # Estimate size factors
+    ## Option 1: Using library size
+    sf <- estimateSizeFactors(deseq)
+    sf <- DESeq2::sizeFactors(sf)
+
+    ## Option 2: Using spike-in controls
+    if(spikein) {
+        se_spikein <- se[grepl(spikein_pattern, rownames(se)), ]
+        deseq_spikein <- DESeqDataSet(se_spikein, design = ~1)
+        deseq_spikein <- estimateSizeFactors(deseq_spikein)
+
+        sf <- sizeFactors(deseq_spikein)
+
+        ## Remove rows with spike-in controls (no longer needed)
+        deseq <- deseq[!grepl(spikein_pattern, rownames(se)), ]
+    }
+
+    # Add size factors to colData of DESeqDataSet
+    DESeq2::sizeFactors(deseq) <- sf
+
+    # Create SummarizedExperiment object from DESeqDataSet
+    final_se <- as(deseq, "SummarizedExperiment")
+    rownames(final_se) <- rownames(deseq)
+
+    return(final_se)
+}
+
diff --git a/R/02_de_analyses.R b/R/02_de_analyses.R
@@ -16,11 +16,6 @@
 #' @param midparent Character indicating which level of the variable
 #' \strong{coldata_column} represents the midparent value. Default:
 #' "midparent", as returned by \code{add_midparent_expression()}.
-#' @param spikein_norm Logical indicating whether or not to normalize data
-#' using spike-ins. Default: FALSE.
-#' @param spikein_pattern Character with the pattern (regex) to use
-#' to identify spike-in features in the count matrix. Only valid 
-#' if \code{spikein_norm = TRUE}.
 #' @param lfcThreshold Numeric indicating the log2 fold-change threshold
 #' to use to consider differentially expressed genes. Default: 0.
 #' @param alpha Numeric indicating the adjusted P-value threshold to use
@@ -66,7 +61,6 @@ get_deg_list <- function(
         se, coldata_column = "Generation", 
         parent1 = "P1", parent2 = "P2", offspring = "F1", 
         midparent = "midparent",
-        spikein_norm = FALSE, spikein_pattern = "ERCC",
         lfcThreshold = 0,
         alpha = 0.01,
         ...
@@ -77,18 +71,6 @@ get_deg_list <- function(
     # Create DESeq object
     form <- as.formula(paste0("~", coldata_column))
     deseq <- suppressMessages(DESeq2::DESeqDataSet(se, design = form))
-    if(spikein_norm) {
-        spikein_rows <- grepl(spikein_pattern, rownames(se))
-        spikein_se <- se[spikein_rows, ]
-        spikein_deseq <- DESeqDataSet(spikein_se, design = form)
-        spikein_deseq <- estimateSizeFactors(spikein_deseq)
-
-        spikein_size_factors <- sizeFactors(spikein_deseq)
-
-        deseq <- deseq[!spikein_rows, ]
-        DESeq2::sizeFactors(deseq) <- spikein_size_factors
-        ngenes <- nrow(deseq)
-    }
 
     # Perform DGE
     res <- DESeq2::DESeq(deseq)

diff --git a/man/add_size_factors.Rd b/man/add_size_factors.Rd
diff --git a/man/get_deg_list.Rd b/man/get_deg_list.Rd
diff --git a/tests/testthat/test-data_processing.R b/tests/testthat/test-data_processing.R
@@ -18,3 +18,12 @@ test_that("add_midparent_expression() adds columns with midparent expression", {
     expect_error(add_midparent_expression(se_chlamy, method = "error"))
 })
 
+test_that("add_size_factors() adds a column named 'sizeFactor' for DESeq2", {
+
+    se_norm1 <- add_size_factors(se_chlamy, spikein = FALSE)
+    se_norm2 <- add_size_factors(se_chlamy, spikein = TRUE)
+
+    expect_true("sizeFactor" %in% colnames(colData(se_norm1)))
+    expect_true("sizeFactor" %in% colnames(colData(se_norm2)))
+    expect_true(!identical(se_norm1$sizeFactor, se_norm2$sizeFactor))
+})
diff --git a/tests/testthat/test-de_analyses.R b/tests/testthat/test-de_analyses.R
@@ -5,12 +5,13 @@ data(deg_list)
 
 ## SummarizedExperiment object with midparent expression
 se <- add_midparent_expression(se_chlamy)
+se <- add_size_factors(se, spikein = TRUE)
 
 # Start tests ----
 test_that("get_deg_list() and get_deg_counts() returns list and data frames", {
 
     # 1) Get list of DEGs
-    degs <- get_deg_list(se, spikein_norm = TRUE)
+    degs <- get_deg_list(se)
 
     # 2) Get data frame of DEG counts
     deg_counts <- get_deg_counts(degs) 

diff --git a/vignettes/HybridExpress.Rmd b/vignettes/HybridExpress.Rmd
@@ -51,7 +51,7 @@ By offering these capabilities, __HybridExpress__ provides a robust toolset for
 
 # Installation
 
-`r BiocStyle::Biocpkg("HybridExpress")` can be installed from Bioconductor
+`r BiocStyle::Githubpkg("HybridExpress")` can be installed from Bioconductor
 with the following code:
 
 ```{r installation, eval=FALSE}
@@ -147,6 +147,57 @@ add_midparent_expression(se_chlamy, method = "weightedmean", weights = w) |>
 We will proceed our analyses with the midparent expression values obtained
 from the mean of the counts, stored in the `se` object.
 
+# Normalizing count data
+
+To normalize count data, the function `add_size_factors()` calculates
+size factors (used by `r BiocStyle::Biocpkg("DESeq2")` 
+for normalization) and adds them as an extra column in the colData slot
+of your `SummarizedExperiment` object. Such size factors are calculated using
+one of two methods:
+
+1. By library size (default) using the 'median of
+ratios' method implemented in `r BiocStyle::Biocpkg("DESeq2")`.
+
+2. By cell size/biomass using spike-in controls (if available). If spike-in 
+controls are present in the count matrix, you can use them
+for normalization by setting `spikein = TRUE` and specifying the
+pattern used to indicate rows that contain spike-ins (usually they start
+with *ERCC*)[^1]. Normalization with spike-in controls is
+particularly useful if the amount of mRNA per cell is not equal
+between generations (due to, for instance, different ploidy levels, which
+in turn can lead to different cell sizes and/or biomass). 
+
+[^1]: **Note:** if you use spike-in normalization, the 
+function `add_size_factors()` automatically removes rows containing 
+spike-in counts from the `SummarizedExperiment` object after normalization. 
+However, if you have counts for spike-in controls in your count matrix,
+but do not want to use spike-in normalization, you should remove them 
+before creating the `SummarizedExperiment` object.
+
+In our example data set, spike-in controls are available 
+in the last rows of the count matrix. Let's take a look at them.
+
+```{r}
+# Show last rows of the count matrix
+assay(se) |> 
+    tail()
+```
+
+As you can see, rows with spike-in counts start with "ERCC". Let's add 
+size factors to our `SummarizedExperiment` object using 
+spike-in controls.
+
+```{r}
+# Take a look at the original colData
+colData(se)
+
+# Add size factors estimated from spike-in controls
+se <- add_size_factors(se, spikein = TRUE, spikein_pattern = "ERCC")
+
+# Take a look at the new colData
+colData(se)
+```
+
 # Exploratory data analyses
 
 Next, you can perform exploratory analyses of sample clustering to verify if
@@ -180,7 +231,7 @@ Now, let's plot the heatmap of sample correlations:
 
 ```{r}
 # Plot a heatmap of sample correlations
-plot_samplecor(se)
+plot_samplecor(se, coldata_cols = c("Generation", "Ploidy"))
 ```
 
 We can see that samples group well together both in the PCA plot and in
@@ -207,33 +258,13 @@ for the following contrasts:
 3. `F1_vs_P2`: hybrid (numerator) versus parent 2 (denominator).
 4. `F1_vs_midparent`: hybrid (numerator) vs midparent (denominator).
 
-By default, count data are normalized by library size using the standard
-normalization process in `r BiocStyle::Biocpkg("DESeq2")`. However,
-if spike-in standards are included in the count matrix, you can use them
-for normalization by setting `spikein_norm = TRUE` and specifying the
-pattern used to indicate rows that contain spike-ins (usually they start
-with *ERCC*)[^1]. In our example data set, spike-in standards are available 
-in the last rows of the count matrix.
-
-[^1]: **Note:** if you use spike-in normalization, the 
-function `get_deg_list()` automatically removes rows containing spike-in counts
-from the `SummarizedExperiment` object after normalization. However,
-if you have counts for spike-in standards in your count matrix,
-but do not want to use spike-in normalization, you should remove them 
-before creating the `SummarizedExperiment` object.
-
-```{r}
-# Show last rows of the count matrix
-assay(se) |> 
-    tail()
-```
-
-Now, we will use `get_deg_list()` to get differentially expressed genes (DEGs)
-for each contrast while using spike-in normalization.
+The size factors estimated with `add_size_factors()` are used for normalization
+before differential expression testing. Let's use `get_deg_list()` to get 
+differentially expressed genes (DEGs) for each contrast.
 
 ```{r}
 # Get a list of differentially expressed genes for each contrast
-deg_list <- get_deg_list(se, spikein_norm = TRUE, spikein_pattern = "ERCC")
+deg_list <- get_deg_list(se)
 
 # Inspecting the output
 ## Getting contrast names
@@ -338,7 +369,6 @@ in the hybrid is the same as in parent 1, but different from parent 2.
 in the hybrid is the same as in parent 2, but different from parent 1.
 
 
-
 ```{r}
 # Classify genes in expression partitions
 exp_partitions <- expression_partitioning(deg_list)
@@ -545,8 +575,3 @@ sessioninfo::session_info()
 # References {.unnumbered}
 
 
-
-
-
-
-