Skip to content

Commit

Permalink
Moved the size factor calculation to a dedicated function named `add_…
Browse files Browse the repository at this point in the history
…size_factors()` and updated vignettes and tests accordingly
  • Loading branch information
almeidasilvaf committed Feb 2, 2024
1 parent 81e81ae commit a698140
Show file tree
Hide file tree
Showing 9 changed files with 165 additions and 62 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,15 @@ biocViews:
DifferentialExpression
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.0
Imports:
ggplot2,
patchwork,
rlang,
DESeq2,
SummarizedExperiment,
stats,
methods,
RColorBrewer,
ComplexHeatmap,
grDevices,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(add_midparent_expression)
export(add_size_factors)
export(expression_partitioning)
export(get_deg_counts)
export(get_deg_list)
Expand Down Expand Up @@ -46,6 +47,7 @@ importFrom(ggplot2,theme_void)
importFrom(ggplot2,unit)
importFrom(ggplot2,ylim)
importFrom(grDevices,colorRampPalette)
importFrom(methods,as)
importFrom(patchwork,wrap_plots)
importFrom(rlang,.data)
importFrom(stats,as.formula)
Expand Down
61 changes: 60 additions & 1 deletion R/01_data_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,63 @@ add_midparent_expression <- function(
)

return(new_se)
}
}


#' Add size factors to normalize count data by library size or by biomass
#'
#' @param se A `SummarizedExperiment` object with a count matrix and sample
#' metadata.
#' @param spikein Logical indicating whether or not to normalize data
#' using spike-ins. If FALSE, data will be normalized by library size.
#' Default: FALSE.
#' @param spikein_pattern Character with the pattern (regex) to use
#' to identify spike-in features in the count matrix. Only valid
#' if \code{spikein_norm = TRUE}.
#'
#' @return A `SummarizedExperiment` object as in \strong{se}, but with an extra
#' column in the colData slot named "sizeFactor". This column contains size
#' factors that will be used by DESeq2 when performing differential expression
#' analyses.
#'
#' @importFrom DESeq2 DESeqDataSet estimateSizeFactors sizeFactors
#' @importFrom methods as
#' @export
#' @rdname add_size_factors
#' @examples
#' data(se_chlamy)
#' se_norm <- add_size_factors(se_chlamy)
add_size_factors <- function(
se, spikein = FALSE, spikein_pattern = "ERCC"
) {

# Create a DESeqDataSet with no design
deseq <- suppressMessages(DESeq2::DESeqDataSet(se, design = ~1))

# Estimate size factors
## Option 1: Using library size
sf <- estimateSizeFactors(deseq)
sf <- DESeq2::sizeFactors(sf)

## Option 2: Using spike-in controls
if(spikein) {
se_spikein <- se[grepl(spikein_pattern, rownames(se)), ]
deseq_spikein <- DESeqDataSet(se_spikein, design = ~1)
deseq_spikein <- estimateSizeFactors(deseq_spikein)

sf <- sizeFactors(deseq_spikein)

## Remove rows with spike-in controls (no longer needed)
deseq <- deseq[!grepl(spikein_pattern, rownames(se)), ]
}

# Add size factors to colData of DESeqDataSet
DESeq2::sizeFactors(deseq) <- sf

# Create SummarizedExperiment object from DESeqDataSet
final_se <- as(deseq, "SummarizedExperiment")
rownames(final_se) <- rownames(deseq)

return(final_se)
}

18 changes: 0 additions & 18 deletions R/02_de_analyses.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@
#' @param midparent Character indicating which level of the variable
#' \strong{coldata_column} represents the midparent value. Default:
#' "midparent", as returned by \code{add_midparent_expression()}.
#' @param spikein_norm Logical indicating whether or not to normalize data
#' using spike-ins. Default: FALSE.
#' @param spikein_pattern Character with the pattern (regex) to use
#' to identify spike-in features in the count matrix. Only valid
#' if \code{spikein_norm = TRUE}.
#' @param lfcThreshold Numeric indicating the log2 fold-change threshold
#' to use to consider differentially expressed genes. Default: 0.
#' @param alpha Numeric indicating the adjusted P-value threshold to use
Expand Down Expand Up @@ -66,7 +61,6 @@ get_deg_list <- function(
se, coldata_column = "Generation",
parent1 = "P1", parent2 = "P2", offspring = "F1",
midparent = "midparent",
spikein_norm = FALSE, spikein_pattern = "ERCC",
lfcThreshold = 0,
alpha = 0.01,
...
Expand All @@ -77,18 +71,6 @@ get_deg_list <- function(
# Create DESeq object
form <- as.formula(paste0("~", coldata_column))
deseq <- suppressMessages(DESeq2::DESeqDataSet(se, design = form))
if(spikein_norm) {
spikein_rows <- grepl(spikein_pattern, rownames(se))
spikein_se <- se[spikein_rows, ]
spikein_deseq <- DESeqDataSet(spikein_se, design = form)
spikein_deseq <- estimateSizeFactors(spikein_deseq)

spikein_size_factors <- sizeFactors(spikein_deseq)

deseq <- deseq[!spikein_rows, ]
DESeq2::sizeFactors(deseq) <- spikein_size_factors
ngenes <- nrow(deseq)
}

# Perform DGE
res <- DESeq2::DESeq(deseq)
Expand Down
33 changes: 33 additions & 0 deletions man/add_size_factors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 0 additions & 9 deletions man/get_deg_list.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions tests/testthat/test-data_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,12 @@ test_that("add_midparent_expression() adds columns with midparent expression", {
expect_error(add_midparent_expression(se_chlamy, method = "error"))
})

test_that("add_size_factors() adds a column named 'sizeFactor' for DESeq2", {

se_norm1 <- add_size_factors(se_chlamy, spikein = FALSE)
se_norm2 <- add_size_factors(se_chlamy, spikein = TRUE)

expect_true("sizeFactor" %in% colnames(colData(se_norm1)))
expect_true("sizeFactor" %in% colnames(colData(se_norm2)))
expect_true(!identical(se_norm1$sizeFactor, se_norm2$sizeFactor))
})
3 changes: 2 additions & 1 deletion tests/testthat/test-de_analyses.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ data(deg_list)

## SummarizedExperiment object with midparent expression
se <- add_midparent_expression(se_chlamy)
se <- add_size_factors(se, spikein = TRUE)

# Start tests ----
test_that("get_deg_list() and get_deg_counts() returns list and data frames", {

# 1) Get list of DEGs
degs <- get_deg_list(se, spikein_norm = TRUE)
degs <- get_deg_list(se)

# 2) Get data frame of DEG counts
deg_counts <- get_deg_counts(degs)
Expand Down
89 changes: 57 additions & 32 deletions vignettes/HybridExpress.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ By offering these capabilities, __HybridExpress__ provides a robust toolset for

# Installation

`r BiocStyle::Biocpkg("HybridExpress")` can be installed from Bioconductor
`r BiocStyle::Githubpkg("HybridExpress")` can be installed from Bioconductor
with the following code:

```{r installation, eval=FALSE}
Expand Down Expand Up @@ -147,6 +147,57 @@ add_midparent_expression(se_chlamy, method = "weightedmean", weights = w) |>
We will proceed our analyses with the midparent expression values obtained
from the mean of the counts, stored in the `se` object.

# Normalizing count data

To normalize count data, the function `add_size_factors()` calculates
size factors (used by `r BiocStyle::Biocpkg("DESeq2")`
for normalization) and adds them as an extra column in the colData slot
of your `SummarizedExperiment` object. Such size factors are calculated using
one of two methods:

1. By library size (default) using the 'median of
ratios' method implemented in `r BiocStyle::Biocpkg("DESeq2")`.

2. By cell size/biomass using spike-in controls (if available). If spike-in
controls are present in the count matrix, you can use them
for normalization by setting `spikein = TRUE` and specifying the
pattern used to indicate rows that contain spike-ins (usually they start
with *ERCC*)[^1]. Normalization with spike-in controls is
particularly useful if the amount of mRNA per cell is not equal
between generations (due to, for instance, different ploidy levels, which
in turn can lead to different cell sizes and/or biomass).

[^1]: **Note:** if you use spike-in normalization, the
function `add_size_factors()` automatically removes rows containing
spike-in counts from the `SummarizedExperiment` object after normalization.
However, if you have counts for spike-in controls in your count matrix,
but do not want to use spike-in normalization, you should remove them
before creating the `SummarizedExperiment` object.

In our example data set, spike-in controls are available
in the last rows of the count matrix. Let's take a look at them.

```{r}
# Show last rows of the count matrix
assay(se) |>
tail()
```

As you can see, rows with spike-in counts start with "ERCC". Let's add
size factors to our `SummarizedExperiment` object using
spike-in controls.

```{r}
# Take a look at the original colData
colData(se)
# Add size factors estimated from spike-in controls
se <- add_size_factors(se, spikein = TRUE, spikein_pattern = "ERCC")
# Take a look at the new colData
colData(se)
```

# Exploratory data analyses

Next, you can perform exploratory analyses of sample clustering to verify if
Expand Down Expand Up @@ -180,7 +231,7 @@ Now, let's plot the heatmap of sample correlations:

```{r}
# Plot a heatmap of sample correlations
plot_samplecor(se)
plot_samplecor(se, coldata_cols = c("Generation", "Ploidy"))
```

We can see that samples group well together both in the PCA plot and in
Expand All @@ -207,33 +258,13 @@ for the following contrasts:
3. `F1_vs_P2`: hybrid (numerator) versus parent 2 (denominator).
4. `F1_vs_midparent`: hybrid (numerator) vs midparent (denominator).

By default, count data are normalized by library size using the standard
normalization process in `r BiocStyle::Biocpkg("DESeq2")`. However,
if spike-in standards are included in the count matrix, you can use them
for normalization by setting `spikein_norm = TRUE` and specifying the
pattern used to indicate rows that contain spike-ins (usually they start
with *ERCC*)[^1]. In our example data set, spike-in standards are available
in the last rows of the count matrix.

[^1]: **Note:** if you use spike-in normalization, the
function `get_deg_list()` automatically removes rows containing spike-in counts
from the `SummarizedExperiment` object after normalization. However,
if you have counts for spike-in standards in your count matrix,
but do not want to use spike-in normalization, you should remove them
before creating the `SummarizedExperiment` object.

```{r}
# Show last rows of the count matrix
assay(se) |>
tail()
```

Now, we will use `get_deg_list()` to get differentially expressed genes (DEGs)
for each contrast while using spike-in normalization.
The size factors estimated with `add_size_factors()` are used for normalization
before differential expression testing. Let's use `get_deg_list()` to get
differentially expressed genes (DEGs) for each contrast.

```{r}
# Get a list of differentially expressed genes for each contrast
deg_list <- get_deg_list(se, spikein_norm = TRUE, spikein_pattern = "ERCC")
deg_list <- get_deg_list(se)
# Inspecting the output
## Getting contrast names
Expand Down Expand Up @@ -338,7 +369,6 @@ in the hybrid is the same as in parent 1, but different from parent 2.
in the hybrid is the same as in parent 2, but different from parent 1.



```{r}
# Classify genes in expression partitions
exp_partitions <- expression_partitioning(deg_list)
Expand Down Expand Up @@ -545,8 +575,3 @@ sessioninfo::session_info()
# References {.unnumbered}







0 comments on commit a698140

Please sign in to comment.