Included a plotdata attribute to the output of get_deg_list() wit…

…h log2 fold changes for all genes
almeidasilvaf · Nov 28, 2023 · 5246079 · 5246079
1 parent a46b1c5
commit 5246079
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 19 deletions.
diff --git a/R/02_de_analyses.R b/R/02_de_analyses.R
@@ -47,6 +47,12 @@
 #'   \item{pvalue}{Numeric, p-value.}
 #'   \item{padj}{Numeric, P-value adjusted for multiple testing.}
 #' }
+#' 
+#' The list contains two additional attributes named \strong{ngenes} (numeric,
+#' total number of genes), and \strong{plotdata}, which is a 3-column data
+#' frame with variables "gene" (character, gene ID), "lFC_F1_vs_P1" (numeric,
+#' log2 fold change between F1 and P1), and "lFC_F1_vs_P2" (numeric, 
+#' log2 fold change between F1 and P2).
 #'
 #' @export
 #' @rdname get_deg_list
@@ -99,14 +105,39 @@ get_deg_list <- function(
         res_df <- as.data.frame(DESeq2::results(
             res, contrast = c(coldata_column, x[1], x[2]), ...
         ))
-        res_df <- res_df[!is.na(res_df$padj), ]
-        res_df <- res_df[res_df$padj < alpha & 
-                             abs(res_df$log2FoldChange) > lfcThreshold, ]
 
         return(res_df)
     })
+
+    # Extract log2foldChange for contrast F1_P1 and F1_P2
+    cont <- c("F1_vs_P1", "F1_vs_P2")
+    dge_f1 <- dge_list[cont]
+    log2fc_df <- lapply(seq_along(dge_f1), function(x) {
+
+        stats_df <- data.frame(
+            Gene = rownames(dge_f1[[x]]),
+            log2FoldChange = dge_f1[[x]]$log2FoldChange
+        )
+        names(stats_df) <- c("Gene", paste0("lFC_", cont[x]))
+
+        return(stats_df)
+    })
+    log2fc_df <- Reduce(
+        function(x, y) merge(x, y, by = "Gene", all.x = TRUE), 
+        log2fc_df
+    )
+    log2fc_df[is.na(log2fc_df)] <- 0
 
+    # Filter DGE list based on user-defined thresholds
+    dge_list <- lapply(dge_list, function(x) {
+
+        res_df <- x[!is.na(x$padj), ]
+        res_df <- res_df[res_df$padj < alpha &
+                             abs(res_df$log2FoldChange) > lfcThreshold, ]
+        return(res_df)
+    })
     attributes(dge_list)$ngenes <- ngenes
+    attributes(dge_list)$plotdata <- log2fc_df
 
     return(dge_list)
 }

diff --git a/R/03_expression_partitioning.R b/R/03_expression_partitioning.R
@@ -62,18 +62,7 @@ expression_partitioning <- function(deg_list) {
     )
 
     # Get a table with log2foldChange for contrasts `F1_vs_P1` and `F1_vs_P2`
-    log2fc_df <- merge(
-        data.frame(
-            Gene = rownames(deg_list$F1_vs_P1),
-            lFC_F1_vs_P1 = deg_list$F1_vs_P1$log2FoldChange
-        ),
-        data.frame(
-            Gene = rownames(deg_list$F1_vs_P2),
-            lFC_F1_vs_P2 = deg_list$F1_vs_P2$log2FoldChange
-        ),
-        all = TRUE
-    )
-    log2fc_df[is.na(log2fc_df)] <- 0
+    log2fc_df <- attributes(deg_list)$plotdata
 
     # Combine classification data frame with log2foldchange
     class_df <- merge(

diff --git a/data/deg_list.rda b/data/deg_list.rda
diff --git a/inst/script/DATA_ACQUISITION.md b/inst/script/DATA_ACQUISITION.md
@@ -11,8 +11,11 @@ this package.
 ``` r
 library(tidyverse)
 
+coldata_path <- "~/meta_data.csv"
+counts_path <- "~/TS_count.txt"
+
 # Create colData
-coldata <- read_csv("~/meta_data.csv", show_col_types = FALSE) |>
+coldata <- read_csv(coldata_path, show_col_types = FALSE) |>
     tibble::column_to_rownames("sample") |>
     mutate(ploidy_gen = str_c(ploidy, generation, sep = "_")) |>
     filter(ploidy_gen %in% c("haploid_Anc", "diploid_Anc", "triploid_Anc")) |>
@@ -25,7 +28,7 @@ coldata <- read_csv("~/meta_data.csv", show_col_types = FALSE) |>
     mutate(Generation = factor(Generation, levels = c("F1", "P1", "P2")))
 
 # Create assay
-counts <- read_tsv("~/TS_count.txt", show_col_types = FALSE) |>
+counts <- read_tsv(counts_path, show_col_types = FALSE) |>
     select(-transcript_id) |>
     filter(
         !gene_id %in% c(
@@ -75,7 +78,7 @@ se <- add_midparent_expression(se_chlamy)
 deg_list <- get_deg_list(se, spikein_norm = TRUE)
 
 # Save object
-usethis::use_data(deg_list, compress = "xz")
+usethis::use_data(deg_list, compress = "xz", overwrite = TRUE)
 ```
 
 ## `data/deg_counts.rda`
@@ -90,3 +93,33 @@ deg_counts <- get_deg_counts(deg_list)
 # Save data
 usethis::use_data(deg_counts, compress = "xz", overwrite = TRUE)
 ```
+
+## `data/`
+
+``` r
+# Read table with functional annotation (from Phytozome)
+annotation <- readr::read_tsv("~/functional_annotation.txt.gz")
+
+# Get a table with genes and GO IDs
+go_table <- annotation |>
+    select(gene = locusName, GO) |>
+    mutate(gene = str_replace_all(gene, "_4532", "")) |>
+    separate_longer_delim(GO, delim = " ") |>
+    dplyr::distinct()
+
+# Replace GO IDs with names
+go_plaza <- readr::read_tsv(
+    "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/GO/go.cre.csv.gz",
+    skip = 8, show_col_types = FALSE
+) |>
+    select(GO = go, description) |>
+    distinct()
+
+go_chlamy <- go_table |>
+    left_join(go_plaza) |>
+    select(gene, GO = description) |>
+    as.data.frame()
+
+# Save data set
+usethis::use_data(go_chlamy, compress = "xz")
+```
diff --git a/man/get_deg_list.Rd b/man/get_deg_list.Rd
diff --git a/vignettes/HybridExpress.Rmd b/vignettes/HybridExpress.Rmd
@@ -263,7 +263,7 @@ in the count matrix. The total number of genes in the count matrix
 is stored in the `ngenes` attribute of the list returned by `get_deg_list()`:
 
 ```{r}
-attributes(deg_list)
+attributes(deg_list)$ngenes
 ```
 
 However, since the count matrix usually does not include all genes in the