filtering and functional analysis fix [A

hds-sandbox · Jan 12, 2023 · bdfa7aa · bdfa7aa
1 parent 856c2ae
commit bdfa7aa
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 24 deletions.
diff --git a/Notebooks/06_exploratory_analysis.Rmd b/Notebooks/06_exploratory_analysis.Rmd
@@ -50,6 +50,8 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le
 dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("samplename"), 
                               design = ~ sampletype)
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
 ```
 
 Approximate time: 80 minutes

diff --git a/Notebooks/07a_DEA.Rmd b/Notebooks/07a_DEA.Rmd
@@ -50,6 +50,8 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le
 dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("samplename"), 
                               design = ~ sampletype)
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
 ```
 
 Approximate time: 60 minutes

diff --git a/Notebooks/07b_hypothesis_testing.Rmd b/Notebooks/07b_hypothesis_testing.Rmd
@@ -38,19 +38,20 @@ knitr::opts_chunk$set(autodep        = TRUE,
 # This chunk is ONLY necessary if you want to knit this document into a pdf!!
 library(tidyverse)
 library(DESeq2)
+library(tximport)
 
-# And with this last line of code, we set our working directory to the folder with this notebook.
-# This way, the relative paths will work without issues
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-
-# Load objects so it can be knitted
-data <- read_table("../Data/Mov10_full_counts.txt") 
 meta <- read_table("../Data/Mov10_full_meta.txt")
-
-# Create dds object
-dds <- DESeqDataSetFromMatrix(countData = data %>% column_to_rownames("GeneSymbol"), 
-                              colData = meta %>% column_to_rownames("samplename"), 
+dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
+tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+files <- file.path(dir, meta$samplename, "quant.sf")
+names(files) <- meta$samplename
+txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)
+dds <- DESeqDataSetFromTximport(txi,
+                                   colData = meta %>% column_to_rownames("samplename"), 
                               design = ~ sampletype)
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
 dds <- DESeq(dds)
 ```
 
@@ -158,6 +159,7 @@ Define contrasts for MOV10 overexpression using one of the two methods above.
 
 ```{r}
 ## Your code here 
+contrast_oe <- 
 ```
 
 ## The results table

diff --git a/Notebooks/07c_DEA_visualization.Rmd b/Notebooks/07c_DEA_visualization.Rmd
@@ -54,6 +54,10 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le
 dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("samplename"), 
                               design = ~ sampletype)
+
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
+
 dds <- DESeq(dds)
 
 contrast_oe <- c("sampletype", "MOV10_overexpression","control")
@@ -163,11 +167,11 @@ One way to visualize results would be to simply plot the expression data for a h
 
 **Using DESeq2 `plotCounts()` to plot expression of a single gene**
 
-To pick out a specific gene of interest to plot, for example MOV10, we can use the `plotCounts()` from DESeq2. `plotCounts()` requires that the gene specified matches the original input to DESeq2.
+To pick out a specific gene of interest to plot, for example MOV10 (ID ENSG00000155363), we can use the `plotCounts()` from DESeq2. `plotCounts()` requires that the gene specified matches the original input to DESeq2.
 
 ```{r countPlot}
 # Plot expression for single gene
-plotCounts(dds, gene="MOV10", intgroup="sampletype") 
+plotCounts(dds, gene="ENSG00000155363", intgroup="sampletype") 
 ```
 
 **Using ggplot2 to plot expression of a single gene**
@@ -176,7 +180,7 @@ If you wish to change the appearance of this plot, we can save the output of `pl
 
 ```{r}
 # Save plotcounts to a data frame object
-d <- plotCounts(dds, gene="MOV10", intgroup="sampletype", returnData=TRUE)
+d <- plotCounts(dds, gene="ENSG00000155363", intgroup="sampletype", returnData=TRUE)
 
 # What is the data output of plotCounts()?
 d %>% head()

diff --git a/Notebooks/08a_FA_genomic_annotation.Rmd b/Notebooks/08a_FA_genomic_annotation.Rmd
@@ -50,6 +50,10 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le
 dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("samplename"), 
                               design = ~ sampletype)
+
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
+
 dds <- DESeq(dds)
 
 res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control")
@@ -191,7 +195,7 @@ To **obtain an annotation data frame** using AnnotationHub, we'll use the `genes
 # Create a gene-level dataframe 
 annotations_ahb <- genes(human_ens, return.type = "data.frame")  %>%
   dplyr::select(gene_id, gene_name, entrezid, gene_biotype, description) %>% 
-  dplyr::filter(gene_name %in% res_tableOE_tb$gene)
+  dplyr::filter(gene_id %in% res_tableOE_tb$gene)
 ```
 
 This dataframe looks like it should be fine as it is, but we look a little closer we will notice that the column containing Entrez identifiers is a list, and in fact there are many Ensembl identifiers that map to more than one Entrez identifier!
@@ -241,7 +245,7 @@ For the different steps in the functional analysis, we require Ensembl and Entre
 
 ```{r}
 ## Merge the AnnotationHub dataframe with the results 
-res_ids_ahb <- left_join(res_tableOE_tb, annotations_ahb, by=c("gene"="gene_name"))    
+res_ids_ahb <- left_join(res_tableOE_tb, annotations_ahb, by=c("gene"="gene_id"))    
 ```
 
 
@@ -288,14 +292,10 @@ res_tableOE_tb <- res_tableOE %>%
 
 ```{r}
 ## Return the IDs for the gene symbols in the DE results
-ids <- grch37 %>% dplyr::filter(symbol %in% rownames(res_tableOE))
-
-## The gene names can map to more than one Ensembl ID (some genes change ID over time), 
-## so we need to remove duplicate IDs prior to assessing enriched GO terms
-ids <- ids %>% dplyr::filter(!duplicated(symbol))
+ids <- grch37 %>% dplyr::filter(ensgene %in% rownames(res_tableOE))
 
 ## Merge the IDs with the results 
-res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="symbol"))
+res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="ensgene"))
 
 head(res_ids)
 ```

diff --git a/Notebooks/08b_FA_overrepresentation.Rmd b/Notebooks/08b_FA_overrepresentation.Rmd
@@ -53,14 +53,19 @@ dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("samplename"), 
                               design = ~ sampletype)
 
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
+
+dds <- DESeq(dds)
+
 res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control")
 res_tableOE_tb <- res_tableOE %>%
     data.frame() %>%
     rownames_to_column(var="gene") %>% 
     as_tibble()
 
-ids <- grch37 %>% dplyr::filter(symbol %in% rownames(res_tableOE)) %>% dplyr::filter(!duplicated(symbol))
-res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="symbol"))
+ids <- grch37 %>% dplyr::filter(ensgene %in% rownames(res_tableOE)) 
+res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="ensgene"))
 ```
 
 Approximate time: 60 minutes

diff --git a/Notebooks/08c_FA_GSEA.Rmd b/Notebooks/08c_FA_GSEA.Rmd
@@ -55,14 +55,19 @@ dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("samplename"), 
                               design = ~ sampletype)
 
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
+
+dds <- DESeq(dds)
+
 res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control")
 res_tableOE_tb <- res_tableOE %>%
     data.frame() %>%
     rownames_to_column(var="gene") %>% 
     as_tibble()
 
-ids <- grch37 %>% dplyr::filter(symbol %in% rownames(res_tableOE)) %>% dplyr::filter(!duplicated(symbol))
-res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="symbol"))
+ids <- grch37 %>% dplyr::filter(ensgene %in% rownames(res_tableOE)) 
+res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="ensgene"))
 ```