From bdfa7aa360d819771ab312c7970bc8ae94298491 Mon Sep 17 00:00:00 2001 From: joseale2310 Date: Thu, 12 Jan 2023 22:52:36 +0100 Subject: [PATCH] filtering and functional analysis fix [A --- Notebooks/06_exploratory_analysis.Rmd | 2 ++ Notebooks/07a_DEA.Rmd | 2 ++ Notebooks/07b_hypothesis_testing.Rmd | 20 +++++++++++--------- Notebooks/07c_DEA_visualization.Rmd | 10 +++++++--- Notebooks/08a_FA_genomic_annotation.Rmd | 16 ++++++++-------- Notebooks/08b_FA_overrepresentation.Rmd | 9 +++++++-- Notebooks/08c_FA_GSEA.Rmd | 9 +++++++-- 7 files changed, 44 insertions(+), 24 deletions(-) diff --git a/Notebooks/06_exploratory_analysis.Rmd b/Notebooks/06_exploratory_analysis.Rmd index bb4acf6b..f359b3fd 100644 --- a/Notebooks/06_exploratory_analysis.Rmd +++ b/Notebooks/06_exploratory_analysis.Rmd @@ -50,6 +50,8 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le dds <- DESeqDataSetFromTximport(txi, colData = meta %>% column_to_rownames("samplename"), design = ~ sampletype) +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] ``` Approximate time: 80 minutes diff --git a/Notebooks/07a_DEA.Rmd b/Notebooks/07a_DEA.Rmd index 641af794..b62d89f9 100644 --- a/Notebooks/07a_DEA.Rmd +++ b/Notebooks/07a_DEA.Rmd @@ -50,6 +50,8 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le dds <- DESeqDataSetFromTximport(txi, colData = meta %>% column_to_rownames("samplename"), design = ~ sampletype) +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] ``` Approximate time: 60 minutes diff --git a/Notebooks/07b_hypothesis_testing.Rmd b/Notebooks/07b_hypothesis_testing.Rmd index bc2e6c1b..baf7c9d8 100644 --- a/Notebooks/07b_hypothesis_testing.Rmd +++ b/Notebooks/07b_hypothesis_testing.Rmd @@ -38,19 +38,20 @@ knitr::opts_chunk$set(autodep = TRUE, # This chunk is ONLY necessary if you want to knit this document into a pdf!! library(tidyverse) library(DESeq2) +library(tximport) -# And with this last line of code, we set our working directory to the folder with this notebook. -# This way, the relative paths will work without issues setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) - -# Load objects so it can be knitted -data <- read_table("../Data/Mov10_full_counts.txt") meta <- read_table("../Data/Mov10_full_meta.txt") - -# Create dds object -dds <- DESeqDataSetFromMatrix(countData = data %>% column_to_rownames("GeneSymbol"), - colData = meta %>% column_to_rownames("samplename"), +dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" +tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) +files <- file.path(dir, meta$samplename, "quant.sf") +names(files) <- meta$samplename +txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) +dds <- DESeqDataSetFromTximport(txi, + colData = meta %>% column_to_rownames("samplename"), design = ~ sampletype) +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] dds <- DESeq(dds) ``` @@ -158,6 +159,7 @@ Define contrasts for MOV10 overexpression using one of the two methods above. ```{r} ## Your code here +contrast_oe <- ``` ## The results table diff --git a/Notebooks/07c_DEA_visualization.Rmd b/Notebooks/07c_DEA_visualization.Rmd index 07ed3883..affda526 100644 --- a/Notebooks/07c_DEA_visualization.Rmd +++ b/Notebooks/07c_DEA_visualization.Rmd @@ -54,6 +54,10 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le dds <- DESeqDataSetFromTximport(txi, colData = meta %>% column_to_rownames("samplename"), design = ~ sampletype) + +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] + dds <- DESeq(dds) contrast_oe <- c("sampletype", "MOV10_overexpression","control") @@ -163,11 +167,11 @@ One way to visualize results would be to simply plot the expression data for a h **Using DESeq2 `plotCounts()` to plot expression of a single gene** -To pick out a specific gene of interest to plot, for example MOV10, we can use the `plotCounts()` from DESeq2. `plotCounts()` requires that the gene specified matches the original input to DESeq2. +To pick out a specific gene of interest to plot, for example MOV10 (ID ENSG00000155363), we can use the `plotCounts()` from DESeq2. `plotCounts()` requires that the gene specified matches the original input to DESeq2. ```{r countPlot} # Plot expression for single gene -plotCounts(dds, gene="MOV10", intgroup="sampletype") +plotCounts(dds, gene="ENSG00000155363", intgroup="sampletype") ``` **Using ggplot2 to plot expression of a single gene** @@ -176,7 +180,7 @@ If you wish to change the appearance of this plot, we can save the output of `pl ```{r} # Save plotcounts to a data frame object -d <- plotCounts(dds, gene="MOV10", intgroup="sampletype", returnData=TRUE) +d <- plotCounts(dds, gene="ENSG00000155363", intgroup="sampletype", returnData=TRUE) # What is the data output of plotCounts()? d %>% head() diff --git a/Notebooks/08a_FA_genomic_annotation.Rmd b/Notebooks/08a_FA_genomic_annotation.Rmd index c0933086..bc51ec2e 100644 --- a/Notebooks/08a_FA_genomic_annotation.Rmd +++ b/Notebooks/08a_FA_genomic_annotation.Rmd @@ -50,6 +50,10 @@ txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "le dds <- DESeqDataSetFromTximport(txi, colData = meta %>% column_to_rownames("samplename"), design = ~ sampletype) + +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] + dds <- DESeq(dds) res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control") @@ -191,7 +195,7 @@ To **obtain an annotation data frame** using AnnotationHub, we'll use the `genes # Create a gene-level dataframe annotations_ahb <- genes(human_ens, return.type = "data.frame") %>% dplyr::select(gene_id, gene_name, entrezid, gene_biotype, description) %>% - dplyr::filter(gene_name %in% res_tableOE_tb$gene) + dplyr::filter(gene_id %in% res_tableOE_tb$gene) ``` This dataframe looks like it should be fine as it is, but we look a little closer we will notice that the column containing Entrez identifiers is a list, and in fact there are many Ensembl identifiers that map to more than one Entrez identifier! @@ -241,7 +245,7 @@ For the different steps in the functional analysis, we require Ensembl and Entre ```{r} ## Merge the AnnotationHub dataframe with the results -res_ids_ahb <- left_join(res_tableOE_tb, annotations_ahb, by=c("gene"="gene_name")) +res_ids_ahb <- left_join(res_tableOE_tb, annotations_ahb, by=c("gene"="gene_id")) ``` @@ -288,14 +292,10 @@ res_tableOE_tb <- res_tableOE %>% ```{r} ## Return the IDs for the gene symbols in the DE results -ids <- grch37 %>% dplyr::filter(symbol %in% rownames(res_tableOE)) - -## The gene names can map to more than one Ensembl ID (some genes change ID over time), -## so we need to remove duplicate IDs prior to assessing enriched GO terms -ids <- ids %>% dplyr::filter(!duplicated(symbol)) +ids <- grch37 %>% dplyr::filter(ensgene %in% rownames(res_tableOE)) ## Merge the IDs with the results -res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="symbol")) +res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="ensgene")) head(res_ids) ``` diff --git a/Notebooks/08b_FA_overrepresentation.Rmd b/Notebooks/08b_FA_overrepresentation.Rmd index 3adae5a4..619711e8 100644 --- a/Notebooks/08b_FA_overrepresentation.Rmd +++ b/Notebooks/08b_FA_overrepresentation.Rmd @@ -53,14 +53,19 @@ dds <- DESeqDataSetFromTximport(txi, colData = meta %>% column_to_rownames("samplename"), design = ~ sampletype) +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] + +dds <- DESeq(dds) + res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control") res_tableOE_tb <- res_tableOE %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble() -ids <- grch37 %>% dplyr::filter(symbol %in% rownames(res_tableOE)) %>% dplyr::filter(!duplicated(symbol)) -res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="symbol")) +ids <- grch37 %>% dplyr::filter(ensgene %in% rownames(res_tableOE)) +res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="ensgene")) ``` Approximate time: 60 minutes diff --git a/Notebooks/08c_FA_GSEA.Rmd b/Notebooks/08c_FA_GSEA.Rmd index 2bdd572a..2f3fa31f 100644 --- a/Notebooks/08c_FA_GSEA.Rmd +++ b/Notebooks/08c_FA_GSEA.Rmd @@ -55,14 +55,19 @@ dds <- DESeqDataSetFromTximport(txi, colData = meta %>% column_to_rownames("samplename"), design = ~ sampletype) +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] + +dds <- DESeq(dds) + res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control") res_tableOE_tb <- res_tableOE %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble() -ids <- grch37 %>% dplyr::filter(symbol %in% rownames(res_tableOE)) %>% dplyr::filter(!duplicated(symbol)) -res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="symbol")) +ids <- grch37 %>% dplyr::filter(ensgene %in% rownames(res_tableOE)) +res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="ensgene")) ```