From 8874820af2c2ac34638487d34f2fc8cdddbf8c2f Mon Sep 17 00:00:00 2001 From: joseale2310 Date: Tue, 24 Jan 2023 16:32:25 +0100 Subject: [PATCH] refined notebooks after feedback --- Data/Mov10_full_meta.txt | 18 ++-- Data/samplesheet.csv | 18 ++-- Notebooks/05b_count_matrix.Rmd | 59 ++++++++---- Notebooks/05c_count_normalization.Rmd | 30 +++--- Notebooks/06_exploratory_analysis.Rmd | 63 ++++++++++--- Notebooks/07a_DEA.Rmd | 26 ++--- Notebooks/07b_hypothesis_testing.Rmd | 60 ++++++------ Notebooks/07c_DEA_visualization.Rmd | 77 +++++++++------ Notebooks/08a_FA_genomic_annotation.Rmd | 17 ++-- Notebooks/08b_FA_overrepresentation.Rmd | 14 +-- Notebooks/08c_FA_GSEA.Rmd | 23 ++--- Notebooks/09_summarized_workflow.Rmd | 120 +++++++++++++++--------- 12 files changed, 316 insertions(+), 209 deletions(-) diff --git a/Data/Mov10_full_meta.txt b/Data/Mov10_full_meta.txt index 81ae79f0..238a10e9 100644 --- a/Data/Mov10_full_meta.txt +++ b/Data/Mov10_full_meta.txt @@ -1,9 +1,9 @@ -samplename sampletype MOVexpr -Mov10_kd_2 MOV10_knockdown low -Mov10_kd_3 MOV10_knockdown low -Mov10_oe_1 MOV10_overexpression high -Mov10_oe_2 MOV10_overexpression high -Mov10_oe_3 MOV10_overexpression high -Control_1 control normal -Control_2 control normal -Control_3 control normal +sample condition +Mov10_kd_2 MOV10_knockdown +Mov10_kd_3 MOV10_knockdown +Mov10_oe_1 MOV10_overexpression +Mov10_oe_2 MOV10_overexpression +Mov10_oe_3 MOV10_overexpression +Control_1 control +Control_2 control +Control_3 control diff --git a/Data/samplesheet.csv b/Data/samplesheet.csv index 28ccbe67..26a28cd3 100644 --- a/Data/samplesheet.csv +++ b/Data/samplesheet.csv @@ -1,9 +1,9 @@ -sample,fastq_1,fastq_2,strandedness -Control_3,Irrel_kd_3.fastq.gz,,unstranded -Control_2,Irrel_kd_2.fastq.gz,,unstranded -Control_1,Irrel_kd_1.fastq.gz,,unstranded -Mov10_oe_3,Mov10_oe_3.fastq.gz,,unstranded -Mov10_oe_2,Mov10_oe_2.fastq.gz,,unstranded -Mov10_oe_1,Mov10_oe_1.fastq.gz,,unstranded -Mov10_kd_3,Mov10_kd_3.fastq.gz,,unstranded -Mov10_kd_2,Mov10_kd_2.fastq.gz,,unstranded \ No newline at end of file +sample,fastq_1,fastq_2,strandedness,condition +Control_3,Irrel_kd_3.fastq.gz,NA,unstranded,control +Control_2,Irrel_kd_2.fastq.gz,NA,unstranded,control +Control_1,Irrel_kd_1.fastq.gz,NA,unstranded,control +Mov10_oe_3,Mov10_oe_3.fastq.gz,NA,unstranded,MOV10_overexpression +Mov10_oe_2,Mov10_oe_2.fastq.gz,NA,unstranded,MOV10_overexpression +Mov10_oe_1,Mov10_oe_1.fastq.gz,NA,unstranded,MOV10_overexpression +Mov10_kd_3,Mov10_kd_3.fastq.gz,NA,unstranded,MOV10_knockdown +Mov10_kd_2,Mov10_kd_2.fastq.gz,NA,unstranded,MOV10_knockdown diff --git a/Notebooks/05b_count_matrix.Rmd b/Notebooks/05b_count_matrix.Rmd index 44e303a5..88c847f2 100644 --- a/Notebooks/05b_count_matrix.Rmd +++ b/Notebooks/05b_count_matrix.Rmd @@ -58,6 +58,8 @@ setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) The directories of output from the mapping/quantification step of the workflow (Salmon) is the data that we will be using. These transcript abundance estimates, often referred to as **'pseudocounts', will be the starting point for our differential gene expression analysis**. The main output of Salmon is a `quant.sf` file, and we have one of these for each individual sample in our dataset. +For the sake of reproducibility, we will be using the backup results from our preprocessing pipeline. You are welcome to use your own results! + ```{r} # Tabulated separated files can be opened using the read_table() function. read_table("/work/sequencing_data/Preprocessing_backup/results_salmon/salmon/Control_1/quant.sf", ) %>% head() @@ -77,27 +79,27 @@ For each transcript that was assayed in the reference, we have: We will be using the R Bioconductor package `tximport` to prepare the `quant.sf` files for DESeq2. The first thing we need to do is create a variable that contains the paths to each of our `quant.sf` files. Then we will **add names to our quant files which will allow us to easily distinguish between samples in the final output matrix**. -We will use a metadata file that already contains all the information we need to run our analysis. +We will use the `samplesheet.csv` file that we use to process our raw reads, since it already contains all the information we need to run our analysis. ```{r} # Load metadata -meta <- read_table("/work/bulk_RNAseq_course/Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") # View metadata meta ``` -Using the samplenames, we can create all the paths needed: +Using the samples column, we can create all the paths needed: ```{r} # Directory where salmon files are. You can change this path to the results of your own analysis dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon" # List all directories containing quant.sf files using the samplename column of metadata -files <- file.path(dir,"salmon", meta$samplename, "quant.sf") +files <- file.path(dir,"salmon", meta$sample, "quant.sf") # Name the file list with the samplenames -names(files) <- meta$samplename +names(files) <- meta$sample files ``` @@ -134,7 +136,7 @@ The `txi` object is a simple list containing matrices of the abundance, counts, attributes(txi) ``` -We will be using the `txi` object as is, for input into DESeq2 but will save it until the next lesson. **For now let's take a look at the count matrix.** You will notice that there are decimal values, so let's round to the nearest whole number and convert it into a dataframe. We will save it to a variable called `data` that we can play with. +We will be using the `txi` object as is for input into DESeq2, but will save it until the next lesson. **For now let's take a look at the count matrix.** You will notice that there are decimal values, so let's round to the nearest whole number and convert it into a dataframe. We will save it to a variable called `data` that we can play with. ```{r} # Look at the counts @@ -148,30 +150,47 @@ data <- txi$counts %>% data.frame() ``` +There are a lot of rows with no gene expression at all. +```{r} +sum(rowSums(data) == 0) +``` + +Let's take them out +```{r} +keep <- rowSums(data) > 0 +data <- data[keep,] +``` + + ## RNA-seq count distribution To determine the appropriate statistical model, we need information about the distribution of counts. To get an idea about how RNA-seq counts are distributed, let's plot the counts of all the samples: -```{r count_distribution} +```{r} +# Here we format the data into long format instead of wide format pdata <- data %>% gather(key = Sample, value = Count) pdata +``` +And we plot our count distribution using all our samples: + +```{r count_distribution} ggplot(pdata) + - geom_histogram(aes(x = Count), stat = "bin", bins = 200) + + geom_density(aes(x = Count, color = Sample)) + xlab("Raw expression counts") + ylab("Number of genes") ``` -If we zoom in close to zero, we can see a large number of genes with counts of zero: +If we zoom in close to zero, we can see a large number of genes with counts close to zero: ```{r count_distribution_zoom} ggplot(pdata) + - geom_histogram(aes(x = Count), stat = "bin", bins = 200) + - xlim(-5, 500) + - xlab("Raw expression counts") + - ylab("Number of genes") + geom_density(aes(x = Count, color = Sample)) + + xlim(-5, 500) + + xlab("Raw expression counts") + + ylab("Number of genes") ``` These images illustrate some common features of RNA-seq count data, including a **low number of counts associated with a large proportion of genes**, and a long right tail due to the **lack of any upper limit for expression**. Unlike microarray data, which has a dynamic range maximum limited due to when the probes max out, there is no limit of maximum expression for RNA-seq data. Due to the differences in these technologies, the statistical models used to fit the data are different between the two methods. @@ -184,18 +203,20 @@ With RNA-Seq data, **a very large number of RNAs are represented and the probabi The model that fits best, given this type of variability observed for replicates, is the **Negative Binomial (NB) model**. Essentially, **the NB model is a good approximation for data where the mean \< variance**, as is the case with RNA-Seq count data. -Run the following code to plot the *mean versus variance* for our data: - -```{r mean_vs_variance} +Here we calculate the mean and the variance per gene for all columns and genes: +```{r} df <- data %>% rowwise() %>% - summarise(mean_counts = mean(Mov10_kd_2:Control_3), - variance_counts = var(Mov10_kd_2:Control_3)) - + summarise(mean_counts = mean(c_across(everything())), + variance_counts = var(c_across(everything()))) +``` +Run the following code to plot the *mean versus variance* of each gene for our data: +```{r mean_vs_variance} ggplot(df) + geom_point(aes(x=mean_counts, y=variance_counts)) + geom_abline(intercept = 0, slope = 1, color="red") + scale_y_log10() + scale_x_log10() ``` +If the mean would be equal to the variance, the cloud of points would follow the straight red line. diff --git a/Notebooks/05c_count_normalization.Rmd b/Notebooks/05c_count_normalization.Rmd index ea6e621f..2786b886 100644 --- a/Notebooks/05c_count_normalization.Rmd +++ b/Notebooks/05c_count_normalization.Rmd @@ -41,11 +41,11 @@ library(DESeq2) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) ``` @@ -54,13 +54,12 @@ Approximate time: 40 minutes ## Learning Objectives -* Explore different types of normalization methods * Become familiar with the `DESeqDataSet` object * Understand how to normalize counts using DESeq2 ## Normalization -The first step in the DE analysis workflow is count normalization, which is necessary to make accurate comparisons of gene expression between samples. +The first step in the DE analysis workflow is count normalization, which is necessary to make accurate comparisons of gene expression between samples. Let's try to run an easy example! *** @@ -103,8 +102,8 @@ We should always make sure that we have sample names that match between the two ```{r} ### Check that sample names match in both files -all(colnames(txi$counts) %in% meta$samplename) -all(colnames(txi$counts) == meta$samplename) +all(colnames(txi$counts) %in% meta$sample) +all(colnames(txi$counts) == meta$sample) ``` If your data did not match, you could use the `match()` function to rearrange them to be matching. `match()` function will take two arguments and find in which order the indexes of the second argument match the first argument. @@ -123,7 +122,7 @@ b[reorder] **Exercise 2** -Suppose we had sample names matching in the txi object and metadata file, but they were out of order. Write the line(s) of code required make the `meta_random` dataframe with columns ordered such that they were identical to the row names of the `txi`. +Suppose we had sample names matching in the txi object and metadata file, but they were out of order. Write the line(s) of code required make the `meta_random` dataframe with rows ordered such that they were identical to the column names of the `txi`. ```{r} # randomize metadata rownames @@ -138,7 +137,7 @@ meta_random <- meta[sample(1:nrow(meta)),] ### 2. Create DESEq2 object -Let's start by creating the `DESeqDataSet` object, and then we can talk a bit more about what is stored inside it. To create the object, we will need the *txi** object and the **metadata** table as input (`colData` argument). We will also need to specify a **design formula**. The design formula specifies the column(s) in the metadata table and how they should be used in the analysis. For our dataset we only have one column we are interested in, which is `~sampletype`. This column has three factor levels, which tells DESeq2 that for each gene we want to evaluate gene expression change with respect to these different levels. +Let's start by creating the `DESeqDataSet` object, and then we can talk a bit more about what is stored inside it. To create the object, we will need the **txi** object and the **metadata** table as input (`colData` argument). We will also need to specify a **design formula**. The design formula specifies which column(s) of our metadata we want to use for statistical testing and modeling (more about that later!). For our dataset we only have one column we are interested in, which is `condition`. This column has three factor levels, which tells DESeq2 that for each gene we want to evaluate gene expression change with respect to these different levels. **Our count matrix input is stored in the `txi` list object**. So we need to specify that using the `DESeqDataSetFromTximport()` function, which will extract the counts component and round the values to the nearest whole number. @@ -148,8 +147,8 @@ Let's start by creating the `DESeqDataSet` object, and then we can talk a bit mo # If you do not do this and the samples do not match, you will add wrong info! dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) ``` > **NOTE:** If you did not create pseudocounts, but a count matrix from aligned BAM files and tools such as `featurecounts`, you would want to use the `DESeqDataSetFromMatrix()` function. @@ -157,8 +156,8 @@ dds <- DESeqDataSetFromTximport(txi, ```{r, eval=FALSE} ## DO NOT RUN! ## Create DESeq2Dataset object from traditional count matrix -dds <- DESeqDataSetFromMatrix(countData = "/path/to/count/matrix.txt", - colData = meta %>% column_to_rownames("samplename"), +dds <- DESeqDataSetFromMatrix(countData = "../Data/Mov10_full_counts.txt", + colData = meta %>% column_to_rownames("sample"), design = ~ sampletype) ``` @@ -172,7 +171,10 @@ As we go through the workflow we will use the relevant functions to check what i #### Pre-filtering -While it is not necessary to pre-filter low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful: by removing rows in which there are very few reads, we reduce the memory size of the dds data object, and we increase the speed of the transformation and testing functions within DESeq2. It can also improve visualizations, as features with no information for differential expression are not plotted. +While it is not necessary to pre-filter low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful: + +- By removing rows in which there are very few reads, we reduce the memory size of the dds data object, and we increase the speed of the transformation and testing functions within DESeq2. +- It can also improve visualizations, as features with no information for differential expression are not plotted. Here we perform a minimal pre-filtering to keep only rows that have at least 10 reads total. ```{r} diff --git a/Notebooks/06_exploratory_analysis.Rmd b/Notebooks/06_exploratory_analysis.Rmd index f359b3fd..1138d17e 100644 --- a/Notebooks/06_exploratory_analysis.Rmd +++ b/Notebooks/06_exploratory_analysis.Rmd @@ -41,16 +41,16 @@ library(DESeq2) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) -keep <- rowSums(counts(dds)) >= 10 + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) +keep <- rowSums(counts(dds)) > 0 dds <- dds[keep,] ``` @@ -83,7 +83,6 @@ The `blind=TRUE` argument is to make sure that the `rlog()` function does not ta The `rlog()` function returns a `DESeqTransform` object, another type of DESeq-specific object. The reason you don't just get a matrix of transformed values is because all of the parameters (i.e. size factors) that went into computing the rlog transform are stored in that object. We use this object to plot the PCA and hierarchical clustering figures for quality assessment. - ### Principal component analysis (PCA) for the MOV10 dataset We are now ready for the QC steps, let's start with PCA! @@ -94,10 +93,14 @@ The function `plotPCA()` requires two arguments as input: a `DESeqTransform` obj ```{r DESeq_PCA} ### Plot PCA -plotPCA(rld, intgroup="sampletype") +plotPCA(rld, intgroup="condition") ``` -By default `plotPCA()` uses the *top 500 most variable genes*. You can change this by adding the `ntop=` argument and specifying how many of the genes you want the function to consider. For example, try 1000 genes. +*** + +**Exercise 1**: + +By default `plotPCA()` uses the *top 500 most variable genes*. You can change this by adding the `ntop=` argument and specifying how many of the genes you want the function to consider. For example, try 1000 genes. Did the plot change a lot? ```{r} # your code here @@ -105,7 +108,7 @@ By default `plotPCA()` uses the *top 500 most variable genes*. You can change th *** -**Exercise 1**: +**Exercise 2**: 1. What does the above plot tell you about the similarity of samples? 2. Does it fit the expectation from the experimental design? @@ -120,14 +123,43 @@ The `plotPCA()` function will only return the values for PC1 and PC2. If you wou ```{r} # Input is a matrix of log transformed values rld_mat <- assay(rld) # extract rlog count matrix -pca <- prcomp(t(rld_mat)) # perform PCA +pca <- prcomp(t(rld_mat)) # perform PCA on the transposed (t) matrix of data +``` +To see what the PCA object contains we can use again the `attributes()` function +```{r} +attributes(pca) +``` + +You can check the `?prcomp()` for more information. The most important variables are: +- sdev: standard deviation explained by each PC. +- rotation: contribution of each gene to each PC. +- x: PC values for each sample (we use this values for our plots) + +We can create a new object that contains all our metadata information and the PC values +```{r} df <- cbind(meta, pca$x) # Create data frame with metadata and PC3 and PC4 values for input to ggplot ``` ```{r custom_PCA} # ggplot with info for all PCAs -ggplot(df) + geom_point(aes(x=PC3, y=PC4, color = sampletype)) +ggplot(df) + geom_point(aes(x=PC3, y=PC4, color = condition)) +``` + +If you want to add PC variation information to the plot we can fetch it using the summary() function and take the second row + +```{r} +summary(pca) +pca_var <- summary(pca)$importance[2,] # second row is stored in the object "importance" +pca_var <- round(pca_var * 100, digits = 2) # make it percentage and round to 2 digits ``` +Finally, we can add it to our plot + +```{r} +ggplot(df) + geom_point(aes(x=PC3, y=PC4, color = condition)) + + xlab(paste0("PC3: ",pca_var["PC3"], "% variance")) + + ylab(paste0("PC4: ",pca_var["PC4"], "% variance")) +``` + ### Hierarchical Clustering for the MOV10 dataset @@ -141,7 +173,7 @@ rld_mat <- assay(rld) Next, we need to compute the distances values for all the samples. We can do this using the `dist` function: ```{r} -sampleDists <- dist(t(assay(rld))) #distances are computed by rows, so we need to transpose the matrix +sampleDists <- dist(t(rld_mat)) #distances are computed by rows, so we need to transpose (t) the matrix sampleDistMatrix <- as.matrix(sampleDists) ``` @@ -161,7 +193,8 @@ Now, let's plot the heatmap! ### Load pheatmap package library(pheatmap) -pheatmap(sampleDistMatrix, annotation_col = meta %>% column_to_rownames("samplename")) +pheatmap(sampleDistMatrix, annotation_col = meta %>% column_to_rownames("sample") %>% + select(condition)) # we only want to use the condition column as an annotation ``` When you plot using `pheatmap()` the hierarchical clustering information is used to place similar samples together and this information is represented by the tree structure along the axes. The `annotation` argument accepts a dataframe as input, in our case it is the `meta` data frame. @@ -186,7 +219,7 @@ heat.colors <- colorRampPalette(heat.colors)(100) # Interpolate 100 colors ```{r custom_heatmap} -pheatmap(sampleDistMatrix, annotation = meta %>% column_to_rownames("samplename"), +pheatmap(sampleDistMatrix, annotation = meta %>% column_to_rownames("sample") %>% select("condition"), color = heat.colors, border_color=NA, fontsize = 10, fontsize_row = 10, height=20) ``` diff --git a/Notebooks/07a_DEA.Rmd b/Notebooks/07a_DEA.Rmd index b62d89f9..74507e5a 100644 --- a/Notebooks/07a_DEA.Rmd +++ b/Notebooks/07a_DEA.Rmd @@ -1,6 +1,6 @@ --- title: "Gene-level differential expression analysis with DESeq2" -author: "Jose Alejandro Romero Herrera" +author: "You!" date: '`r Sys.Date()`' knit: (function(inputFile, encoding) { rmarkdown::render(inputFile, @@ -41,16 +41,16 @@ library(DESeq2) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) -keep <- rowSums(counts(dds)) >= 10 + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) +keep <- rowSums(counts(dds)) > 0 dds <- dds[keep,] ``` @@ -59,7 +59,7 @@ Approximate time: 60 minutes ## Learning Objectives * Explain the different steps involved in running `DESeq()` -* Examine size factors and undertand the source of differences +* Examine size factors and understand the source of differences * Inspect gene-level dispersion estimates * Recognize the importance of dispersion during differential expression analysis @@ -72,9 +72,13 @@ Previously, we created the DESeq2 object using the appropriate design formula. # DO NOT RUN # Create dds object -dds <- DESeqDataSetFromMatrix(countData = data %>% column_to_rownames("GeneSymbol"), - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) +dds <- DESeqDataSetFromTximport(txi, + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) + +# Filter genes with 0 counts +keep <- rowSums(counts(dds)) > 0 +dds <- dds[keep,] ``` Then, to run the entire differential expression analysis workflow, we use a single call to the function `DESeq()`. diff --git a/Notebooks/07b_hypothesis_testing.Rmd b/Notebooks/07b_hypothesis_testing.Rmd index 87c8f361..f2912312 100644 --- a/Notebooks/07b_hypothesis_testing.Rmd +++ b/Notebooks/07b_hypothesis_testing.Rmd @@ -41,26 +41,25 @@ library(DESeq2) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) -keep <- rowSums(counts(dds)) >= 10 + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) +keep <- rowSums(counts(dds)) > 0 dds <- dds[keep,] -dds <- DESeq(dds) +dds <- estimateSizeFactors(dds) +dds <- estimateDispersions(dds) ``` Approximate time: 90 minutes ## Learning Objectives -- Describe the process of model fitting -- Compare two methods for hypothesis testing (Wald test vs. LRT) - Discuss the steps required to generate a results table for pairwise comparisons (Wald test) - Recognize the importance of multiple test correction - Identify different methods for multiple test correction @@ -82,13 +81,11 @@ knitr::include_graphics("./img/07b_hypothesis_testing/NB_model_formula.png") The two parameters required are the **size factor, and the dispersion estimate**. Next, a generalized linear model (GLM) of the NB family is used to fit the data. Modeling is a mathematically formalized way to approximate how the data behaves given a set of parameters. -After the model is fit, coefficients are estimated for each sample group along with their standard error. The coefficents are the estimates for the **log2 fold changes**, and will be used as input for hypothesis testing. +After the model is fit, coefficients are estimated for each sample group along with their standard error. The coefficients are the estimates for the **log2 fold changes**, and will be used as input for hypothesis testing. ### Hypothesis testing -The first step in hypothesis testing is to set up a **null hypothesis** for each gene. In our case, the null hypothesis is that there is **no differential expression across the two sample groups (LFC == 0)**. Second, we use a statistical test to determine if based on the observed data, the null hypothesis can be rejected. - -In DESeq2, the **Wald test is the default used for hypothesis testing when comparing two groups**. In our case we are testing each gene model coefficient (LFC) which was derived using parameters like dispersion, which were estimated using maximum likelihood. +The first step in hypothesis testing is to set up a **null hypothesis** for each gene. In our case, the null hypothesis is that there is **no differential expression across the two sample groups (LFC == 0)**. Second, we use a statistical test to determine if based on the observed data, the null hypothesis can be rejected. In DESeq2, the **Wald test is the default used for hypothesis testing when comparing two groups**. The GLM and the statistical testing is done using the function `nbinomWaldTest()` @@ -96,16 +93,15 @@ The GLM and the statistical testing is done using the function `nbinomWaldTest() dds <- nbinomWaldTest(dds) ``` - The **model fit and Wald test were already run previously as part of the `DESeq()` function**: ```{r, eval = FALSE} ## DO NOT RUN THIS CODE ## Create DESeq2Dataset object -dds <- DESeqDataSetFromMatrix(countData = data.frame(data[,-1], row.names = data$GeneSymbol), - colData = data.frame(meta[,-1], row.names = meta$samplename), - design = ~ sampletype) +dds <- DESeqDataSetFromTximport(txi, + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) ## Run analysis dds <- DESeq(dds) ``` @@ -126,7 +122,7 @@ In our dataset, we have three sample groups so we can make three possible pairwi To indicate which two sample groups we are interested in comparing, we need to specify **contrasts**. The contrasts are used as input to the DESeq2 `results()` function to extract the desired results. -Contrasts can be specified in two different ways (with the first method more commonly used): +Contrasts can be specified in three different ways (with the first method more commonly used): 1. Contrasts can be supplied as a **character vector with exactly three elements**: @@ -145,6 +141,14 @@ contrast <- list(resultsNames(dds)[1], resultsNames(dds)[2]) results(dds, contrast = contrast) ``` +3. One of the results from `resultsNames(dds)` and the `name` argument. This one is the simplest but it can also be very restricted: + +```{r, eval = FALSE} +# DO NOT RUN! +resultsNames(dds) # to see what names to use +results(dds, name = resultsNames(dds)[2]) +``` + Alternatively, if you **only had two factor levels you could do nothing**: ```{r} @@ -177,7 +181,7 @@ Now that we have our contrast created, we can use it as input to the `results()` You will see we have the option to provide a wide array of arguments and tweak things from the defaults as needed. For example: ```{r} -## Extract results for MOV10 overexpression vs control +## Extract results for MOV10 overexpression vs control with a pvalue < 0.05 res_tableOE <- results(dds, contrast=contrast_oe, alpha = 0.05) ``` @@ -204,9 +208,9 @@ We can use the `mcols()` function to extract information on what the values stor data.frame(mcols(res_tableOE, use.names=T)) ``` -- `baseMean`: mean of normalized counts for all samples +- `baseMean`: mean of normalized counts for all samples in your count matrix - `log2FoldChange`: log2 fold change -- `lfcSE`: standard error +- `lfcSE`: standard error for the lfc calculation - `stat`: Wald statistic - `pvalue`: Wald test p-value - `padj`: BH adjusted p-values @@ -219,35 +223,35 @@ Let's take a closer look at our results table. As we scroll through it, you will knitr::include_graphics("./img/07b_hypothesis_testing/gene_filtering.png") ``` -The missing values represent genes that have undergone filtering as part of the `DESeq()` function. Prior to differential expression analysis it is **beneficial to omit genes that have little or no chance of being detected as differentially expressed.** This will increase the power to detect differentially expressed genes. DESeq2 does not physically remove any genes from the original counts matrix, and so all genes will be present in your results table. The genes omitted by DESeq2 meet one of the **three filtering criteria outlined below**: +The missing values represent genes that have undergone filtering as part of the `DESeq()` function. Prior to differential expression analysis it is **beneficial to omit genes that have little or no chance of being detected as differentially expressed.** This will increase the power to detect differentially expressed genes. DESeq2 **does not physically remove** any genes from the original counts matrix, and so all genes will be present in your results table. The genes omitted by DESeq2 meet one of the **three filtering criteria outlined below**: **1. Genes with zero counts in all samples** -If within a row, all samples have zero counts there is no expression information and therefore these genes are not tested. +If within a row, all samples have zero counts there is no expression information and therefore these genes are not tested. Since we have already filtered out these genes ourselves when we created our `dds` object. ```{r} -# Filter genes by zero expression +# Show genes with zero expression res_tableOE %>% as_tibble(rownames = "gene") %>% dplyr::filter(baseMean==0) %>% head() ``` -> **The baseMean column for these genes will be zero, and the log2 fold change estimates, p-value and adjusted p-value will all be set to NA.** +> **If there would be any genes meeting this criteria, the baseMean column for these genes will be zero, and the log2 fold change estimates, p-value and adjusted p-value will all be set to NA.** **2. Genes with an extreme count outlier** The `DESeq()` function calculates, for every gene and for every sample, a diagnostic test for outliers called Cook's distance. If several samples are flagged for a certain gene, the gene is filtered out. ```{r} -# Filter genes that have an extreme outlier +# Show genes that have an extreme outlier res_tableOE %>% as_tibble(rownames = "gene") %>% dplyr::filter(is.na(pvalue) & is.na(padj) & baseMean > 0) %>% head() ``` -It seems that our dataset does not contain genes with an extreme count outlier! +It seems that we have some genes with outliers! > **If a gene contains a sample with an extreme count outlier then the p-value and adjusted p-value will be set to NA.** @@ -256,7 +260,7 @@ It seems that our dataset does not contain genes with an extreme count outlier! DESeq2 defines a low mean threshold, that is empirically determined from your data, in which the fraction of significant genes can be increased by reducing the number of genes that are considered for multiple testing. This is based on the notion that genes with very low counts are not likely to see significant differences typically due to high dispersion. ```{r} -# Filter genes below the low mean threshold +# Show genes below the low mean threshold res_tableOE %>% as_tibble(rownames = "gene") %>% dplyr::filter(!is.na(pvalue) & is.na(padj) & baseMean > 0) %>% diff --git a/Notebooks/07c_DEA_visualization.Rmd b/Notebooks/07c_DEA_visualization.Rmd index affda526..3d91025a 100644 --- a/Notebooks/07c_DEA_visualization.Rmd +++ b/Notebooks/07c_DEA_visualization.Rmd @@ -45,22 +45,21 @@ library(ggrepel) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) - -keep <- rowSums(counts(dds)) >= 10 + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) +keep <- rowSums(counts(dds)) > 0 dds <- dds[keep,] dds <- DESeq(dds) -contrast_oe <- c("sampletype", "MOV10_overexpression","control") +contrast_oe <- c("condition", "MOV10_overexpression","control") res_tableOE <- results(dds, contrast=contrast_oe, alpha = 0.05) res_tableOE_tb <- res_tableOE %>% as_tibble(rownames = "gene") %>% @@ -100,7 +99,7 @@ To generate the shrunken log2 fold change estimates, you have to run an addition res_tableOE_unshrunken <- res_tableOE # Apply fold change shrinkage -res_tableOE <- lfcShrink(dds, coef="sampletype_MOV10_overexpression_vs_control", type="apeglm") +res_tableOE <- lfcShrink(dds, coef="condition_MOV10_overexpression_vs_control", type="apeglm") ``` > For more information on shrinkage, the DESeq2 vignette has an [Extended section on shrinkage estimators](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators) that is quite useful. @@ -171,7 +170,7 @@ To pick out a specific gene of interest to plot, for example MOV10 (ID ENSG00000 ```{r countPlot} # Plot expression for single gene -plotCounts(dds, gene="ENSG00000155363", intgroup="sampletype") +plotCounts(dds, gene="ENSG00000155363", intgroup="condition") ``` **Using ggplot2 to plot expression of a single gene** @@ -180,15 +179,15 @@ If you wish to change the appearance of this plot, we can save the output of `pl ```{r} # Save plotcounts to a data frame object -d <- plotCounts(dds, gene="ENSG00000155363", intgroup="sampletype", returnData=TRUE) +d <- plotCounts(dds, gene="ENSG00000155363", intgroup="condition", returnData=TRUE) # What is the data output of plotCounts()? d %>% head() ``` ```{r custom_countPlot} -# Plot the MOV10 normalized counts, using the samplenames (rownames(d) as labels) -ggplot(d, aes(x = sampletype, y = count, color = sampletype)) + +# Plot the MOV10 normalized counts, using the samples (rownames(d) as labels) +ggplot(d, aes(x = condition, y = count, color = condition)) + geom_point(position=position_jitter(w = 0.1,h = 0)) + geom_text_repel(aes(label = rownames(d))) + theme_bw() + @@ -196,14 +195,30 @@ ggtitle("MOV10") + theme(plot.title = element_text(hjust = 0.5)) ``` +**Create a translator from gene names to gene IDs** + +While gene IDs are unique and traceable, it is hard for us humans to memorize a bunch of numbers. Let's try to make a translator function that will give you possible gene IDs for a gene name. Then you can use this table to select one of the possible gene_IDs. + +The function will take as input a vector of gene names of interest, the tx2gene dataframe and the dds object that you analyzed +```{r} +lookup <- function(gene_name, tx2gene, dds){ + hits <- tx2gene %>% select(gene_symbol, gene_ID) %>% distinct() %>% + filter(gene_symbol %in% gene_name & gene_ID %in% rownames(dds)) + return(hits) +} + +lookup(gene_name = "MOV10", tx2gene = tx2gene, dds = dds) +``` + + ### Heatmap In addition to plotting subsets, we could also extract the normalized values of *all* the significant genes and plot a heatmap of their expression using `pheatmap()`. ```{r} -### Extract normalized expression for significant genes from the OE and control samples (columns 4 to 9) -### also get gene name (column 1) -norm_OEsig <- normalized_counts[,c(1,4:9)] %>% +### Extract normalized expression for significant genes from the OE and control samples +### also get gene name +norm_OEsig <- normalized_counts %>% select(gene, starts_with("Control"), starts_with("Mov10_oe")) dplyr::filter(gene %in% sigOE$gene) ``` @@ -211,13 +226,13 @@ Now let's draw the heatmap using `pheatmap`: ```{r sigOE_heatmap} ### Run pheatmap using the metadata data frame for the annotation -pheatmap(log(norm_OEsig[2:7]+1), +pheatmap(norm_OEsig %>% column_to_rownames("gene"), cluster_rows = T, show_rownames = F, - annotation = meta %>% column_to_rownames(var = "samplename"), + annotation = meta %>% column_to_rownames(var = "sample") %>% select("condition"), border_color = NA, fontsize = 10, - #scale = "row", + scale = "row", fontsize_row = 10, height = 20) ``` @@ -241,15 +256,15 @@ Now we can start plotting. The `geom_point` object is most applicable, as this i ```{r volcano_plot} ## Volcano plot -ggplot(res_tableOE_tb) + -geom_point(aes(x = log2FoldChange, y = -log10(padj), colour = threshold_OE)) + -ggtitle("Mov10 overexpression") + -xlab("log2 fold change") + -ylab("-log10 adjusted p-value") + -#scale_y_continuous(limits = c(0,50)) + -theme(legend.position = "none", -plot.title = element_text(size = rel(1.5), hjust = 0.5), -axis.title = element_text(size = rel(1.25))) +ggplot(res_tableOE_tb) + + geom_point(aes(x = log2FoldChange, y = -log10(padj), colour = threshold_OE)) + + ggtitle("Mov10 overexpression") + + xlab("log2 fold change") + + ylab("-log10 adjusted p-value") + + #scale_y_continuous(limits = c(0,50)) + + theme(legend.position = "none", + plot.title = element_text(size = rel(1.5), hjust = 0.5), + axis.title = element_text(size = rel(1.25))) ``` **Checking the top DE genes** @@ -265,7 +280,7 @@ res_tableOE_tb <- res_tableOE_tb %>% mutate(genelabels = "") ## Sort by padj values res_tableOE_tb <- res_tableOE_tb %>% arrange(padj) -## Populate the genelabels column with contents of the gene symbols column for the first 10 rows, i.e. the top 10 most significantly expressed genes +## Populate the gene labels column with contents of the gene symbols column for the first 10 rows, i.e. the top 10 most significantly expressed genes res_tableOE_tb$genelabels[1:10] <- as.character(res_tableOE_tb$gene[1:10]) head(res_tableOE_tb) @@ -292,12 +307,12 @@ ggplot(res_tableOE_tb, aes(x = log2FoldChange, y = -log10(padj))) + ***NOTE:** If using the DESeq2 tool for differential expression analysis, the package 'DEGreport' can use the DESeq2 results output to make the top20 genes and the volcano plots generated above by writing a few lines of simple code. While you can customize the plots above, you may be interested in using the easier code. Below are examples of the code to create these plots:* ```{r degPlot, fig.height=10} -DEGreport::degPlot(dds = dds, res = res_tableOE_unshrunken, n = 20, xs = "sampletype", group = "sampletype") + +DEGreport::degPlot(dds = dds, res = res_tableOE_unshrunken, n = 20, xs = "condition", group = "condition") + theme(axis.text.x = element_text(angle = 45, hjust = 1))# dds object is output from DESeq2 ``` ```{r degPlotWide} -DEGreport::degPlotWide(counts = dds, genes = row.names(res_tableOE)[1:5], group = "sampletype") +DEGreport::degPlotWide(counts = dds, genes = row.names(res_tableOE)[1:5], group = "condition") ``` ```{r degVolcano} diff --git a/Notebooks/08a_FA_genomic_annotation.Rmd b/Notebooks/08a_FA_genomic_annotation.Rmd index bc51ec2e..346e7dc9 100644 --- a/Notebooks/08a_FA_genomic_annotation.Rmd +++ b/Notebooks/08a_FA_genomic_annotation.Rmd @@ -41,22 +41,22 @@ library(DESeq2) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) keep <- rowSums(counts(dds)) >= 10 dds <- dds[keep,] dds <- DESeq(dds) -res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control") +res_tableOE <- lfcShrink(dds, coef = "condition_MOV10_overexpression_vs_control") res_tableOE_tb <- res_tableOE %>% data.frame() %>% rownames_to_column(var="gene") %>% @@ -120,6 +120,7 @@ AnnotationHub is a wonderful resource for accessing genomic data or querying lar To get started with AnnotationHub, we first load the library and connect to the database: +**The script will ask you to create a cache directory, type yes! ** ```{r} # Load libraries library(AnnotationHub) @@ -169,7 +170,7 @@ If we are looking for the latest Ensembl release, so that the annotations are th ```{r} # Extract annotations of interest -human_ens <- human_ens[["AH75011"]] +human_ens <- human_ens[[length(human_ens)]] ``` Now we can use `ensembldb` functions to extract the information at the gene, transcript, or exon levels. We are interested in the gene-level annotations, so we can extract that information as follows: @@ -251,7 +252,7 @@ res_ids_ahb <- left_join(res_tableOE_tb, annotations_ahb, by=c("gene"="gene_id") ### Using AnnotationHub to create a tx2gene file -If you wish to easily translate between genes and transcripts (this depends on how the count matrix was computed), it would be wise to create a 'transcript to gene' translation file (or `tx2gene` file). +If you wish to easily translate between genes and transcripts (this depends on how the count matrix was computed), it would be wise to create a 'transcript to gene' translation file (or `tx2gene` file). In our case, we already have one tx2gene table generated by our pipeline, but sometimes you may not have access to an tx2gene file already, so it is useful to learn how to create such a file To create it, we would need to use a combination of the methods above and merge two dataframes together. For example: ```{r, eval = FALSE} diff --git a/Notebooks/08b_FA_overrepresentation.Rmd b/Notebooks/08b_FA_overrepresentation.Rmd index 290b67b2..7772a652 100644 --- a/Notebooks/08b_FA_overrepresentation.Rmd +++ b/Notebooks/08b_FA_overrepresentation.Rmd @@ -43,22 +43,22 @@ library(annotables) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) keep <- rowSums(counts(dds)) >= 10 dds <- dds[keep,] dds <- DESeq(dds) -res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control") +res_tableOE <- lfcShrink(dds, coef = "condition_MOV10_overexpression_vs_control") res_tableOE_tb <- res_tableOE %>% data.frame() %>% rownames_to_column(var="gene") %>% @@ -135,7 +135,7 @@ write.csv(cluster_summary, "../Results/clusterProfiler_Mov10oe.csv") *** **Exercise 1** -Create new separate GO enrichment analyses with UP and DOWN regulated genes for MOV10 overexpression. +Create two new GO enrichment analyses one with UP and another for DOWN regulated genes for MOV10 overexpression. *** diff --git a/Notebooks/08c_FA_GSEA.Rmd b/Notebooks/08c_FA_GSEA.Rmd index d969d7e9..6dded6aa 100644 --- a/Notebooks/08c_FA_GSEA.Rmd +++ b/Notebooks/08c_FA_GSEA.Rmd @@ -40,29 +40,25 @@ library(tidyverse) library(DESeq2) library(ggrepel) library(annotables) -library(clusterProfiler) -library(DOSE) -library(pathview) -library(org.Hs.eg.db) library(tximport) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) keep <- rowSums(counts(dds)) >= 10 dds <- dds[keep,] dds <- DESeq(dds) -res_tableOE <- lfcShrink(dds, coef = "sampletype_MOV10_overexpression_vs_control") +res_tableOE <- lfcShrink(dds, coef = "condition_MOV10_overexpression_vs_control") res_tableOE_tb <- res_tableOE %>% data.frame() %>% rownames_to_column(var="gene") %>% @@ -95,7 +91,6 @@ clusterProfiler offers several functions to perform GSEA using different genes s ## Remove any NA values (reduces the data by quite a bit) and duplicates res_entrez <- dplyr::filter(res_ids, entrez != "NA" & duplicated(entrez)==F) - ``` Finally, extract and name the fold changes: @@ -148,7 +143,7 @@ Explore the GSEA plot of enrichment of one of the pathways in the ranked list: ```{r gseaplot_KEGG} ## Plot the GSEA plot for a single enriched pathway: -gseaplot(gseaKEGG, geneSetID = 'hsa04060') +gseaplot(gseaKEGG, geneSetID = gseaKEGG_results$ID[1], title = gseaKEGG_results$Description[1]) ``` In this plot, the lines in plot represent the genes in the gene set, and where they occur among the log2 fold changes. The largest positive log2 fold changes are on the left-hand side of the plot, while the largest negative log2 fold changes are on the right. The top plot shows the magnitude of the log2 fold changes for each gene, while the bottom plot shows the running sum, with the enrichment score peaking at the red dotted line (which is among the negative log2 fold changes). @@ -158,7 +153,7 @@ Use the [Pathview R package](http://bioconductor.org/packages/release/bioc/html/ ```{r pathview} ## Output images for a single significant KEGG pathway pathview(gene.data = foldchanges, - pathway.id = "hsa04060", + pathway.id = gseaKEGG_results$ID[1], species = "hsa", limit = list(gene = 2, # value gives the max/min limit for foldchanges cpd = 1)) @@ -195,7 +190,7 @@ head(gseaGO_results) ``` ```{r gseaplotGO} -gseaplot(gseaGO, geneSetID = 'GO:0006397') +gseaplot(gseaGO, geneSetID = gseaGO_results$ID[1], title = gseaGO_results$Description[1]) ``` There are other gene sets available for GSEA analysis in clusterProfiler (Disease Ontology, Reactome pathways, etc.) You can check out this [link](https://yulab-smu.top/biomedical-knowledge-mining-book/) for more! diff --git a/Notebooks/09_summarized_workflow.Rmd b/Notebooks/09_summarized_workflow.Rmd index 4fb4356b..f9891814 100644 --- a/Notebooks/09_summarized_workflow.Rmd +++ b/Notebooks/09_summarized_workflow.Rmd @@ -52,6 +52,7 @@ library(DOSE) library(pathview) library(org.Hs.eg.db) library(tximport) +library(RColorBrewer) ``` We have detailed the various steps in a differential expression analysis workflow, providing theory with example code. To provide a more succinct reference for the code needed to run a DGE analysis, we have summarized the steps in an analysis below: @@ -60,47 +61,53 @@ We have detailed the various steps in a differential expression analysis workflo ### If you have a traditional raw count matrix +Load data and metadata ```{r} -# Load data and metadata data <- read_table("../Data/Mov10_full_counts.txt") meta <- read_table("../Data/Mov10_full_meta.txt") ``` - +Check that the row names of the metadata equal the column names of the **raw counts** data ```{r} -# Check that the row names of the metadata equal the column names of the **raw counts** data -all(colnames(data)[-1] == meta$samplename) +all(colnames(data)[-1] == meta$sample) +``` -# Create DESeq2Dataset object +Create DESeq2Dataset object +```{r} dds <- DESeqDataSetFromMatrix(countData = data %>% column_to_rownames("GeneSymbol"), - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) ``` ### If you have pseudocounts +Load samplesheet with all our metadata from our pipeline ```{r} # Load data, metadata and tx2gene and create a txi object -meta <- read_table("../Data/Mov10_full_meta.txt") +meta <- read_csv("../Data/samplesheet.csv") +``` +Create a list of salmon results +```{r} dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon" tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol")) # Get all salmon results files -files <- file.path(dir, meta$samplename, "quant.sf") -names(files) <- meta$samplename +files <- file.path(dir, meta$sample, "quant.sf") +names(files) <- meta$sample +``` -# Create txi object +Create txi object +```{r} txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion = TRUE) ``` - +Create dds object ```{r} -# Create dds object dds <- DESeqDataSetFromTximport(txi, - colData = meta %>% column_to_rownames("samplename"), - design = ~ sampletype) + colData = meta %>% column_to_rownames("sample"), + design = ~ condition) ``` ## Exploratory data analysis @@ -109,7 +116,7 @@ Prefiltering low count genes + PCA & hierarchical clustering - identifying outli ### Prefiltering low count genes ```{r} -keep <- rowSums(counts(dds)) >= 10 +keep <- rowSums(counts(dds)) > 0 dds <- dds[keep,] ``` @@ -121,74 +128,86 @@ rld <- rlog(dds, ``` ### PCA + +Plot PCA ```{r PCA_plot} -# Plot PCA plotPCA(rld, - intgroup="sampletype") + intgroup="condition") ``` ### Heatmaps + +Extract the rlog matrix from the object ```{r} -# Extract the rlog matrix from the object rld_mat <- assay(rld) rld_cor <- cor(rld_mat) # Pearson correlation betweeen samples rld_dist <- as.matrix(dist(t(assay(rld)))) #distances are computed by rows, so we need to transponse the matrix ``` - +Plot heatmap of correlations ```{r cor_heatmap} -# Plot heatmap of correlations pheatmap(rld_cor, - annotation = meta %>% column_to_rownames("samplename")) + annotation = meta %>% column_to_rownames("sample") %>% select("condition")) ``` +Plot heatmap of distances with a new color range ```{r dist_heatmap} -# Plot heatmap of distances -library(RColorBrewer) heat.colors <- brewer.pal(6, "Blues") # Colors from the RColorBrewer package (only 6) heat.colors <- colorRampPalette(heat.colors)(100) # Interpolate 100 colors pheatmap(rld_dist, - annotation = meta %>% column_to_rownames("samplename"), color = heat.colors) + annotation = meta %>% column_to_rownames("sample") %>% select("condition"), + color = heat.colors) ``` ## Run DESeq2: +**Optional step** - Re-create DESeq2 dataset if the design formula has changed after QC analysis in include other sources of variation using + ```{r} -# **Optional step** - Re-create DESeq2 dataset if the design formula has changed after QC analysis in include other sources of variation using -#"dds <- DESeqDataSetFromMatrix(data, colData = metadata, design = ~ covariate + condition)" +# dds <- DESeqDataSetFromMatrix(data, colData = meta, design = ~ covariate + condition) +``` +Run DEseq2 +```{r} # Run DESeq2 differential expression analysis dds <- DESeq(dds) ``` +**Optional step** - Output normalized counts to save as a file to access outside RStudio using ```{r} -# **Optional step** - Output normalized counts to save as a file to access outside RStudio using normalized_counts <- counts(dds, normalized=TRUE) ``` ## Check the fit of the dispersion estimates +Plot dispersion estimates ```{r DispEsts_plot} -# Plot dispersion estimates plotDispEsts(dds) ``` ## Create contrasts to perform Wald testing or the shrunken log2 fold changes between specific conditions +Formal LFC calculation ```{r} # Specify contrast for comparison of interest -contrast <- c("sampletype", "MOV10_overexpression", "control") +contrast <- c("condition", "MOV10_overexpression", "control") # Output results of Wald test for contrast res <- results(dds, contrast = contrast, alpha = 0.05) +``` + +Shrinkage +```{r} +# Get name of the contrast you would like to use +resultsNames(dds) # Shrink the log2 fold changes to be more accurate res <- lfcShrink(dds, - contrast = contrast, - type = "normal") + coef = "condition_MOV10_overexpression_vs_control", + type = "apeglm") ``` ## Output significant results: @@ -210,9 +229,21 @@ sig_res <- filter(res_tbl, ## Visualize results: volcano plots, heatmaps, normalized counts plots of top genes, etc. +Function to get gene_IDs based on gene names. The function will take as input a vector of gene names of interest, the tx2gene dataframe and the dds object that you analyzed. + +```{r} +lookup <- function(gene_name, tx2gene, dds){ + hits <- tx2gene %>% select(gene_symbol, gene_ID) %>% distinct() %>% + filter(gene_symbol %in% gene_name & gene_ID %in% rownames(dds)) + return(hits) +} + +lookup(gene_name = "MOV10", tx2gene = tx2gene, dds = dds) +``` + Plot expression for single gene ```{r counts_plot} -plotCounts(dds, gene="MOV10", intgroup="sampletype") +plotCounts(dds, gene="ENSG00000155363", intgroup="condition") ``` ### MAplot @@ -265,7 +296,7 @@ pheatmap(norm_sig, scale = "row", # scale by gene so expression pattern is visible treeheight_row = 0, # dont show the row dendogram show_rownames = F, # remove rownames so it is more clear - annotation = meta %>% column_to_rownames(var = "samplename") %>% dplyr::select(MOVexpr) + annotation = meta %>% column_to_rownames(var = "sample") %>% dplyr::select("condition") ) ``` @@ -274,22 +305,22 @@ pheatmap(norm_sig, ### Annotate with `annotables` ```{r} -ids <- grch37 %>% dplyr::filter(ensgene %in% rownames(res_tableOE)) -res_ids <- inner_join(res_tableOE_tb, ids, by=c("gene"="ensgene")) +ids <- grch37 %>% dplyr::filter(ensgene %in% res_tbl$gene) +res_ids <- inner_join(res_tbl, ids, by=c("gene"="ensgene")) ``` ### Perform enrichment analysis of GO terms (can be done as well with KEGG pathways) ```{r} # Create background dataset for hypergeometric testing using all genes tested for significance in the results -all_genes <- dplyr::filter(res_ids, !is.na(ensgene)) %>% - pull(ensgene) %>% +all_genes <- dplyr::filter(res_ids, !is.na(gene)) %>% + pull(gene) %>% as.character() # Extract significant results -sig <- dplyr::filter(res_ids, padj < 0.05 & !is.na(ensgene)) +sig <- dplyr::filter(res_ids, padj < 0.05 & !is.na(gene)) sig_genes <- sig %>% - pull(ensgene) %>% + pull(gene) %>% as.character() ``` @@ -310,6 +341,7 @@ ego <- enrichplot::pairwise_termsim(ego) ```{r enrichGO, fig.height=10, fig.width=5} dotplot(ego, showCategory=50) ``` + ```{r emapplot, fig.height=10, fig.width=10} emapplot(ego, showCategory = 50) ``` @@ -332,7 +364,7 @@ cnetplot(ego, vertex.label.font=6) ``` -### Perform GSEA analysis of KEGG pathways *can be done as well with GO terms) +### Perform GSEA analysis of KEGG pathways (can be done as well with GO terms) ```{r} # Extract entrez IDs. IDs should not be duplicated or NA res_entrez <- dplyr::filter(res_ids, entrez != "NA" & entrez != "NULL" & duplicated(entrez)==F) @@ -360,20 +392,20 @@ head(gseaKEGG_results) ```{r gseaplot_KEGG} ## Plot the GSEA plot for a single enriched pathway: -gseaplot(gseaKEGG, geneSetID = 'hsa04060') +gseaplot(gseaKEGG, geneSetID = gseaKEGG_results$ID[1]) ``` ```{r pathview} ## Output images for a single significant KEGG pathway pathview(gene.data = foldchanges, - pathway.id = "hsa04060", + pathway.id = gseaKEGG_results$ID[1], species = "hsa", limit = list(gene = 2, # value gives the max/min limit for foldchanges cpd = 1)) ``` ```{r pathway_plot} -knitr::include_graphics("./hsa04060.png") +knitr::include_graphics(paste0("./",gseaKEGG_results$ID[1],".png")) ``` ## Make sure to output the versions of all tools used in the DE analysis: