updated notebooks paths

hds-sandbox · Apr 14, 2023 · 4f94411 · 4f94411
1 parent f540c55
commit 4f94411
Show file tree

Hide file tree

Showing 10 changed files with 39 additions and 41 deletions.
diff --git a/Notebooks/05b_count_matrix.Rmd b/Notebooks/05b_count_matrix.Rmd
@@ -62,7 +62,7 @@ For the sake of reproducibility, we will be using the backup results from our pr
 
 ```{r}
 # Tabulated separated files can be opened using the read_table() function.
-read_table("/work/sequencing_data/Preprocessing_backup/results_salmon/salmon/Control_1/quant.sf", ) %>% head()
+read_table("/work/Intro_to_bulkRNAseq/Data/salmon/Control_1/quant.sf", ) %>% head()
 ```
 
 For each transcript that was assayed in the reference, we have:
@@ -83,7 +83,7 @@ We will use the `samplesheet.csv` file that we use to process our raw reads, sin
 
 ```{r}
 # Load metadata
-meta <- read_csv("../Data/samplesheet.csv")
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
 
 # View metadata
 meta
@@ -93,7 +93,7 @@ Using the samples column, we can create all the paths needed:
 
 ```{r}
 # Directory where salmon files are. You can change this path to the results of your own analysis
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon"
+dir <- "/work/Intro_to_bulkRNAseq/Data"
 
 # List all directories containing quant.sf files using the samplename column of metadata
 files <- file.path(dir,"salmon", meta$sample, "quant.sf")
@@ -106,7 +106,7 @@ files
 Our Salmon files were generated with transcript sequences listed by Ensembl IDs, but `tximport` needs to know **which genes these transcripts came from**. We will use annotation table the that was created in our workflow, called `tx2gene.txt`.
 
 ```{r}
-tx2gene <- read_table("/work/sequencing_data/Preprocessing_backup/results_salmon/salmon/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 tx2gene %>% head()
 ```
 

diff --git a/Notebooks/05c_count_normalization.Rmd b/Notebooks/05c_count_normalization.Rmd
@@ -41,9 +41,9 @@ library(DESeq2)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)
@@ -156,7 +156,7 @@ dds <- DESeqDataSetFromTximport(txi,
 ```{r, eval=FALSE}
 ## DO NOT RUN!
 ## Create DESeq2Dataset object from traditional count matrix
-dds <- DESeqDataSetFromMatrix(countData = "../Data/Mov10_full_counts.txt", 
+dds <- DESeqDataSetFromMatrix(countData = "/work/Intro_to_bulkRNAseq/Data/Mov10_counts_traditional.txt", 
                               colData = meta %>% column_to_rownames("sample"), 
                               design = ~ sampletype)
 ```
@@ -209,5 +209,5 @@ head(normalized_counts)
 We can save this normalized data matrix to file for later use:
 
 ```{r}
-write.table(normalized_counts, file="../Results/normalized_counts.txt", sep="\t", quote=F)
+write.table(normalized_counts, file="/work/Intro_to_bulkRNAseq/Results/normalized_counts.txt", sep="\t", quote=F)
 ```
diff --git a/Notebooks/06_exploratory_analysis.Rmd b/Notebooks/06_exploratory_analysis.Rmd
@@ -41,9 +41,9 @@ library(DESeq2)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)

diff --git a/Notebooks/07a_DEA.Rmd b/Notebooks/07a_DEA.Rmd
@@ -41,9 +41,9 @@ library(DESeq2)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)

diff --git a/Notebooks/07b_hypothesis_testing.Rmd b/Notebooks/07b_hypothesis_testing.Rmd
@@ -41,9 +41,9 @@ library(DESeq2)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)

diff --git a/Notebooks/07c_DEA_visualization.Rmd b/Notebooks/07c_DEA_visualization.Rmd
@@ -45,9 +45,9 @@ library(ggrepel)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)
@@ -218,7 +218,7 @@ In addition to plotting subsets, we could also extract the normalized values of
 ```{r}
 ### Extract normalized expression for significant genes from the OE and control samples
 ### also get gene name
-norm_OEsig <- normalized_counts %>% select(gene, starts_with("Control"), starts_with("Mov10_oe")) 
+norm_OEsig <- normalized_counts %>% select(gene, starts_with("Control"), starts_with("Mov10_oe"))  %>%
   dplyr::filter(gene %in% sigOE$gene)  
 ```
 

diff --git a/Notebooks/08a_FA_genomic_annotation.Rmd b/Notebooks/08a_FA_genomic_annotation.Rmd
@@ -41,17 +41,16 @@ library(DESeq2)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)
 dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("sample"), 
                               design = ~ condition)
-
-keep <- rowSums(counts(dds)) >= 10
+keep <- rowSums(counts(dds)) > 0
 dds <- dds[keep,]
 
 dds <- DESeq(dds)

diff --git a/Notebooks/08b_FA_overrepresentation.Rmd b/Notebooks/08b_FA_overrepresentation.Rmd
@@ -43,19 +43,19 @@ library(annotables)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)
 dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("sample"), 
                               design = ~ condition)
-
-keep <- rowSums(counts(dds)) >= 10
+keep <- rowSums(counts(dds)) > 0
 dds <- dds[keep,]
 
+
 dds <- DESeq(dds)
 
 res_tableOE <- lfcShrink(dds, coef = "condition_MOV10_overexpression_vs_control")
@@ -129,7 +129,7 @@ Let's check the results:
 cluster_summary <- data.frame(ego)
 cluster_summary
 
-write.csv(cluster_summary, "../Results/clusterProfiler_Mov10oe.csv")
+write.csv(cluster_summary, "/work/Intro_to_bulkRNAseq/Results/clusterProfiler_Mov10oe.csv")
 ```
 
 ***

diff --git a/Notebooks/08c_FA_GSEA.Rmd b/Notebooks/08c_FA_GSEA.Rmd
@@ -43,17 +43,16 @@ library(annotables)
 library(tximport)
 
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
-meta <- read_csv("../Data/samplesheet.csv")
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
-tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
+tx2gene <- read_table("/work/Intro_to_bulkRNAseq/Data/salmon_tx2gene.tsv", col_names = c("transcript_ID","gene_ID","gene_symbol"))
 files <- file.path(dir, meta$sample, "quant.sf")
 names(files) <- meta$sample
 txi <- tximport(files, type="salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM", ignoreTxVersion	= TRUE)
 dds <- DESeqDataSetFromTximport(txi,
                                    colData = meta %>% column_to_rownames("sample"), 
                               design = ~ condition)
-
-keep <- rowSums(counts(dds)) >= 10
+keep <- rowSums(counts(dds)) > 0
 dds <- dds[keep,]
 
 dds <- DESeq(dds)
@@ -134,7 +133,7 @@ head(gseaKEGG_results)
 
 ```{r}
 ## Write GSEA results to file
-write.csv(gseaKEGG_results, "../Results/gseaOE_kegg.csv", quote=F)
+write.csv(gseaKEGG_results, "/work/Intro_to_bulkRNAseq/Results/gseaOE_kegg.csv", quote=F)
 ```
 
 > ***NOTE:** We will all get different results for the GSEA because the permutations performed use random reordering. If we would like to use the same permutations every time we run a function (i.e. we would like the same results every time we run the function), then we could use the `set.seed(123456)` function prior to running. The input to `set.seed()` could be any number, but if you would want the same results, then you would need to use the same number as input.*

diff --git a/Notebooks/09_summarized_workflow.Rmd b/Notebooks/09_summarized_workflow.Rmd
@@ -85,12 +85,12 @@ dds <- DESeqDataSetFromMatrix(countData = data %>% column_to_rownames("GeneSymbo
 Load samplesheet with all our metadata from our pipeline
 ```{r}
 # Load data, metadata and tx2gene and create a txi object
-meta <- read_csv("../Data/samplesheet.csv")
+meta <- read_csv("/work/Intro_to_bulkRNAseq/Data/samplesheet.csv")
 ```
 
 Create a list of salmon results
 ```{r}
-dir <- "/work/sequencing_data/Preprocessing_backup/results_salmon/salmon"
+dir <- "/work/Intro_to_bulkRNAseq/Data/salmon"
 tx2gene <- read_table(file.path(dir,"salmon_tx2gene.tsv"), col_names = c("transcript_ID","gene_ID","gene_symbol"))
 
 # Get all salmon results files