-
Notifications
You must be signed in to change notification settings - Fork 0
/
scRNAanalysisworkflow.R
119 lines (99 loc) · 4.19 KB
/
scRNAanalysisworkflow.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#scRNA-seq analysis workflow
# Before starting the analysis itself, some comments on experimental design may
# be helpful. The most obvious question is the choice of technology, which can
# be roughly divided into:
#
# Droplet-based: 10X Genomics, inDrop, Drop-seq
# Plate-based with unique molecular identifiers (UMIs): CEL-seq, MARS-seq
# Plate-based with reads: Smart-seq2
# Other: sci-RNA-seq, Seq-Well
# sequencing data from scRNA-seq experiments must be converted into a matrix of
# expression values that can be used for statistical analysis
# discrete data => Unique molecular identifiers or reads mapped to each gene in
# each cell.
# after quantification the count matrix is imported into R and converted to a
# SingleCellExperiment object.
# Data processing and downstream analysis:
# 1. Compute quality control metrics to remove low quality cells
# => Total counts per cell, proportion of spike-in or mitochondrial reads and
# number of detected features.
# 2. Convert counts into normalized expression values to eliminate cell specific
# biases like in capture efficiency. This allows for explicit comparisons across
# cells in downstream steps like clustering.
# Also apply a transformation (typically log transformation) to adjust for the
# mean-variance relationship.
# 3. Perform feature selection to pick a subset of interesting features for
# downstream analysis.
# => Model Variance across cells for each gene and retain genes that are highly
# variable. The aim is to reduce computational overhead and noise from
# uninteresting genes.
# 4. Apply dimensionality reduction to compact the data and further reduce noise.
# PCA is usually used initially followed by more aggressive methods like tSNE
# for visualization.
# 5. Cluster cells into groups according to similarities in their normalized
# expression profiles. This aims to obtain groupings that serve as empirical
# proxies for distinct biological states.
# Example: Dataset from Macosko et al 2015
BiocManager::install("SingleCellExperiment")
library(SingleCellExperiment)
BiocManager::install("scRNAseq")
library(scRNAseq)
# this will download a bunch of data, create a new directory if possible
sce <- MacoskoRetinaData()
# Quality control.
BiocManager::install("scater")
library(scater)
is.mito <- grepl("^MT-", rownames(sce))
qcstats <- perCellQCMetrics(sce, subsets=list(Mito=is.mito))
filtered <- quickPerCellQC(qcstats, percent_subsets="subsets_Mito_percent")
sce <- sce[, !filtered$discard]
# Normalization.
sce <- logNormCounts(sce)
# Feature selection.
library(scran)
dec <- modelGeneVar(sce)
hvg <- getTopHVGs(dec, prop=0.1)
# Dimensionality reduction.
set.seed(1234)
sce <- runPCA(sce, ncomponents=25, subset_row=hvg)
sce <- runUMAP(sce, dimred = 'PCA', external_neighbors=TRUE)
plotUMAP(sce, colour_by = "cluster")
# Clustering.
g <- buildSNNGraph(sce, use.dimred = 'PCA')
colData(sce)$clust <- factor(igraph::cluster_louvain(g)$membership)
plotUMAP(sce, colour_by = "clust")
plotPCA(sce, colour_by = "clust")
# colnames(sce) <- factor(igraph::cluster_louvain(g)$membership)
# does not work as of yet
# Visualization.
plotUMAP(sce, colour_by="cell.id")
plotPCA(sce, colour_by = "label")
# installing a mirror that exports the function colLabels
devtools::install_github("drisso/SingleCellExperiment")
# fails
# need to update to R version 3.6.3
# BiocManager::install("SingleCellExperiment")
library(SingleCellExperiment)
library(installr)
installr::updateR()
devtools::install_github("drisso/SingleCellExperiment")
library(SingleCellExperiment)
sce
# lets try the coloring example from plotUMAP
example_sce <- mockSCE()
example_sce <- logNormCounts(example_sce)
example_sce <- runPCA(example_sce)
plotPCA(example_sce)
plotPCA(example_sce, colour_by = "Mutation_Status")
plotPCA(example_sce, colour_by = "Cell_Cycle", shape_by = "Treatment",
size_by = "Mutation_Status")
plotPCA(example_sce, ncomponents = 4, colour_by = "Treatment",
shape_by = "Mutation_Status")
example_sce <- runTSNE(example_sce)
plotTSNE(example_sce, run_args=list(perplexity = 10))
## Same for DiffusionMaps:
example_sce <- runDiffusionMap(example_sce)
plotDiffusionMap(example_sce)
## Same for MDS plots:
example_sce <- runMDS(example_sce)
plotMDS(example_sce)