-
Notifications
You must be signed in to change notification settings - Fork 0
/
featureselection.R
227 lines (191 loc) · 9.96 KB
/
featureselection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# feature selection
# scRNA-seq data is often used in exploratory analyses to characterize heterogenity
# across cells.
# clustering and dim reduction compares cels based on their gene expression profiles
# which involves aggregating per gene differences into a single (dis)similarity metric
# between a pair of cells.
# => we want to select genes that contain useful information about the biology of
# the system while removeing the genes that contain random noise.
# => preserve interesting biological structure without the variance that obscures
# that structure
# also reduces size of the data set and thus improves computational efficiency in
# later steps
# => simplest approach is to select the most variable genes based on their
# expression across the population. This assumes that genuine biological differences
# will manifest as increased variation in the affected genes, compared to other
# genes that are only affected by technical noise or a baseline level of uninteresting
# biological variation.
# => Highly variable genes (HVGs)
# data import: pbmc dataset
#--- loading ---#
library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
raw.path <- bfcrpath(bfc, file.path("http://cf.10xgenomics.com/samples",
"cell-exp/2.1.0/pbmc4k/pbmc4k_raw_gene_bc_matrices.tar.gz"))
untar(raw.path, exdir=file.path(tempdir(), "pbmc4k"))
library(DropletUtils)
fname <- file.path(tempdir(), "pbmc4k/raw_gene_bc_matrices/GRCh38")
sce.pbmc <- read10xCounts(fname, col.names=TRUE)
#--- gene-annotation ---#
library(scater)
rownames(sce.pbmc) <- uniquifyFeatureNames(
rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol)
# BiocManager::install("EnsDb.Hsapiens.v86")
library(EnsDb.Hsapiens.v86)
location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID,
column="SEQNAME", keytype="GENEID")
#--- cell-detection ---#
set.seed(100)
e.out <- emptyDrops(counts(sce.pbmc))
sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)]
#--- quality-control ---#
stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=which(location=="MT")))
high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
sce.pbmc <- sce.pbmc[,!high.mito]
#--- normalization ---#
library(scran)
set.seed(1000)
clusters <- quickCluster(sce.pbmc)
sce.pbmc <- computeSumFactors(sce.pbmc, cluster=clusters)
sce.pbmc <- logNormCounts(sce.pbmc)
# data import: 416b dataset
#--- loading ---#
library(scRNAseq)
sce.416b <- LunSpikeInData(which="416b")
sce.416b$block <- factor(sce.416b$block)
#--- gene-annotation ---#
library(AnnotationHub)
ens.mm.v97 <- AnnotationHub()[["AH73905"]]
rowData(sce.416b)$ENSEMBL <- rownames(sce.416b)
rowData(sce.416b)$SYMBOL <- mapIds(ens.mm.v97, keys=rownames(sce.416b),
keytype="GENEID", column="SYMBOL")
rowData(sce.416b)$SEQNAME <- mapIds(ens.mm.v97, keys=rownames(sce.416b),
keytype="GENEID", column="SEQNAME")
library(scater)
rownames(sce.416b) <- uniquifyFeatureNames(rowData(sce.416b)$ENSEMBL,
rowData(sce.416b)$SYMBOL)
#--- quality-control ---#
mito <- which(rowData(sce.416b)$SEQNAME=="MT")
stats <- perCellQCMetrics(sce.416b, subsets=list(Mt=mito))
qc <- quickPerCellQC(stats, percent_subsets=c("subsets_Mt_percent",
"altexps_ERCC_percent"), batch=sce.416b$block)
sce.416b <- sce.416b[,!qc$discard]
#--- normalization ---#
library(scran)
sce.416b <- computeSumFactors(sce.416b)
sce.416b <- logNormCounts(sce.416b)
# variance of the log-counts
# simplest approach to quantifying per gene variation
# compute variance of log normalized expression values
# this has the advantage of basing the feature selection on the same log values
# that are used for later steps. In particular, genes with the largest variances
# in log values will contribute the most to the euclidian distances between cells.
# simply calculating the per-gene variance is not enough, feature selection
# requires modeling of the mean-variance relationship.
# The log transformation does not achieve perfecet variance stabilization, which
# means that the variance of a gene is driven more by its abundance than its
# underlying biological heterogenity. To account fo this effect we use modelGeneVar()
# to fit a trend to the variance with respect to abundance across all genes.
library(scran)
dec.pbmc <- modelGeneVar(sce.pbmc)
# Visualizing the fit:
fit.pbmc <- metadata(dec.pbmc)
plot(fit.pbmc$mean, fit.pbmc$var, xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(fit.pbmc$trend(x), col="dodgerblue", add=TRUE, lwd=2)
# we assume that the expression proviles of most genes are dominated by random
# technical noise. Under this assumption our trend represents an estimate of the
# technical noise as a function of abundance.
# We then break down the total variance of each gene into the technical component
# => the fitted value of the trend at that genes abundance
# and the biological component
# => difference between total variance and technical component.
# => this biological component represents the interesting variation for each gene
# and is used as the metric for high variance gene selection.
# Ordering by most interesting genes for inspection.
dec.pbmc[order(dec.pbmc$bio, decreasing=TRUE),]
# alternative approach to quantification uses squared coeff of variation
# of normalized expression values prior to log transform
dec.cv2.pbmc <- modelGeneCV2(sce.pbmc)
# this allows us the model the mean variance relationship when considering the
# relevance of each gene
# => core assumption is that most genes contain random noise and that the trend
# captures mostly technical variation.
# => large cv^2 values that deviate from the trend are likely to represent genes
# affected by biological structure.
fit.cv2.pbmc <- metadata(dec.cv2.pbmc)
plot(fit.cv2.pbmc$mean, fit.cv2.pbmc$cv2, log="xy")
curve(fit.cv2.pbmc$trend(x), col="dodgerblue", add=TRUE, lwd=2)
# for each gene we quantify the deviation from the trend in terms of the ratio
# of its cv^2 to the fitted value of trend at its abundance
# this is more appropriate than directly subtracting the trend from the cv^2, as
# the magnitude of the ratio is not affected by the mean.
dec.cv2.pbmc[order(dec.cv2.pbmc$ratio, decreasing=TRUE),]
# both cv^2 and the variance of log counts are effective metrics for quantifying
# variation in gene expression.
# cv^2 tends to give higher rank to low abundance hvgs driven by upregulation in
# rare subpopulations for which increase in variance on the raw scale is stronger
# than that on the log scale
# however cv2 variance is less directly related to downstream procedures operating
# on the log counts and the reliance on the ratio can assign high rank to
# uninteresting genes with low absolute variance.
# quantifying technical noise:
# strictly speaking the noise we are not interested in is a compound of technical
# noise and "uninteresting" biological noise caused by events like
# transcriptional bursting
# this revised assumption is generally reasonable, but may be problematic in some
# scenarios where many genes at a particular abundance are affected by a
# biological process.
# => strong upregulation may result in an enrichment of high variance genes at
# high abundances
# => skews the fitted trend in abundance interval and compromises detection of
# relevant genes.
# avoid this problem by fitting a mean dependent trend to the variance of the
# spike in transcripts if they are available
# spike ins should not be affected by biological variation
dec.spike.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC")
dec.spike.416b[order(dec.spike.416b$bio, decreasing=TRUE),]
plot(dec.spike.416b$mean, dec.spike.416b$total, xlab="Mean of log-expression",
ylab="Variance of log-expression")
fit.spike.416b <- metadata(dec.spike.416b)
points(fit.spike.416b$mean, fit.spike.416b$var, col="red", pch=16)
curve(fit.spike.416b$trend(x), col="dodgerblue", add=TRUE, lwd=2)
# if spike in data is not available one can make a few distributional assumptions about
# the data
# for example, unique molecular identifiers usually exhibit poisson variation
set.seed(0010101)
dec.pois.pbmc <- modelGeneVarByPoisson(sce.pbmc)
dec.pois.pbmc <- dec.pois.pbmc[order(dec.pois.pbmc$bio, decreasing=TRUE),]
head(dec.pois.pbmc)
plot(dec.pois.pbmc$mean, dec.pois.pbmc$total, pch=16, xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(metadata(dec.pois.pbmc)$trend(x), col="dodgerblue", add=TRUE)
# trends based purely on technical noise tend to yield large biological components
# for highly expressed genes.
# this often includes "house-keeping" genes for essential cellular components
# such as ribosomal proteins which are usually considered to be uninteresting for
# characterizing cellular heterogenity.
# These observations suggest that a more accurate noise model does not necessarily
# yield a better ranking of high variance genes.
### controlling for batch effects
# use modelgenevarwithspikes function to treat every plate (block) in the 416b
# data as a different batch
dec.block.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC", block=sce.416b$block)
head(dec.block.416b[order(dec.block.416b$bio, decreasing=TRUE),1:6])
# take advantage of the information provided in every batch
par(mfrow=c(1,2))
blocked.stats <- dec.block.416b$per.block
for (i in colnames(blocked.stats)) {
current <- blocked.stats[[i]]
plot(current$mean, current$total, main=i, pch=16, cex=0.5,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(current)
points(curfit$mean, curfit$var, col="red", pch=16)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}
# one may also use a design matrix to block specific genes like so
design <- model.matrix(~factor(block) + phenotype, colData(sce.416b))
dec.design.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC", design=design)
dec.design.416b[order(dec.design.416b$bio, decreasing=TRUE),]
# in the end we always select by a given criterium
# and proceed with care