-
Notifications
You must be signed in to change notification settings - Fork 5
/
README.Rmd
197 lines (161 loc) · 7.41 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
---
title: "How to use the functions in this package"
author: "Amrit Singh"
date: "December 3, 2016"
output:
html_document:
keep_md: yes
toc: true
toc_depth: 5
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, warning = FALSE, message = FALSE)
```
```{r loadLibraries}
#library(devtools)
#install_github("singha53/amritr")
library(amritr)
library(mixOmics)
library(dplyr); library(tidyr);
```
#biomarker pipeline()
## y~X, y = binary response, X = nxp dataset
### Description of classification algorithms
* Enet panels (alpha = 0-0.9, step = 0.1) with and without a p-value filter (topranked = # of top ranked significant features to use)
* random forest panels with and without a p-value filter (topranked = # of top ranked features to use)
* support vector machine panels with and without a p-value filter (topranked = # of top ranked features to use)
* single (p) biomarkers (based on a glm model)
* pathway biomarkers based on geneset from BioCarta_2016, KEGG_2016, Reactome_2016, and WikiPathways_2016 from Enrichr (each panel is a geneset from a given database)
### Inputs and outputs
* Input:
+ X = nxp dataset
+ y = n-vector of class labels
+ topranked = 30 (number of significant features to input into classifier)
+ validation = "Mfold" or "loo"
+ M = 2 - # of folds in the cross-validation scheme
+ iter - # of times to repeat cross-validation
+ threads - # of cores, each running a cross-validation scheme
+ pathways - list of lists each for a different database; each element is a genset consisting of gene symbols
* Output:
+ a dataframe with columns, Mean and SD of CV-AUC, Panel_label, Genes (within panels), Type of Panel (Enet, RF, SVM, GLM, Pathways(biocarta, kegg, reactome, wikipathways))
```{r biomarkerPipeline}
data("pathways"); data("breast.TCGA")
Y = breast.TCGA$data.train$subtype
names(Y) <- rownames(breast.TCGA$data.train$mrna)
Y = c(Y[Y == "Basal"][1:10], Y[Y == "Her2"][1:10])
Y[Y == 1] <- "Basal"
Y[Y == 2] <- "Her2"
Y <- factor(Y)
set.seed(123)
X = breast.TCGA$data.train$mrna[names(Y), sample(1:200, 30)]
## run biomarker pipeline for a binary response
allPanels <- amritr::biomarkerPipeline(X = X, Y = Y, topranked = 30, validation = "Mfold", M = 2,
iter = 2, threads = 2, progressBar = TRUE, pathways = pathways)
allPanels %>% arrange(desc(Mean)) %>% head
```
### Plot AUC (Mean +/- SD) of all panels
```{r, fig.height=5, fig.width=8}
allPanels %>% arrange(Mean) %>% mutate(Panel = 1:nrow(.)) %>%
ggplot(aes(x = Panel, y = Mean, color = Type)) + geom_point() +
geom_errorbar(aes(ymin = Mean-SD, ymax = Mean+SD)) +
customTheme(sizeStripFont = 10, xAngle = 0, hjust = 0.5, vjust = 0.5,
xSize = 10, ySize = 10, xAxisSize = 10, yAxisSize = 10) +
ylab("AUC - 2x2-fold CV") + xlab("Panels") +
geom_hline(yintercept = 0.5, linetype = "dashed")
```
### Plot best performing panel
```{r}
(topPanel <- allPanels %>% arrange(desc(Mean)) %>% head %>% slice(1))
pcaX <- plsda(X[, unlist(strsplit(topPanel$Genes, "_"))], Y)
plotIndiv(pcaX, group = Y, ellipse = TRUE, star = TRUE)
```
# Integrative methods (Concatenation, Ensemble and DIABLO)
## Concatenation-Enet biomarker panel
### Description of algorithm
* all X datasets are combined column-wise and a single-dataset classifer is applied (Enet)
```{r}
X.train <- breast.TCGA$data.train[c("mrna", "mirna")]
Y.train <- breast.TCGA$data.train$subtype
X.test <- breast.TCGA$data.test[c("mrna", "mirna")]
Y.test <- breast.TCGA$data.test$subtype
M = 2; iter = 2; cpus = 2;
## Build panel and estimate test performance
result <- amritr::enet(X = do.call(cbind, X.train), Y = Y.train, alpha=1, family="multinomial", lambda = NULL,
X.test = do.call(cbind, X.test), Y.test = Y.test, filter = "none", topranked = 50)
lapply(X.train, function(i){
intersect(colnames(i), result$enet.panel)
})
## Estimate panel performance using cross-validation
cv <- amritr::perf.enet(result, validation = "Mfold", M = M, iter = iter, threads = cpus, progressBar = FALSE)
concat_enetErrTrain_tuneConcat <- filter(cv$perf, ErrName == "BER")[-1]
concat_enetErrTest_tuneConcat <- c(result$perfTest["BER"], NA)
names(concat_enetErrTest_tuneConcat) <- names(concat_enetErrTrain_tuneConcat)
## Concateantion panel error rate
rbind(concat_enetErrTrain_tuneConcat, concat_enetErrTest_tuneConcat) %>%
mutate(Set = c("Train", "Test"))
```
## Ensemble classifier
### Description of classifier
```{r}
alphaList = list(1, 1)
lambdaList = list(NULL, NULL)
## Build panel and estimate test performance
ensembleMod <- ensembleEnet(X.train, Y.train, alphaList = alphaList, lambdaList = lambdaList, X.test = X.test, Y.test = Y.test, filter = "none", topranked = 50)
ensembleResult <- ensembleMod$result %>% zip_nPure()
ensemblePanel <- ensembleResult$enet.panel
ensemblePanel
## Estimate panel performance using cross-validation
ensembleTrain <- perfEnsemble(object=ensembleMod, validation = "Mfold", M = M, iter = iter, threads = cpus, progressBar = TRUE)
ensemble_enetLength <- lapply(ensemblePanel, length)
ensemble_enetErrTrain_tuneEnsemble <- filter(ensembleTrain$perf, ErrName == "BER")[-1]
ensemble_enetErrTest_tuneEnsemble <- c(ensembleMod$perfTest["BER"], NA)
names(ensemble_enetErrTest_tuneEnsemble) <- names(ensemble_enetErrTrain_tuneEnsemble)
## Ensemble panel error rate
rbind(ensemble_enetErrTrain_tuneEnsemble, ensemble_enetErrTest_tuneEnsemble) %>%
mutate(Set = c("Train", "Test"))
```
## DIABLO classifier
### description of classifier
#### tune DIABLO classifier
```{r}
design <- setDesign(X.train, corCutOff = 0.6, plotMat = FALSE)
ncomp <- nlevels(Y.train) - 1
list.keepX <- lapply(ensemble_enetLength, function(i){
rep(round(i/ncomp, 0), ncomp)
})
TCGA.block.splsda = block.splsda(X = X.train, Y = Y.train,
ncomp = ncomp, keepX = list.keepX, design = design,
scheme = "centroid")
diabloFeat <- lapply(TCGA.block.splsda$loadings[-(length(X.train)+1)], function(x)
unique(as.character(as.matrix(apply(x, 2, function(i) names(i)[which(i != 0)])))))
diabloFeat
## training error
cv <- perf(TCGA.block.splsda, validation = "Mfold", folds = M, cpus = cpus, nrepeat = iter)
err <- extractErr(cv)
err$Comp <- factor(err$Comp, levels = paste("comp", 1:ncomp, sep = "."))
diablo_enetErrTrain <- err %>% filter(Type == "centroids.dist", Class == "Overall.BER",
EnsembleMode == "wMajVote", Dataset == "DIABLO", Comp == paste("comp", ncomp, sep = "."))
diablo_enetErrTrain <- diablo_enetErrTrain[c("meanErr", "sdErr")]
## test error
diabloTest <- predict(TCGA.block.splsda, X.test, method = "all")
diabloTestConsensus <- lapply(diabloTest$WeightedVote, function(i){
predY <- apply(i, 2, function(z){
temp <- table(factor(z, levels = levels(Y.test)), Y.test)
diag(temp) <- 0
error = c(colSums(temp)/summary(Y.test), sum(temp)/length(Y.test), mean(colSums(temp)/summary(Y.test)))
names(error) <- c(names(error)[1:nlevels(Y.test)], "ER", "BER")
error
})
})
diablo_enetErrTest <- c(diabloTestConsensus$centroids.dist["BER", paste("comp", ncomp, sep = "")], NA)
names(diablo_enetErrTest) <- names(diablo_enetErrTrain)
## DIABLO panel error rate
rbind(diablo_enetErrTrain, diablo_enetErrTest) %>%
mutate(Set = c("Train", "Test"))
```
## overlap between panels
## Ensemble and DIABLO
```{r}
datList = list(ensemble = as.character(unlist(ensemblePanel)), diablo=as.character(unlist(diabloFeat)))
amritr::venndiagram(datList = datList, circleNames = c("Ensemble", "DIABLO"))
```