README.Rmd

---
title: "How to use the functions in this package"
author: "Amrit Singh"
date: "December 3, 2016"
output: 
  html_document: 
    keep_md: yes
    toc: true
    toc_depth: 5
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, warning = FALSE, message = FALSE)
```

```{r loadLibraries}
#library(devtools)
#install_github("singha53/amritr")
library(amritr)
library(mixOmics)
library(dplyr); library(tidyr);
```

#biomarker pipeline()
## y~X, y = binary response, X = nxp dataset
### Description of classification algorithms
  * Enet panels (alpha = 0-0.9, step = 0.1) with and without a p-value filter (topranked = # of top ranked significant features to use)
  * random forest panels with and without a p-value filter (topranked = # of top ranked features to use)
  * support vector machine panels with and without a p-value filter (topranked = # of top ranked features to use)
  * single (p) biomarkers (based on a glm model)
  * pathway biomarkers based on geneset from BioCarta_2016, KEGG_2016, Reactome_2016, and WikiPathways_2016 from Enrichr (each panel is a geneset from a given database)

### Inputs and outputs
  * Input:
          + X = nxp dataset
          + y = n-vector of class labels
          + topranked = 30 (number of significant features to input into classifier)
          + validation = "Mfold" or "loo"
          + M = 2 - # of folds in the cross-validation scheme
          + iter - # of times to repeat cross-validation
          + threads - # of cores, each running a cross-validation scheme
          + pathways - list of lists each for a different database; each element is a genset consisting of gene symbols
  * Output:
          + a dataframe with columns, Mean and SD of CV-AUC, Panel_label, Genes (within panels), Type of Panel (Enet, RF, SVM, GLM, Pathways(biocarta, kegg, reactome, wikipathways))

```{r biomarkerPipeline}
data("pathways"); data("breast.TCGA")

Y = breast.TCGA$data.train$subtype
names(Y) <- rownames(breast.TCGA$data.train$mrna)

Y = c(Y[Y == "Basal"][1:10], Y[Y == "Her2"][1:10])
Y[Y == 1] <- "Basal"
Y[Y == 2] <- "Her2"
Y <- factor(Y)
set.seed(123)
X = breast.TCGA$data.train$mrna[names(Y), sample(1:200, 30)]

## run biomarker pipeline for a binary response
allPanels <- amritr::biomarkerPipeline(X = X, Y = Y, topranked = 30, validation = "Mfold", M = 2, 
             iter = 2, threads = 2, progressBar = TRUE, pathways = pathways)
allPanels %>% arrange(desc(Mean)) %>% head
```

### Plot AUC (Mean +/- SD) of all panels

```{r, fig.height=5, fig.width=8}
allPanels %>% arrange(Mean) %>% mutate(Panel = 1:nrow(.)) %>% 
  ggplot(aes(x = Panel, y = Mean, color = Type)) + geom_point() +
  geom_errorbar(aes(ymin = Mean-SD, ymax = Mean+SD)) +
  customTheme(sizeStripFont = 10, xAngle = 0, hjust = 0.5, vjust = 0.5,
    xSize = 10, ySize = 10, xAxisSize = 10, yAxisSize = 10) +
  ylab("AUC - 2x2-fold CV") + xlab("Panels") +
  geom_hline(yintercept = 0.5, linetype = "dashed")
```

### Plot best performing panel

```{r}
(topPanel <- allPanels %>% arrange(desc(Mean)) %>% head %>% slice(1))

pcaX <- plsda(X[, unlist(strsplit(topPanel$Genes, "_"))], Y)
plotIndiv(pcaX, group = Y, ellipse = TRUE, star = TRUE)

```

# Integrative methods (Concatenation, Ensemble and DIABLO)
## Concatenation-Enet biomarker panel
### Description of algorithm
  * all X datasets are combined column-wise and a single-dataset classifer is applied (Enet)

```{r}
X.train <- breast.TCGA$data.train[c("mrna", "mirna")]
Y.train <- breast.TCGA$data.train$subtype
X.test <- breast.TCGA$data.test[c("mrna", "mirna")]
Y.test <- breast.TCGA$data.test$subtype
M = 2; iter = 2; cpus = 2;

## Build panel and estimate test performance
result <- amritr::enet(X = do.call(cbind, X.train), Y = Y.train, alpha=1, family="multinomial", lambda = NULL, 
               X.test = do.call(cbind, X.test), Y.test = Y.test, filter = "none", topranked = 50)
lapply(X.train, function(i){
  intersect(colnames(i), result$enet.panel)
})

## Estimate panel performance using cross-validation
cv <- amritr::perf.enet(result, validation = "Mfold", M = M, iter = iter, threads = cpus, progressBar = FALSE)
concat_enetErrTrain_tuneConcat <- filter(cv$perf, ErrName == "BER")[-1]
concat_enetErrTest_tuneConcat <- c(result$perfTest["BER"], NA)
names(concat_enetErrTest_tuneConcat) <- names(concat_enetErrTrain_tuneConcat)

## Concateantion panel error rate
rbind(concat_enetErrTrain_tuneConcat, concat_enetErrTest_tuneConcat) %>% 
  mutate(Set = c("Train", "Test"))

```

## Ensemble classifier
### Description of classifier

```{r}
alphaList = list(1, 1)
lambdaList = list(NULL, NULL)
## Build panel and estimate test performance
ensembleMod <- ensembleEnet(X.train, Y.train, alphaList = alphaList, lambdaList = lambdaList, X.test = X.test, Y.test = Y.test, filter = "none", topranked = 50)
ensembleResult <- ensembleMod$result %>% zip_nPure()
ensemblePanel <- ensembleResult$enet.panel
ensemblePanel

## Estimate panel performance using cross-validation
ensembleTrain <- perfEnsemble(object=ensembleMod, validation = "Mfold", M = M, iter = iter, threads = cpus, progressBar = TRUE)
ensemble_enetLength <- lapply(ensemblePanel, length)
ensemble_enetErrTrain_tuneEnsemble <- filter(ensembleTrain$perf, ErrName == "BER")[-1]
ensemble_enetErrTest_tuneEnsemble <- c(ensembleMod$perfTest["BER"], NA)
names(ensemble_enetErrTest_tuneEnsemble) <- names(ensemble_enetErrTrain_tuneEnsemble)

## Ensemble panel error rate
rbind(ensemble_enetErrTrain_tuneEnsemble, ensemble_enetErrTest_tuneEnsemble) %>% 
  mutate(Set = c("Train", "Test"))

```

## DIABLO classifier
### description of classifier

#### tune DIABLO classifier

```{r}
design <- setDesign(X.train, corCutOff = 0.6, plotMat = FALSE)
ncomp <- nlevels(Y.train) - 1
list.keepX <- lapply(ensemble_enetLength, function(i){
  rep(round(i/ncomp, 0), ncomp)
})
TCGA.block.splsda = block.splsda(X = X.train, Y = Y.train, 
  ncomp = ncomp, keepX = list.keepX, design = design,
  scheme = "centroid")

diabloFeat <- lapply(TCGA.block.splsda$loadings[-(length(X.train)+1)], function(x)
  unique(as.character(as.matrix(apply(x, 2, function(i) names(i)[which(i != 0)])))))
diabloFeat

## training error
cv <- perf(TCGA.block.splsda, validation = "Mfold", folds = M, cpus = cpus, nrepeat = iter)
err <- extractErr(cv)
err$Comp <- factor(err$Comp, levels = paste("comp", 1:ncomp, sep = "."))
diablo_enetErrTrain <- err %>% filter(Type == "centroids.dist", Class == "Overall.BER", 
  EnsembleMode == "wMajVote", Dataset == "DIABLO", Comp == paste("comp", ncomp, sep = "."))
diablo_enetErrTrain <- diablo_enetErrTrain[c("meanErr", "sdErr")]

## test error
diabloTest <- predict(TCGA.block.splsda, X.test, method = "all")
diabloTestConsensus <- lapply(diabloTest$WeightedVote, function(i){
  predY <- apply(i, 2, function(z){
    temp <- table(factor(z, levels = levels(Y.test)), Y.test)
    diag(temp) <- 0
    error = c(colSums(temp)/summary(Y.test), sum(temp)/length(Y.test), mean(colSums(temp)/summary(Y.test)))
    names(error) <- c(names(error)[1:nlevels(Y.test)], "ER", "BER")
    error
  })
})
diablo_enetErrTest <- c(diabloTestConsensus$centroids.dist["BER", paste("comp", ncomp, sep = "")], NA)
names(diablo_enetErrTest) <- names(diablo_enetErrTrain)
## DIABLO panel error rate
rbind(diablo_enetErrTrain, diablo_enetErrTest) %>% 
  mutate(Set = c("Train", "Test"))
```

## overlap between panels
## Ensemble and DIABLO

```{r}
datList = list(ensemble = as.character(unlist(ensemblePanel)), diablo=as.character(unlist(diabloFeat)))
amritr::venndiagram(datList = datList, circleNames = c("Ensemble", "DIABLO"))
```