From 21961648479b7f1cb872da012755620dc425864c Mon Sep 17 00:00:00 2001 From: Shraddha Pai Date: Thu, 1 Feb 2018 19:27:16 -0500 Subject: [PATCH 001/124] add sparsifier test --- netDx/DESCRIPTION | 2 +- netDx/R/sparsifyNet.R | 130 ++++++++++++++++++++++-------------------- 2 files changed, 70 insertions(+), 62 deletions(-) diff --git a/netDx/DESCRIPTION b/netDx/DESCRIPTION index 47cde6b7..3c443b01 100644 --- a/netDx/DESCRIPTION +++ b/netDx/DESCRIPTION @@ -1,6 +1,6 @@ Package: netDx Title: Learns Patient Binary Classification based on Similarity Networks -Version: 1.0.2 +Version: 1.0.21 Authors@R: c(person("Shraddha", "Pai", email = "shraddha.pai@utoronto.ca", role = c("aut", "cre")), person("Ahmad","Shah", role="aut"), person("Shirley","Hui",role="aut"), diff --git a/netDx/R/sparsifyNet.R b/netDx/R/sparsifyNet.R index 6457d811..8cf4fadd 100644 --- a/netDx/R/sparsifyNet.R +++ b/netDx/R/sparsifyNet.R @@ -37,6 +37,8 @@ #' @param numPatients (integer) number of patients in the network. See #' Details. #' @param keepTies (logical) keep edge ties. See Details +#' @param useExtLib (logical) if TRUE, uses NetPreProc::Sparsify.matrix() +#' with user-supplied k; else uses the implementation in this file #' @param verbose (logical) print messages #' @return No value. Writes sparsified matrix to \code{outFile} #' @export @@ -46,7 +48,7 @@ #' x <- melt(cor(xpr)) # patient 1, patient 2, edge weight #' sparsifyNet(x,outFile="tmp.txt") sparsifyNet <- function(net,outFile,k=50L,MAX_INT=600L,MAX_PCT=0.02, - numPatients,keepTies=TRUE,verbose=TRUE){ + numPatients,keepTies=TRUE,useExtLib=FALSE,verbose=TRUE){ if (class(net)=="data.frame") { dat <- net } else if (class(net)=="character"){ @@ -55,71 +57,77 @@ if (class(net)=="data.frame") { } dat <- dat[order(dat[,1]),] -curPat <- dat[1,1] # initialize -sidx <- 1; eidx <- NA; -ctr <- 1 -newCt <- 0 -t0 <- Sys.time() -system(sprintf("cat /dev/null > %s",outFile)) -while ((ctr < nrow(dat))) { - nextPat <- dat[ctr+1,1] - - if (nextPat != curPat) { - eidx <- ctr - if (verbose) cat(sprintf("%s: %i-%i:", curPat,sidx,eidx)) - # process cur pat's interactions. write to file - totalInter <- dat[sidx:eidx,3] - names(totalInter) <- dat[sidx:eidx,2] - - if (!keepTies) { - totalInter <- totalInter[!duplicated(totalInter)] +if (useExtLib) { + browser() + +} else { + curPat <- dat[1,1] # initialize + sidx <- 1; eidx <- NA; + ctr <- 1 + + newCt <- 0 + t0 <- Sys.time() + system(sprintf("cat /dev/null > %s",outFile)) + while ((ctr < nrow(dat))) { + nextPat <- dat[ctr+1,1] + + if (nextPat != curPat) { + eidx <- ctr + if (verbose) cat(sprintf("%s: %i-%i:", curPat,sidx,eidx)) + # process cur pat's interactions. write to file + totalInter <- dat[sidx:eidx,3] + names(totalInter) <- dat[sidx:eidx,2] + + if (!keepTies) { + totalInter <- totalInter[!duplicated(totalInter)] + } + totalInter <- sort(totalInter,decreasing=TRUE) + n <- length(totalInter) + + tokeep <- max(k,min(round(MAX_PCT*numPatients),600)); + tokeep <- min(k,n); + + if (verbose) { + n1 <- length(totalInter) + cat(sprintf("%i -> %i ",n1, tokeep)) + if (tokeep < n1) cat("*** trimmed") + cat("\n") + } + + outInter <- totalInter[1:tokeep] + df <- data.frame(P1=curPat,P2=names(outInter),x=outInter) + write.table(df,file=outFile,sep="\t", + append=TRUE,col=F,row=F,quote=F) + newCt <- newCt + nrow(df) + + curPat <- nextPat; + sidx <- ctr+1 } - totalInter <- sort(totalInter,decreasing=TRUE) - n <- length(totalInter) - - tokeep <- max(k,min(round(MAX_PCT*numPatients),600)); - tokeep <- min(k,n); - - if (verbose) { - n1 <- length(totalInter) - cat(sprintf("%i -> %i ",n1, tokeep)) - if (tokeep < n1) cat("*** trimmed") - cat("\n") - } - - outInter <- totalInter[1:tokeep] - df <- data.frame(P1=curPat,P2=names(outInter),x=outInter) - write.table(df,file=outFile,sep="\t", - append=TRUE,col=F,row=F,quote=F) - newCt <- newCt + nrow(df) - - curPat <- nextPat; - sidx <- ctr+1 + ctr <- ctr+1 } - ctr <- ctr+1 -} -# last patient -eidx <- nrow(dat) -# process cur pat's interactions. write to file -totalInter <- dat[sidx:eidx,3] -names(totalInter) <- dat[sidx:eidx,2] -n1 <- length(totalInter) - -if (!keepTies) { - totalInter <- totalInter[!duplicated(totalInter)] -} -totalInter <- sort(totalInter,decreasing=TRUE) -n <- length(totalInter) - -tokeep <- max(k,min(MAX_PCT*numPatients,600)); -tokeep <- min(k,n); - -if (verbose) cat(sprintf("%i -> %i\n",n1, tokeep)) - -outInter <- totalInter[1:tokeep] + # last patient + eidx <- nrow(dat) + # process cur pat's interactions. write to file + totalInter <- dat[sidx:eidx,3] + names(totalInter) <- dat[sidx:eidx,2] + n1 <- length(totalInter) + + if (!keepTies) { + totalInter <- totalInter[!duplicated(totalInter)] + } + totalInter <- sort(totalInter,decreasing=TRUE) + n <- length(totalInter) + + tokeep <- max(k,min(MAX_PCT*numPatients,600)); + tokeep <- min(k,n); + + if (verbose) cat(sprintf("%i -> %i\n",n1, tokeep)) + + outInter <- totalInter[1:tokeep] df <- data.frame(P1=curPat,P2=names(outInter),x=outInter) write.table(df,file=outFile,sep="\t",append=TRUE,col=F,row=F,quote=F) +} cat(sprintf("Interactions trimmed from %i to %i (sparse factor= %1.2f%%)\n", nrow(dat), newCt,(newCt/nrow(dat))*100)) From d876eecab63eba3fe22c9ce2921286402313a7eb Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:32:44 -0500 Subject: [PATCH 002/124] removed use of netpreproc. does filter by cutoff --- netDx/R/sparsifyNet.R | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/netDx/R/sparsifyNet.R b/netDx/R/sparsifyNet.R index 8cf4fadd..f2fef020 100644 --- a/netDx/R/sparsifyNet.R +++ b/netDx/R/sparsifyNet.R @@ -37,9 +37,8 @@ #' @param numPatients (integer) number of patients in the network. See #' Details. #' @param keepTies (logical) keep edge ties. See Details -#' @param useExtLib (logical) if TRUE, uses NetPreProc::Sparsify.matrix() -#' with user-supplied k; else uses the implementation in this file #' @param verbose (logical) print messages +#' @param cutoff (value between 0 and 1) min edge value to keep #' @return No value. Writes sparsified matrix to \code{outFile} #' @export #' @examples @@ -48,20 +47,19 @@ #' x <- melt(cor(xpr)) # patient 1, patient 2, edge weight #' sparsifyNet(x,outFile="tmp.txt") sparsifyNet <- function(net,outFile,k=50L,MAX_INT=600L,MAX_PCT=0.02, - numPatients,keepTies=TRUE,useExtLib=FALSE,verbose=TRUE){ + numPatients,keepTies=TRUE,verbose=TRUE,cutoff=0.3){ if (class(net)=="data.frame") { dat <- net } else if (class(net)=="character"){ netFile <- net dat <- read.delim(netFile,sep="\t",as.is=T,h=F) } -dat <- dat[order(dat[,1]),] + dat <- dat[order(dat[,1]),] + idx <- which(dat[,3] Date: Tue, 6 Feb 2018 11:33:04 -0500 Subject: [PATCH 003/124] added useSparsify2 option --- netDx/R/makePSN_NamedMatrix.R | 44 +++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/netDx/R/makePSN_NamedMatrix.R b/netDx/R/makePSN_NamedMatrix.R index 08d31839..0d1f6f83 100644 --- a/netDx/R/makePSN_NamedMatrix.R +++ b/netDx/R/makePSN_NamedMatrix.R @@ -33,6 +33,8 @@ #' are not included in the corresponding interaction network #' @param verbose (logical) print detailed messages #' @param numCores (integer) number of cores for parallel network generation +#' @param useExtLib (logical) if TRUE, uses NetPreProc::Sparsify.matrix() +#' with user-supplied k; else uses the implementation in this file #' @param writeProfiles (logical) use GeneMANIA's ProfileToNetworkDriver to #' create interaction networks. If TRUE, this function writes subsets #' of the original data corresponding to networks to file (profiles). @@ -53,7 +55,7 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, simMetric="pearson", cutoff=0.3,verbose=TRUE, numCores=1L,writeProfiles=FALSE, - sparsify=FALSE,append=FALSE,...){ + sparsify=FALSE,useSparsify2=FALSE,append=FALSE,...){ if (!append) { if (file.exists(outDir)) unlink(outDir,recursive=TRUE) dir.create(outDir) @@ -67,6 +69,8 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, if (simMetric!="pearson" & writeProfiles==TRUE) { stop("writeProfiles must only be TRUE with simMetric is set to pearson. For all other metrics, set writeProfiles=FALSE") } + + if (!sparsify & useSparsify2) { stop("if useSparsify=TRUE then sparsify must also be set to TRUE\n")} cl <- makeCluster(numCores) registerDoParallel(cl) @@ -91,26 +95,36 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, outFile <- sprintf("%s/%s_cont.txt", outDir, curSet) sim <- getSimilarity(xpr[idx,,drop=FALSE], type=simMetric,...) - idx <- which(upper.tri(sim,diag=F)) - ij <- matrix_getIJ(dim(sim),idx) + if (!useSparsify2) {# prepare for internal sparsifier + idx <- which(upper.tri(sim,diag=F)) + ij <- matrix_getIJ(dim(sim),idx) + + # make interaction network + pat_pairs <- data.frame(p1=rownames(sim)[ij[,1]], + p2=colnames(sim)[ij[,2]], + similarity=sim[idx]) - # make interaction network - pat_pairs <- data.frame(p1=rownames(sim)[ij[,1]], - p2=colnames(sim)[ij[,2]], - similarity=sim[idx]) - - too_weak <- which(pat_pairs[,3] < cutoff | - is.na(pat_pairs[,3])) - if (any(too_weak)) { - if (verbose) - cat(sprintf("\t%i weak connections\n", - length(too_weak))) - pat_pairs <- pat_pairs[-too_weak,] + too_weak <- which(pat_pairs[,3] < cutoff | + is.na(pat_pairs[,3])) + if (any(too_weak)) { + if (verbose) + cat(sprintf("\t%i weak connections\n", + length(too_weak))) + pat_pairs <- pat_pairs[-too_weak,] + } + } else { # stick to sim matrix + pat_pairs <- sim } if (sparsify) { + if (useSparsify2) { + cat("using sparsify2\n") + sparsify2(pat_pairs,outFile) + } else { + cat("using original sparsifier method\n") sparsifyNet(pat_pairs,outFile,numPatients=nrow(sim), verbose=FALSE) + } } else { write.table(pat_pairs, file=outFile,sep="\t", col=FALSE,row=FALSE,quote=FALSE) From f49a2c01146c3c400143f6055049581277f1b1cb Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:34:01 -0500 Subject: [PATCH 004/124] formatting --- netDx/R/runPredictor_nestedCV.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netDx/R/runPredictor_nestedCV.R b/netDx/R/runPredictor_nestedCV.R index ced1bd2d..08161339 100644 --- a/netDx/R/runPredictor_nestedCV.R +++ b/netDx/R/runPredictor_nestedCV.R @@ -44,7 +44,7 @@ #' @export runPredictor_nestedCV <- function(pheno,dataList,groupList,outDir,makeNetFunc, nFoldCV=10L,trainProp=0.8,numSplits=10L,numCores,CVmemory=4L,CVcutoff=9L, - keepAllData=FALSE) { + keepAllData=FALSE) { ### tests# pheno$ID and $status must exist if (missing(dataList)) stop("dataList must be supplied.\n") From 3df7fad340abda5064ad561c6111f7f1f5fc00dd Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:34:15 -0500 Subject: [PATCH 005/124] updated doc --- netDx/man/makePSN_NamedMatrix.Rd | 5 ++++- netDx/man/sparsifyNet.Rd | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/netDx/man/makePSN_NamedMatrix.Rd b/netDx/man/makePSN_NamedMatrix.Rd index 3f148cf5..f716815c 100644 --- a/netDx/man/makePSN_NamedMatrix.Rd +++ b/netDx/man/makePSN_NamedMatrix.Rd @@ -6,7 +6,7 @@ \usage{ makePSN_NamedMatrix(xpr, nm, namedSets, outDir, simMetric = "pearson", cutoff = 0.3, verbose = TRUE, numCores = 1L, writeProfiles = FALSE, - sparsify = FALSE, append = FALSE, ...) + sparsify = FALSE, useSparsify2 = FALSE, append = FALSE, ...) } \arguments{ \item{xpr}{(matrix) rows are measurements, columns are samples. Columns @@ -44,6 +44,9 @@ with default parameters. Only used when writeProfiles=FALSE} \item{append}{(logical) if TRUE does not overwrite netDir.} \item{...}{passed to \code{getSimilarity()}} + +\item{useExtLib}{(logical) if TRUE, uses NetPreProc::Sparsify.matrix() +with user-supplied k; else uses the implementation in this file} } \value{ (char) Basename of files to which networks are written. diff --git a/netDx/man/sparsifyNet.Rd b/netDx/man/sparsifyNet.Rd index e6f7f177..605972ac 100644 --- a/netDx/man/sparsifyNet.Rd +++ b/netDx/man/sparsifyNet.Rd @@ -5,7 +5,7 @@ \title{Sparsifies patient similarity network} \usage{ sparsifyNet(net, outFile, k = 50L, MAX_INT = 600L, MAX_PCT = 0.02, - numPatients = 100L, keepTies = TRUE, verbose = TRUE) + numPatients, keepTies = TRUE, verbose = TRUE) } \arguments{ \item{net}{(char or data.frame) If of type char, should path to From cc9e0565cde7560a80a641fb83bf8a4261c0e90c Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:34:23 -0500 Subject: [PATCH 006/124] updated doc --- netDx/man/sparsify2.Rd | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 netDx/man/sparsify2.Rd diff --git a/netDx/man/sparsify2.Rd b/netDx/man/sparsify2.Rd new file mode 100644 index 00000000..eee93753 --- /dev/null +++ b/netDx/man/sparsify2.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparsify2.R +\name{sparsify2} +\alias{sparsify2} +\title{cleaner sparsification routine} +\usage{ +sparsify2(W, outFile = "tmp.txt", cutoff = 0.3, maxInt = 50, + EDGE_MAX = 1000) +} +\arguments{ +\item{W}{(matrix) similarity matrix} + +\item{outFile}{(char) path to file to write sparsified network} + +\item{cutoff}{(numeric) edges with weight smaller than this are set to NA} + +\item{maxInt}{(numeric) max num edges per node.} + +\item{EDGE_MAX}{(numeric) max num edges in network} +} +\value{ +writes SIF content to text file (node1,node2,edge weight) +} +\description{ +cleaner sparsification routine +} +\details{ +Sparsifies similarity matrix to keep strongest edges. +Sets diagonal and edges < cutoff to NA. Keeps strongest maxInt edges +per node. Ties are ignored. Keeps a max of EDGE_MAX edges in the network. +} From e9b70f35877889c544dea536106808592734926b Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:34:36 -0500 Subject: [PATCH 007/124] new func --- netDx/NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/netDx/NAMESPACE b/netDx/NAMESPACE index eb4850fe..a59aa805 100644 --- a/netDx/NAMESPACE +++ b/netDx/NAMESPACE @@ -56,6 +56,7 @@ export(resampling_predTest_CNV) export(runGeneMANIA) export(runPredictor_nestedCV) export(simpleCap) +export(sparsify2) export(sparsifyNet) export(splitTestTrain) export(splitTestTrain_partition) @@ -75,6 +76,7 @@ import(httr) import(parallel) import(pracma) import(r2cytoscape) +import(reshape2) importFrom(RJSONIO,fromJSON) importFrom(igraph,E) importFrom(igraph,delete.edges) From 500ff16ce2f21c1b12d827ca08cd3405850e7560 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:35:09 -0500 Subject: [PATCH 008/124] plotSim now handles sif input --- misc/PanCancer/simFuns.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/misc/PanCancer/simFuns.R b/misc/PanCancer/simFuns.R index 971679eb..e3599466 100644 --- a/misc/PanCancer/simFuns.R +++ b/misc/PanCancer/simFuns.R @@ -102,11 +102,18 @@ plotrix::color2D.matplot(x,xrange=c(0,1), # given psn plot intra- and inter-class similarity # matrix must have upper populated, lower can be empty -#' @param s1 (matrix) similarity matrix +#' @param s1 (matrix) similarity matrix. If 3-column table provided, assumes +#' it's a SIF #' @param c1,c2 (char) vector of patients in each of the two groups plotSim <- function(s1,name="simfun",c1,c2) { - s1[lower.tri(s1,diag=TRUE)] <- NA - s1 <- na.omit(melt(s1)) + if (ncol(s1) == 3) { + cat("assuming SIF provided\n") + colnames(s1) <- c("Var1","Var2","value") + # do nothing + } else { + s1[lower.tri(s1,diag=TRUE)] <- NA + s1 <- na.omit(melt(s1)) + } out <- list( pp=s1$value[which(s1$Var1 %in% c1 & s1$Var2 %in% c1)], mm=s1$value[which(s1$Var1 %in% c2 & s1$Var2 %in% c2)], From e4df02a26af19534aba589dc9c8579bc371c1e8e Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:36:41 -0500 Subject: [PATCH 009/124] computes pp,mm,pm similarity for SIF files generated as part of GM database --- misc/PanCancer/plotSim_inputNet.R | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 misc/PanCancer/plotSim_inputNet.R diff --git a/misc/PanCancer/plotSim_inputNet.R b/misc/PanCancer/plotSim_inputNet.R new file mode 100644 index 00000000..32f22204 --- /dev/null +++ b/misc/PanCancer/plotSim_inputNet.R @@ -0,0 +1,26 @@ +#' plot similarity for intra- and inter-class for netDx input net +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input" +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +netDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/pruneRBFsigma25_180201/networks" +netList <- dir(netDir,pattern="_cont.txt") +source("simFuns.R") +cur <- read.delim(sprintf("%s/%s",netDir,netList[1]),sep="\t",h=F,as.is=T) + + From fd35a65626f94c9d2c83c5739bf1fcc03a7dff7a Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:37:07 -0500 Subject: [PATCH 010/124] runs full set --- misc/SCZ/netDx.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/misc/SCZ/netDx.R b/misc/SCZ/netDx.R index e2adc54f..8dba58e3 100644 --- a/misc/SCZ/netDx.R +++ b/misc/SCZ/netDx.R @@ -54,11 +54,11 @@ idx <- which(pheno$Dx %in% c("Control","SCZ")) xpr <- xpr[,idx] pheno <- pheno[idx,] -set.seed(123) -idx1 <- sample(which(pheno$Dx %in% "Control"),70,F) -idx2 <- sample(which(pheno$Dx %in% "SCZ"),70,F) -idx <- c(idx1,idx2) -pheno <- pheno[idx,]; xpr <- xpr[,idx] +#set.seed(123) +#idx1 <- sample(which(pheno$Dx %in% "Control"),100,F) +#idx2 <- sample(which(pheno$Dx %in% "SCZ"),100,F) +#idx <- c(idx1,idx2) +#pheno <- pheno[idx,]; xpr <- xpr[,idx] pheno$STATUS <- pheno$Dx pheno$ID <- pheno$DLPFC_RNA_Sequencing_Sample_ID @@ -82,7 +82,7 @@ makeNets <- function(dataList, groupList, netDir,...) { } dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/SCZ_%s",outDir,dt) +megaDir <- sprintf("%s/SCZfull_%s",outDir,dt) if (!file.exists(megaDir)) dir.create(megaDir) gps <- list(rna=pathwayList) @@ -92,4 +92,5 @@ runPredictor_nestedCV(pheno, dataList=dats,groupList=gps, makeNetFunc=makeNets, ### custom network creation function outDir=sprintf("%s/pred",megaDir), - numCores=4L,nFoldCV=10L, CVcutoff=9L,numSplits=10L) + numCores=2L,nFoldCV=10L, CVcutoff=9L,numSplits=25L, + CVmemory=26L) From 61a07286b80808f2fe506c6ec1e3e8e59a5be7e3 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:37:22 -0500 Subject: [PATCH 011/124] will be script with 20% set aside --- misc/SCZ/netDx_blindTest.R | 100 +++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 misc/SCZ/netDx_blindTest.R diff --git a/misc/SCZ/netDx_blindTest.R b/misc/SCZ/netDx_blindTest.R new file mode 100644 index 00000000..a82755b2 --- /dev/null +++ b/misc/SCZ/netDx_blindTest.R @@ -0,0 +1,100 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2018_CommonMind" +inDir <- sprintf("%s/input",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_November_01_2017_symbol.gmt", + rootDir) + +geneFile <- sprintf("%s/anno/gencode.v27lift37.annotation.gtf.geneidsymbols.txt", + rootDir) +genes <- read.delim(geneFile,sep="\t",h=T,as.is=T) +gene_id <- genes[,4] +dpos <- regexpr("\\.",gene_id) +gene_id <- substr(gene_id,1,dpos-1) +gene_name <- genes[,5] + +options(StringsAsFactors=FALSE) +pheno <- read.delim(sprintf("%s/CMC_MSSM-Penn-Pitt_Clinical.csv", + inDir),sep=",",h=T,as.is=T) +xpr <- read.delim(sprintf("%s/CMC_MSSM-Penn-Pitt_DLPFC_mRNA_IlluminaHiSeq2500_gene-adjustedSVA-dataNormalization-includeAncestry-adjustedLogCPM.tsv.gz",inDir),sep="\t",h=T,as.is=T) + +not_in <- which(!xpr[,1] %in% gene_id) +cat(sprintf("%i ids not in dictionary; excluding\n", length(not_in))) +if (length(not_in)>0) xpr <- xpr[-not_in,] + +midx <- match(xpr[,1],gene_id) +if (all.equal(gene_id[midx],xpr[,1])!=TRUE) { + cat("don't match\n") + browser() +} +xpr_genes <- gene_name[midx] +idx <- which(duplicated(xpr_genes)) +cat(sprintf("\t%i gene names duplicated; excluding\n", length(idx))) +xpr_genes <- xpr_genes[-idx] +xpr <- xpr[-idx,] +rownames(xpr)<- xpr_genes; rm(gene_id,gene_name) +xpr <- xpr[,-1] +common <- intersect(colnames(xpr),pheno$DLPFC_RNA_Sequencing_Sample_ID) +xpr <- xpr[,which(colnames(xpr) %in% common)] +pheno <- pheno[which(pheno$DLPFC_RNA_Sequencing_Sample_ID %in% common),] + +midx <- match(pheno$DLPFC_RNA_Sequencing_Sample_ID,colnames(xpr)) +if (all.equal(colnames(xpr)[midx],pheno$DLPFC_RNA_Sequencing_Sample_ID)!=TRUE) { + cat("pheno don't match"); browser() +} +xpr <- xpr[,midx] + +# limit by dx +idx <- which(pheno$Dx %in% c("Control","SCZ")) + +xpr <- xpr[,idx] +pheno <- pheno[idx,] + +ctrlIdx <- which(pheno$Dx %in% "Control") +caseIdx <- which(pheno$Dx %in% "SCZ") + +set.seed(123) # make reproducible +idx1 <- sample(ctrlIdx,length(ctrlIdx)/2,F) +idx2 <- sample(caseIdx,length(caseIdx)/2,F) +idx <- c(idx1,idx2) +pheno <- pheno[idx,]; xpr <- xpr[,idx] + +pheno$STATUS <- pheno$Dx +pheno$ID <- pheno$DLPFC_RNA_Sequencing_Sample_ID + +pathwayList <- readPathways(pathFile) +head(pathwayList) + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/SCZfull_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=pathwayList) +dats <- list(rna=xpr) + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=2L,nFoldCV=10L, CVcutoff=9L,numSplits=25L, + CVmemory=26L) From 87ed42917db917dd4afe0464afaa4e04d045fe7c Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:37:40 -0500 Subject: [PATCH 012/124] univariate filtering test --- .../pruneVersion/LUSC_oneNetPer_LMprune.R | 375 ++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R diff --git a/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R b/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R new file mode 100644 index 00000000..4cc2a182 --- /dev/null +++ b/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R @@ -0,0 +1,375 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +#inDir <- "/mnt/data2/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/mnt/data2/BaderLab/PanCancer_LUSC/output" + +#inDir <- "/home/netdx/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/home/netdx/BaderLab/PanCancer_LUSC/output" + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruned_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + netSets[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/networks",megaDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) + +# now create database +megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 89e613f30c1df69500c1da092a55f32174227892 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:38:05 -0500 Subject: [PATCH 013/124] new sparsifier --- netDx/R/sparsify2.R | 47 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 netDx/R/sparsify2.R diff --git a/netDx/R/sparsify2.R b/netDx/R/sparsify2.R new file mode 100644 index 00000000..454ee733 --- /dev/null +++ b/netDx/R/sparsify2.R @@ -0,0 +1,47 @@ +#' cleaner sparsification routine +#' +#' @details Sparsifies similarity matrix to keep strongest edges. +#' Sets diagonal and edges < cutoff to NA. Keeps strongest maxInt edges +#' per node. Ties are ignored. Keeps a max of EDGE_MAX edges in the network. +#' @param W (matrix) similarity matrix +#' @param outFile (char) path to file to write sparsified network +#' @param cutoff (numeric) edges with weight smaller than this are set to NA +#' @param maxInt (numeric) max num edges per node. +#' @param EDGE_MAX (numeric) max num edges in network +#' @return writes SIF content to text file (node1,node2,edge weight) +#' @import reshape2 +#' @export +sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000) { + if (maxInt > ncol(W)) maxInt <- ncol(W) + + diag(W) <- 0; + W[W < cutoff] <- NA + x <- apply(W,1,sort,decreasing=TRUE,na.last=NA); + for (k in 1:length(x)) { + cur <- x[[k]] + tokeep <- names(cur)[1:min(length(cur),maxInt)] + W[k,which(!colnames(W)%in% tokeep)] <- NA + } + tmp <- na.omit(melt(W)) + tmp <- tmp[order(tmp[,3],decreasing=TRUE),] + #maxEdge <- 0.02*ncol(W); + + maxEdge <- nrow(tmp) + if (maxEdge>EDGE_MAX) maxEdge <- EDGE_MAX + + tmp <- tmp[1:maxEdge,] + write.table(tmp,file=outFile,sep="\t",col=F,row=F,quote=F) + +### the code below converts the SIF format back to a matrix,potentially +### for debugging. +### W2 <- dcast(tmp,Var2~Var1,value.var="value") +### rownames(W2) <- W2[,1]; W2 <- W2[,-1] +### W2 <- W2[,colnames(W)] +### W2 <- W2[colnames(W),] +### n <- ncol(W); +### sp <- nrow(tmp)/(n*(n-1))/2 +### cat(sprintf("%i -> %i edges (%i%% sparsity)\n", +### sum(!is.na(W)), nrow(tmp), round(sp*100))) +### return(W2); +} + From 85cfbd50437988b42c16abb2db6a46889738ec45 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:38:29 -0500 Subject: [PATCH 014/124] uses sparsify2 --- .../featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R b/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R index aa24b02c..26135a92 100644 --- a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R +++ b/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R @@ -81,9 +81,12 @@ clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "I clinical$stage <- as.factor(clinical$stage) clinical <- clinical[, -which(colnames(clinical)=="gender")] clinical <- t(clinical[,c("age","stage")]) + clinical[1,] <- as.integer(clinical[1,]) clinical[2,] <- as.integer(as.factor(clinical[2,])) class(clinical) <- "numeric" +ztrans <- function(x) (x-mean(x,na.rm=TRUE))/(sd(x,na.rm=TRUE)) +clinical[1,] <- ztrans(clinical[1,]) # ======================= dats$clinical <- clinical; rm(clinical) @@ -194,7 +197,7 @@ netList2 <- makePSN_NamedMatrix(alldat, rownames(alldat),netSets["clinical"], netDir,simMetric="custom",customFunc=sim.rbf, verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) + sparsify=TRUE,useSparsify2=TRUE,append=TRUE) netList <- c(netList,netList2) cat(sprintf("Total of %i nets\n", length(netList))) @@ -229,7 +232,7 @@ for (rngNum in 1:10) { rownames(alldat_train),netSets["clinical"], netDir,simMetric="custom",customFunc=sim.rbf, verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) + sparsify=TRUE,useSparsify2=TRUE,append=TRUE) netList <- c(netList,netList2) cat(sprintf("Total of %i nets\n", length(netList))) From cddd163dcbd81fd8f1ed54d6e909b98cd64afc1e Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 6 Feb 2018 11:44:40 -0500 Subject: [PATCH 015/124] moved --- .../LUSC_oneNetPer_LMprune.R | 370 ------------------ 1 file changed, 370 deletions(-) delete mode 100644 misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune.R diff --git a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune.R b/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune.R deleted file mode 100644 index 391fed9c..00000000 --- a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune.R +++ /dev/null @@ -1,370 +0,0 @@ -#' PanCancer binarized survival: LUSC: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design - -rm(list=ls()) -require(netDx) -require(netDx.examples) -source("../runLM.R") - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 - -#inDir <- "/mnt/data2/BaderLab/PanCancer_LUSC/input" -#outRoot <- "/mnt/data2/BaderLab/PanCancer_LUSC/output" - -#inDir <- "/home/netdx/BaderLab/PanCancer_LUSC/input" -#outRoot <- "/home/netdx/BaderLab/PanCancer_LUSC/output" - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/pruneCheckIntegr_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions - -# takes average of normdiff of each row in x -normDiff2 <- function(x) { - # normalized difference - # x is vector of values, one per patient (e.g. ages) - normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out - } - - sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) - for (k in 1:nrow(x)) { - tmp <- normDiff(x[k,,drop=FALSE]) - sim <- sim + tmp - rownames(sim) <- rownames(tmp) - colnames(sim) <- colnames(tmp) - } - sim <- sim/nrow(x) - sim -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), - survival=sprintf("%s/LUSC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), - prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), - mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), - cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno -rownames(clinical) <- clinical[,1]; -# ======================= -# LUSC-specific variables -clinical$stage <- as.vector(clinical$stage) -clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" -clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" -clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" -clinical$stage <- as.factor(clinical$stage) -clinical <- clinical[, -which(colnames(clinical)=="gender")] -clinical <- t(clinical[,c("age","stage")]) -clinical[1,] <- as.integer(clinical[1,]) -clinical[2,] <- as.integer(as.factor(clinical[2,])) -class(clinical) <- "numeric" -# ======================= -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - if (nm == "rna") tmp <- log(tmp+1) - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -# apply pruning to proteomic data -curwd <- getwd() -setwd("..") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) -for (nm in setdiff(names(dats),"clinical")) { -print(nm) - if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) - dev.off() - if (!is.na(prune)) { - res <- prune$res - res <- subset(res, adj.P.Val < prune$bestThresh) - tmp <- dats[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res))] - dats[[nm]] <- tmp - netSets[[nm]] <- rownames(tmp) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } else { - cat(sprintf("%s: not pruning\n",nm)) - } -} - -## Create the mega database with all patients and all nets. -## This will be used to predict test samples by subsetting just for feature -## selected nets in a given round -## Note that this is useful for all train/test splits because we can always -## change which samples are query and can always subset based on which nets -## are feature selected in a given round. -netDir <- sprintf("%s/networks",megaDir) -nonclin <- setdiff(names(netSets),"clinical") -netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) -netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) -netList <- c(netList,netList2) -cat(sprintf("Total of %i nets\n", length(netList))) - -# now create database -megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) - -# first loop - over train/test splits -for (rngNum in 1:25) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - # only include the nets that were feature selected - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - } else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } -} - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) From d3c00838c2a71cf71ef95a7aeccda53d5605b0c3 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 12 Feb 2018 16:03:57 -0500 Subject: [PATCH 016/124] variations to improve perf --- misc/PanCancer/pruneVersion/GBM_pruned.R | 372 +++++++++++++++++ misc/PanCancer/pruneVersion/KIRC_pruned.R | 355 ++++++++++++++++ misc/PanCancer/pruneVersion/LUSC_MI.R | 380 +++++++++++++++++ misc/PanCancer/pruneVersion/LUSC_corAuto.R | 382 ++++++++++++++++++ .../pruneVersion/LUSC_oneNetPer_LMprune.R | 24 +- misc/PanCancer/pruneVersion/LUSC_sparse2.R | 381 +++++++++++++++++ misc/PanCancer/pruneVersion/OV_pruned.R | 348 ++++++++++++++++ 7 files changed, 2233 insertions(+), 9 deletions(-) create mode 100644 misc/PanCancer/pruneVersion/GBM_pruned.R create mode 100644 misc/PanCancer/pruneVersion/KIRC_pruned.R create mode 100644 misc/PanCancer/pruneVersion/LUSC_MI.R create mode 100644 misc/PanCancer/pruneVersion/LUSC_corAuto.R create mode 100644 misc/PanCancer/pruneVersion/LUSC_sparse2.R create mode 100644 misc/PanCancer/pruneVersion/OV_pruned.R diff --git a/misc/PanCancer/pruneVersion/GBM_pruned.R b/misc/PanCancer/pruneVersion/GBM_pruned.R new file mode 100644 index 00000000..d25fd847 --- /dev/null +++ b/misc/PanCancer/pruneVersion/GBM_pruned.R @@ -0,0 +1,372 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/prune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + netSets[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +browser() +#### ---------------------------------------------------------- + + +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/networks",megaDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +browser() + +# now create database +megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/KIRC_pruned.R b/misc/PanCancer/pruneVersion/KIRC_pruned.R new file mode 100644 index 00000000..20462974 --- /dev/null +++ b/misc/PanCancer/pruneVersion/KIRC_pruned.R @@ -0,0 +1,355 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruned_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + netSets[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + + +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/networks",megaDir) +nonclin <- setdiff(names(netSets),"clinical") + +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) + +# now create database +megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores) + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/LUSC_MI.R b/misc/PanCancer/pruneVersion/LUSC_MI.R new file mode 100644 index 00000000..a89828aa --- /dev/null +++ b/misc/PanCancer/pruneVersion/LUSC_MI.R @@ -0,0 +1,380 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +#inDir <- "/mnt/data2/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/mnt/data2/BaderLab/PanCancer_LUSC/output" + +#inDir <- "/home/netdx/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/home/netdx/BaderLab/PanCancer_LUSC/output" + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/prunedMI_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( +# rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir) +# mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), +# cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + #clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + #clinical="clinical_cont", + #mir="mir.profile", + #rna="rna.profile", + prot="prot.profile" + #cnv="cnv.profile", + #clinicalAmir=c("clinical_cont","mir.profile"), + #clinicalAcnv=c("clinical_cont","cnv.profile"), + #all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000 | nm == "prot") topVar <- 50 else topVar <- 100 + #topVar <- 50 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + if (nm == "prot") prune$bestThresh <- 0.6 + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + netSets[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/networks",megaDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets[nonclin],netDir, + simMetric="MI", + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE,writeProfiles=FALSE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) + +# now create database +megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores, + simMetric="MI") + +# first loop - over train/test splits +for (rngNum in 1:10) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="MI") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE,writeProfiles=FALSE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="MI") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/LUSC_corAuto.R b/misc/PanCancer/pruneVersion/LUSC_corAuto.R new file mode 100644 index 00000000..15219173 --- /dev/null +++ b/misc/PanCancer/pruneVersion/LUSC_corAuto.R @@ -0,0 +1,382 @@ +#' correlation but threshold=auto. +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +#inDir <- "/mnt/data2/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/mnt/data2/BaderLab/PanCancer_LUSC/output" + +#inDir <- "/home/netdx/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/home/netdx/BaderLab/PanCancer_LUSC/output" + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneAuto_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000 | nm == "prot") + topVar <- 50 else topVar <- 100 + #topVar <- 50 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + if (nm == "prot") prune$bestThresh <- 0.6 + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + netSets[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/networks",megaDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) + +# now create database +megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores, + simMetric="pearson",P2N_threshType="auto") + +# first loop - over train/test splits +for (rngNum in 1:25) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson",P2N_threshType="auto") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R b/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R index 4cc2a182..9b660169 100644 --- a/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R +++ b/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R @@ -21,7 +21,7 @@ inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/pruned_%s",outRoot,dt) +megaDir <- sprintf("%s/prunedPearson_%s",outRoot,dt) # ---------------------------------------------------------------- # helper functions @@ -65,7 +65,7 @@ inFiles <- list( datFiles <- list( rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), - mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) ) @@ -177,13 +177,16 @@ require(cluster) setwd(curwd) for (nm in setdiff(names(dats),"clinical")) { print(nm) - if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + if (nrow(dats[[nm]])>10000 | nm == "prot") + topVar <- 50 else topVar <- 100 + #topVar <- 50 pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) dev.off() if (!is.na(prune)) { if (prune$bestThresh < 1) { res <- prune$res + if (nm == "prot") prune$bestThresh <- 0.6 res <- subset(res, adj.P.Val < prune$bestThresh) tmp <- dats[[nm]];orig_ct <- nrow(tmp) tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] @@ -208,17 +211,19 @@ netDir <- sprintf("%s/networks",megaDir) nonclin <- setdiff(names(netSets),"clinical") netList <- makePSN_NamedMatrix(alldat, rownames(alldat),netSets[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") netList2 <- makePSN_NamedMatrix(alldat, rownames(alldat),netSets["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, verbose=FALSE,numCores=numCores, sparsify=TRUE,append=TRUE) netList <- c(netList,netList2) cat(sprintf("Total of %i nets\n", length(netList))) # now create database -megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) +megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores, + simMetric="pearson") # first loop - over train/test splits for (rngNum in 1:100) { @@ -243,17 +248,18 @@ for (rngNum in 1:100) { netList <- makePSN_NamedMatrix(alldat_train, rownames(alldat_train),netSets[nonclin], netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE) + writeProfiles=TRUE,simMetric="pearson") netList2 <- makePSN_NamedMatrix(alldat_train, rownames(alldat_train),netSets["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, verbose=FALSE,numCores=numCores, sparsify=TRUE,append=TRUE) netList <- c(netList,netList2) cat(sprintf("Total of %i nets\n", length(netList))) # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") # second loop - over combinations of input data for (cur in names(combList)) { diff --git a/misc/PanCancer/pruneVersion/LUSC_sparse2.R b/misc/PanCancer/pruneVersion/LUSC_sparse2.R new file mode 100644 index 00000000..5f74bf49 --- /dev/null +++ b/misc/PanCancer/pruneVersion/LUSC_sparse2.R @@ -0,0 +1,381 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +#inDir <- "/mnt/data2/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/mnt/data2/BaderLab/PanCancer_LUSC/output" + +#inDir <- "/home/netdx/BaderLab/PanCancer_LUSC/input" +#outRoot <- "/home/netdx/BaderLab/PanCancer_LUSC/output" + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/prunedSparse2_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + #rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir) + #mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + #cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( +# clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), +# clinical="clinical_cont", +# mir="mir.profile", +# rna="rna.profile", + prot="prot.profile" +# cnv="cnv.profile", +# clinicalAmir=c("clinical_cont","mir.profile"), +# clinicalAcnv=c("clinical_cont","cnv.profile"), +# all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000 | nm == "prot") + topVar <- 50 else topVar <- 100 + #topVar <- 50 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + if (nm == "prot") prune$bestThresh <- 0.6 + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + netSets[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/networks",megaDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,useSparsify2=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) + +# now create database +megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores, + simMetric="pearson") + +# first loop - over train/test splits +for (rngNum in 1:25) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,useSparsify2=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/OV_pruned.R b/misc/PanCancer/pruneVersion/OV_pruned.R new file mode 100644 index 00000000..b9977ff7 --- /dev/null +++ b/misc/PanCancer/pruneVersion/OV_pruned.R @@ -0,0 +1,348 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruned_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno_all$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + netSets[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} + +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/networks",megaDir) +# netList <- makePSN_NamedMatrix(alldat, + # rownames(alldat),netSets,netDir, + # verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + +nonclin <- setdiff(names(netSets),"clinical") + +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + +netList <- c(netList,netList2) + +cat(sprintf("Total of %i nets\n", length(netList))) + +# now create database +megadbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + alldat_train <- alldat[,which(colnames(alldat) %in% pheno$ID)] + + netDir <- sprintf("%s/networks",outDir) + # netList <- makePSN_NamedMatrix(alldat_train, + # rownames(alldat_train),netSets, + # netDir,verbose=FALSE,numCores=numCores, + # writeProfiles=TRUE) + + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From fe2024884e955dd0c4c40d4a2e42f6db49a0264b Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 12 Feb 2018 16:04:38 -0500 Subject: [PATCH 017/124] passes params to P2NDriver. --- netDx/R/GM_createDB.R | 27 +++++++++++++++++++++------ netDx/R/makePSN_NamedMatrix.R | 16 ++++++++++------ netDx/man/GM_createDB.Rd | 11 ++++++++--- netDx/man/makePSN_NamedMatrix.Rd | 8 ++++++-- netDx/man/sparsifyNet.Rd | 4 +++- 5 files changed, 48 insertions(+), 18 deletions(-) diff --git a/netDx/R/GM_createDB.R b/netDx/R/GM_createDB.R index e2547d29..0a761bdf 100644 --- a/netDx/R/GM_createDB.R +++ b/netDx/R/GM_createDB.R @@ -19,6 +19,9 @@ #' @param netSfx (char) pattern for finding network files in \code{netDir}. #' @param verbose (logical) print messages #' @param numCores (integer) num cores for parallel processing +#' @param P2N_threshType (char) Most users shouldn't have to change this. +#' ProfileToNetworkDriver's threshold option. One of "off|auto". +#' @param P2N_maxMissing (integer 5-100) #' @param GMmemory (integer) Memory for GeneMANIA (in Gb) #' @param ... params for \code{GM_writeBatchFile()} #' @return (list). "dbDir": path to GeneMANIA database @@ -35,15 +38,19 @@ #' writeProfiles=TRUE); #' db <- GM_createDB("/tmp/nets/",pheno$ID,"/tmp") #' @export -GM_createDB <- function(netDir,patientID,outDir,simMetric="cor_pearson", - netSfx="_cont.txt$",verbose=TRUE,numCores=1L, - GMmemory=4L, ...) { +GM_createDB <- function(netDir,patientID,outDir,simMetric="pearson", + netSfx="_cont.txt$",verbose=TRUE,numCores=1L, P2N_threshType="off", + P2N_maxMissing=100,GMmemory=4L, ...) { # tmpDir/ is where all the prepared files are stored. # GeneMANIA uses tmpDir as input to create the generic database. # The database itself will be in outDir/ tmpDir <- sprintf("%s/tmp",outDir) dataDir <- sprintf("%s/dataset",outDir) + if (P2N_maxMissing < 5) PSN_maxMissing <- 5 + if (P2N_maxMissing >100) PSN_maxMissing <- 100 + if (!P2N_threshType %in% c("off","auto")) P2N_threshType <- "off" + if (file.exists(tmpDir)) unlink(tmpDir,recursive=TRUE) if (file.exists(dataDir)) unlink(dataDir,recursive=TRUE) dir.create(dataDir) @@ -109,12 +116,19 @@ GM_createDB <- function(netDir,patientID,outDir,simMetric="cor_pearson", if (length(netList1)>0) { cat("\t* Converting profiles to interaction networks\n") - cl <- makeCluster(numCores) + cl <- makeCluster(numCores,outfile=sprintf("%s/P2N_log.txt",tmpDir)) registerDoParallel(cl) + + if (simMetric=="pearson") { + corType <- "PEARSON" + } else if (simMetric == "MI") { + corType <- "MUTUAL_INFORMATION" + } cmd1 <- sprintf("java -Xmx%iG -cp %s org.genemania.engine.core.evaluation.ProfileToNetworkDriver", GMmemory,GM_jar) - cmd3 <- "-proftype continuous -cor PEARSON" - cmd5 <- "-threshold off -maxmissing 100.0" + cmd3 <- sprintf("-proftype continuous -cor %s",corType) + cmd5 <- sprintf("-threshold %s -maxmissing %1.1f", P2N_threshType, + P2N_maxMissing) profDir <- sprintf("%s/profiles",tmpDir) netOutDir <- sprintf("%s/INTERACTIONS",tmpDir) tmpsfx <- sub("\\$","",netSfx) @@ -125,6 +139,7 @@ GM_createDB <- function(netDir,patientID,outDir,simMetric="cor_pearson", cmd4 <- sprintf("-syn %s/1.synonyms -keepAllTies -limitTies", tmpDir) cmd <- sprintf("%s %s %s %s %s", cmd1,cmd2,cmd3,cmd4,cmd5) + print(cmd) system(cmd) } )) diff --git a/netDx/R/makePSN_NamedMatrix.R b/netDx/R/makePSN_NamedMatrix.R index 0d1f6f83..ede58fdc 100644 --- a/netDx/R/makePSN_NamedMatrix.R +++ b/netDx/R/makePSN_NamedMatrix.R @@ -28,7 +28,8 @@ #' that are input to network generation #' @param outDir (char) path to directory where networks are written #' @param simMetric (char) measure of similarity. See \code{getSimilarity()} -#' for details +#' for details. If writeProfiles is set to TRUE, must be one of pearson +#' (Pearson correlation) or MI (correlation by mutual information). #' @param cutoff (numeric) patients with similarity smaller than this value #' are not included in the corresponding interaction network #' @param verbose (logical) print detailed messages @@ -41,6 +42,8 @@ #' If FALSE, uses getSimilarity() and writes interaction networks. #' @param sparsify (logical) sparsify networks by calling sparsifyNets() #' with default parameters. Only used when writeProfiles=FALSE +#' @param useSparsify2 (logical). Currently for testing only. A cleaner +#' sparsification routine. #' @param append (logical) if TRUE does not overwrite netDir. #' @param ... passed to \code{getSimilarity()} #' @return (char) Basename of files to which networks are written. @@ -54,7 +57,7 @@ #' @export makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, simMetric="pearson", cutoff=0.3,verbose=TRUE, - numCores=1L,writeProfiles=FALSE, + numCores=1L,writeProfiles=TRUE, sparsify=FALSE,useSparsify2=FALSE,append=FALSE,...){ if (!append) { if (file.exists(outDir)) unlink(outDir,recursive=TRUE) @@ -66,14 +69,15 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, } } - if (simMetric!="pearson" & writeProfiles==TRUE) { - stop("writeProfiles must only be TRUE with simMetric is set to pearson. For all other metrics, set writeProfiles=FALSE") + if ((!simMetric %in% c("pearson","MI")) & writeProfiles==TRUE) { + print(simMetric) + stop("writeProfiles must only be TRUE with simMetric set to pearson or MI. For all other metrics, set writeProfiles=FALSE") } if (!sparsify & useSparsify2) { stop("if useSparsify=TRUE then sparsify must also be set to TRUE\n")} cl <- makeCluster(numCores) - registerDoParallel(cl) + registerDoParallel(cl,outfile=sprintf("%s/makePSN_log.txt",outDir)) # process pathways in parallel outFiles <- foreach (curSet=names(namedSets)) %dopar% { @@ -82,7 +86,7 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, if (verbose) cat(sprintf("%i members\n", length(idx))) minMembers <- 1 - if (simMetric=="pearson") minMembers <- 3; + if (simMetric=="pearson") minMembers <- 5; oFile <- NULL # has sufficient connections to make network diff --git a/netDx/man/GM_createDB.Rd b/netDx/man/GM_createDB.Rd index 70e3add6..e41ef7e8 100644 --- a/netDx/man/GM_createDB.Rd +++ b/netDx/man/GM_createDB.Rd @@ -4,9 +4,9 @@ \alias{GM_createDB} \title{Create GeneMANIA database} \usage{ -GM_createDB(netDir, patientID, outDir, simMetric = "cor_pearson", - netSfx = "_cont.txt$", verbose = TRUE, numCores = 1L, GMmemory = 4L, - ...) +GM_createDB(netDir, patientID, outDir, simMetric = "pearson", + netSfx = "_cont.txt$", verbose = TRUE, numCores = 1L, + P2N_threshType = "off", P2N_maxMissing = 100, GMmemory = 4L, ...) } \arguments{ \item{netDir}{(char) path to dir with input networks/profiles. All @@ -27,6 +27,11 @@ profiles to interaction networks.} \item{numCores}{(integer) num cores for parallel processing} +\item{P2N_threshType}{(char) Most users shouldn't have to change this. +ProfileToNetworkDriver's threshold option. One of "off|auto".} + +\item{P2N_maxMissing}{(integer 5-100)} + \item{GMmemory}{(integer) Memory for GeneMANIA (in Gb)} \item{...}{params for \code{GM_writeBatchFile()}} diff --git a/netDx/man/makePSN_NamedMatrix.Rd b/netDx/man/makePSN_NamedMatrix.Rd index f716815c..ce868e60 100644 --- a/netDx/man/makePSN_NamedMatrix.Rd +++ b/netDx/man/makePSN_NamedMatrix.Rd @@ -5,7 +5,7 @@ \title{Create patient networks from full matrix of named measurements} \usage{ makePSN_NamedMatrix(xpr, nm, namedSets, outDir, simMetric = "pearson", - cutoff = 0.3, verbose = TRUE, numCores = 1L, writeProfiles = FALSE, + cutoff = 0.3, verbose = TRUE, numCores = 1L, writeProfiles = TRUE, sparsify = FALSE, useSparsify2 = FALSE, append = FALSE, ...) } \arguments{ @@ -24,7 +24,8 @@ that are input to network generation} \item{outDir}{(char) path to directory where networks are written} \item{simMetric}{(char) measure of similarity. See \code{getSimilarity()} -for details} +for details. If writeProfiles is set to TRUE, must be one of pearson +(Pearson correlation) or MI (correlation by mutual information).} \item{cutoff}{(numeric) patients with similarity smaller than this value are not included in the corresponding interaction network} @@ -41,6 +42,9 @@ If FALSE, uses getSimilarity() and writes interaction networks.} \item{sparsify}{(logical) sparsify networks by calling sparsifyNets() with default parameters. Only used when writeProfiles=FALSE} +\item{useSparsify2}{(logical). Currently for testing only. A cleaner +sparsification routine.} + \item{append}{(logical) if TRUE does not overwrite netDir.} \item{...}{passed to \code{getSimilarity()}} diff --git a/netDx/man/sparsifyNet.Rd b/netDx/man/sparsifyNet.Rd index 605972ac..0fdec27e 100644 --- a/netDx/man/sparsifyNet.Rd +++ b/netDx/man/sparsifyNet.Rd @@ -5,7 +5,7 @@ \title{Sparsifies patient similarity network} \usage{ sparsifyNet(net, outFile, k = 50L, MAX_INT = 600L, MAX_PCT = 0.02, - numPatients, keepTies = TRUE, verbose = TRUE) + numPatients, keepTies = TRUE, verbose = TRUE, cutoff = 0.3) } \arguments{ \item{net}{(char or data.frame) If of type char, should path to @@ -29,6 +29,8 @@ Details.} \item{keepTies}{(logical) keep edge ties. See Details} \item{verbose}{(logical) print messages} + +\item{cutoff}{(value between 0 and 1) min edge value to keep} } \value{ No value. Writes sparsified matrix to \code{outFile} From 0a885b7a09e972e1d4491e761efd56bc714c7e16 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 12 Feb 2018 16:05:28 -0500 Subject: [PATCH 018/124] updated for clean runs and testing variations --- misc/PanCancer/multiCutoff/GBM_parseMulti.R | 23 ++++---- misc/PanCancer/multiCutoff/KIRC_parseMulti.R | 57 ++++++++++++++++++++ misc/PanCancer/multiCutoff/LUSC_parseMulti.R | 52 +++++++++++++----- misc/PanCancer/multiCutoff/OV_parseMulti.R | 57 ++++++++++++++++++++ 4 files changed, 164 insertions(+), 25 deletions(-) create mode 100644 misc/PanCancer/multiCutoff/KIRC_parseMulti.R create mode 100644 misc/PanCancer/multiCutoff/OV_parseMulti.R diff --git a/misc/PanCancer/multiCutoff/GBM_parseMulti.R b/misc/PanCancer/multiCutoff/GBM_parseMulti.R index 60a8ffdf..8c59fb70 100644 --- a/misc/PanCancer/multiCutoff/GBM_parseMulti.R +++ b/misc/PanCancer/multiCutoff/GBM_parseMulti.R @@ -3,27 +3,28 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" +#dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" -dataDir_both <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/prune_180124" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/prune_180204" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCA1net_180126" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCAmultinet_180126" -outmat <- matrix(NA,nrow=3,ncol=9) -settypes <- c("clinical","rna","clinicalArna") +settypes <- c("clinical","mir","rna","cnv","dnam", + "clinicalArna","clinicalAmir","clinicalAdnam","clinicalAcnv","all") +outmat <- matrix(NA,nrow=length(settypes),ncol=9) meas <- paste(rep(7:9,each=3),c("auroc","aupr","accuracy"),sep="_") rownames(outmat)<- settypes colnames(outmat) <- meas ctr <- 1 -outD <- sprintf("GBM_%s",basename(dataDir_each)) +outD <- sprintf("GBM_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) for (settype in settypes) { - if (settype %in% "clinicalArna") - dataDir <- dataDir_both - else - dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:10,sep="") +### if (settype %in% "clinicalArna") +### dataDir <- dataDir_both +### else +### dataDir <- dataDir_each + rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") colctr <- 1 for (cutoff in 7:9) { @@ -53,7 +54,7 @@ ctr <- ctr+1 } print(round(outmat,digits=2)) -write.table(outmat,file=sprintf("%s/perf.txt",outD),sep="\t", +write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", col=T,row=T,quote=F) diff --git a/misc/PanCancer/multiCutoff/KIRC_parseMulti.R b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R new file mode 100644 index 00000000..e344348f --- /dev/null +++ b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R @@ -0,0 +1,57 @@ +#' plot GBM results with multiple CV cutoffs +rm(list=ls()) +require(netDx) +require(reshape2) + +dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruned_180204" + +settypes <- c("clinical","mir","rna","prot","cnv","dnam", + "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", + "clinicalAcnv","all") +outmat <- matrix(NA,nrow=length(settypes),ncol=3) +meas <- paste(rep(9,each=3),c("auroc","aupr","accuracy"),sep="_") +rownames(outmat)<- settypes +colnames(outmat) <- meas +ctr <- 1 +outD <- sprintf("KIRC_%s",basename(dataDir)) +if (!file.exists(outD)) dir.create(outD) + +for (settype in settypes) { +### if (settype %in% "clinicalArna") +### dataDir <- dataDir_both +### else +### dataDir <- dataDir_each + rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") + +colctr <- 1 +for (cutoff in 9) { + c7 <- sprintf("%s/%s/predictionResults.txt", + rngDir,settype,cutoff) + torm <- c() + for (idx in 1:length(c7)) { + dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) + x1 <- sum(dat$STATUS=="SURVIVEYES") + x2 <- sum(dat$STATUS=="SURVIVENO") + if (x1<1 & x2<1) torm <- c(torm, idx) + } + cat(sprintf("%i: removing %i\n", cutoff,length(torm))) + if (length(torm)>0) c7 <- c7[-torm] + postscript(sprintf("%s/%s_cutoff%i.eps",outD,settype,cutoff)); + x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) + dev.off() + + y1 <- unlist(lapply(x,function(i) i$auroc)) + y2 <- unlist(lapply(x,function(i) i$aupr)) + y3 <- unlist(lapply(x,function(i) i$accuracy)) + outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) + + colctr <- colctr+3 +} +ctr <- ctr+1 +} +print(round(outmat,digits=2)) + +write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", + col=T,row=T,quote=F) + + diff --git a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R index 82470ae1..f78ba41e 100644 --- a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R @@ -1,32 +1,56 @@ -#' plot LUSC results with multiple CV cutoffs +#' plot GBM results with multiple CV cutoffs rm(list=ls()) +require(netDx) +require(reshape2) -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/multiCutoff_180119" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/prune_180125" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/pruneCheckIntegr_180126" -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/pruneRBF_180130" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/prunePCA_180126" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/prunedMI_180212" -rngDir <- paste(sprintf("%s/rng",dataDir), 1:10,sep="") +settypes <- c("prot","clinicalAprot") #"mir","rna","prot","cnv", + #"clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") +outmat <- matrix(NA,nrow=length(settypes),ncol=9) +meas <- paste(rep(7:9,each=3),c("auroc","aupr","accuracy"),sep="_") +rownames(outmat)<- settypes +colnames(outmat) <- meas +ctr <- 1 +outD <- sprintf("LUSC_%s",basename(dataDir)) +if (!file.exists(outD)) dir.create(outD) -require(netDx) +for (settype in settypes) { +### if (settype %in% "clinicalArna") +### dataDir <- dataDir_both +### else +### dataDir <- dataDir_each + rngDir <- paste(sprintf("%s/rng",dataDir), 1:10,sep="") + +colctr <- 1 for (cutoff in 7:9) { - c7 <- sprintf("%s/clinical/cutoff%i/predictionResults.txt",rngDir,cutoff) + c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", + rngDir,settype,cutoff) torm <- c() for (idx in 1:length(c7)) { dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) x1 <- sum(dat$STATUS=="SURVIVEYES") x2 <- sum(dat$STATUS=="SURVIVENO") - # cat(sprintf("%i: %i YES, %i NO\n", idx,x1,x2)) if (x1<1 & x2<1) torm <- c(torm, idx) } cat(sprintf("%i: removing %i\n", cutoff,length(torm))) if (length(torm)>0) c7 <- c7[-torm] - postscript(sprintf("LUSC_cutoff%i.eps",cutoff)); + postscript(sprintf("%s/%s_cutoff%i.eps",outD,settype,cutoff)); x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) - y <- unlist(lapply(x,function(i) i$auroc)) - cat(sprintf("%i, mean auroc= %1.2f\n", cutoff, mean(y))) dev.off() + + y1 <- unlist(lapply(x,function(i) i$auroc)) + y2 <- unlist(lapply(x,function(i) i$aupr)) + y3 <- unlist(lapply(x,function(i) i$accuracy)) + outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) + + colctr <- colctr+3 +} +ctr <- ctr+1 } +print(round(outmat,digits=2)) + +write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", + col=T,row=T,quote=F) + -### diff --git a/misc/PanCancer/multiCutoff/OV_parseMulti.R b/misc/PanCancer/multiCutoff/OV_parseMulti.R new file mode 100644 index 00000000..0deadc1f --- /dev/null +++ b/misc/PanCancer/multiCutoff/OV_parseMulti.R @@ -0,0 +1,57 @@ +#' plot GBM results with multiple CV cutoffs +rm(list=ls()) +require(netDx) +require(reshape2) + +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output/pruned_180206" + +settypes <- c("clinical","mir","rna","prot","cnv","dnam", + "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", + "clinicalAcnv","all") +outmat <- matrix(NA,nrow=length(settypes),ncol=3) +meas <- paste(rep(9,each=3),c("auroc","aupr","accuracy"),sep="_") +rownames(outmat)<- settypes +colnames(outmat) <- meas +ctr <- 1 +outD <- sprintf("OV_%s",basename(dataDir)) +if (!file.exists(outD)) dir.create(outD) + +for (settype in settypes) { +### if (settype %in% "clinicalArna") +### dataDir <- dataDir_both +### else +### dataDir <- dataDir_each + rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") + +colctr <- 1 +for (cutoff in 9) { + c7 <- sprintf("%s/%s/predictionResults.txt", + rngDir,settype,cutoff) + torm <- c() + for (idx in 1:length(c7)) { + dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) + x1 <- sum(dat$STATUS=="SURVIVEYES") + x2 <- sum(dat$STATUS=="SURVIVENO") + if (x1<1 & x2<1) torm <- c(torm, idx) + } + cat(sprintf("%i: removing %i\n", cutoff,length(torm))) + if (length(torm)>0) c7 <- c7[-torm] + postscript(sprintf("%s/%s_cutoff%i.eps",outD,settype,cutoff)); + x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) + dev.off() + + y1 <- unlist(lapply(x,function(i) i$auroc)) + y2 <- unlist(lapply(x,function(i) i$aupr)) + y3 <- unlist(lapply(x,function(i) i$accuracy)) + outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) + + colctr <- colctr+3 +} +ctr <- ctr+1 +} +print(round(outmat,digits=2)) + +write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", + col=T,row=T,quote=F) + + From 9e85aef953cb56d00f471a0be438be73f2d39565 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 14 Feb 2018 11:56:10 -0500 Subject: [PATCH 019/124] pruned version --- .../PanCancer/pruneVersion/GBM_pruneTrained.R | 370 ++++++++++++++++++ misc/PanCancer/pruneVersion/KIRC_pruneTrain.R | 356 +++++++++++++++++ misc/PanCancer/pruneVersion/OV_pruneTrain.R | 339 ++++++++++++++++ 3 files changed, 1065 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/GBM_pruneTrained.R create mode 100644 misc/PanCancer/pruneVersion/KIRC_pruneTrain.R create mode 100644 misc/PanCancer/pruneVersion/OV_pruneTrain.R diff --git a/misc/PanCancer/pruneVersion/GBM_pruneTrained.R b/misc/PanCancer/pruneVersion/GBM_pruneTrained.R new file mode 100644 index 00000000..e9d9eacf --- /dev/null +++ b/misc/PanCancer/pruneVersion/GBM_pruneTrained.R @@ -0,0 +1,370 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/prune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + for (nm in setdiff(names(dats),"clinical")) { + print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R b/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R new file mode 100644 index 00000000..79e6c8be --- /dev/null +++ b/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R @@ -0,0 +1,356 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + +# first loop - over train/test splits +for (rngNum in 1:25) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") +## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +## pruneTrain code end + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/OV_pruneTrain.R b/misc/PanCancer/pruneVersion/OV_pruneTrain.R new file mode 100644 index 00000000..c86babba --- /dev/null +++ b/misc/PanCancer/pruneVersion/OV_pruneTrain.R @@ -0,0 +1,339 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + +# first loop - over train/test splits +for (rngNum in 1:25) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + +# pruneTrain: ---- + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + netSets_iter[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) +netSets_iter[["clinical"]] <- netSets[["clinical"]] +## end pruning code +## ---- + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + test_netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + ### netSets_iter has univariate filtering for curr round + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],test_netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + test_netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(test_netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 4e8c460460c9a62eb572b48ad544e5df376ecd6e Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 23 Feb 2018 10:39:55 -0500 Subject: [PATCH 020/124] asthma --- misc/Asthma_PBMC/Asthma_plotResults.R | 68 ++++++++++++++++++++++++++ misc/Asthma_PBMC/netDx.R | 69 +++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 misc/Asthma_PBMC/Asthma_plotResults.R create mode 100644 misc/Asthma_PBMC/netDx.R diff --git a/misc/Asthma_PBMC/Asthma_plotResults.R b/misc/Asthma_PBMC/Asthma_plotResults.R new file mode 100644 index 00000000..cadf7e40 --- /dev/null +++ b/misc/Asthma_PBMC/Asthma_plotResults.R @@ -0,0 +1,68 @@ +#' plot BRCA results +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(GEOquery) +require(org.Hs.eg.db) + +rootDir <- "/home/shraddhapai/BaderLab/2018_AsthmaPBMC" +inDir <- sprintf("%s/input",rootDir) +datDir <- sprintf("%s/output/basic_180220/pred",rootDir) +outDir <- sprintf("%s/output/basic_180220/plot",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_November_01_2017_symbol.gmt", + rootDir) + +dat <- getGEO(filename=sprintf("%s/GSE40732_series_matrix.txt.gz",inDir), + GSEMatrix=TRUE) +xpr <- exprs(dat) +pheno <- pData(dat) +# map GB ID to symbol +x <- mapIds(org.Hs.eg.db, keys=rownames(xpr), + column="SYMBOL",keytype="ACCNUM", + multiVals="first") +common <- intersect(names(x),rownames(xpr)) +xpr <- xpr[which(rownames(xpr) %in% common),] +x <- x[which(names(x) %in% common)] + +midx <- match(rownames(xpr),names(x)) +gnames <- x[midx] +agg <- aggregate(xpr, by=list(gene_name=gnames),FUN=mean) +xpr <- agg[,-1] +rownames(xpr) <- agg[,1] + +pheno <- pheno[,c("geo_accession","characteristics_ch1")] +st <- rep(NA, nrow(pheno)) +st[which(pheno[,2] %in% "asthma: FALSE")] <- "control" +st[which(pheno[,2] %in% "asthma: TRUE")] <- "asthma" +pheno[,2] <- st +colnames(pheno) <- c("ID","STATUS") +pheno[,1] <- as.character(pheno[,1]) + +pathwayList <- readPathways(pathFile) +head(pathwayList) + +if (!file.exists(outDir)) dir.create(outDir) +predClasses <- unique(pheno$STATUS) +postscript(sprintf("%s/perf.eps",outDir)) +#predDir <- sprintf("%s/rng%i/predictionResults.txt", +# datDir,1:11) +predPerf <- plotPerf(datDir, predClasses=predClasses) +dev.off() +auroc <- unlist(lapply(predPerf, function(x) x$auroc)) +aupr <- unlist(lapply(predPerf, function(x) x$aupr)) +acc <- unlist(lapply(predPerf, function(x) x$accuracy)) + +###featList <- list( +### control=sprintf("%s/rng%i/control/GM_results/control_pathway_CV_score.txt", +### datDir,1:11), +### asthma=sprintf("%s/rng%i/asthma/GM_results/asthma_pathway_CV_score.txt", +### datDir,1:11)) +featScores <- getFeatureScores(datDir,predClasses=predClasses) +featSelNet <- lapply(featScores, function(x) { + callFeatSel(x, fsCutoff=10, fsPctPass=0.9) +}) + +netInfoFile <- sprintf("%s/inputNets.txt",datDir) +netInfo <- read.delim(netInfoFile,sep="\t",h=FALSE,as.is=TRUE) +EMap_input <- writeEMapInput_many(featScores,pathwayList, + netInfo,outDir=outDir) diff --git a/misc/Asthma_PBMC/netDx.R b/misc/Asthma_PBMC/netDx.R new file mode 100644 index 00000000..03ace33f --- /dev/null +++ b/misc/Asthma_PBMC/netDx.R @@ -0,0 +1,69 @@ +# Ependymoma +rm(list=ls()) + +require(GEOquery) +require(netDx) +require(netDx.examples) +require(org.Hs.eg.db) + +rootDir <- "/home/shraddhapai/BaderLab/2018_AsthmaPBMC" +inDir <- sprintf("%s/input",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) + +dat <- getGEO(filename=sprintf("%s/GSE40732_series_matrix.txt.gz",inDir), + GSEMatrix=TRUE) +xpr <- exprs(dat) +pheno <- pData(dat) +# map GB ID to symbol +x <- mapIds(org.Hs.eg.db, keys=rownames(xpr), column="SYMBOL",keytype="ACCNUM", + multiVals="first") +common <- intersect(names(x),rownames(xpr)) +xpr <- xpr[which(rownames(xpr) %in% common),] +x <- x[which(names(x) %in% common)] + +midx <- match(rownames(xpr),names(x)) +gnames <- x[midx] +agg <- aggregate(xpr, by=list(gene_name=gnames),FUN=mean) +xpr <- agg[,-1] +rownames(xpr) <- agg[,1] + +pheno <- pheno[,c("geo_accession","characteristics_ch1")] +st <- rep(NA, nrow(pheno)) +st[which(pheno[,2] %in% "asthma: FALSE")] <- "control" +st[which(pheno[,2] %in% "asthma: TRUE")] <- "asthma" +pheno[,2] <- st +colnames(pheno) <- c("ID","STATUS") +pheno[,1] <- as.character(pheno[,1]) + +pathwayList <- readPathways(pathFile) +head(pathwayList) + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/basic_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=pathwayList) +dats <- list(rna=xpr) + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=2L,nFoldCV=10L, CVcutoff=9L,numSplits=100L,CVmemory=13L) From ce1c14e1617c400ddc81759e8ff0cf4f38b2cdfc Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 23 Feb 2018 10:40:10 -0500 Subject: [PATCH 021/124] latest brca, pthway file update --- misc/BRCA/BRCA_example.R | 16 +++++++-------- misc/BRCA/BRCA_oneRNAmat.R | 40 ++++++++++++++++++++++++++++++++++++ misc/BRCA/BRCA_plotResults.R | 9 ++++---- 3 files changed, 53 insertions(+), 12 deletions(-) create mode 100644 misc/BRCA/BRCA_oneRNAmat.R diff --git a/misc/BRCA/BRCA_example.R b/misc/BRCA/BRCA_example.R index 40ee2645..f3445a34 100644 --- a/misc/BRCA/BRCA_example.R +++ b/misc/BRCA/BRCA_example.R @@ -8,11 +8,6 @@ subtypes<- c("LumA") pheno$STATUS[which(!pheno$STATUS %in% subtypes)] <- "other" subtypes <- c(subtypes,"other") # add residual -pathFile <- sprintf("%s/extdata/Human_160124_AllPathways.gmt", - path.package("netDx.examples")) -pathwayList <- readPathways(pathFile) -head(pathwayList) - BRCA_makeNets <- function(dataList, groupList, netDir,...) { netList <- c() @@ -29,9 +24,14 @@ BRCA_makeNets <- function(dataList, groupList, netDir,...) { return(netList) } -rootDir <- "/home/shraddhapai/BaderLab/2017_BRCA/output/" +rootDir <- "/home/shraddhapai/BaderLab/2017_BRCA" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/BRCA_%s",rootDir,dt) +megaDir <- sprintf("%s/output/BRCA_%s",rootDir,dt) + +pathFile <- sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +pathwayList <- readPathways(pathFile) +head(pathwayList) gps <- list(rna=pathwayList) dats <- list(rna=xpr) @@ -40,4 +40,4 @@ runPredictor_nestedCV(pheno, dataList=dats,groupList=gps, makeNetFunc=BRCA_makeNets, ### custom network creation function outDir=megaDir, - numCores=10L,nFoldCV=10L, CVcutoff=9L,numSplits=25L) + numCores=4L,nFoldCV=10L, CVcutoff=9L,numSplits=100L) diff --git a/misc/BRCA/BRCA_oneRNAmat.R b/misc/BRCA/BRCA_oneRNAmat.R new file mode 100644 index 00000000..1495586e --- /dev/null +++ b/misc/BRCA/BRCA_oneRNAmat.R @@ -0,0 +1,40 @@ +# BRCA example with nested cv + +require(netDx) +require(netDx.examples) +data(TCGA_BRCA) + +subtypes<- c("LumA") +pheno$STATUS[which(!pheno$STATUS %in% subtypes)] <- "other" +subtypes <- c(subtypes,"other") # add residual + + +BRCA_makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +rootDir <- "/home/shraddhapai/BaderLab/2017_BRCA" +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/output/BRCA_OneRNAnet_%s",rootDir,dt) + +geneSet <- list(allrna=rownames(xpr)) + +gps <- list(rna=geneSet) +dats <- list(rna=xpr) + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=BRCA_makeNets, ### custom network creation function + outDir=megaDir, + numCores=4L,nFoldCV=10L, CVcutoff=9L,numSplits=100L) diff --git a/misc/BRCA/BRCA_plotResults.R b/misc/BRCA/BRCA_plotResults.R index a0470cc0..6b51edb6 100644 --- a/misc/BRCA/BRCA_plotResults.R +++ b/misc/BRCA/BRCA_plotResults.R @@ -4,11 +4,12 @@ require(netDx) require(netDx.examples) data(TCGA_BRCA) -rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab/2017_BRCA/output/BRCA_180117" +rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab/2017_BRCA/output/BRCA_180220" -pathFile <- sprintf("%s/extdata/Human_160124_AllPathways.gmt", - path.package("netDx.examples")) +pathFile <- sprintf("%s/anno/Human_AllPathways_November_01_2017_symbol.gmt", + rootDir) pathwayList <- readPathways(pathFile) +head(pathwayList) xpr_genes <- rownames(xpr) pathwayList <- lapply(pathwayList,function(x) x[which(x %in% xpr_genes)]) @@ -33,4 +34,4 @@ featSelNet <- lapply(featScores, function(x) { netInfoFile <- sprintf("%s/inputNets.txt",inDir) netInfo <- read.delim(netInfoFile,sep="\t",h=FALSE,as.is=TRUE) EMap_input <- writeEMapInput_many(featScores,pathwayList, - netInfo,outDir=outDir) + netInfo,outDir=outDir,pctPass=0.5) From 694cf73fd3ec97b8902c69c0d891ae947cd5ee75 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 23 Feb 2018 10:41:48 -0500 Subject: [PATCH 022/124] can change which split the run starts at. for multi-computer jobs or restarts --- netDx/R/runPredictor_nestedCV.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/netDx/R/runPredictor_nestedCV.R b/netDx/R/runPredictor_nestedCV.R index 08161339..1feed14c 100644 --- a/netDx/R/runPredictor_nestedCV.R +++ b/netDx/R/runPredictor_nestedCV.R @@ -40,17 +40,20 @@ #' @param keepAllData (logical) if TRUE keeps all intermediate files, even #' those not needed for assessing the predictor. Use very cautiously as for #' some designs, each split can result in using 1Gb of data. +#' @param startAt (integer) which of the splits to start at (e.g. if the +#' job aborted part-way through) #' @examples see examples/NestedCV_MultiData.Rmd for example use. #' @export runPredictor_nestedCV <- function(pheno,dataList,groupList,outDir,makeNetFunc, nFoldCV=10L,trainProp=0.8,numSplits=10L,numCores,CVmemory=4L,CVcutoff=9L, - keepAllData=FALSE) { + keepAllData=FALSE,startAt=1L) { ### tests# pheno$ID and $status must exist if (missing(dataList)) stop("dataList must be supplied.\n") if (missing(groupList)) stop("groupList must be supplied.\n") if (trainProp <= 0 | trainProp >= 1) stop("trainProp must be greater than 0 and less than 1") +if (startAt > numSplits) stop("startAt should be between 1 and numSplits") megaDir <- outDir if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) @@ -95,7 +98,7 @@ cat("\n\nCustom function to generate input nets:\n") print(makeNetFunc) cat(sprintf("-------------------------------\n\n")) -for (rngNum in 1:numSplits) { +for (rngNum in startAt:numSplits) { cat(sprintf("-------------------------------\n")) cat(sprintf("RNG seed = %i\n", rngNum)) cat(sprintf("-------------------------------\n")) From 5f54b1abaae8f9e48a21a0e347d6d34ef238fc25 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 23 Feb 2018 10:42:15 -0500 Subject: [PATCH 023/124] writes psn creation to log file --- netDx/R/makePSN_NamedMatrix.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/netDx/R/makePSN_NamedMatrix.R b/netDx/R/makePSN_NamedMatrix.R index ede58fdc..1991ebeb 100644 --- a/netDx/R/makePSN_NamedMatrix.R +++ b/netDx/R/makePSN_NamedMatrix.R @@ -76,8 +76,8 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, if (!sparsify & useSparsify2) { stop("if useSparsify=TRUE then sparsify must also be set to TRUE\n")} - cl <- makeCluster(numCores) - registerDoParallel(cl,outfile=sprintf("%s/makePSN_log.txt",outDir)) + cl <- makeCluster(numCores,outfile=sprintf("%s/makePSN_log.txt",outDir)) + registerDoParallel(cl) # process pathways in parallel outFiles <- foreach (curSet=names(namedSets)) %dopar% { From fb5383c2953a55cb4553b12ad6017989d3eaeac4 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 27 Feb 2018 09:10:48 -0500 Subject: [PATCH 024/124] updated to latest brca --- misc/BRCA/BRCA_plotResults.R | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/misc/BRCA/BRCA_plotResults.R b/misc/BRCA/BRCA_plotResults.R index 6b51edb6..c36cfdc0 100644 --- a/misc/BRCA/BRCA_plotResults.R +++ b/misc/BRCA/BRCA_plotResults.R @@ -4,9 +4,9 @@ require(netDx) require(netDx.examples) data(TCGA_BRCA) -rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab/2017_BRCA/output/BRCA_180220" +rootDir <- "/home/shraddhapai/BaderLab/2017_BRCA" -pathFile <- sprintf("%s/anno/Human_AllPathways_November_01_2017_symbol.gmt", +pathFile <- sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", rootDir) pathwayList <- readPathways(pathFile) head(pathwayList) @@ -20,12 +20,21 @@ pheno$STATUS[which(pheno$STATUS!="LumA")] <- "other" # outDir=sprintf("%s/plot",rootDir), # fsCutoff=10,fsPctPass=0.7,pathwaySet=pathwayList) -inDir <- sprintf("%s/pred",rootDir) -outDir <- sprintf("%s/plot",rootDir) +inDir <- sprintf("%s/output/BRCA_part2_180223",rootDir) +outDir <- sprintf("%s/output/BRCA_part2_180223/plot",rootDir) +if (!file.exists(outDir)) dir.create(outDir) predClasses <- unique(pheno$STATUS) postscript(sprintf("%s/perf.eps",outDir)) + +#predFiles <- sprintf("%s/rng%i/predictionResults.txt", inDir,1:79) predPerf <- plotPerf(inDir, predClasses=predClasses) dev.off() + +featFiles <- list( + LumA=sprintf("%s/rng%i/LumA/GM_results/LumA_pathway_CV_score.txt", inDir,1:79), + other=sprintf("%s/rng%i/other/GM_results/other_pathway_CV_score.txt", inDir,1:79) +) +#featScores <- getFeatureScores(featFiles,predClasses=predClasses) featScores <- getFeatureScores(inDir,predClasses=predClasses) featSelNet <- lapply(featScores, function(x) { callFeatSel(x, fsCutoff=10, fsPctPass=0.7) @@ -34,4 +43,18 @@ featSelNet <- lapply(featScores, function(x) { netInfoFile <- sprintf("%s/inputNets.txt",inDir) netInfo <- read.delim(netInfoFile,sep="\t",h=FALSE,as.is=TRUE) EMap_input <- writeEMapInput_many(featScores,pathwayList, - netInfo,outDir=outDir,pctPass=0.5) + netInfo,outDir=outDir,pctPass=0.7) + +auroc <- unlist(lapply(predPerf,function(x) x$auroc)) +aupr <- unlist(lapply(predPerf,function(x) x$aupr)) +acc <- unlist(lapply(predPerf,function(x) x$accuracy)) +cat(sprintf("mean auroc = %1.2f ; aupr = %1.2f ; acc = %1.2f%%", + mean(auroc), mean(aupr), mean(acc))) + +cat("Performance\n") +#cat(sprintf("AUROC = %1.2f +/- %1.2f\n", mean(auroc),sd(auroc)/sqrt(length(auroc)))) +#cat(sprintf("AUPR = %1.2f +/- %1.2f\n", mean(aupr),sd(aupr)/sqrt(length(aupr)))) +#cat(sprintf("ACC = %1.2f +/- %1.2f\n", mean(acc),sd(acc)/sqrt(length(acc)))) +cat(sprintf("AUROC = %1.2f +/- %1.2f\n", mean(auroc),sd(auroc))) +cat(sprintf("AUPR = %1.2f +/- %1.2f\n", mean(aupr),sd(aupr))) +cat(sprintf("ACC = %1.2f +/- %1.2f\n", mean(acc),sd(acc))) From 889714646588c1b18e6fc41c416aa52352c79317 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 27 Feb 2018 09:11:09 -0500 Subject: [PATCH 025/124] compare one-net and pathways and pathway-level predictor after gmt change --- misc/BRCA/BRCA_example.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/misc/BRCA/BRCA_example.R b/misc/BRCA/BRCA_example.R index f3445a34..e28615a3 100644 --- a/misc/BRCA/BRCA_example.R +++ b/misc/BRCA/BRCA_example.R @@ -26,7 +26,7 @@ BRCA_makeNets <- function(dataList, groupList, netDir,...) { rootDir <- "/home/shraddhapai/BaderLab/2017_BRCA" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/output/BRCA_%s",rootDir,dt) +megaDir <- sprintf("%s/output/BRCA_part2_%s",rootDir,dt) pathFile <- sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", rootDir) @@ -40,4 +40,5 @@ runPredictor_nestedCV(pheno, dataList=dats,groupList=gps, makeNetFunc=BRCA_makeNets, ### custom network creation function outDir=megaDir, - numCores=4L,nFoldCV=10L, CVcutoff=9L,numSplits=100L) + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=100L,CVmemory=13L, + startAt=3L) From 5ae43efa98566cd7d0e7b641081403cde58e81ec Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 27 Feb 2018 09:11:38 -0500 Subject: [PATCH 026/124] compare one-net and pathways and pathway-level predictor after gmt change --- misc/BRCA/BRCA_oneNet_plotResults.R | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 misc/BRCA/BRCA_oneNet_plotResults.R diff --git a/misc/BRCA/BRCA_oneNet_plotResults.R b/misc/BRCA/BRCA_oneNet_plotResults.R new file mode 100644 index 00000000..60bfc40c --- /dev/null +++ b/misc/BRCA/BRCA_oneNet_plotResults.R @@ -0,0 +1,36 @@ +#' plot BRCA results + +require(netDx) +require(netDx.examples) +data(TCGA_BRCA) + +rootDir <- "/home/shraddhapai/BaderLab/2017_BRCA/output" + +pheno$STATUS[which(pheno$STATUS!="LumA")] <- "other" + +# get one rna net perf +inDir <- sprintf("%s/BRCA_OneRNAnet_180221/.",rootDir) +outDir <- sprintf("%s/BRCA_OneRNAnet_180221/plot",rootDir) +predClasses <- unique(pheno$STATUS) +postscript(sprintf("%s/perf.eps",outDir)) +predPerf_oneRNA <- plotPerf(inDir, predClasses=predClasses) +dev.off() + +inDir_path <- sprintf("%s/BRCA_part2_180223", rootDir) +outDir <- sprintf("%s/BRCA_part2_180223/plot",rootDir) +postscript(sprintf("%s/perf.eps",outDir)) +predPerf_path <- plotPerf(inDir_path, predClasses=predClasses) +dev.off() + +auc_onerna <- unlist(lapply(predPerf_oneRNA,function(x) x$auroc)) +auc_path <- unlist(lapply(predPerf_path,function(x) x$auroc)) +wmw <- wilcox.test(auc_onerna,auc_path) +pdf("BRCA_oneNetVsPath.pdf") +boxplot(list(OneNet=auc_onerna, Pathways=auc_path), + main=sprintf("BRCA, one vs path\n(WMW p < %1.2e)", + wmw$p.value)) +dev.off() +cat(sprintf("One RNA: AUC=%1.2f +/- %1.2f\n",mean(auc_onerna),sd(auc_onerna))) +cat(sprintf("Pathways: AUC=%1.2f +/- %1.2f\n",mean(auc_path),sd(auc_path))) + + From f28bceea9b0b53651e1e6ab058afcb605bbeacb0 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 19 Apr 2018 18:03:54 -0400 Subject: [PATCH 027/124] univariate filtering now instead outer split --- .../PanCancer/pruneVersion/GBM_pruneTrained.R | 11 +- misc/PanCancer/pruneVersion/KIRC_pruneTrain.R | 30 +- misc/PanCancer/pruneVersion/LUSC_pruneTrain.R | 384 ++++++++++++++++++ misc/PanCancer/pruneVersion/OV_pruneTrain.R | 7 +- .../pruneVersion/{ => outdated}/GBM_pruned.R | 0 .../pruneVersion/{ => outdated}/KIRC_pruned.R | 0 .../pruneVersion/{ => outdated}/LUSC_MI.R | 0 .../{ => outdated}/LUSC_oneNetPer_LMprune.R | 0 .../{ => outdated}/LUSC_sparse2.R | 0 .../pruneVersion/{ => outdated}/OV_pruned.R | 0 10 files changed, 409 insertions(+), 23 deletions(-) create mode 100644 misc/PanCancer/pruneVersion/LUSC_pruneTrain.R rename misc/PanCancer/pruneVersion/{ => outdated}/GBM_pruned.R (100%) rename misc/PanCancer/pruneVersion/{ => outdated}/KIRC_pruned.R (100%) rename misc/PanCancer/pruneVersion/{ => outdated}/LUSC_MI.R (100%) rename misc/PanCancer/pruneVersion/{ => outdated}/LUSC_oneNetPer_LMprune.R (100%) rename misc/PanCancer/pruneVersion/{ => outdated}/LUSC_sparse2.R (100%) rename misc/PanCancer/pruneVersion/{ => outdated}/OV_pruned.R (100%) diff --git a/misc/PanCancer/pruneVersion/GBM_pruneTrained.R b/misc/PanCancer/pruneVersion/GBM_pruneTrained.R index e9d9eacf..bef88fc6 100644 --- a/misc/PanCancer/pruneVersion/GBM_pruneTrained.R +++ b/misc/PanCancer/pruneVersion/GBM_pruneTrained.R @@ -18,7 +18,7 @@ inDir <- sprintf("%s/input",rootDir) outRoot <- sprintf("%s/output",rootDir) dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/prune_%s",outRoot,dt) +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) # ---------------------------------------------------------------- # helper functions @@ -125,7 +125,6 @@ for (k in 2:length(dats)) { } - # input nets for each category netSets <- lapply(dats, function(x) rownames(x)) @@ -170,7 +169,6 @@ source("silh.R") require(cluster) setwd(curwd) - # first loop - over train/test splits for (rngNum in 1:100) { rng_t0 <- Sys.time() @@ -181,7 +179,7 @@ for (rngNum in 1:100) { dir.create(outDir) pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) + setSeed=rngNum*5) write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", col=T,row=F,quote=F) # -------------------------------------------- @@ -195,12 +193,13 @@ for (rngNum in 1:100) { if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) dev.off() if (!is.na(prune)) { - if (prune$bestThresh < 1) { + if (prune$bestThresh < 0.9) { res <- prune$res res <- subset(res, adj.P.Val < prune$bestThresh) - tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] dats_train[[nm]] <- tmp netSets_iter[[nm]] <- rownames(tmp) diff --git a/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R b/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R index 79e6c8be..d9be1b31 100644 --- a/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R +++ b/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R @@ -157,9 +157,6 @@ dir.create(megaDir) logFile <- sprintf("%s/log.txt",megaDir) sink(logFile,split=TRUE) tryCatch({ - -#### ----------------------------------------------------- -### BEGIN PRUNING CODE # apply pruning to proteomic data curwd <- getwd() setwd("..") @@ -170,7 +167,7 @@ require(cluster) setwd(curwd) # first loop - over train/test splits -for (rngNum in 1:25) { +for (rngNum in 1:100) { rng_t0 <- Sys.time() cat(sprintf("-------------------------------\n")) cat(sprintf("RNG seed = %i\n", rngNum)) @@ -179,31 +176,35 @@ for (rngNum in 1:25) { dir.create(outDir) pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) + setSeed=rngNum*5) write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", col=T,row=F,quote=F) # -------------------------------------------- # feature selection - train only pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") -## pruneTrain code ------ + + ## pruneTrain code ------ dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), drop=F]) netSets_iter <- list() for (nm in setdiff(names(dats_train),"clinical")) { - print(nm) + print(nm) if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) dev.off() + + netSets_iter[[nm]] <- rownames(tmp) if (!is.na(prune)) { - if (prune$bestThresh < 1) { + if (prune$bestThresh < 0.9) { res <- prune$res res <- subset(res, adj.P.Val < prune$bestThresh) tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] dats_train[[nm]] <- tmp netSets_iter[[nm]] <- rownames(tmp) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", + nm,prune$bestThresh)) cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) } } else { @@ -313,10 +314,11 @@ for (rngNum in 1:25) { # query of all training samples for this class qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] + pheno_all$TT_STATUS%in%"TRAIN")] qFile <- sprintf("%s/%s_query",pDir2,g) - GM_writeQueryFile(qSamps,incNets=pTally,nrow(pheno_all),qFile) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), pheno_all,g) @@ -325,7 +327,8 @@ for (rngNum in 1:25) { predClass <- GM_OneVAll_getClass(predRes) out <- merge(x=pheno_all,y=predClass,by="ID") outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", @@ -333,7 +336,7 @@ for (rngNum in 1:25) { require(ROCR) ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") + out$STATUS=="SURVIVEYES") save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) } @@ -342,7 +345,6 @@ for (rngNum in 1:25) { outDir,outDir,outDir)) system(sprintf("rm -r %s/dataset %s/networks", outDir,outDir)) - } pheno_all$TT_STATUS <- NA rng_t1 <- Sys.time() diff --git a/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R b/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R new file mode 100644 index 00000000..a3649b6a --- /dev/null +++ b/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R @@ -0,0 +1,384 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + + netSets_iter <- list() + cat("Pruning\n") + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + if (nrow(dats_train[[nm]])>10000 | nm == "prot") + topVar <- 50 else topVar <- 100 + #topVar <- 50 + pdf(sprintf("%s/%s_prune.pdf",outDir,nm)) + prune <- LMprune(dats_train[[nm]], + pheno$STATUS,topVar=topVar) + dev.off() + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + if (!is.na(prune)) { + if (prune$bestThresh < 0.9) { + res <- prune$res + if (nm == "prot") prune$bestThresh <- 0.6 + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", + nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +#### ---------------------------------------------------------- + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + ## Create the mega database with all patients and all nets. + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/OV_pruneTrain.R b/misc/PanCancer/pruneVersion/OV_pruneTrain.R index c86babba..dca4872a 100644 --- a/misc/PanCancer/pruneVersion/OV_pruneTrain.R +++ b/misc/PanCancer/pruneVersion/OV_pruneTrain.R @@ -148,7 +148,7 @@ require(cluster) setwd(curwd) # first loop - over train/test splits -for (rngNum in 1:25) { +for (rngNum in 1:100) { rng_t0 <- Sys.time() cat(sprintf("-------------------------------\n")) cat(sprintf("RNG seed = %i\n", rngNum)) @@ -174,13 +174,14 @@ for (rngNum in 1:25) { pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) dev.off() + netSets_iter[[nm]] <- rownames(tmp) if (!is.na(prune)) { - if (prune$bestThresh < 1) { + if (prune$bestThresh < 0.9) { res <- prune$res res <- subset(res, adj.P.Val < prune$bestThresh) tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - netSets_iter[[nm]] <- tmp + dats_train[[nm]] <- tmp netSets_iter[[nm]] <- rownames(tmp) cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) diff --git a/misc/PanCancer/pruneVersion/GBM_pruned.R b/misc/PanCancer/pruneVersion/outdated/GBM_pruned.R similarity index 100% rename from misc/PanCancer/pruneVersion/GBM_pruned.R rename to misc/PanCancer/pruneVersion/outdated/GBM_pruned.R diff --git a/misc/PanCancer/pruneVersion/KIRC_pruned.R b/misc/PanCancer/pruneVersion/outdated/KIRC_pruned.R similarity index 100% rename from misc/PanCancer/pruneVersion/KIRC_pruned.R rename to misc/PanCancer/pruneVersion/outdated/KIRC_pruned.R diff --git a/misc/PanCancer/pruneVersion/LUSC_MI.R b/misc/PanCancer/pruneVersion/outdated/LUSC_MI.R similarity index 100% rename from misc/PanCancer/pruneVersion/LUSC_MI.R rename to misc/PanCancer/pruneVersion/outdated/LUSC_MI.R diff --git a/misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R b/misc/PanCancer/pruneVersion/outdated/LUSC_oneNetPer_LMprune.R similarity index 100% rename from misc/PanCancer/pruneVersion/LUSC_oneNetPer_LMprune.R rename to misc/PanCancer/pruneVersion/outdated/LUSC_oneNetPer_LMprune.R diff --git a/misc/PanCancer/pruneVersion/LUSC_sparse2.R b/misc/PanCancer/pruneVersion/outdated/LUSC_sparse2.R similarity index 100% rename from misc/PanCancer/pruneVersion/LUSC_sparse2.R rename to misc/PanCancer/pruneVersion/outdated/LUSC_sparse2.R diff --git a/misc/PanCancer/pruneVersion/OV_pruned.R b/misc/PanCancer/pruneVersion/outdated/OV_pruned.R similarity index 100% rename from misc/PanCancer/pruneVersion/OV_pruned.R rename to misc/PanCancer/pruneVersion/outdated/OV_pruned.R From 0e8a299ba32c5f09379abe319144f9b0b1919c23 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 20 Apr 2018 07:17:27 -0400 Subject: [PATCH 028/124] moved many scripts to outdated --- misc/PanCancer/BRCA_missing_perf_multi_run.r | 182 ---------- misc/PanCancer/LMprune.R | 11 +- misc/PanCancer/LUSC_featSel_somMut_VM3copy.R | 342 ------------------ misc/PanCancer/multiCutoff/GBM_parseMulti.R | 20 +- misc/PanCancer/multiCutoff/KIRC_parseMulti.R | 17 +- misc/PanCancer/multiCutoff/LUSC_parseMulti.R | 19 +- misc/PanCancer/multiCutoff/OV_parseMulti.R | 21 +- .../{ => outdated}/GBM_featSel_perf.R | 0 misc/PanCancer/{ => outdated}/GBM_perf.R | 0 .../{ => outdated}/GBM_somMut_writeEnrMap.R | 0 misc/PanCancer/{ => outdated}/LUSC_perf.R | 0 misc/PanCancer/{ => outdated}/OV_perf.R | 0 .../basicPredictor/LUSC_classify.R | 0 .../basicPredictor/LUSC_classify_ownTrain.R | 0 .../basicPredictor/OV_classify.R | 0 .../basicPredictor/OV_classify_ownTrain.R | 0 .../{ => outdated}/check_trainTestOverlap.R | 0 misc/PanCancer/{ => outdated}/consNetScore.R | 0 .../featSel_oneNetPer/GBM_oneNetPer.R | 0 .../featSel_oneNetPer/GBM_oneNetPer_PCA.R | 0 .../GBM_oneNetPer_PCAmultiNet.R | 0 .../featSel_oneNetPer/GBM_oneNetPer_PCAuniv.R | 0 .../GBM_oneNetPer_clinRNAalone.R | 0 .../GBM_oneNetPer_multiCutoff.R | 0 .../featSel_oneNetPer/GBM_oneNetPer_naOmit.R | 0 .../GBM_oneNetPer_normDiff.R | 0 .../featSel_oneNetPer/GBM_oneNetPer_prune.R | 0 .../KIRC_Prot_oneNetPer_PSN.R | 0 .../KIRC_RNA_oneNetPer_PSN.R | 0 .../KIRC_clin_oneNetPer_PSN.R | 0 .../featSel_oneNetPer/KIRC_collectRes.R | 0 .../featSel_oneNetPer/KIRC_oneNetPer.R | 0 .../KIRC_oneNetPer_normDiff.R | 0 .../KIRC_oneNetPer_withPathways.R | 0 .../featSel_oneNetPer/LUSC_oneNetPer.R | 0 .../LUSC_oneNetPer_LMprunePCA.R | 2 +- .../LUSC_oneNetPer_LMpruneStageSep.R | 0 .../LUSC_oneNetPer_LMprune_RBF.R | 0 .../featSel_oneNetPer/OV_clin_oneNetPer_PSN.R | 0 .../{ => outdated}/featSel_oneNetPer/README | 0 .../featSel_oneNetPer/getTarballed.sh | 0 .../getTarballed_clinNormDiff.sh | 0 .../featSel_oneNetPer/oneNetPer_collectRes.R | 0 .../oneNetPer_collectRes_normDiff.R | 0 .../featSel_oneNetPer/oneNetPer_plotPerf.R | 0 .../oneNetPer_tarballResults.sh | 0 .../featSel_oneNetPer/selectPC.R | 0 .../featSel_oneNetPer/showScoreRanking.R | 0 .../writeConsensusNets_batch.R | 0 .../writeConsensusNets_oneSet.R | 0 .../featSel_pathways/KIRC_SurvivalByVar.R | 0 .../featSel_pathways/KIRC_checkConsensus.R | 0 .../featSel_pathways/KIRC_checkRandom.R | 0 .../KIRC_checkRandom_normDiff.R | 0 .../KIRC_checkRandom_normDiff2.R | 0 .../featSel_pathways/KIRC_writeEMap.R | 0 .../KIRCpathway_locations.txt | 0 .../featSel_pathways/PRANK_compareRealShuf.R | 0 .../clinNets/KIRC_featSel_clinNets.R | 0 .../clinNets/collectRes_clinNet.R | 0 .../clinNets/getDijk_indivClinNets.R | 0 .../featSel_pathways/clinNets/getPSN.R | 0 .../featSel_pathways/clinNets/getPSN_test.R | 0 .../clinNets/getTarballed_clinNetsOnly.sh | 0 .../writeConsensusNets_clinNets_batch.R | 0 .../clinRNA_best/KIRC_writeEMap.R | 0 .../featSel_pathways/clinRNA_best/getPSN.R | 0 .../clinRNA_best/getTarballed_clinRNAbest.sh | 0 .../showScoreRanking_clinRNAbest.R | 0 .../featSel_pathways/collectRes.R | 0 .../featSel_pathways/collectRes_batch.R | 0 .../featSel_pathways/collectRes_consensus.R | 0 .../featSel_pathways/compareConsNets.R | 0 .../compareScores_realPseudo.R | 0 .../featSel_pathways/compareTestSimRanks.R | 0 .../{ => outdated}/featSel_pathways/getAUC.R | 0 .../{ => outdated}/featSel_pathways/getPSN.R | 0 .../featSel_pathways/getTarballed.sh | 0 .../getTarballed_randomRes.sh | 0 .../featSel_pathways/multiplot.R | 0 .../KIRC_featSel_pathways_oneClinNet.R | 0 .../KIRC_pathways_oneClinNet_normDiff.R | 0 .../oneClinNet/collectRes_oneClinNet.R | 0 .../oneClinNet/getTarballed_clinOneNet.R | 0 .../writeConsensusNets_oneClinNet_batch.R | 0 .../pathways/GM_OneVAll_getClass_altMethod.R | 0 .../pathways/KIRC_featSel_pathways.R | 0 .../pathways/KIRC_featSel_pathwaysConsOnly.R | 0 .../pathways/KIRC_featSel_pathwaysOnly.R | 0 .../pathways/KIRC_featSel_pathwaysOnly_80.R | 0 .../KIRC_featSel_pathwaysOnly_memtest.R | 0 .../pathways/KIRC_pathOnly_checkConsensus.R | 0 .../pathways/checkPathwayGone.R | 0 .../pathways/collectRes_consensus.R | 0 .../pathways/collectRes_pathOnly.R | 0 .../pathways/corrFeatWithSurvival.R | 0 .../pathways/featSel_pathways_noFSgenes.R | 0 .../pathways/featSel_pathways_scrambled.R | 0 .../featSel_pseudoPathways_noPathGenes.R | 0 .../pathways/featSel_realPseudo_both.R | 0 .../filterNetWt/getTarballed_path9095.sh | 0 .../pathways/getTar_allPlusPathways.sh | 0 .../pathways/getTar_featSelPseudo.sh | 0 .../pathways/getTar_pathNoFS.sh | 0 .../pathways/getTar_realPseudo.sh | 0 .../pathways/getTar_scrambled.sh | 0 .../pathways/getTar_scrambled2.sh | 0 .../pathways/getTarballed_AltClassMethod.sh | 0 .../pathways/getTarballed_AltClass_noFS.sh | 0 .../getTarballed_pathFull_AltClass.sh | 0 .../pathways/getTarballed_pathGMresults.R | 0 .../pathways/getTarballed_pathOnly.sh | 0 .../pathways/getTarballed_pathOnly80.sh | 0 .../getTarballed_pathOnly_getPRANK.sh | 0 .../pathways/getTarballed_pathOnly_strict.sh | 0 .../pathways/pathwaysFull_AltClassMethod.R | 0 .../pathways/pathwaysOnly_plotResults.R | 0 .../pathways/pathways_AltClassMethod.R | 0 .../pathways_AltClassMethod_designD_noFS.R | 0 .../pathways/pathways_changeFilterWt.R | 0 .../pathways/pathways_getPSN.R | 0 .../featSel_pathways/pathways/plotRandom.R | 0 .../randomDesigns/KIRC_pathOnly_checkRandom.R | 0 .../KIRC_pathOnly_checkRandom_notFS.R | 0 .../KIRC_pathOnly_checkRandom_shufGenes.R | 0 .../KIRC_pathOnly_checkRandom_smaller.R | 0 .../KIRC_pathOnly_checkRandom_smallest.R | 0 .../randomDesigns/KIRC_randomPath_designD.R | 0 .../KIRC_randomPath_designD_noFS.R | 0 .../KIRC_randomPath_designD_shufGenes.R | 0 .../randomDesigns/changePathwaySize.R | 0 .../pathways/randomDesigns/collectRes_psize.R | 0 .../pathways/randomDesigns/designD_getNets.R | 0 .../randomDesigns/designD_shufGetPathways.R | 0 .../getTar_netNames_pseudoPath_noFS.sh | 0 .../pathways/randomDesigns/getTar_psize.sh | 0 .../getTarballed_designD_noFS.sh | 0 .../getTarballed_designD_noFS_PRANK.sh | 0 .../getTarballed_designD_shufGenes_noFS.sh | 0 ...Tarballed_designD_shufGenes_noPathGenes.sh | 0 .../getTarballed_pathOnly_randomRes.sh | 0 .../getTarballed_pathOnly_shufGenes.sh | 0 .../getTarballed_pathOnly_smaller.sh | 0 .../getTarballed_pathOnly_strictNoFS.sh | 0 .../getTarballed_randomDesignD.R | 0 .../getTarballed_randomDesignD_shuf.R | 0 .../randomDesigns/plotRandom_shufGenes.R | 0 .../randomDesigns/plotRandom_smaller.R | 0 .../randomDesigns/plotRandom_strict.R | 0 .../randomDesigns/plotRandom_strictNoFS.R | 0 .../pseudoPath_noPathGenes_getNetScores.R | 0 .../randomPath_designD_rmFSpath.R | 0 .../randomPath_designD_rmFSpath_top90.R | 0 .../randomPath_designD_shufGenes_noFS.R | 0 .../pathways/randomDesigns/shufflePathways.R | 0 .../randomDesigns/strictNoFS_getNetNames.sh | 0 .../pathways/strictNoFS_makeEM.R | 0 .../featSel_pathways/plotPerf.R | 0 .../featSel_pathways/plotRandom.R | 0 .../featSel_pathways/rankPatients_test.R | 0 .../featSel_pathways/showScoreRanking.R | 0 .../simRank_compareRealShuf.R | 0 .../featSel_pathways/survivalPlots.R | 0 .../writeConsensusNets_batch.R | 0 .../writeConsensusNets_oneSet.R | 0 .../featSel_pathways/writeEmap.R | 0 .../GBM_featSel.R | 0 .../GBM_featSel_somMut.R | 0 .../KIRC_featSel_somMut.r | 0 .../LUSC_featSel.R | 0 .../LUSC_featSel_clinRNA.R | 0 .../LUSC_featSel_clinRPPA.R | 0 .../LUSC_featSel_somMut.R | 0 .../OV_featSel_somMut.R | 0 .../PanCancer_featSel_getAcc.R | 0 .../PanCancer_featSel_plotPerf.R | 0 .../featSel_predictor/GBM_featSel.R | 0 .../featSel_predictor/GBM_featSel_somMut.R | 0 .../featSel_predictor/KIRC_featSel_somMut.r | 0 .../featSel_predictor/LUSC_featSel.R | 0 .../featSel_predictor/LUSC_featSel_clinRNA.R | 0 .../featSel_predictor/LUSC_featSel_clinRPPA.R | 0 .../featSel_predictor/LUSC_featSel_somMut.R | 0 .../featSel_predictor/OV_featSel_somMut.R | 0 .../PanCancer_featSel_getAcc.R | 0 .../PanCancer_featSel_plotPerf.R | 0 misc/PanCancer/{ => outdated}/getFSNets.R | 0 .../{ => outdated}/getTrainTestPct.R | 0 .../{ => outdated}/sp_GBM_classifier_netdx.r | 0 misc/PanCancer/pruneVersion/LUSC_pruneTrain.R | 3 +- 190 files changed, 75 insertions(+), 542 deletions(-) delete mode 100644 misc/PanCancer/BRCA_missing_perf_multi_run.r delete mode 100644 misc/PanCancer/LUSC_featSel_somMut_VM3copy.R rename misc/PanCancer/{ => outdated}/GBM_featSel_perf.R (100%) rename misc/PanCancer/{ => outdated}/GBM_perf.R (100%) rename misc/PanCancer/{ => outdated}/GBM_somMut_writeEnrMap.R (100%) rename misc/PanCancer/{ => outdated}/LUSC_perf.R (100%) rename misc/PanCancer/{ => outdated}/OV_perf.R (100%) rename misc/PanCancer/{ => outdated}/basicPredictor/LUSC_classify.R (100%) rename misc/PanCancer/{ => outdated}/basicPredictor/LUSC_classify_ownTrain.R (100%) rename misc/PanCancer/{ => outdated}/basicPredictor/OV_classify.R (100%) rename misc/PanCancer/{ => outdated}/basicPredictor/OV_classify_ownTrain.R (100%) rename misc/PanCancer/{ => outdated}/check_trainTestOverlap.R (100%) rename misc/PanCancer/{ => outdated}/consNetScore.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_PCA.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_PCAmultiNet.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_PCAuniv.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_clinRNAalone.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_multiCutoff.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_naOmit.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_normDiff.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/GBM_oneNetPer_prune.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/KIRC_Prot_oneNetPer_PSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/KIRC_RNA_oneNetPer_PSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/KIRC_clin_oneNetPer_PSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/KIRC_collectRes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/KIRC_oneNetPer.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/KIRC_oneNetPer_normDiff.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/KIRC_oneNetPer_withPathways.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/LUSC_oneNetPer.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/LUSC_oneNetPer_LMprunePCA.R (96%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/LUSC_oneNetPer_LMpruneStageSep.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/OV_clin_oneNetPer_PSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/README (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/getTarballed.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/getTarballed_clinNormDiff.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/oneNetPer_collectRes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/oneNetPer_collectRes_normDiff.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/oneNetPer_plotPerf.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/oneNetPer_tarballResults.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/selectPC.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/showScoreRanking.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/writeConsensusNets_batch.R (100%) rename misc/PanCancer/{ => outdated}/featSel_oneNetPer/writeConsensusNets_oneSet.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/KIRC_SurvivalByVar.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/KIRC_checkConsensus.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/KIRC_checkRandom.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/KIRC_checkRandom_normDiff.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/KIRC_checkRandom_normDiff2.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/KIRC_writeEMap.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/KIRCpathway_locations.txt (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/PRANK_compareRealShuf.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinNets/KIRC_featSel_clinNets.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinNets/collectRes_clinNet.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinNets/getDijk_indivClinNets.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinNets/getPSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinNets/getPSN_test.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinNets/getTarballed_clinNetsOnly.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinNets/writeConsensusNets_clinNets_batch.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinRNA_best/KIRC_writeEMap.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinRNA_best/getPSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinRNA_best/getTarballed_clinRNAbest.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/clinRNA_best/showScoreRanking_clinRNAbest.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/collectRes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/collectRes_batch.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/collectRes_consensus.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/compareConsNets.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/compareScores_realPseudo.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/compareTestSimRanks.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/getAUC.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/getPSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/getTarballed.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/getTarballed_randomRes.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/multiplot.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/oneClinNet/KIRC_featSel_pathways_oneClinNet.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/oneClinNet/KIRC_pathways_oneClinNet_normDiff.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/oneClinNet/collectRes_oneClinNet.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/oneClinNet/getTarballed_clinOneNet.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/oneClinNet/writeConsensusNets_oneClinNet_batch.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/GM_OneVAll_getClass_altMethod.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/KIRC_featSel_pathways.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/KIRC_featSel_pathwaysConsOnly.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_80.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_memtest.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/KIRC_pathOnly_checkConsensus.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/checkPathwayGone.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/collectRes_consensus.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/collectRes_pathOnly.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/corrFeatWithSurvival.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/featSel_pathways_noFSgenes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/featSel_pathways_scrambled.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/featSel_pseudoPathways_noPathGenes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/featSel_realPseudo_both.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/filterNetWt/getTarballed_path9095.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTar_allPlusPathways.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTar_featSelPseudo.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTar_pathNoFS.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTar_realPseudo.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTar_scrambled.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTar_scrambled2.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_AltClassMethod.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_AltClass_noFS.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_pathFull_AltClass.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_pathGMresults.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_pathOnly.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_pathOnly80.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_pathOnly_getPRANK.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/getTarballed_pathOnly_strict.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/pathwaysFull_AltClassMethod.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/pathwaysOnly_plotResults.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/pathways_AltClassMethod.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/pathways_AltClassMethod_designD_noFS.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/pathways_changeFilterWt.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/pathways_getPSN.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/plotRandom.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_notFS.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_shufGenes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smaller.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smallest.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_noFS.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_shufGenes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/changePathwaySize.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/collectRes_psize.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/designD_getNets.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/designD_shufGetPathways.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTar_netNames_pseudoPath_noFS.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTar_psize.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS_PRANK.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noFS.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noPathGenes.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_randomRes.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_shufGenes.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_smaller.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_strictNoFS.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD_shuf.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/plotRandom_shufGenes.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/plotRandom_smaller.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/plotRandom_strict.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/plotRandom_strictNoFS.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/pseudoPath_noPathGenes_getNetScores.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath_top90.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/randomPath_designD_shufGenes_noFS.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/shufflePathways.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/randomDesigns/strictNoFS_getNetNames.sh (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/pathways/strictNoFS_makeEM.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/plotPerf.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/plotRandom.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/rankPatients_test.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/showScoreRanking.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/simRank_compareRealShuf.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/survivalPlots.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/writeConsensusNets_batch.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/writeConsensusNets_oneSet.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways/writeEmap.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/GBM_featSel.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/GBM_featSel_somMut.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/KIRC_featSel_somMut.r (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/LUSC_featSel.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/LUSC_featSel_clinRNA.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/LUSC_featSel_clinRPPA.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/LUSC_featSel_somMut.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/OV_featSel_somMut.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/PanCancer_featSel_getAcc.R (100%) rename misc/PanCancer/{ => outdated}/featSel_pathways_somaticMutations/PanCancer_featSel_plotPerf.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/GBM_featSel.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/GBM_featSel_somMut.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/KIRC_featSel_somMut.r (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/LUSC_featSel.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/LUSC_featSel_clinRNA.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/LUSC_featSel_clinRPPA.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/LUSC_featSel_somMut.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/OV_featSel_somMut.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/PanCancer_featSel_getAcc.R (100%) rename misc/PanCancer/{ => outdated}/featSel_predictor/PanCancer_featSel_plotPerf.R (100%) rename misc/PanCancer/{ => outdated}/getFSNets.R (100%) rename misc/PanCancer/{ => outdated}/getTrainTestPct.R (100%) rename misc/PanCancer/{ => outdated}/sp_GBM_classifier_netdx.r (100%) diff --git a/misc/PanCancer/BRCA_missing_perf_multi_run.r b/misc/PanCancer/BRCA_missing_perf_multi_run.r deleted file mode 100644 index b1a8ab18..00000000 --- a/misc/PanCancer/BRCA_missing_perf_multi_run.r +++ /dev/null @@ -1,182 +0,0 @@ -# assess performance of BRCA classifier for different levels of missing -# data -rm(list=ls()) -require(netDx) -require(RColorBrewer) - -rngRange <- 1:13 # number of train/test split iterations run - -inDir <- "/mnt/data2/BaderLab/TCGA_BRCA/output/msng_170213" -inFull <- "/mnt/data2/BaderLab/TCGA_BRCA/output/xpr_170213" - -combSet <- paste("miss",c(0,10,50,70,85,90,95,99),sep="") -cols <- c(brewer.pal(n=length(combSet),name="Blues"), "darkblue") - -cat(sprintf("Got %i combs\n", length(combSet))) - -all_overall_acc <- list() -all_tot <- list() -all_f1 <- list() -all_acc <- list() -all_ppv <- list() -for (rngSeed in rngRange) { - #cat(sprintf("RNG %i\n",rngSeed)) - out <- list() - overall_acc <- numeric() - pctMiss <- numeric() - numInt <- numeric() - curRoc <- list() - - for (cur in combSet) { - - if (cur == "miss0") { - inf <- sprintf("%s/rng%i/predictionResults.txt", - inFull,rngSeed) - } else { - inf <- sprintf("%s/rng%i/%s/predictionResults.txt", - inDir,rngSeed,cur) - } - # cat(sprintf("\t%s ", cur)) - dat <- read.delim(inf,sep="\t",h=T,as.is=T) - dat <- dat[-which(dat$STATUS %in% "Normal"),] - out[[cur]] <- perfCalc_multiClass(dat$STATUS,dat$PRED_CLASS)*100 - overall_acc <- c(overall_acc, - sum(dat$STATUS==dat$PRED_CLASS)/nrow(dat)*100) - } - #cat("\n") - names(overall_acc) <- combSet - - tot <- unlist(lapply(out,function(x) sum(x[1,1:4])/100)) - f1 <- unlist(lapply(out, function(x) x[nrow(x),7])) - acc <- unlist(lapply(out, function(x) x[nrow(x),8])) - ppv <- unlist(lapply(out, function(x) x[nrow(x),5])) - - all_tot[[rngSeed]] <- tot - all_f1[[rngSeed]] <- f1 - all_acc[[rngSeed]] <- acc - all_ppv[[rngSeed]] <- ppv - all_overall_acc[[rngSeed]] <- overall_acc - -} - -overall_avg_f1 <- c() -overall_avg_tot <- c() -overall_avg_acc <- c() -overall_avg_ppv <- c() -overall_avg_overall_acc <- c() - - -overall_sd_f1 <- c() -overall_sd_tot <- c() -overall_sd_acc <- c() -overall_sd_ppv <- c() -overall_sd_overall_acc <- c() - -for (index in seq(1:length(combSet))) { - avg_f1 <- c() - avg_tot <- c() - avg_acc <- c() - avg_ppv <- c() - avg_overall_acc <- c() - for (rngSeed in rngRange) { - avg_f1 <- c(avg_f1,all_f1[[rngSeed]][[index]]) - avg_tot <- c(avg_tot,all_tot[[rngSeed]][[index]]) - avg_acc <- c(avg_acc,all_acc[[rngSeed]][[index]]) - avg_ppv <- c(avg_ppv,all_ppv[[rngSeed]][[index]]) - avg_overall_acc <- c(avg_overall_acc,all_overall_acc[[rngSeed]][[index]]) - } - current <- combSet[index] - - current_f1 <- paste(current, '.f1', sep = '') - current_tot <- current - current_acc <- paste(current, '.acc', sep = '') - current_ppv <- paste(current, '.ppv', sep = '') - current_overall_acc <- current - - overall_avg_f1[current_f1] <- mean(avg_f1) - overall_avg_tot[current_tot] <- mean(avg_tot) - overall_avg_acc[current_acc] <- mean(avg_acc) - overall_avg_ppv[current_ppv] <- mean(avg_ppv) - overall_avg_overall_acc[current_overall_acc] <- mean(avg_overall_acc) - - overall_sd_f1[current_f1] <- sd(avg_f1) - overall_sd_tot[current_tot] <- sd(avg_tot) - overall_sd_acc[current_acc] <- sd(avg_acc) - overall_sd_ppv[current_ppv] <- sd(avg_ppv) - overall_sd_overall_acc[current_overall_acc] <- sd(avg_overall_acc) -} - - -pdf("BRCA_missing.pdf",width=8,height=4) -tryCatch({ - -# mean pairwise F1 -f1_plot <- barplot(overall_avg_f1, - main=sprintf("mean F1 (N=%i)",length(rngRange)), - col=cols,ylab="F1",ylim=c(0,100),las =2, cex.names=0.75) -bottom_f1 <- overall_avg_f1 - overall_sd_f1 -top_f1 <- overall_avg_f1 + overall_sd_f1 -names(bottom_f1) <- NULL -names(top_f1) <- NULL -arrows(f1_plot, bottom_f1, f1_plot, - top_f1, lwd = 1.5, angle = 90, - code = 3, length = 0.05) - -# mean pairwise accuracy -acc_plot <- barplot(overall_avg_acc, - main=sprintf("mean accuracy (N=%i)",length(rngRange)), - col=cols,ylab="accuracy",ylim=c(0,100),las =2, cex.names=0.75) -abline(h=25,col='red',lwd=2,lty=2) -abline(h=overall_avg_acc[1],col='grey50',lwd=2) -bottom_acc <- overall_avg_acc - overall_sd_acc -top_acc <- overall_avg_acc + overall_sd_acc -names(bottom_acc) <- NULL -names(top_acc) <- NULL -arrows(acc_plot, bottom_acc, acc_plot, - top_acc, lwd = 1.5, angle = 90, - code = 3, length = 0.05) - -#ppv -ppv_plot <- barplot(overall_avg_ppv, - main=sprintf("mean PPV (N=%i)",length(rngRange)), - col=cols,ylab="PPV",ylim=c(0,100),las =2, cex.names=0.75) -abline(h=50,col='red',lwd=2,lty=2) -abline(h=overall_avg_ppv[1],col='grey50',lwd=2) -bottom_ppv<- overall_avg_ppv - overall_sd_ppv -top_ppv <- overall_avg_ppv + overall_sd_ppv -names(bottom_ppv) <- NULL -names(top_ppv) <- NULL -arrows(ppv_plot, bottom_ppv, ppv_plot, - top_ppv, lwd = 1.5, angle = 90, - code = 3, length = 0.05) - -# plot overall accuracy -overall_acc_plot <- barplot(overall_avg_overall_acc, - main=sprintf("Accuracy (N=%i)",length(rngRange)), - col=cols,ylab="accuracy",ylim=c(0,100),las =2, cex.names=0.75) -abline(h=25,col='red',lwd=2,lty=2) -abline(h=overall_avg_overall_acc[1],col='grey50',lwd=2) -bottom_oall<- overall_avg_overall_acc - overall_sd_overall_acc -top_oall <- overall_avg_overall_acc + overall_sd_overall_acc -names(bottom_oall) <- NULL -names(top_oall) <- NULL -arrows(overall_acc_plot, bottom_oall, overall_acc_plot, - top_oall, lwd = 1.5, angle = 90, - code = 3, length = 0.05) - - -barplot(overall_avg_tot,main="total classified", las = 2, cex.names=0.75) -# names(numInt) <- names(tot) -# barplot(numInt/1000,main="# interactions (x1000)") -# names(pctMiss) <- names(tot) -# barplot(pctMiss*100,main="% missing in profile",ylim=c(0,100)) - -df <- do.call("rbind",all_overall_acc) -boxplot(df,ylab="% accuracy",main=sprintf("%% Overall accuracy (N=%i)", - nrow(df)),col=cols,pars=list(boxwex=0.4),bty='n',ylim=c(0,100),las=1) -abline(h=25,col='red',lty=3,lwd=2) -},error=function(ex){ - print(ex) -},finally={ - dev.off() -}) diff --git a/misc/PanCancer/LMprune.R b/misc/PanCancer/LMprune.R index 8c869c1c..a271ce4d 100644 --- a/misc/PanCancer/LMprune.R +++ b/misc/PanCancer/LMprune.R @@ -20,6 +20,9 @@ if (min(res$adj.P.Val) > 0.9) { return(NA) } thresh_vec <- seq(min(res$adj.P.Val),0.9,0.05) +if (min(res$adj.P.Val) < .Machine$double.eps) { + thresh_vec <- c(0.001,0.005,seq(0.01,0.04,0.01),thresh_vec) +} sil_width <- matrix(NA,nrow=length(thresh_vec),ncol=3) colnames(sil_width) <- c("thresh","num_vars","avg_sil_width") @@ -31,14 +34,18 @@ ct <- nrow(m) # evaluate effect of different Q cutoffs for (thresh in thresh_vec) { if (verbose) cat(sprintf("cutoff %1.2f\n", thresh)) - if (sum(res$adj.P.Val < thresh) < 5) { + if (sum(res$adj.P.Val < thresh) < 10) { sil_width[ctr,2] <- 0 if (verbose) cat("\t < 5 values left - ignore\n") } else { res_cur <- subset(res, adj.P.Val < thresh) if (verbose) cat(sprintf("\t%i of %i measures left\n",nrow(res_cur), ct,thresh)) m_cur <- m[which(rownames(m) %in% rownames(res_cur)),] - x <- silh(groups, m_cur,plotMe=FALSE) + zv <- apply(m_cur,2,function(x) sd(x)^2) + zv <- zv < .Machine$double.eps + m_cur <- m_cur[,!zv] + cur_groups <- groups[!zv] + x <- silh(cur_groups, m_cur,plotMe=FALSE) y <- summary(x) if (verbose)cat(sprintf("\tsilh = %1.2f, %1.2f; avg = %1.2f\n", thresh, y$clus.avg.widths[1],y$clus.avg.widths[2], diff --git a/misc/PanCancer/LUSC_featSel_somMut_VM3copy.R b/misc/PanCancer/LUSC_featSel_somMut_VM3copy.R deleted file mode 100644 index 65f665a5..00000000 --- a/misc/PanCancer/LUSC_featSel_somMut_VM3copy.R +++ /dev/null @@ -1,342 +0,0 @@ -#' feature selection for LUSC from PanCancer survival dataset -#' 10-fold CV predictor design with clinical and mRNA data -rm(list=ls()) -require(netDx) -require(netDx.examples) -require(ROCR) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -cutoff <- 9 - -inDir <- "/home/spai/BaderLab/PanCancer_LUSC/input" -outRoot <-"/home/spai/BaderLab/PanCancer_LUSC/output" -#inDir <- "/Users/shraddhapai/Documents/Research/BaderLab/2017_TCGA_LUSC/input" -#outRoot <-"/Users/shraddhapai/Documents/Research/BaderLab/2017_TCGA_LUSC/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/featSel_incMutRPPA_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions - -# normalized difference -# x is vector of values, one per patient (e.g. ages) -normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), - survival=sprintf("%s/LUSC_binary_survival.txt",inDir), - rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), - rppa=sprintf("%s/LUSC_RPPA_core.txt",inDir), - mut=sprintf("%s/from_firehose/LUSC_core_somatic_mutations.txt", - inDir) -) -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") - -dats <- list() #input data in different slots -# clinical -cat("\t* Clinical\n") -clin <- pheno -# ###### -# This section copied from main.R of syn1895966 and adapted to the current -# code -clin$stage <- as.vector(clin$stage) -clin$stage[clin$stage=="Stage IA"| clin$stage=="Stage IB"] <- "I" -clin$stage[clin$stage=="Stage IIA"| clin$stage=="Stage IIB"| clin$stage=="Stage II"] <- "II" -clin$stage[clin$stage=="Stage IIIA"| clin$stage=="Stage IIIB"] <- "III" -clin$stage <- as.factor(clin$stage) -clin <- clin[, -which(colnames(clin)=="gender")] -# ###### -rownames(clin) <- clin[,1]; -clin <- t(clin[,c("age","stage")]) -clin[1,] <- as.integer(clin[1,]) -clin[2,] <- as.integer(as.factor(clin[2,])) -class(clin) <- "numeric" -dats$clinical <- clin; rm(clin) - -#### change this for current tumour type -clinList <- list(age="age",stage="stage") - -# Proteomics -cat("\t* RPPA\n") -rppa <- read.delim(inFiles$rppa,sep="\t",h=T,as.is=T) -rppa <- t(rppa) -colnames(rppa) <- rppa[1,]; rppa <- rppa[-1,]; -rppa <- rppa[-nrow(rppa),] -class(rppa) <- "numeric" -nm <- sub("RPPA_","",rownames(rppa)) -dpos <- regexpr("\\.",nm) -nm <- substr(nm,1,dpos-1) -rownames(rppa) <- nm - -dats$rppa <- rppa; rm(rppa) - -# RNA -cat("\t* RNA\n") -rna <- read.delim(inFiles$rna,sep="\t",h=T,as.is=T) -rna <- t(rna) -colnames(rna) <- rna[1,]; rna <- rna[-1,]; -rna <- rna[-nrow(rna),] -class(rna) <- "numeric" -rownames(rna) <- sub("mRNA_","",rownames(rna)) -dpos <- regexpr("\\.", rownames(rna)) -rownames(rna) <- substr(rownames(rna),1,dpos-1) -dats$rna <- rna; rm(rna) - -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# somatic mutations -cat("\t* Somatic mutations\n") -mut <- read.delim(inFiles$mut,sep="\t",h=T,as.is=T) -# next steps: convert into GRanges with LOCUS_NAMES column. -# call makePSN_NamedRanges() to make pathway-level nets. -# run training. -pat_GR <- GRanges(paste("chr",mut$Chromosome,sep=""), - IRanges(mut$Start_position,mut$End_position)) -pat_GR$LOCUS_NAMES<-mut$Hugo_Symbol -pat_GR$ID <- mut$ID - -pheno_all <- pheno; -pat_GR_all <- pat_GR; - -rm(pheno,pat_GR,mut) - -# ---------------------------------------------------------- -# build classifier -numCores <- 8L -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ -for (rngNum in 24:100) { - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats,function(x) { - x[,which(colnames(x) %in% pheno$ID)]}) - pat_GR_train <- pat_GR_all[which(pat_GR_all$ID %in% pheno$ID)] - - # create nets - netDir <- sprintf("%s/networks",outDir) - pathFile <- sprintf("%s/extdata/Human_160124_AllPathways.gmt", - path.package("netDx.examples")) - pathwayList <- readPathways(pathFile) - PROT_pathwayList <- pathwayList - names(PROT_pathwayList) <- paste("PROT", names(pathwayList),sep="_") - - data(genes) - gene_GR <- GRanges(genes$chrom,IRanges(genes$txStart,genes$txEnd), - name=genes$name2) - cat("* Limiting to pathway genes\n") - path_GRList <- mapNamedRangesToSets(gene_GR,pathwayList) - names(path_GRList) <- paste("MUT_",names(path_GRList),sep="") - - # RNA - group by pathway - netList <- makePSN_NamedMatrix(dats_train$rna, rownames(dats_train$rna), - pathwayList,netDir,verbose=FALSE, - numCores=numCores,writeProfiles=TRUE) - cat("Made RNA pathway nets\n") - - # PROTEIN - group by pathway - netList2 <- makePSN_NamedMatrix(dats_train$rppa, rownames(dats_train$rppa), - PROT_pathwayList,netDir,verbose=FALSE, - numCores=numCores,writeProfiles=TRUE,append=TRUE) - cat("Made protein pathway nets\n") - - # each clinical var is its own net - netList3 <- makePSN_NamedMatrix(dats_train$clinical, - rownames(dats_train$clinical), - clinList,netDir, simMetric="custom",customFunc=normDiff, - sparsify=TRUE,verbose=TRUE,numCores=numCores,append=TRUE) - cat("Made clinical nets\n") - - # add somatic mutations at pathway-level - netList4 <- makePSN_RangeSets(pat_GR_train, path_GRList, netDir, - numCores=numCores) - cat("Made somatic mutation pathway nets\n") - - netList <- unlist(c(netList,netList2,netList3,netList4)) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir <- sprintf("%s/%s",outDir,g) - if (file.exists(pDir)) unlink(pDir,recursive=TRUE) - dir.create(pDir) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as a residual - pheno_subtype$STATUS[which(!pheno_subtype$STATUS %in% g)] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - ## ----class-prediction, eval=TRUE------------------------- - # now create GM databases for each class - # should contain train + test patients - # and be limited to nets that pass feature selection - pheno <- pheno_all - predRes <- list() - for (g in subtypes) { - pDir <- sprintf("%s/%s",outDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir,g), - sep="\t",h=T,as.is=T) - pTally <- pTally[which(pTally[,2]>=cutoff),1] - pTally <- sub(".profile","",pTally) - pTally <- sub("_cont","",pTally) - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - netDir <- sprintf("%s/networks",pDir) - - # prepare nets for new db - # RNA - idx <- which(names(pathwayList) %in% pTally) - if (any(idx)) { - cat(sprintf("RNA: included %i nets\n", length(idx))) - tmp <- makePSN_NamedMatrix(dats$rna, rownames(dats$rna), - pathwayList[idx],writeProfiles=TRUE, - netDir,verbose=F,numCores=numCores) - } - - # clinical - idx <- which(names(clinList) %in% pTally) - if (any(idx)) { - cat(sprintf("clinical: included %i nets\n", length(idx))) - netList2 <- makePSN_NamedMatrix(dats$clinical, rownames(dats$clinical), - clinList[idx], - netDir, simMetric="custom",customFunc=normDiff, - sparsify=TRUE,verbose=TRUE,numCores=numCores,append=TRUE) - } - - # add somatic mutations at pathway-level - idx <- which(names(path_GRList) %in% pTally) - if (any(idx)) { - cat(sprintf("mutations: included %i nets\n", length(idx))) - netList3 <- makePSN_RangeSets(pat_GR_all, - path_GRList[idx], - netDir,numCores=numCores) - } - - # proteomics group by pathway - idx <- which(names(PROT_pathwayList) %in% pTally) - if (any(idx)) { - cat(sprintf("proteomics: included %i nets\n", length(idx))) - netList4 <- makePSN_NamedMatrix(dats$rppa, - rownames(dats$rppa), - PROT_pathwayList[idx],netDir,verbose=FALSE, - numCores=numCores,writeProfiles=TRUE,append=TRUE) - } - - # create db - dbDir <- GM_createDB(netDir,pheno$ID,pDir,numCores=numCores) - # query of all training samples for this class - qSamps <- pheno$ID[which(pheno$STATUS %in% g & - pheno$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",pDir,g) - GM_writeQueryFile(qSamps,"all",nrow(pheno),qFile) - resFile <- runGeneMANIA(dbDir$dbDir,qFile,resDir=pDir) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile),pheno,g) - } - - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",outDir) - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",outDir)) - - # cleanup - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/SURVIVENO/dataset %s/SURVIVENO/networks", - outDir,outDir)) - system(sprintf("rm -r %s/SURVIVEYES/dataset %s/SURVIVEYES/networks", - outDir,outDir)) - system(sprintf("rm -r %s/SURVIVEYES/tmp %s/SURVIVENO/tmp", - outDir,outDir)) - -}}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) - - diff --git a/misc/PanCancer/multiCutoff/GBM_parseMulti.R b/misc/PanCancer/multiCutoff/GBM_parseMulti.R index 8c59fb70..78b32fb5 100644 --- a/misc/PanCancer/multiCutoff/GBM_parseMulti.R +++ b/misc/PanCancer/multiCutoff/GBM_parseMulti.R @@ -5,29 +5,30 @@ require(reshape2) #dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/prune_180204" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180419" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCA1net_180126" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCAmultinet_180126" settypes <- c("clinical","mir","rna","cnv","dnam", "clinicalArna","clinicalAmir","clinicalAdnam","clinicalAcnv","all") -outmat <- matrix(NA,nrow=length(settypes),ncol=9) -meas <- paste(rep(7:9,each=3),c("auroc","aupr","accuracy"),sep="_") +outmat <- matrix(NA,nrow=length(settypes),ncol=3) +meas <- paste(rep(9,each=3),c("auroc","aupr","accuracy"),sep="_") rownames(outmat)<- settypes colnames(outmat) <- meas ctr <- 1 outD <- sprintf("GBM_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) +auc_set <- list() for (settype in settypes) { ### if (settype %in% "clinicalArna") ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:43,sep="") colctr <- 1 -for (cutoff in 7:9) { +for (cutoff in 9) { c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", rngDir,settype,cutoff) torm <- c() @@ -47,6 +48,7 @@ for (cutoff in 7:9) { y2 <- unlist(lapply(x,function(i) i$aupr)) y3 <- unlist(lapply(x,function(i) i$accuracy)) outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) + auc_set[[settype]] <- y1 colctr <- colctr+3 } @@ -54,6 +56,14 @@ ctr <- ctr+1 } print(round(outmat,digits=2)) +auc_set <- auc_set[-which(names(auc_set) %in% c("mir","cnv","clinicalAmir","clinicalAcnv"))] + +pdf("gbm_auc.pdf",width=13,height=5); + boxplot(auc_set,cex.axis=0.6,pars=list(boxwex=0.3)); + abline(h=median(auc_set[["clinical"]])); + barplot(unlist(lapply(auc_set,mean)),las=1,cex.axis=1.3,font.axis=2, + ylim=c(0.5,1),main="GBM") +dev.off() write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", col=T,row=T,quote=F) diff --git a/misc/PanCancer/multiCutoff/KIRC_parseMulti.R b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R index e344348f..429f434b 100644 --- a/misc/PanCancer/multiCutoff/KIRC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R @@ -3,7 +3,7 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruned_180204" +dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruneTrain_180419" settypes <- c("clinical","mir","rna","prot","cnv","dnam", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", @@ -16,12 +16,13 @@ ctr <- 1 outD <- sprintf("KIRC_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) +auc_set <- list() for (settype in settypes) { ### if (settype %in% "clinicalArna") ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:50,sep="") colctr <- 1 for (cutoff in 9) { @@ -44,6 +45,7 @@ for (cutoff in 9) { y2 <- unlist(lapply(x,function(i) i$aupr)) y3 <- unlist(lapply(x,function(i) i$accuracy)) outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) + auc_set[[settype]] <- y1 colctr <- colctr+3 } @@ -51,6 +53,17 @@ ctr <- ctr+1 } print(round(outmat,digits=2)) +meds <- unlist(lapply(auc_set,median)) +mu <- unlist(lapply(auc_set,mean)) +err <- unlist(lapply(auc_set,sd)) + +auc_set <- auc_set[order(meds)] + +pdf("kirc_auc.pdf",width=13,height=5); + boxplot(auc_set,cex.axis=0.6); + abline(h=median(auc_set[["clinical"]]));dev.off() + + write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", col=T,row=T,quote=F) diff --git a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R index f78ba41e..766c8f55 100644 --- a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R @@ -3,10 +3,10 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/prunedMI_180212" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/prunedPearson_180212" -settypes <- c("prot","clinicalAprot") #"mir","rna","prot","cnv", - #"clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") +settypes <- c("clinical","mir","rna","prot","cnv", + "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") outmat <- matrix(NA,nrow=length(settypes),ncol=9) meas <- paste(rep(7:9,each=3),c("auroc","aupr","accuracy"),sep="_") rownames(outmat)<- settypes @@ -15,15 +15,16 @@ ctr <- 1 outD <- sprintf("LUSC_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) +auc_set <- list() for (settype in settypes) { ### if (settype %in% "clinicalArna") ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:10,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:93,sep="") colctr <- 1 -for (cutoff in 7:9) { +for (cutoff in 9) { c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", rngDir,settype,cutoff) torm <- c() @@ -43,6 +44,7 @@ for (cutoff in 7:9) { y2 <- unlist(lapply(x,function(i) i$aupr)) y3 <- unlist(lapply(x,function(i) i$accuracy)) outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) + auc_set[[settype]] <- y1 colctr <- colctr+3 } @@ -50,6 +52,13 @@ ctr <- ctr+1 } print(round(outmat,digits=2)) +pdf("lusc_auc.pdf",width=13,height=5); + boxplot(auc_set,cex.axis=0.6,pars=list(boxwex=0.3)); + abline(h=median(auc_set[["clinical"]])); + barplot(unlist(lapply(auc_set,mean)),las=1,cex.axis=1.3,font.axis=2, + ylim=c(0.5,1),main="LUSC") +dev.off() + write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", col=T,row=T,quote=F) diff --git a/misc/PanCancer/multiCutoff/OV_parseMulti.R b/misc/PanCancer/multiCutoff/OV_parseMulti.R index 0deadc1f..eeb15de2 100644 --- a/misc/PanCancer/multiCutoff/OV_parseMulti.R +++ b/misc/PanCancer/multiCutoff/OV_parseMulti.R @@ -16,12 +16,13 @@ ctr <- 1 outD <- sprintf("OV_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) +auc_set <- list() for (settype in settypes) { ### if (settype %in% "clinicalArna") ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:14,sep="") colctr <- 1 for (cutoff in 9) { @@ -44,12 +45,30 @@ for (cutoff in 9) { y2 <- unlist(lapply(x,function(i) i$aupr)) y3 <- unlist(lapply(x,function(i) i$accuracy)) outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) + auc_set[[settype]] <- y1 colctr <- colctr+3 } ctr <- ctr+1 } + +#auc_set <- auc_set[which(names(auc_set)%in% c("clinical","rna","clinicalArna","all"))] + print(round(outmat,digits=2)) +pdf("ov_auc.pdf",width=16,height=5); + boxplot(auc_set,cex.axis=0.6,pars=list(boxwex=0.3)); + abline(h=median(auc_set[["clinical"]])); + +mu <- unlist(lapply(auc_set,mean)) +err <- unlist(lapply(auc_set,sd)) +xpos <- barplot(mu,las=1,cex=1.3,cex.names=0.8, + ylim=c(0,1),main="OV") +segments(x0=xpos,y0=mu-err,y1=mu+err) + +wmw <- wilcox.test(auc_set[["all"]],auc_set[["clinicalArna"]],alternative="greater") +cat(sprintf("All > clin+rna: p < %1.2e\n", wmw$p.value)) + +dev.off() write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", col=T,row=T,quote=F) diff --git a/misc/PanCancer/GBM_featSel_perf.R b/misc/PanCancer/outdated/GBM_featSel_perf.R similarity index 100% rename from misc/PanCancer/GBM_featSel_perf.R rename to misc/PanCancer/outdated/GBM_featSel_perf.R diff --git a/misc/PanCancer/GBM_perf.R b/misc/PanCancer/outdated/GBM_perf.R similarity index 100% rename from misc/PanCancer/GBM_perf.R rename to misc/PanCancer/outdated/GBM_perf.R diff --git a/misc/PanCancer/GBM_somMut_writeEnrMap.R b/misc/PanCancer/outdated/GBM_somMut_writeEnrMap.R similarity index 100% rename from misc/PanCancer/GBM_somMut_writeEnrMap.R rename to misc/PanCancer/outdated/GBM_somMut_writeEnrMap.R diff --git a/misc/PanCancer/LUSC_perf.R b/misc/PanCancer/outdated/LUSC_perf.R similarity index 100% rename from misc/PanCancer/LUSC_perf.R rename to misc/PanCancer/outdated/LUSC_perf.R diff --git a/misc/PanCancer/OV_perf.R b/misc/PanCancer/outdated/OV_perf.R similarity index 100% rename from misc/PanCancer/OV_perf.R rename to misc/PanCancer/outdated/OV_perf.R diff --git a/misc/PanCancer/basicPredictor/LUSC_classify.R b/misc/PanCancer/outdated/basicPredictor/LUSC_classify.R similarity index 100% rename from misc/PanCancer/basicPredictor/LUSC_classify.R rename to misc/PanCancer/outdated/basicPredictor/LUSC_classify.R diff --git a/misc/PanCancer/basicPredictor/LUSC_classify_ownTrain.R b/misc/PanCancer/outdated/basicPredictor/LUSC_classify_ownTrain.R similarity index 100% rename from misc/PanCancer/basicPredictor/LUSC_classify_ownTrain.R rename to misc/PanCancer/outdated/basicPredictor/LUSC_classify_ownTrain.R diff --git a/misc/PanCancer/basicPredictor/OV_classify.R b/misc/PanCancer/outdated/basicPredictor/OV_classify.R similarity index 100% rename from misc/PanCancer/basicPredictor/OV_classify.R rename to misc/PanCancer/outdated/basicPredictor/OV_classify.R diff --git a/misc/PanCancer/basicPredictor/OV_classify_ownTrain.R b/misc/PanCancer/outdated/basicPredictor/OV_classify_ownTrain.R similarity index 100% rename from misc/PanCancer/basicPredictor/OV_classify_ownTrain.R rename to misc/PanCancer/outdated/basicPredictor/OV_classify_ownTrain.R diff --git a/misc/PanCancer/check_trainTestOverlap.R b/misc/PanCancer/outdated/check_trainTestOverlap.R similarity index 100% rename from misc/PanCancer/check_trainTestOverlap.R rename to misc/PanCancer/outdated/check_trainTestOverlap.R diff --git a/misc/PanCancer/consNetScore.R b/misc/PanCancer/outdated/consNetScore.R similarity index 100% rename from misc/PanCancer/consNetScore.R rename to misc/PanCancer/outdated/consNetScore.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_PCA.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_PCA.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_PCA.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_PCA.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_PCAmultiNet.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_PCAmultiNet.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_PCAmultiNet.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_PCAmultiNet.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_PCAuniv.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_PCAuniv.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_PCAuniv.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_PCAuniv.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_clinRNAalone.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_clinRNAalone.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_clinRNAalone.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_clinRNAalone.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_multiCutoff.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_multiCutoff.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_multiCutoff.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_multiCutoff.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_naOmit.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_naOmit.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_naOmit.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_naOmit.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_normDiff.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_normDiff.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_normDiff.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_normDiff.R diff --git a/misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_prune.R b/misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_prune.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/GBM_oneNetPer_prune.R rename to misc/PanCancer/outdated/featSel_oneNetPer/GBM_oneNetPer_prune.R diff --git a/misc/PanCancer/featSel_oneNetPer/KIRC_Prot_oneNetPer_PSN.R b/misc/PanCancer/outdated/featSel_oneNetPer/KIRC_Prot_oneNetPer_PSN.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/KIRC_Prot_oneNetPer_PSN.R rename to misc/PanCancer/outdated/featSel_oneNetPer/KIRC_Prot_oneNetPer_PSN.R diff --git a/misc/PanCancer/featSel_oneNetPer/KIRC_RNA_oneNetPer_PSN.R b/misc/PanCancer/outdated/featSel_oneNetPer/KIRC_RNA_oneNetPer_PSN.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/KIRC_RNA_oneNetPer_PSN.R rename to misc/PanCancer/outdated/featSel_oneNetPer/KIRC_RNA_oneNetPer_PSN.R diff --git a/misc/PanCancer/featSel_oneNetPer/KIRC_clin_oneNetPer_PSN.R b/misc/PanCancer/outdated/featSel_oneNetPer/KIRC_clin_oneNetPer_PSN.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/KIRC_clin_oneNetPer_PSN.R rename to misc/PanCancer/outdated/featSel_oneNetPer/KIRC_clin_oneNetPer_PSN.R diff --git a/misc/PanCancer/featSel_oneNetPer/KIRC_collectRes.R b/misc/PanCancer/outdated/featSel_oneNetPer/KIRC_collectRes.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/KIRC_collectRes.R rename to misc/PanCancer/outdated/featSel_oneNetPer/KIRC_collectRes.R diff --git a/misc/PanCancer/featSel_oneNetPer/KIRC_oneNetPer.R b/misc/PanCancer/outdated/featSel_oneNetPer/KIRC_oneNetPer.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/KIRC_oneNetPer.R rename to misc/PanCancer/outdated/featSel_oneNetPer/KIRC_oneNetPer.R diff --git a/misc/PanCancer/featSel_oneNetPer/KIRC_oneNetPer_normDiff.R b/misc/PanCancer/outdated/featSel_oneNetPer/KIRC_oneNetPer_normDiff.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/KIRC_oneNetPer_normDiff.R rename to misc/PanCancer/outdated/featSel_oneNetPer/KIRC_oneNetPer_normDiff.R diff --git a/misc/PanCancer/featSel_oneNetPer/KIRC_oneNetPer_withPathways.R b/misc/PanCancer/outdated/featSel_oneNetPer/KIRC_oneNetPer_withPathways.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/KIRC_oneNetPer_withPathways.R rename to misc/PanCancer/outdated/featSel_oneNetPer/KIRC_oneNetPer_withPathways.R diff --git a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer.R b/misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer.R rename to misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer.R diff --git a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprunePCA.R b/misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer_LMprunePCA.R similarity index 96% rename from misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprunePCA.R rename to misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer_LMprunePCA.R index 64ee4421..f9d5753a 100644 --- a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprunePCA.R +++ b/misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer_LMprunePCA.R @@ -21,7 +21,7 @@ inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/prunePCA_%s",outRoot,dt) +megaDir <- sprintf("%s/pruneTest_%s",outRoot,dt) # ---------------------------------------------------------------- # helper functions diff --git a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMpruneStageSep.R b/misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer_LMpruneStageSep.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMpruneStageSep.R rename to misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer_LMpruneStageSep.R diff --git a/misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R b/misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R rename to misc/PanCancer/outdated/featSel_oneNetPer/LUSC_oneNetPer_LMprune_RBF.R diff --git a/misc/PanCancer/featSel_oneNetPer/OV_clin_oneNetPer_PSN.R b/misc/PanCancer/outdated/featSel_oneNetPer/OV_clin_oneNetPer_PSN.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/OV_clin_oneNetPer_PSN.R rename to misc/PanCancer/outdated/featSel_oneNetPer/OV_clin_oneNetPer_PSN.R diff --git a/misc/PanCancer/featSel_oneNetPer/README b/misc/PanCancer/outdated/featSel_oneNetPer/README similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/README rename to misc/PanCancer/outdated/featSel_oneNetPer/README diff --git a/misc/PanCancer/featSel_oneNetPer/getTarballed.sh b/misc/PanCancer/outdated/featSel_oneNetPer/getTarballed.sh similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/getTarballed.sh rename to misc/PanCancer/outdated/featSel_oneNetPer/getTarballed.sh diff --git a/misc/PanCancer/featSel_oneNetPer/getTarballed_clinNormDiff.sh b/misc/PanCancer/outdated/featSel_oneNetPer/getTarballed_clinNormDiff.sh similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/getTarballed_clinNormDiff.sh rename to misc/PanCancer/outdated/featSel_oneNetPer/getTarballed_clinNormDiff.sh diff --git a/misc/PanCancer/featSel_oneNetPer/oneNetPer_collectRes.R b/misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_collectRes.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/oneNetPer_collectRes.R rename to misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_collectRes.R diff --git a/misc/PanCancer/featSel_oneNetPer/oneNetPer_collectRes_normDiff.R b/misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_collectRes_normDiff.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/oneNetPer_collectRes_normDiff.R rename to misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_collectRes_normDiff.R diff --git a/misc/PanCancer/featSel_oneNetPer/oneNetPer_plotPerf.R b/misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_plotPerf.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/oneNetPer_plotPerf.R rename to misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_plotPerf.R diff --git a/misc/PanCancer/featSel_oneNetPer/oneNetPer_tarballResults.sh b/misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_tarballResults.sh similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/oneNetPer_tarballResults.sh rename to misc/PanCancer/outdated/featSel_oneNetPer/oneNetPer_tarballResults.sh diff --git a/misc/PanCancer/featSel_oneNetPer/selectPC.R b/misc/PanCancer/outdated/featSel_oneNetPer/selectPC.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/selectPC.R rename to misc/PanCancer/outdated/featSel_oneNetPer/selectPC.R diff --git a/misc/PanCancer/featSel_oneNetPer/showScoreRanking.R b/misc/PanCancer/outdated/featSel_oneNetPer/showScoreRanking.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/showScoreRanking.R rename to misc/PanCancer/outdated/featSel_oneNetPer/showScoreRanking.R diff --git a/misc/PanCancer/featSel_oneNetPer/writeConsensusNets_batch.R b/misc/PanCancer/outdated/featSel_oneNetPer/writeConsensusNets_batch.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/writeConsensusNets_batch.R rename to misc/PanCancer/outdated/featSel_oneNetPer/writeConsensusNets_batch.R diff --git a/misc/PanCancer/featSel_oneNetPer/writeConsensusNets_oneSet.R b/misc/PanCancer/outdated/featSel_oneNetPer/writeConsensusNets_oneSet.R similarity index 100% rename from misc/PanCancer/featSel_oneNetPer/writeConsensusNets_oneSet.R rename to misc/PanCancer/outdated/featSel_oneNetPer/writeConsensusNets_oneSet.R diff --git a/misc/PanCancer/featSel_pathways/KIRC_SurvivalByVar.R b/misc/PanCancer/outdated/featSel_pathways/KIRC_SurvivalByVar.R similarity index 100% rename from misc/PanCancer/featSel_pathways/KIRC_SurvivalByVar.R rename to misc/PanCancer/outdated/featSel_pathways/KIRC_SurvivalByVar.R diff --git a/misc/PanCancer/featSel_pathways/KIRC_checkConsensus.R b/misc/PanCancer/outdated/featSel_pathways/KIRC_checkConsensus.R similarity index 100% rename from misc/PanCancer/featSel_pathways/KIRC_checkConsensus.R rename to misc/PanCancer/outdated/featSel_pathways/KIRC_checkConsensus.R diff --git a/misc/PanCancer/featSel_pathways/KIRC_checkRandom.R b/misc/PanCancer/outdated/featSel_pathways/KIRC_checkRandom.R similarity index 100% rename from misc/PanCancer/featSel_pathways/KIRC_checkRandom.R rename to misc/PanCancer/outdated/featSel_pathways/KIRC_checkRandom.R diff --git a/misc/PanCancer/featSel_pathways/KIRC_checkRandom_normDiff.R b/misc/PanCancer/outdated/featSel_pathways/KIRC_checkRandom_normDiff.R similarity index 100% rename from misc/PanCancer/featSel_pathways/KIRC_checkRandom_normDiff.R rename to misc/PanCancer/outdated/featSel_pathways/KIRC_checkRandom_normDiff.R diff --git a/misc/PanCancer/featSel_pathways/KIRC_checkRandom_normDiff2.R b/misc/PanCancer/outdated/featSel_pathways/KIRC_checkRandom_normDiff2.R similarity index 100% rename from misc/PanCancer/featSel_pathways/KIRC_checkRandom_normDiff2.R rename to misc/PanCancer/outdated/featSel_pathways/KIRC_checkRandom_normDiff2.R diff --git a/misc/PanCancer/featSel_pathways/KIRC_writeEMap.R b/misc/PanCancer/outdated/featSel_pathways/KIRC_writeEMap.R similarity index 100% rename from misc/PanCancer/featSel_pathways/KIRC_writeEMap.R rename to misc/PanCancer/outdated/featSel_pathways/KIRC_writeEMap.R diff --git a/misc/PanCancer/featSel_pathways/KIRCpathway_locations.txt b/misc/PanCancer/outdated/featSel_pathways/KIRCpathway_locations.txt similarity index 100% rename from misc/PanCancer/featSel_pathways/KIRCpathway_locations.txt rename to misc/PanCancer/outdated/featSel_pathways/KIRCpathway_locations.txt diff --git a/misc/PanCancer/featSel_pathways/PRANK_compareRealShuf.R b/misc/PanCancer/outdated/featSel_pathways/PRANK_compareRealShuf.R similarity index 100% rename from misc/PanCancer/featSel_pathways/PRANK_compareRealShuf.R rename to misc/PanCancer/outdated/featSel_pathways/PRANK_compareRealShuf.R diff --git a/misc/PanCancer/featSel_pathways/clinNets/KIRC_featSel_clinNets.R b/misc/PanCancer/outdated/featSel_pathways/clinNets/KIRC_featSel_clinNets.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinNets/KIRC_featSel_clinNets.R rename to misc/PanCancer/outdated/featSel_pathways/clinNets/KIRC_featSel_clinNets.R diff --git a/misc/PanCancer/featSel_pathways/clinNets/collectRes_clinNet.R b/misc/PanCancer/outdated/featSel_pathways/clinNets/collectRes_clinNet.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinNets/collectRes_clinNet.R rename to misc/PanCancer/outdated/featSel_pathways/clinNets/collectRes_clinNet.R diff --git a/misc/PanCancer/featSel_pathways/clinNets/getDijk_indivClinNets.R b/misc/PanCancer/outdated/featSel_pathways/clinNets/getDijk_indivClinNets.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinNets/getDijk_indivClinNets.R rename to misc/PanCancer/outdated/featSel_pathways/clinNets/getDijk_indivClinNets.R diff --git a/misc/PanCancer/featSel_pathways/clinNets/getPSN.R b/misc/PanCancer/outdated/featSel_pathways/clinNets/getPSN.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinNets/getPSN.R rename to misc/PanCancer/outdated/featSel_pathways/clinNets/getPSN.R diff --git a/misc/PanCancer/featSel_pathways/clinNets/getPSN_test.R b/misc/PanCancer/outdated/featSel_pathways/clinNets/getPSN_test.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinNets/getPSN_test.R rename to misc/PanCancer/outdated/featSel_pathways/clinNets/getPSN_test.R diff --git a/misc/PanCancer/featSel_pathways/clinNets/getTarballed_clinNetsOnly.sh b/misc/PanCancer/outdated/featSel_pathways/clinNets/getTarballed_clinNetsOnly.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/clinNets/getTarballed_clinNetsOnly.sh rename to misc/PanCancer/outdated/featSel_pathways/clinNets/getTarballed_clinNetsOnly.sh diff --git a/misc/PanCancer/featSel_pathways/clinNets/writeConsensusNets_clinNets_batch.R b/misc/PanCancer/outdated/featSel_pathways/clinNets/writeConsensusNets_clinNets_batch.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinNets/writeConsensusNets_clinNets_batch.R rename to misc/PanCancer/outdated/featSel_pathways/clinNets/writeConsensusNets_clinNets_batch.R diff --git a/misc/PanCancer/featSel_pathways/clinRNA_best/KIRC_writeEMap.R b/misc/PanCancer/outdated/featSel_pathways/clinRNA_best/KIRC_writeEMap.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinRNA_best/KIRC_writeEMap.R rename to misc/PanCancer/outdated/featSel_pathways/clinRNA_best/KIRC_writeEMap.R diff --git a/misc/PanCancer/featSel_pathways/clinRNA_best/getPSN.R b/misc/PanCancer/outdated/featSel_pathways/clinRNA_best/getPSN.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinRNA_best/getPSN.R rename to misc/PanCancer/outdated/featSel_pathways/clinRNA_best/getPSN.R diff --git a/misc/PanCancer/featSel_pathways/clinRNA_best/getTarballed_clinRNAbest.sh b/misc/PanCancer/outdated/featSel_pathways/clinRNA_best/getTarballed_clinRNAbest.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/clinRNA_best/getTarballed_clinRNAbest.sh rename to misc/PanCancer/outdated/featSel_pathways/clinRNA_best/getTarballed_clinRNAbest.sh diff --git a/misc/PanCancer/featSel_pathways/clinRNA_best/showScoreRanking_clinRNAbest.R b/misc/PanCancer/outdated/featSel_pathways/clinRNA_best/showScoreRanking_clinRNAbest.R similarity index 100% rename from misc/PanCancer/featSel_pathways/clinRNA_best/showScoreRanking_clinRNAbest.R rename to misc/PanCancer/outdated/featSel_pathways/clinRNA_best/showScoreRanking_clinRNAbest.R diff --git a/misc/PanCancer/featSel_pathways/collectRes.R b/misc/PanCancer/outdated/featSel_pathways/collectRes.R similarity index 100% rename from misc/PanCancer/featSel_pathways/collectRes.R rename to misc/PanCancer/outdated/featSel_pathways/collectRes.R diff --git a/misc/PanCancer/featSel_pathways/collectRes_batch.R b/misc/PanCancer/outdated/featSel_pathways/collectRes_batch.R similarity index 100% rename from misc/PanCancer/featSel_pathways/collectRes_batch.R rename to misc/PanCancer/outdated/featSel_pathways/collectRes_batch.R diff --git a/misc/PanCancer/featSel_pathways/collectRes_consensus.R b/misc/PanCancer/outdated/featSel_pathways/collectRes_consensus.R similarity index 100% rename from misc/PanCancer/featSel_pathways/collectRes_consensus.R rename to misc/PanCancer/outdated/featSel_pathways/collectRes_consensus.R diff --git a/misc/PanCancer/featSel_pathways/compareConsNets.R b/misc/PanCancer/outdated/featSel_pathways/compareConsNets.R similarity index 100% rename from misc/PanCancer/featSel_pathways/compareConsNets.R rename to misc/PanCancer/outdated/featSel_pathways/compareConsNets.R diff --git a/misc/PanCancer/featSel_pathways/compareScores_realPseudo.R b/misc/PanCancer/outdated/featSel_pathways/compareScores_realPseudo.R similarity index 100% rename from misc/PanCancer/featSel_pathways/compareScores_realPseudo.R rename to misc/PanCancer/outdated/featSel_pathways/compareScores_realPseudo.R diff --git a/misc/PanCancer/featSel_pathways/compareTestSimRanks.R b/misc/PanCancer/outdated/featSel_pathways/compareTestSimRanks.R similarity index 100% rename from misc/PanCancer/featSel_pathways/compareTestSimRanks.R rename to misc/PanCancer/outdated/featSel_pathways/compareTestSimRanks.R diff --git a/misc/PanCancer/featSel_pathways/getAUC.R b/misc/PanCancer/outdated/featSel_pathways/getAUC.R similarity index 100% rename from misc/PanCancer/featSel_pathways/getAUC.R rename to misc/PanCancer/outdated/featSel_pathways/getAUC.R diff --git a/misc/PanCancer/featSel_pathways/getPSN.R b/misc/PanCancer/outdated/featSel_pathways/getPSN.R similarity index 100% rename from misc/PanCancer/featSel_pathways/getPSN.R rename to misc/PanCancer/outdated/featSel_pathways/getPSN.R diff --git a/misc/PanCancer/featSel_pathways/getTarballed.sh b/misc/PanCancer/outdated/featSel_pathways/getTarballed.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/getTarballed.sh rename to misc/PanCancer/outdated/featSel_pathways/getTarballed.sh diff --git a/misc/PanCancer/featSel_pathways/getTarballed_randomRes.sh b/misc/PanCancer/outdated/featSel_pathways/getTarballed_randomRes.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/getTarballed_randomRes.sh rename to misc/PanCancer/outdated/featSel_pathways/getTarballed_randomRes.sh diff --git a/misc/PanCancer/featSel_pathways/multiplot.R b/misc/PanCancer/outdated/featSel_pathways/multiplot.R similarity index 100% rename from misc/PanCancer/featSel_pathways/multiplot.R rename to misc/PanCancer/outdated/featSel_pathways/multiplot.R diff --git a/misc/PanCancer/featSel_pathways/oneClinNet/KIRC_featSel_pathways_oneClinNet.R b/misc/PanCancer/outdated/featSel_pathways/oneClinNet/KIRC_featSel_pathways_oneClinNet.R similarity index 100% rename from misc/PanCancer/featSel_pathways/oneClinNet/KIRC_featSel_pathways_oneClinNet.R rename to misc/PanCancer/outdated/featSel_pathways/oneClinNet/KIRC_featSel_pathways_oneClinNet.R diff --git a/misc/PanCancer/featSel_pathways/oneClinNet/KIRC_pathways_oneClinNet_normDiff.R b/misc/PanCancer/outdated/featSel_pathways/oneClinNet/KIRC_pathways_oneClinNet_normDiff.R similarity index 100% rename from misc/PanCancer/featSel_pathways/oneClinNet/KIRC_pathways_oneClinNet_normDiff.R rename to misc/PanCancer/outdated/featSel_pathways/oneClinNet/KIRC_pathways_oneClinNet_normDiff.R diff --git a/misc/PanCancer/featSel_pathways/oneClinNet/collectRes_oneClinNet.R b/misc/PanCancer/outdated/featSel_pathways/oneClinNet/collectRes_oneClinNet.R similarity index 100% rename from misc/PanCancer/featSel_pathways/oneClinNet/collectRes_oneClinNet.R rename to misc/PanCancer/outdated/featSel_pathways/oneClinNet/collectRes_oneClinNet.R diff --git a/misc/PanCancer/featSel_pathways/oneClinNet/getTarballed_clinOneNet.R b/misc/PanCancer/outdated/featSel_pathways/oneClinNet/getTarballed_clinOneNet.R similarity index 100% rename from misc/PanCancer/featSel_pathways/oneClinNet/getTarballed_clinOneNet.R rename to misc/PanCancer/outdated/featSel_pathways/oneClinNet/getTarballed_clinOneNet.R diff --git a/misc/PanCancer/featSel_pathways/oneClinNet/writeConsensusNets_oneClinNet_batch.R b/misc/PanCancer/outdated/featSel_pathways/oneClinNet/writeConsensusNets_oneClinNet_batch.R similarity index 100% rename from misc/PanCancer/featSel_pathways/oneClinNet/writeConsensusNets_oneClinNet_batch.R rename to misc/PanCancer/outdated/featSel_pathways/oneClinNet/writeConsensusNets_oneClinNet_batch.R diff --git a/misc/PanCancer/featSel_pathways/pathways/GM_OneVAll_getClass_altMethod.R b/misc/PanCancer/outdated/featSel_pathways/pathways/GM_OneVAll_getClass_altMethod.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/GM_OneVAll_getClass_altMethod.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/GM_OneVAll_getClass_altMethod.R diff --git a/misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathways.R b/misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathways.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathways.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathways.R diff --git a/misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysConsOnly.R b/misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysConsOnly.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysConsOnly.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysConsOnly.R diff --git a/misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly.R b/misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly.R diff --git a/misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_80.R b/misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_80.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_80.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_80.R diff --git a/misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_memtest.R b/misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_memtest.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_memtest.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_featSel_pathwaysOnly_memtest.R diff --git a/misc/PanCancer/featSel_pathways/pathways/KIRC_pathOnly_checkConsensus.R b/misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_pathOnly_checkConsensus.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/KIRC_pathOnly_checkConsensus.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/KIRC_pathOnly_checkConsensus.R diff --git a/misc/PanCancer/featSel_pathways/pathways/checkPathwayGone.R b/misc/PanCancer/outdated/featSel_pathways/pathways/checkPathwayGone.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/checkPathwayGone.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/checkPathwayGone.R diff --git a/misc/PanCancer/featSel_pathways/pathways/collectRes_consensus.R b/misc/PanCancer/outdated/featSel_pathways/pathways/collectRes_consensus.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/collectRes_consensus.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/collectRes_consensus.R diff --git a/misc/PanCancer/featSel_pathways/pathways/collectRes_pathOnly.R b/misc/PanCancer/outdated/featSel_pathways/pathways/collectRes_pathOnly.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/collectRes_pathOnly.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/collectRes_pathOnly.R diff --git a/misc/PanCancer/featSel_pathways/pathways/corrFeatWithSurvival.R b/misc/PanCancer/outdated/featSel_pathways/pathways/corrFeatWithSurvival.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/corrFeatWithSurvival.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/corrFeatWithSurvival.R diff --git a/misc/PanCancer/featSel_pathways/pathways/featSel_pathways_noFSgenes.R b/misc/PanCancer/outdated/featSel_pathways/pathways/featSel_pathways_noFSgenes.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/featSel_pathways_noFSgenes.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/featSel_pathways_noFSgenes.R diff --git a/misc/PanCancer/featSel_pathways/pathways/featSel_pathways_scrambled.R b/misc/PanCancer/outdated/featSel_pathways/pathways/featSel_pathways_scrambled.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/featSel_pathways_scrambled.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/featSel_pathways_scrambled.R diff --git a/misc/PanCancer/featSel_pathways/pathways/featSel_pseudoPathways_noPathGenes.R b/misc/PanCancer/outdated/featSel_pathways/pathways/featSel_pseudoPathways_noPathGenes.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/featSel_pseudoPathways_noPathGenes.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/featSel_pseudoPathways_noPathGenes.R diff --git a/misc/PanCancer/featSel_pathways/pathways/featSel_realPseudo_both.R b/misc/PanCancer/outdated/featSel_pathways/pathways/featSel_realPseudo_both.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/featSel_realPseudo_both.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/featSel_realPseudo_both.R diff --git a/misc/PanCancer/featSel_pathways/pathways/filterNetWt/getTarballed_path9095.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/filterNetWt/getTarballed_path9095.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/filterNetWt/getTarballed_path9095.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/filterNetWt/getTarballed_path9095.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTar_allPlusPathways.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTar_allPlusPathways.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTar_allPlusPathways.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTar_allPlusPathways.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTar_featSelPseudo.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTar_featSelPseudo.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTar_featSelPseudo.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTar_featSelPseudo.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTar_pathNoFS.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTar_pathNoFS.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTar_pathNoFS.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTar_pathNoFS.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTar_realPseudo.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTar_realPseudo.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTar_realPseudo.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTar_realPseudo.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTar_scrambled.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTar_scrambled.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTar_scrambled.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTar_scrambled.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTar_scrambled2.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTar_scrambled2.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTar_scrambled2.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTar_scrambled2.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_AltClassMethod.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_AltClassMethod.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_AltClassMethod.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_AltClassMethod.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_AltClass_noFS.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_AltClass_noFS.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_AltClass_noFS.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_AltClass_noFS.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_pathFull_AltClass.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathFull_AltClass.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_pathFull_AltClass.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathFull_AltClass.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_pathGMresults.R b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathGMresults.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_pathGMresults.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathGMresults.R diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly80.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly80.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly80.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly80.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly_getPRANK.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly_getPRANK.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly_getPRANK.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly_getPRANK.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly_strict.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly_strict.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/getTarballed_pathOnly_strict.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/getTarballed_pathOnly_strict.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/pathwaysFull_AltClassMethod.R b/misc/PanCancer/outdated/featSel_pathways/pathways/pathwaysFull_AltClassMethod.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/pathwaysFull_AltClassMethod.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/pathwaysFull_AltClassMethod.R diff --git a/misc/PanCancer/featSel_pathways/pathways/pathwaysOnly_plotResults.R b/misc/PanCancer/outdated/featSel_pathways/pathways/pathwaysOnly_plotResults.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/pathwaysOnly_plotResults.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/pathwaysOnly_plotResults.R diff --git a/misc/PanCancer/featSel_pathways/pathways/pathways_AltClassMethod.R b/misc/PanCancer/outdated/featSel_pathways/pathways/pathways_AltClassMethod.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/pathways_AltClassMethod.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/pathways_AltClassMethod.R diff --git a/misc/PanCancer/featSel_pathways/pathways/pathways_AltClassMethod_designD_noFS.R b/misc/PanCancer/outdated/featSel_pathways/pathways/pathways_AltClassMethod_designD_noFS.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/pathways_AltClassMethod_designD_noFS.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/pathways_AltClassMethod_designD_noFS.R diff --git a/misc/PanCancer/featSel_pathways/pathways/pathways_changeFilterWt.R b/misc/PanCancer/outdated/featSel_pathways/pathways/pathways_changeFilterWt.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/pathways_changeFilterWt.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/pathways_changeFilterWt.R diff --git a/misc/PanCancer/featSel_pathways/pathways/pathways_getPSN.R b/misc/PanCancer/outdated/featSel_pathways/pathways/pathways_getPSN.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/pathways_getPSN.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/pathways_getPSN.R diff --git a/misc/PanCancer/featSel_pathways/pathways/plotRandom.R b/misc/PanCancer/outdated/featSel_pathways/pathways/plotRandom.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/plotRandom.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/plotRandom.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_notFS.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_notFS.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_notFS.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_notFS.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_shufGenes.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_shufGenes.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_shufGenes.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_shufGenes.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smaller.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smaller.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smaller.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smaller.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smallest.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smallest.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smallest.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_pathOnly_checkRandom_smallest.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_noFS.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_noFS.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_noFS.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_noFS.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_shufGenes.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_shufGenes.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_shufGenes.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/KIRC_randomPath_designD_shufGenes.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/changePathwaySize.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/changePathwaySize.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/changePathwaySize.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/changePathwaySize.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/collectRes_psize.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/collectRes_psize.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/collectRes_psize.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/collectRes_psize.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/designD_getNets.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/designD_getNets.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/designD_getNets.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/designD_getNets.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/designD_shufGetPathways.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/designD_shufGetPathways.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/designD_shufGetPathways.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/designD_shufGetPathways.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTar_netNames_pseudoPath_noFS.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTar_netNames_pseudoPath_noFS.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTar_netNames_pseudoPath_noFS.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTar_netNames_pseudoPath_noFS.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTar_psize.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTar_psize.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTar_psize.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTar_psize.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS_PRANK.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS_PRANK.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS_PRANK.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_noFS_PRANK.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noFS.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noFS.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noFS.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noFS.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noPathGenes.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noPathGenes.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noPathGenes.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_designD_shufGenes_noPathGenes.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_randomRes.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_randomRes.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_randomRes.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_randomRes.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_shufGenes.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_shufGenes.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_shufGenes.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_shufGenes.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_smaller.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_smaller.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_smaller.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_smaller.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_strictNoFS.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_strictNoFS.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_strictNoFS.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_pathOnly_strictNoFS.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD_shuf.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD_shuf.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD_shuf.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/getTarballed_randomDesignD_shuf.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_shufGenes.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_shufGenes.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_shufGenes.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_shufGenes.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_smaller.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_smaller.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_smaller.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_smaller.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_strict.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_strict.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_strict.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_strict.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_strictNoFS.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_strictNoFS.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/plotRandom_strictNoFS.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/plotRandom_strictNoFS.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/pseudoPath_noPathGenes_getNetScores.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/pseudoPath_noPathGenes_getNetScores.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/pseudoPath_noPathGenes_getNetScores.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/pseudoPath_noPathGenes_getNetScores.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath_top90.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath_top90.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath_top90.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/randomPath_designD_rmFSpath_top90.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/randomPath_designD_shufGenes_noFS.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/randomPath_designD_shufGenes_noFS.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/randomPath_designD_shufGenes_noFS.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/randomPath_designD_shufGenes_noFS.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/shufflePathways.R b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/shufflePathways.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/shufflePathways.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/shufflePathways.R diff --git a/misc/PanCancer/featSel_pathways/pathways/randomDesigns/strictNoFS_getNetNames.sh b/misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/strictNoFS_getNetNames.sh similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/randomDesigns/strictNoFS_getNetNames.sh rename to misc/PanCancer/outdated/featSel_pathways/pathways/randomDesigns/strictNoFS_getNetNames.sh diff --git a/misc/PanCancer/featSel_pathways/pathways/strictNoFS_makeEM.R b/misc/PanCancer/outdated/featSel_pathways/pathways/strictNoFS_makeEM.R similarity index 100% rename from misc/PanCancer/featSel_pathways/pathways/strictNoFS_makeEM.R rename to misc/PanCancer/outdated/featSel_pathways/pathways/strictNoFS_makeEM.R diff --git a/misc/PanCancer/featSel_pathways/plotPerf.R b/misc/PanCancer/outdated/featSel_pathways/plotPerf.R similarity index 100% rename from misc/PanCancer/featSel_pathways/plotPerf.R rename to misc/PanCancer/outdated/featSel_pathways/plotPerf.R diff --git a/misc/PanCancer/featSel_pathways/plotRandom.R b/misc/PanCancer/outdated/featSel_pathways/plotRandom.R similarity index 100% rename from misc/PanCancer/featSel_pathways/plotRandom.R rename to misc/PanCancer/outdated/featSel_pathways/plotRandom.R diff --git a/misc/PanCancer/featSel_pathways/rankPatients_test.R b/misc/PanCancer/outdated/featSel_pathways/rankPatients_test.R similarity index 100% rename from misc/PanCancer/featSel_pathways/rankPatients_test.R rename to misc/PanCancer/outdated/featSel_pathways/rankPatients_test.R diff --git a/misc/PanCancer/featSel_pathways/showScoreRanking.R b/misc/PanCancer/outdated/featSel_pathways/showScoreRanking.R similarity index 100% rename from misc/PanCancer/featSel_pathways/showScoreRanking.R rename to misc/PanCancer/outdated/featSel_pathways/showScoreRanking.R diff --git a/misc/PanCancer/featSel_pathways/simRank_compareRealShuf.R b/misc/PanCancer/outdated/featSel_pathways/simRank_compareRealShuf.R similarity index 100% rename from misc/PanCancer/featSel_pathways/simRank_compareRealShuf.R rename to misc/PanCancer/outdated/featSel_pathways/simRank_compareRealShuf.R diff --git a/misc/PanCancer/featSel_pathways/survivalPlots.R b/misc/PanCancer/outdated/featSel_pathways/survivalPlots.R similarity index 100% rename from misc/PanCancer/featSel_pathways/survivalPlots.R rename to misc/PanCancer/outdated/featSel_pathways/survivalPlots.R diff --git a/misc/PanCancer/featSel_pathways/writeConsensusNets_batch.R b/misc/PanCancer/outdated/featSel_pathways/writeConsensusNets_batch.R similarity index 100% rename from misc/PanCancer/featSel_pathways/writeConsensusNets_batch.R rename to misc/PanCancer/outdated/featSel_pathways/writeConsensusNets_batch.R diff --git a/misc/PanCancer/featSel_pathways/writeConsensusNets_oneSet.R b/misc/PanCancer/outdated/featSel_pathways/writeConsensusNets_oneSet.R similarity index 100% rename from misc/PanCancer/featSel_pathways/writeConsensusNets_oneSet.R rename to misc/PanCancer/outdated/featSel_pathways/writeConsensusNets_oneSet.R diff --git a/misc/PanCancer/featSel_pathways/writeEmap.R b/misc/PanCancer/outdated/featSel_pathways/writeEmap.R similarity index 100% rename from misc/PanCancer/featSel_pathways/writeEmap.R rename to misc/PanCancer/outdated/featSel_pathways/writeEmap.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/GBM_featSel.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/GBM_featSel.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/GBM_featSel.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/GBM_featSel.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/GBM_featSel_somMut.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/GBM_featSel_somMut.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/GBM_featSel_somMut.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/GBM_featSel_somMut.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/KIRC_featSel_somMut.r b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/KIRC_featSel_somMut.r similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/KIRC_featSel_somMut.r rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/KIRC_featSel_somMut.r diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel_clinRNA.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel_clinRNA.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel_clinRNA.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel_clinRNA.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel_clinRPPA.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel_clinRPPA.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel_clinRPPA.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel_clinRPPA.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel_somMut.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel_somMut.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/LUSC_featSel_somMut.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/LUSC_featSel_somMut.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/OV_featSel_somMut.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/OV_featSel_somMut.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/OV_featSel_somMut.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/OV_featSel_somMut.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/PanCancer_featSel_getAcc.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/PanCancer_featSel_getAcc.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/PanCancer_featSel_getAcc.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/PanCancer_featSel_getAcc.R diff --git a/misc/PanCancer/featSel_pathways_somaticMutations/PanCancer_featSel_plotPerf.R b/misc/PanCancer/outdated/featSel_pathways_somaticMutations/PanCancer_featSel_plotPerf.R similarity index 100% rename from misc/PanCancer/featSel_pathways_somaticMutations/PanCancer_featSel_plotPerf.R rename to misc/PanCancer/outdated/featSel_pathways_somaticMutations/PanCancer_featSel_plotPerf.R diff --git a/misc/PanCancer/featSel_predictor/GBM_featSel.R b/misc/PanCancer/outdated/featSel_predictor/GBM_featSel.R similarity index 100% rename from misc/PanCancer/featSel_predictor/GBM_featSel.R rename to misc/PanCancer/outdated/featSel_predictor/GBM_featSel.R diff --git a/misc/PanCancer/featSel_predictor/GBM_featSel_somMut.R b/misc/PanCancer/outdated/featSel_predictor/GBM_featSel_somMut.R similarity index 100% rename from misc/PanCancer/featSel_predictor/GBM_featSel_somMut.R rename to misc/PanCancer/outdated/featSel_predictor/GBM_featSel_somMut.R diff --git a/misc/PanCancer/featSel_predictor/KIRC_featSel_somMut.r b/misc/PanCancer/outdated/featSel_predictor/KIRC_featSel_somMut.r similarity index 100% rename from misc/PanCancer/featSel_predictor/KIRC_featSel_somMut.r rename to misc/PanCancer/outdated/featSel_predictor/KIRC_featSel_somMut.r diff --git a/misc/PanCancer/featSel_predictor/LUSC_featSel.R b/misc/PanCancer/outdated/featSel_predictor/LUSC_featSel.R similarity index 100% rename from misc/PanCancer/featSel_predictor/LUSC_featSel.R rename to misc/PanCancer/outdated/featSel_predictor/LUSC_featSel.R diff --git a/misc/PanCancer/featSel_predictor/LUSC_featSel_clinRNA.R b/misc/PanCancer/outdated/featSel_predictor/LUSC_featSel_clinRNA.R similarity index 100% rename from misc/PanCancer/featSel_predictor/LUSC_featSel_clinRNA.R rename to misc/PanCancer/outdated/featSel_predictor/LUSC_featSel_clinRNA.R diff --git a/misc/PanCancer/featSel_predictor/LUSC_featSel_clinRPPA.R b/misc/PanCancer/outdated/featSel_predictor/LUSC_featSel_clinRPPA.R similarity index 100% rename from misc/PanCancer/featSel_predictor/LUSC_featSel_clinRPPA.R rename to misc/PanCancer/outdated/featSel_predictor/LUSC_featSel_clinRPPA.R diff --git a/misc/PanCancer/featSel_predictor/LUSC_featSel_somMut.R b/misc/PanCancer/outdated/featSel_predictor/LUSC_featSel_somMut.R similarity index 100% rename from misc/PanCancer/featSel_predictor/LUSC_featSel_somMut.R rename to misc/PanCancer/outdated/featSel_predictor/LUSC_featSel_somMut.R diff --git a/misc/PanCancer/featSel_predictor/OV_featSel_somMut.R b/misc/PanCancer/outdated/featSel_predictor/OV_featSel_somMut.R similarity index 100% rename from misc/PanCancer/featSel_predictor/OV_featSel_somMut.R rename to misc/PanCancer/outdated/featSel_predictor/OV_featSel_somMut.R diff --git a/misc/PanCancer/featSel_predictor/PanCancer_featSel_getAcc.R b/misc/PanCancer/outdated/featSel_predictor/PanCancer_featSel_getAcc.R similarity index 100% rename from misc/PanCancer/featSel_predictor/PanCancer_featSel_getAcc.R rename to misc/PanCancer/outdated/featSel_predictor/PanCancer_featSel_getAcc.R diff --git a/misc/PanCancer/featSel_predictor/PanCancer_featSel_plotPerf.R b/misc/PanCancer/outdated/featSel_predictor/PanCancer_featSel_plotPerf.R similarity index 100% rename from misc/PanCancer/featSel_predictor/PanCancer_featSel_plotPerf.R rename to misc/PanCancer/outdated/featSel_predictor/PanCancer_featSel_plotPerf.R diff --git a/misc/PanCancer/getFSNets.R b/misc/PanCancer/outdated/getFSNets.R similarity index 100% rename from misc/PanCancer/getFSNets.R rename to misc/PanCancer/outdated/getFSNets.R diff --git a/misc/PanCancer/getTrainTestPct.R b/misc/PanCancer/outdated/getTrainTestPct.R similarity index 100% rename from misc/PanCancer/getTrainTestPct.R rename to misc/PanCancer/outdated/getTrainTestPct.R diff --git a/misc/PanCancer/sp_GBM_classifier_netdx.r b/misc/PanCancer/outdated/sp_GBM_classifier_netdx.r similarity index 100% rename from misc/PanCancer/sp_GBM_classifier_netdx.r rename to misc/PanCancer/outdated/sp_GBM_classifier_netdx.r diff --git a/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R b/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R index a3649b6a..6f833e8b 100644 --- a/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R +++ b/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R @@ -191,7 +191,7 @@ for (rngNum in 1:100) { cat("Pruning\n") for (nm in setdiff(names(dats_train),"clinical")) { print(nm) - if (nrow(dats_train[[nm]])>10000 | nm == "prot") + if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 #topVar <- 50 pdf(sprintf("%s/%s_prune.pdf",outDir,nm)) @@ -202,7 +202,6 @@ for (rngNum in 1:100) { if (!is.na(prune)) { if (prune$bestThresh < 0.9) { res <- prune$res - if (nm == "prot") prune$bestThresh <- 0.6 res <- subset(res, adj.P.Val < prune$bestThresh) tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] From 0755c37919cb505a6ca2e67cf0c1d24ce17a6ec7 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 20 Apr 2018 15:04:10 -0400 Subject: [PATCH 029/124] various input sources --- misc/Ependymoma/Epen_plotDNAm.R | 66 +++++++++++++ misc/Ependymoma/Epen_plotResults.R | 41 +++++--- misc/Ependymoma/epen_dnam_hclust.pdf | Bin 0 -> 9858 bytes misc/Ependymoma/netDx_DNAm_tSNE.R | 47 +++++++++ misc/Ependymoma/netDx_RuthData.R | 6 +- misc/Ependymoma/netDx_pruned.R | 81 ++++++++++++++++ misc/Ependymoma/netDx_prunedDNAm.R | 105 +++++++++++++++++++++ misc/Ependymoma/netDx_prunedOneNet.R | 79 ++++++++++++++++ misc/Ependymoma/netDx_prunedPath_lenient.R | 81 ++++++++++++++++ 9 files changed, 491 insertions(+), 15 deletions(-) create mode 100644 misc/Ependymoma/Epen_plotDNAm.R create mode 100644 misc/Ependymoma/epen_dnam_hclust.pdf create mode 100644 misc/Ependymoma/netDx_DNAm_tSNE.R create mode 100644 misc/Ependymoma/netDx_pruned.R create mode 100644 misc/Ependymoma/netDx_prunedDNAm.R create mode 100644 misc/Ependymoma/netDx_prunedOneNet.R create mode 100644 misc/Ependymoma/netDx_prunedPath_lenient.R diff --git a/misc/Ependymoma/Epen_plotDNAm.R b/misc/Ependymoma/Epen_plotDNAm.R new file mode 100644 index 00000000..7b30cd76 --- /dev/null +++ b/misc/Ependymoma/Epen_plotDNAm.R @@ -0,0 +1,66 @@ +#' plot BRCA results +rm(list=ls()) +require(netDx) +require(netDx.examples) +#rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab/2017_Ependymoma" +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" + +phenoFile <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_pData.txt" +pheno <- read.delim(phenoFile,sep="\t",h=T,as.is=T) +ttype <- pheno$characteristics_ch1 + +inFile <- sprintf("%s/input/netDx_prepared/Ependymoma_cohortMerged_180125.Rdata",rootDir) +load(inFile) +# ---------------------- +# input processing +pheno <- read.delim(phenoFile,sep="\t",h=T,as.is=T) +ttype <- pheno$characteristics_ch1 +idx <- which(ttype %in% c("methylation class: EPN, PF A","methylation class: EPN, PF B")) +cat(sprintf("Got %i samples\n",length(idx))) +pheno <- pheno[idx,] # limit to EPN samples +cpos <- regexpr("sample", pheno$title) +bpos <- regexpr("\\[reference", pheno$title) +str <- as.integer(substr(pheno$title, cpos+7, bpos-2)) # get sample number +pheno$ID <- paste("SAMPLE", str,sep=".") +pheno <- pheno[,c("ID","characteristics_ch1")] +st <- rep("",nrow(pheno)) +st[grep("PF A", pheno[,2])] <- "PFA" +st[grep("PF B", pheno[,2])] <- "PFB" +pheno$STATUS <- st + +#out <- plotAllResults(pheno, sprintf("%s/pred",rootDir), +# outDir=sprintf("%s/plot",rootDir), +# fsCutoff=10,fsPctPass=0.7,pathwaySet=pathwayList) + +#setName <- "Epen_prunedOneNet_0.001_180409" +#setName <- "Epen_prunedPathway_0.01_180409" +setName <- "Epen_prunedOneNet_0.001_180410" +inDir <- sprintf("%s/output/%s/pred",rootDir,setName) +outDir <- sprintf("%s/output/%s/plot",rootDir,setName) + +if (!file.exists(outDir)) dir.create(outDir) +predClasses <- unique(pheno$STATUS) +postscript(sprintf("%s/perf.eps",outDir)) +predPerf <- plotPerf(inDir, predClasses=predClasses) +dev.off() +auroc <- unlist(lapply(predPerf, function(x) x$auroc*100)) +aupr <- unlist(lapply(predPerf, function(x) x$aupr*100)) +acc <- unlist(lapply(predPerf, function(x) x$accuracy)) + +cat("--------------\n") +cat(sprintf("Performance: %s\n",setName)) +cat(sprintf("AUROC = %1.2f +/- %1.2f\n",mean(auroc),sd(auroc))) +cat(sprintf("AUPR = %1.2f +/- %1.2f\n",mean(aupr),sd(aupr))) +cat(sprintf("Accuracy = %1.2f +/- %1.2f\n",mean(acc),sd(acc))) +cat("--------------\n") + + +###featScores <- getFeatureScores(inDir,predClasses=predClasses) +###featSelNet <- lapply(featScores, function(x) { +### callFeatSel(x, fsCutoff=10, fsPctPass=0.9) +###}) +### +###netInfoFile <- sprintf("%s/inputNets.txt",inDir) +###netInfo <- read.delim(netInfoFile,sep="\t",h=FALSE,as.is=TRUE) +###EMap_input <- writeEMapInput_many(featScores,pathwayList, +### netInfo,outDir=outDir) diff --git a/misc/Ependymoma/Epen_plotResults.R b/misc/Ependymoma/Epen_plotResults.R index bdaa12fd..bdae47a4 100644 --- a/misc/Ependymoma/Epen_plotResults.R +++ b/misc/Ependymoma/Epen_plotResults.R @@ -2,7 +2,8 @@ rm(list=ls()) require(netDx) require(netDx.examples) -rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab/2017_Ependymoma" +#rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab/2017_Ependymoma" +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" inFile <- sprintf("%s/input/netDx_prepared/Ependymoma_cohortMerged_180125.Rdata",rootDir) load(inFile) @@ -20,19 +21,35 @@ pathwayList <- readPathways(pathFile) # outDir=sprintf("%s/plot",rootDir), # fsCutoff=10,fsPctPass=0.7,pathwaySet=pathwayList) -inDir <- sprintf("%s/output/Epen_180125/pred",rootDir) -outDir <- sprintf("%s/output/Epen_180125/plot",rootDir) +#setName <- "Epen_prunedOneNet_0.001_180409" +#setName <- "Epen_prunedPathway_0.01_180409" +setName <- "Epen_prunedOneNet_0.001_180410" +inDir <- sprintf("%s/output/%s/pred",rootDir,setName) +outDir <- sprintf("%s/output/%s/plot",rootDir,setName) + if (!file.exists(outDir)) dir.create(outDir) predClasses <- unique(pheno$STATUS) postscript(sprintf("%s/perf.eps",outDir)) predPerf <- plotPerf(inDir, predClasses=predClasses) dev.off() -featScores <- getFeatureScores(inDir,predClasses=predClasses) -featSelNet <- lapply(featScores, function(x) { - callFeatSel(x, fsCutoff=10, fsPctPass=0.9) -}) - -netInfoFile <- sprintf("%s/inputNets.txt",inDir) -netInfo <- read.delim(netInfoFile,sep="\t",h=FALSE,as.is=TRUE) -EMap_input <- writeEMapInput_many(featScores,pathwayList, - netInfo,outDir=outDir) +auroc <- unlist(lapply(predPerf, function(x) x$auroc)) +aupr <- unlist(lapply(predPerf, function(x) x$aupr)) +acc <- unlist(lapply(predPerf, function(x) x$accuracy)) + +cat("--------------\n") +cat(sprintf("Performance: %s\n",setName)) +cat(sprintf("AUROC = %1.2f +/- %1.2f\n",mean(auroc),sd(auroc))) +cat(sprintf("AUPR = %1.2f +/- %1.2f\n",mean(aupr),sd(aupr))) +cat(sprintf("Accuracy = %1.2f +/- %1.2f\n",mean(acc),sd(acc))) +cat("--------------\n") + + +###featScores <- getFeatureScores(inDir,predClasses=predClasses) +###featSelNet <- lapply(featScores, function(x) { +### callFeatSel(x, fsCutoff=10, fsPctPass=0.9) +###}) +### +###netInfoFile <- sprintf("%s/inputNets.txt",inDir) +###netInfo <- read.delim(netInfoFile,sep="\t",h=FALSE,as.is=TRUE) +###EMap_input <- writeEMapInput_many(featScores,pathwayList, +### netInfo,outDir=outDir) diff --git a/misc/Ependymoma/epen_dnam_hclust.pdf b/misc/Ependymoma/epen_dnam_hclust.pdf new file mode 100644 index 0000000000000000000000000000000000000000..dcdcd4a295308b77a22d3f7810cc4fc585404223 GIT binary patch literal 9858 zcmch7XH-*Lx2V!VKthoOQ3423gb;cZL?A?Z3%!RBAhcizy@}LA6Ojl;GCV6_3fNAB0^n%KtOotibya?VzEU=#RzpY0`_*)#>I1eY1GgL-SUKR>N;oV5U4qzxZKsZPo zmf(OR^X)|fWaiG`mtAbFH6Yty@FBEi*4*(NX)3+BTI=C$b`aHD&gz(F3=W{5QCdm6 z2lZAS6NnElUia8LZeGlrH>$0gd-k$3IArTqo>Ius#(Slb(2)K4(Eay$p&>uN)Onjv zY=s{m{p1YI8^j(Q4%E#YtvnAon#(#k6#2E$NzkRsvkiTph_h7^kCJ! zNyy`|om*yI&eIQB=?yV&IKt0ex6+z+_1|ObfwlM%=JrcdqrvIDo4tAZPnai&8^`lA zTE7mD1`_u7Hdc=}f0TxPLE}P>TRZoP*%w+i2`w;7zFpVJU-JhO^OwuNM-S{Mg&fT7 zTsb=4-&#!fOtU?hv^{t#<{{dWIMQk~MKy-ZszGHgW3+ zz0Z8ww?XLxtF?G2!>;rd6GOWq$-@S*_YAVJ3lsD5=jMlKPS+}osR@T3eryQ-NsG{k zfxoE#el1hK&%xf(BwCQJz8AybcXVVClt06--}|dNg0h#R>#iRWLE(#-nxDxl??fFc zp7uo?=DJ=2-O<+7SEibtqdd(CTkOt%6$78G-c1F^7WJn!?uAiNxcJMj{IMqdY2qn2 zGY1%s%8aj~bT4xb7*y@J{tlX-R^@s@;xMqTv7PYs67MUzIx4&%Te)L56|jCy`^Jui z@LC|%3ein#U1(xsX-FAZZ^`PX3q$>u)$AV(@uylp#GU&?@Vq|-)p>x4HI{*y)wsA# z@)Td|orgL%LOS^M5fruD-<3i_ubf{7_EQadimJ)trX6SD?^wei23|yIJz7xXiU7Cf zd9PeDuD;jU*Kfj79)C01KH+KoeHoT=#!S0GDU6$BNEaF@f-U;Ikr}}iaSEG`djN(c zb6KYMvW2r?>!tg*U!XG2^s944u(VP?dsuPxj_yqpx^l*fyU|fvT3(l!RBqqup-_*H zr)w1pw4>omRSr>*8NSq54LpW%8L7TeM<4dfM5*KArC*FPDd1Ts;-x>UIXZk5?bRZk zWmJhzHW}Pc%3%mr2^~Ti7P>E_I}m~XR+H&Pkk(6Vz|&v zChnG=-XT0fg@@Y5d)xNMfn(`ZRj)cgDH!XhCdSo&T4l-_08=6wW$#s({p_uM#(@+G zU;0vB@cFU})DW_Abx!G(P|S7r*oc_082I4b(vAkVFc}s&O(NnkZ+Z1s$q6uTDtt61 z?!7r>a?cH>rG_`}(wB>JwaL4f%?MW>1UOyh%|bc|J>=JP6ay&7n>qa`jcS{Ai)67Y z^6#roHFG>-=gSi{=Pi=FxTVYmzp3$6m@kzVW3bi*kp8jq;5TLQ4FxuvNUAI}bHWhvg71ghve8MP~vgAXP zZq6?;d_|H`T*_&_!#9c~Z-h%l*dkK;8Y(m44sBpzdGVKbN~3J6rgckR(@FG~av1;W zLEC89eF!|O0+>y>%wbN$X+6;C4ncZuK(=|?Q%SGYar!yWqFOdo?WmJ`piGZ>G0xka zF7Y62(T|juQ(WRz_GN%T#^ zlMQX~@fjDWJtM-*vrQif&U~-@dwa472R!_%GlEGjHU@Ua1@)r$x{wRAL1K&occ{m8&d+hdp7Um#fAMf7eTcwwfH0#(+y2GIo zUp*&5Rav%VdVMjCyT}~V2A1TEc!7##J39_o%=Ip#2oob?7ez5Ox5%WPeE7~_jQHw` zG7vzV?Jq)N7e;y>QuO%INv0nv{lM%9NM#f7+YGJ~JF@Z01;+->3BcWd{pGmPD3{Bt>HwKf+!0~|jEW24 zY5YZ!D6XuDpbR{)LDq|D@u&D}X?Xz@ZvXc)qxDi$cGMud$E@!jau{=nW@0Hs0U;Xm zysG~ipDPV`o-?4Ba>u||XqEU|+|K}RHA^S}IBF_CFYIwyMJ>05avON}ie7?3%RRuv zZZK_XgG-BsffyPO9~AP3kRwCBvQ*QPu8`7~7U|)BDO))>6CeVWjKC6p z<7(Dtf@b8VxWz%Qn2KR~2SI79^| za3*wcnh9E0D`HDDnc3~Lo7xI}mW-PzWcS<*VJBPBVE#pZ#-?>D#TX!9;JYqpj{{+$ z;3EqkGVs~s^)9kM-VgN#7LUZU*vHEdQiozIVmX@LM4qIF;HRkm(ImkBcsCwhMkz-W zn6@PS6{%3&XDiK?qrjBXWl<3Z9&qIZUPB0f!aW`eCKVA%Y56;Ppe8E~dH6#oz^536 zhvw1zY42erfN2d6U+@7k$1L7v>@@?Bo=T@jUGWh~cUuMlDJLI&VFq5>tLbgNW#CU$ z{;@tP1i3T_u-QKbI5lSBA_xLF%-(CRg0JP;;mJL(j+A9CRr0K_d_3_t#i6FEAeSQd zU+v@&QOZ0B?51=sDwP~~*UWRduE!!q0iCQ=6o#<`TOc$jfv~gYd+xXx^zg}g3E-r& z!IWvnbNGRvLXO8|f!H2n%y%wjR97ZC#}g>li0)4YKo8F}vlm1u-fRLvG>iVHKA z%aqGU5r4>h#!MIx7eTdv14s|v`@=$VSvM#_Si6_`PH(#8GX6^Epf`$%1VAyWiNtLt z2sOY`lnU8im2$ZR!l}4o!2HxHV`wC)BlWy7*(Me`(W(Taf|IDZ)WB7cExP4E=Jl3S zl6H7fkEd{ikem?&glb5oVnoJ=0Xe=Wfsd>uaGw38B(QJkdaq*C3_&RaY>{Ss5sej1X7dBy z7^-_eA&0@av~vKp1{D~eLI1WZCLkd9EfB=J?v_S1heeWO=@!~e81vPN zvzD8jEXc+K=o{aJ@Ua)-V^pM6`yEWzbp*-LUq2iLq_h&6o9w(D>dKw&bLF?P8(MDD z0HZTo7le}mb-RJd>Mpx}k;~VFSwW5bI4G5aS+DfpNFn=FN)R2{I@qV$?}KBV6S_sn zWlZ&y%l@c*ua5oGJ9H8+4kFTx+cg^|9e`9Ae-lWxIm#4zpmIiE^Rs-&#|!R4c7U4K zzLXnyNRrhU#$~4ZW>EXw=tV^5Y3AlJ*{V>!OIT`J65cX2SvsMi9arV{A#x#JR?Gcr=_e!&8^JpaJw@mW=o@To|dqMZKwl|VV=J9EVeshE1+ zqzjPT#b4nv(&#Y2cCbu4G4@Dsz%_;w*Zes;_}-ul2;BKJTj0g+PNLPja4i3lSOQNUL!fdE!El8$Ql z%)}LQiho+J0$eV|2>+`j8q{V20$h9zlA9LJZlE_81(E#yBFnlA2vSe;?6ov{7jDVP5jQ&u^*0BhPjG3ewp&`)k@y3O$Zw20?mg)pPae@)C{uwSDV_o1ec$M9Agd?8 z_-=W#&_SK7g#uG~a_mecwlX{hwBVLr8uhVE<)j4tn&DUBWc5jyw*!n{&=Dl(;<7L? zfq3HiE{jAKETgn$8nrYaL~}h>2uL&_oDLiTrabg>NjZ;n2CB#hG(n7_?dz7mz-ioH zD|g$k959;hm6`1MjUNW(m4$%fM$_e#H0zsHMmrQu zh6#g##aAt^77J9Yq`Le$Ejp%NZ(8xwth5HH0lc&hDCROCX=*(l6de=FH?3F>tTHlP z(Z!qV0kaRYnB3kET{hjcLVveUo96 z>V)W6Hny~8I&e94iRp<=`Nu~t>1n>ljl}`9<)#&Kn1JE{rDC*&N5ZD&)^+ulB^8fYpwH2# zqj*r@5U~9&CjN7@GBALIfE{-+*`TOfC~yY^19pn||1@X==tSQ_ad$v&ft~&ypxAGL z4cahBP&7!5`+q-(4B|b89nl@8ah^?wm4qDcA8ub(3OQD89^2jP9_R^eIsVqk6SA{+ z^k{y5Ncn5kjf25-?rf@S>;9WOq1(%$yDx1^^aH&s?>_c5>OU0RvlRI8&B^HD#n(#4 z+gq6ii1pI}AvdXw?H{ltt1kt;oSk@gNDS5!P`_4dln6<+o~a2n?nM+6y9HjM*!CbC z8p1gVvg^y$I^7F(t6U#EN?N{qK@T&PRI8jUvh|Nymm66_E7k z-Gk&O-W54tb!}hfI7Wqieg6V=ZLiyBeeaC7SO$WyPWxp>5-;?&D?)~+R`I+Dx70{{ z5!C=nwb&_hDNwj%#`6Bhb6Ll}cb{nQ6)grm$Diqgav=ko^VD@a(7iu)1UO{v#CA`1u`3Z<vacQ-cn-+ z!;Z^twMJaKXvx6lov8;kB~haxi=q*$$&-h+A2%I_uMgDCHBm`XQw%k_WGO)9mW9 znA#dsh$xxXhNYq7D#r{TWi84PlR);axetgPReRXjstS5E=+<=z;(Wh$r8n^oQFd0w zHzK1}H_>9-S*$-ZaI{?M~h?P=giem&5UDI?>y7!Pgjx9glGOQ`0I z8JBYJC$V4Bqw~4%&L=bH0`9J6yzB5?_rkAaReZL&F??v)fUodMD@L` zK?|8TcC6)Q^r^yh>U8>83+|U_$tVbnXp9sWNJvda=w^yj-$ffwYm6Y|WSk=JmAH0k z`;T;wGz5!8LIV67y+=cU=06q;88M}AP6K|clKHN;-yN-t{f?V|QC|2B82vzF zkl!8WzXj-Y_sd*#sW`hc;)QN~nU$Mm$#!)SA?M4&6}Z^Y5I*=$W1vBPP8+5;KkrhX zo_og9oe-!2r&hkzG-Hqw*y-9l(n*j^)R-(kUS6`*&$E6UQgN`i8f4FMZ+>s~(eB|y z-tp1S@otf$8KUTX??N0Cp6k9Eyu0gu$bsz|=-PXrvHrV9{kz3Z?x6qG;fDThvH#Us zhiTy*yr7m)a!1_|Y7KPW3En^l9S()*;2iK66@nksQUZ7$LyJ-&e z`mtyn32F%gzBT|9aDF6f1{exxz(W-jU?@o_oVWPP+LpH9yi#G>QD( zK|z7MGC|8uXO%+o);{+u*lBlP6C=z%7=CjEOX zu_=SD&W04~Oik*^;v#T*7R^^lr}$)%MxIE63D(Nts`i-_3{cXkqGxpE(?J)^)PIk1X z8P`rxx}Eyu`TV)cxm^@R21U@jc()9;UCyRPX6W95$cR$>>r=;hDGla)@KgE^;^)_Q zC#@iy-SUd2*#dQX-49+!>-0-#_VbzFkuYv4}eVxAi zB>}VVK$50k+<=?fq{p#91$u#y|wE&`}BZ+2TGA4Cu_qwZD~MmP_EDg}=*ym~($7 zo^l*GG-vmBhdkrE_D&ulFX1lc-Yhg4YJHe>Z^tLiCL(kNg5=q4-8K!$U^?}*1Xnw` zAMK9kIAy(jTL0y*=sr8xsn}=D-ps+bE;a_)7`Dw`aT0wrJ#D3RI?q)Ywy?UN#~i9^ z8#G|p{9$zt*}BN+++Nf7ro0vd*prb9O&>;a||p+4cS|w z4b@Jowd`Vxq0Uf=y0M#=xqG(tsA#ew?T@Kl&|sPwMKLeejgX6{ghW74`alQK!PVe1 z9XBcW`r?K%9^@Ph+&Kz1t=B&4I(I2F5qlFUS;a(v9;Alx(l4 zUc3U;pE~=Al0sQ1hU%6b{L>xwq|-2O(3FewcCNm`y( zJgnD_YWlRH1at0f;WdW0H&`MfyI433qe)QJF79WMW=ZG2Qc|me^tn^$1XNYjDL8H{ z=Np!BS41rpI4)Q&2$8rNPcf<1a-)83rzcOB7{U3vdreZF zNBmCRt5i?QAeNN~z1H+kATtJ~uu{-%7SPp)aT1_&Z%``9ewXM$ypNJtizx6nrjjle z@ifG$Czp0vveT%k!S%IM&h+qbM9w};D=;aLEvzu0)z2xrK59HF>KJ| zI7Pa06_OJtjxx>98oaRZd;zp6zt2GvUVx2+V@j&c z9$y{jUXEpICqA`zgTG)1WC>&qq`sCI9L_&W@_Zq0Yo(wYBid&C$-?upr>Q4fJ>*Z; zk1VV4XWKW{dVk_}Q9)-z=#&!kc!k*F>7PWXtB7F43hoV}d zA7!SYfK<@q7HAeQP2-w-A-@o$f(yt5qP5Wq?t2Dec z)HIbebxc2_uC%hZT1Wkgs!E7Yu#cAodqWf@O{AD5`sF-inq5_7Z^@ZE4%-p?*s(p5 zI#M6p111?|pUY{%AHmN{M)zHKOa1L-U$ofZIo)$AaWrw~wOL2*XSBNE5-oH!rs>~3Zc+s&vWwmZK)9`5CXCTO!$5-W=Kcdp->bK4D z$=BxJgWm0Na+YXiYQ^;y`gd&XZg9;fG%Pg`t(L8eJs9hdbxj_IzSh2pf#C;~XM@k? z-n`!#QCjX)F|Rz}WHWBVSH)a4;pVihxh+g+dq429EpRY;mTeCG!)wNLKyI>b%(rLY zBO!7|IJ71-=~(vo{Ar8R)3?9fF2B+FDm5%vxhS7nc})52oxVG2uN8Sc)$O{rlU{Bx z&7Ya~jl45>d+Gz#o!BphU$SOPIrpzjH+U+%zFr{zd7tl# zntz=5m2tD|=G|S^c7bexo2)~B!2b~YV--=5P1C6sSu3?68U6h2^VsJ_z1itMtYifkSY>dklX1^pwo6YAjNWcTKC_X`{E~*gzac1h zbw@pym@7fBYnWWu>Ak!jywPG4X7Sp3$THvZ=?gnj@qEv|@nA*qqX5(8Dsc-nt97n8LyPRyjTiN@VDWP}r_4t*N3LPFT zKLJjyN-a^%8NxS@=H1jj(^T8wse3iJr*8|~229?Xd>&tZKV5f0$W(j)#`FH-vnDTE z?9J}P^lRzg|J28r8q~!-)q1970kN92#_8AAE(LpQZmB1j>I7Cg{wSH6cif@k zBPONBrWQLMRd%a=0WJIeG=)*I!4HBF2Nz{ga@(PO`;846h2=Gits$R77QJMCJ# z)7BOiu`OABM2w{l3;al2zIdI>ucofExb#b4@9GZEh@y#?)tuiX{hL1?vgPT1%ger= z>b|lk(cICEuxr*UkX*T-+1P{B5!0rs+?VG(e++jWCVyTwm>_&qujg&* zez~_`vR^r0=jAalZSKVxT=zA1xGDA^bLV`B^sj2`C8*-{CB>_2cCW*)VW+DvE8&he zL;hUg-NUp;^dmR;-{0&M489Cgvf1@J)EK{?QQEE~dTe#@_Uoi?i{GK71jjX*Bdg8B zk*X})vX*zp!z;K|(!0%HrjRN}oAf^oLcKnTExRATeBjF_)v#K%eYg7*M!wvfx#{~np*J(NAX@P7jsU`V{9BM!LD zLcXqI36++Cx#K;&y`TtL*e!1Y3Fn4$Br(9HrC|;P-~x`97aj_8B4T`i8W)50CYQQc zJQ3^d?&yZ|gThF7HwOUZj=>TM9#EJ)5l2P@W@4~d;DQVT%mELO^1^#TVd5SHk^{~W z3L~B{i3JV;Zf=-=fjfJ9IAMt1?rs=w5)?*oB6#3j0cx0&mA%N7yY;`^>^Uj>|1QM; zy4rJcD1f28?MWv$eaJxIw4sY}2Ur3Q3MX>;3r8L2=7S^Qu^0x}-*SPHCHk*7fldw+ z{|f>@(N7lqzY!2qA-FmGKdAT-agGcUP`D(6#6Lf%tc;A54Ac?&4@?p`*pPovkAGlr zS>WLHKVh=PToi-C(D(oxAOk2&!^4pP91%|rbcO^12}-`eb)pq7 V5{5`RkrqNmT7p49Kur(D@L$d`ZlwSK literal 0 HcmV?d00001 diff --git a/misc/Ependymoma/netDx_DNAm_tSNE.R b/misc/Ependymoma/netDx_DNAm_tSNE.R new file mode 100644 index 00000000..831da606 --- /dev/null +++ b/misc/Ependymoma/netDx_DNAm_tSNE.R @@ -0,0 +1,47 @@ +# plot tSNE of DNAm of ependymoma +require(Rtsne) + +# ----------------------------------------------- +# helper plotting function +plot_cluster=function(data, var_cluster, palette) +{ + ggplot(data, aes_string(x="V1", y="V2", color=var_cluster)) + + geom_point(size=0.25) + + guides(colour=guide_legend(override.aes=list(size=6))) + + xlab("") + ylab("") + + ggtitle("") + + theme_light(base_size=20) + + theme(axis.text.x=element_blank(), + axis.text.y=element_blank(), + legend.direction = "horizontal", + legend.position = "bottom", + legend.box = "horizontal") + + scale_colour_brewer(palette = palette) +} +# ----------------------------------------------- + +mData <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_EPN_PFAB_beta.txt.gz" + +orig_dat <- read.delim(mData,sep="\t",h=T,as.is=T) + +# tSNE +cat("computing tSNE\n") +set.seed(9) +t0 <- Sys.time() +dtsne <- Rtsne(as.matrix(orig_dat),check_duplicates=FALSE,pca=TRUE, + perplexity=30,theta=0.5,dims=2) +t1 <- Sys.time() +cat(sprintf("tSNE computation took %i seconds\n", t1-t0)) +dat <- as.data.frame(dtsne$Y) + +# kmeans +cat("kmeans clustering\n") +clust_km <- kmeans(scale(dat),2) +cl_kmeans <- factor(clust_km) + +#cat("hclustering\n") +#fit_km_hclust <- hclust(dist(scale(dat)) +#cl_hclust <- factor(cutree(fit_km_hclust,k=2)) + +pdf("tsne_DNAm.pdf") +plot_cluster(dat,"kmeans","Accent") diff --git a/misc/Ependymoma/netDx_RuthData.R b/misc/Ependymoma/netDx_RuthData.R index 9ece1dff..55aa0d8d 100644 --- a/misc/Ependymoma/netDx_RuthData.R +++ b/misc/Ependymoma/netDx_RuthData.R @@ -7,7 +7,7 @@ require(netDx.examples) rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" inDir <- sprintf("%s/input/netDx_prepared",rootDir) outDir <- sprintf("%s/output",rootDir) -pathFile <-sprintf("%s/anno/Human_AllPathways_November_01_2017_symbol.gmt", +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", rootDir) load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) @@ -35,7 +35,7 @@ makeNets <- function(dataList, groupList, netDir,...) { } dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/Epen_%s",outDir,dt) +megaDir <- sprintf("%s/Epen_2_%s",outDir,dt) if (!file.exists(megaDir)) dir.create(megaDir) gps <- list(rna=pathwayList) @@ -47,4 +47,4 @@ runPredictor_nestedCV(pheno, dataList=dats,groupList=gps, makeNetFunc=makeNets, ### custom network creation function outDir=sprintf("%s/pred",megaDir), - numCores=4L,nFoldCV=10L, CVcutoff=9L,numSplits=10L) + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=8L) diff --git a/misc/Ependymoma/netDx_pruned.R b/misc/Ependymoma/netDx_pruned.R new file mode 100644 index 00000000..49e1930c --- /dev/null +++ b/misc/Ependymoma/netDx_pruned.R @@ -0,0 +1,81 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +pathwayList <- readPathways(pathFile) +head(pathwayList) + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedPathway_0.001_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=pathwayList) +dats <- list(rna=xpr) +pheno$STATUS <- droplevels(pheno$STATUS) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < 0.001) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/netDx_prunedDNAm.R b/misc/Ependymoma/netDx_prunedDNAm.R new file mode 100644 index 00000000..aca56921 --- /dev/null +++ b/misc/Ependymoma/netDx_prunedDNAm.R @@ -0,0 +1,105 @@ +# Ependymoma - DNA methylation after Nature brain tumour paper +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +outDir <- sprintf("%s/output",rootDir) + +phenoFile <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_pData.txt" +dnaFile <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_EPN_PFAB_beta.txt.gz" + +# ---------------------- +# input processing +pheno <- read.delim(phenoFile,sep="\t",h=T,as.is=T) +ttype <- pheno$characteristics_ch1 +idx <- which(ttype %in% c("methylation class: EPN, PF A","methylation class: EPN, PF B")) +cat(sprintf("Got %i samples\n",length(idx))) +pheno <- pheno[idx,] # limit to EPN samples +cpos <- regexpr("sample", pheno$title) +bpos <- regexpr("\\[reference", pheno$title) +str <- as.integer(substr(pheno$title, cpos+7, bpos-2)) # get sample number +pheno$ID <- paste("SAMPLE", str,sep=".") +pheno <- pheno[,c("ID","characteristics_ch1")] +st <- rep("",nrow(pheno)) +st[grep("PF A", pheno[,2])] <- "PFA" +st[grep("PF B", pheno[,2])] <- "PFB" +pheno$STATUS <- st + +# ---------------------- +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["dnam"]])) { + netList <- makePSN_NamedMatrix(dataList$dnam, + rownames(dataList$dnam), + groupList[["dnam"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedOneNet_0.001_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +xpr <- read.delim(dnaFile,sep="\t",h=T,as.is=T) +rownames(xpr) <- paste("probe",1:nrow(xpr),sep="") +# match pheno and methylation values +midx <- match(colnames(xpr),pheno$ID) +if (all.equal(colnames(xpr),pheno$ID[midx])!=TRUE){ + cat("don't match\n") +browser() +} +pheno <- pheno[midx,] + +gps <- list(dnam=list(dnam=rownames(xpr))) +dats <- list(dnam=xpr) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + + require(dataExplore) + cat("add hclust\n"); browser() + + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + gps[[nm]] <- list(dnam=rownames(tmp)) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", + nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/netDx_prunedOneNet.R b/misc/Ependymoma/netDx_prunedOneNet.R new file mode 100644 index 00000000..9c35681c --- /dev/null +++ b/misc/Ependymoma/netDx_prunedOneNet.R @@ -0,0 +1,79 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedOneNet_0.001_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=list(rna=rownames(xpr))) +dats <- list(rna=xpr) +pheno$STATUS <- droplevels(pheno$STATUS) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < 0.001) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + gps[[nm]] <- list(rna=rownames(tmp)) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/netDx_prunedPath_lenient.R b/misc/Ependymoma/netDx_prunedPath_lenient.R new file mode 100644 index 00000000..7d7f2e18 --- /dev/null +++ b/misc/Ependymoma/netDx_prunedPath_lenient.R @@ -0,0 +1,81 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +pathwayList <- readPathways(pathFile) +head(pathwayList) + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedPathway_0.01_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=pathwayList) +dats <- list(rna=xpr) +pheno$STATUS <- droplevels(pheno$STATUS) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < 0.01) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) From 89e6d3fc2805ab92fe14c668c091230d66a7ece2 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 20 Apr 2018 15:47:40 -0400 Subject: [PATCH 030/124] moved buggy code to own folder --- misc/Ependymoma/buggy/netDx_pruned.R | 81 ++++++++++++++ misc/Ependymoma/buggy/netDx_prunedDNAm.R | 105 ++++++++++++++++++ misc/Ependymoma/buggy/netDx_prunedOneNet.R | 79 +++++++++++++ .../buggy/netDx_prunedPath_lenient.R | 81 ++++++++++++++ 4 files changed, 346 insertions(+) create mode 100644 misc/Ependymoma/buggy/netDx_pruned.R create mode 100644 misc/Ependymoma/buggy/netDx_prunedDNAm.R create mode 100644 misc/Ependymoma/buggy/netDx_prunedOneNet.R create mode 100644 misc/Ependymoma/buggy/netDx_prunedPath_lenient.R diff --git a/misc/Ependymoma/buggy/netDx_pruned.R b/misc/Ependymoma/buggy/netDx_pruned.R new file mode 100644 index 00000000..49e1930c --- /dev/null +++ b/misc/Ependymoma/buggy/netDx_pruned.R @@ -0,0 +1,81 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +pathwayList <- readPathways(pathFile) +head(pathwayList) + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedPathway_0.001_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=pathwayList) +dats <- list(rna=xpr) +pheno$STATUS <- droplevels(pheno$STATUS) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < 0.001) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/buggy/netDx_prunedDNAm.R b/misc/Ependymoma/buggy/netDx_prunedDNAm.R new file mode 100644 index 00000000..aca56921 --- /dev/null +++ b/misc/Ependymoma/buggy/netDx_prunedDNAm.R @@ -0,0 +1,105 @@ +# Ependymoma - DNA methylation after Nature brain tumour paper +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +outDir <- sprintf("%s/output",rootDir) + +phenoFile <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_pData.txt" +dnaFile <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_EPN_PFAB_beta.txt.gz" + +# ---------------------- +# input processing +pheno <- read.delim(phenoFile,sep="\t",h=T,as.is=T) +ttype <- pheno$characteristics_ch1 +idx <- which(ttype %in% c("methylation class: EPN, PF A","methylation class: EPN, PF B")) +cat(sprintf("Got %i samples\n",length(idx))) +pheno <- pheno[idx,] # limit to EPN samples +cpos <- regexpr("sample", pheno$title) +bpos <- regexpr("\\[reference", pheno$title) +str <- as.integer(substr(pheno$title, cpos+7, bpos-2)) # get sample number +pheno$ID <- paste("SAMPLE", str,sep=".") +pheno <- pheno[,c("ID","characteristics_ch1")] +st <- rep("",nrow(pheno)) +st[grep("PF A", pheno[,2])] <- "PFA" +st[grep("PF B", pheno[,2])] <- "PFB" +pheno$STATUS <- st + +# ---------------------- +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["dnam"]])) { + netList <- makePSN_NamedMatrix(dataList$dnam, + rownames(dataList$dnam), + groupList[["dnam"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedOneNet_0.001_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +xpr <- read.delim(dnaFile,sep="\t",h=T,as.is=T) +rownames(xpr) <- paste("probe",1:nrow(xpr),sep="") +# match pheno and methylation values +midx <- match(colnames(xpr),pheno$ID) +if (all.equal(colnames(xpr),pheno$ID[midx])!=TRUE){ + cat("don't match\n") +browser() +} +pheno <- pheno[midx,] + +gps <- list(dnam=list(dnam=rownames(xpr))) +dats <- list(dnam=xpr) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=topVar) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + + require(dataExplore) + cat("add hclust\n"); browser() + + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + gps[[nm]] <- list(dnam=rownames(tmp)) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", + nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/buggy/netDx_prunedOneNet.R b/misc/Ependymoma/buggy/netDx_prunedOneNet.R new file mode 100644 index 00000000..9c35681c --- /dev/null +++ b/misc/Ependymoma/buggy/netDx_prunedOneNet.R @@ -0,0 +1,79 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedOneNet_0.001_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=list(rna=rownames(xpr))) +dats <- list(rna=xpr) +pheno$STATUS <- droplevels(pheno$STATUS) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < 0.001) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + gps[[nm]] <- list(rna=rownames(tmp)) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/buggy/netDx_prunedPath_lenient.R b/misc/Ependymoma/buggy/netDx_prunedPath_lenient.R new file mode 100644 index 00000000..7d7f2e18 --- /dev/null +++ b/misc/Ependymoma/buggy/netDx_prunedPath_lenient.R @@ -0,0 +1,81 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +pathwayList <- readPathways(pathFile) +head(pathwayList) + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedPathway_0.01_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=pathwayList) +dats <- list(rna=xpr) +pheno$STATUS <- droplevels(pheno$STATUS) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +for (nm in setdiff(names(dats),"clinical")) { +print(nm) + #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 1) { + res <- prune$res + res <- subset(res, adj.P.Val < 0.01) + tmp <- dats[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats[[nm]] <- tmp + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } +} +#### ---------------------------------------------------------- + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) From 3d68a0dd7d4740f3e125db141e4160c404120b12 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 20 Apr 2018 15:47:45 -0400 Subject: [PATCH 031/124] moved buggy code to own folder --- misc/Ependymoma/netDx_pruned.R | 81 ---------------- misc/Ependymoma/netDx_prunedDNAm.R | 105 --------------------- misc/Ependymoma/netDx_prunedOneNet.R | 79 ---------------- misc/Ependymoma/netDx_prunedPath_lenient.R | 81 ---------------- 4 files changed, 346 deletions(-) delete mode 100644 misc/Ependymoma/netDx_pruned.R delete mode 100644 misc/Ependymoma/netDx_prunedDNAm.R delete mode 100644 misc/Ependymoma/netDx_prunedOneNet.R delete mode 100644 misc/Ependymoma/netDx_prunedPath_lenient.R diff --git a/misc/Ependymoma/netDx_pruned.R b/misc/Ependymoma/netDx_pruned.R deleted file mode 100644 index 49e1930c..00000000 --- a/misc/Ependymoma/netDx_pruned.R +++ /dev/null @@ -1,81 +0,0 @@ -# Ependymoma -rm(list=ls()) - -require(netDx) -require(netDx.examples) - -rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" -inDir <- sprintf("%s/input/netDx_prepared",rootDir) -outDir <- sprintf("%s/output",rootDir) -pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", - rootDir) -load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) - -# exclude ST -idx <- which(pheno$STATUS=="ST") -pheno <- pheno[-idx,] -xpr <- xpr[,-idx] - -pathwayList <- readPathways(pathFile) -head(pathwayList) - -makeNets <- function(dataList, groupList, netDir,...) { - netList <- c() - # make RNA nets: group by pathway - if (!is.null(groupList[["rna"]])) { - netList <- makePSN_NamedMatrix(dataList$rna, - rownames(dataList$rna), - groupList[["rna"]],netDir,verbose=FALSE, - writeProfiles=TRUE,...) - netList <- unlist(netList) - cat(sprintf("Made %i RNA pathway nets\n", length(netList))) - } - cat(sprintf("Total of %i nets\n", length(netList))) - return(netList) -} - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/Epen_prunedPathway_0.001_%s",outDir,dt) -if (!file.exists(megaDir)) dir.create(megaDir) - -gps <- list(rna=pathwayList) -dats <- list(rna=xpr) -pheno$STATUS <- droplevels(pheno$STATUS) - -#### ----------------------------------------------------- -### BEGIN PRUNING CODE -# apply pruning to proteomic data -curwd <- getwd() -setwd("../PanCancer") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) -for (nm in setdiff(names(dats),"clinical")) { -print(nm) - #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) - dev.off() - if (!is.na(prune)) { - if (prune$bestThresh < 1) { - res <- prune$res - res <- subset(res, adj.P.Val < 0.001) - tmp <- dats[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats[[nm]] <- tmp - cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } -} -#### ---------------------------------------------------------- - -runPredictor_nestedCV(pheno, - dataList=dats,groupList=gps, - makeNetFunc=makeNets, ### custom network creation function - outDir=sprintf("%s/pred",megaDir), - numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/netDx_prunedDNAm.R b/misc/Ependymoma/netDx_prunedDNAm.R deleted file mode 100644 index aca56921..00000000 --- a/misc/Ependymoma/netDx_prunedDNAm.R +++ /dev/null @@ -1,105 +0,0 @@ -# Ependymoma - DNA methylation after Nature brain tumour paper -rm(list=ls()) - -require(netDx) -require(netDx.examples) - -rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" -outDir <- sprintf("%s/output",rootDir) - -phenoFile <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_pData.txt" -dnaFile <- "/home/shraddhapai/BaderLab/2018_Epen_DNAm/input/GSE90496_EPN_PFAB_beta.txt.gz" - -# ---------------------- -# input processing -pheno <- read.delim(phenoFile,sep="\t",h=T,as.is=T) -ttype <- pheno$characteristics_ch1 -idx <- which(ttype %in% c("methylation class: EPN, PF A","methylation class: EPN, PF B")) -cat(sprintf("Got %i samples\n",length(idx))) -pheno <- pheno[idx,] # limit to EPN samples -cpos <- regexpr("sample", pheno$title) -bpos <- regexpr("\\[reference", pheno$title) -str <- as.integer(substr(pheno$title, cpos+7, bpos-2)) # get sample number -pheno$ID <- paste("SAMPLE", str,sep=".") -pheno <- pheno[,c("ID","characteristics_ch1")] -st <- rep("",nrow(pheno)) -st[grep("PF A", pheno[,2])] <- "PFA" -st[grep("PF B", pheno[,2])] <- "PFB" -pheno$STATUS <- st - -# ---------------------- -makeNets <- function(dataList, groupList, netDir,...) { - netList <- c() - # make RNA nets: group by pathway - if (!is.null(groupList[["dnam"]])) { - netList <- makePSN_NamedMatrix(dataList$dnam, - rownames(dataList$dnam), - groupList[["dnam"]],netDir,verbose=FALSE, - writeProfiles=TRUE,...) - netList <- unlist(netList) - cat(sprintf("Made %i RNA pathway nets\n", length(netList))) - } - cat(sprintf("Total of %i nets\n", length(netList))) - return(netList) -} - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/Epen_prunedOneNet_0.001_%s",outDir,dt) -if (!file.exists(megaDir)) dir.create(megaDir) - -xpr <- read.delim(dnaFile,sep="\t",h=T,as.is=T) -rownames(xpr) <- paste("probe",1:nrow(xpr),sep="") -# match pheno and methylation values -midx <- match(colnames(xpr),pheno$ID) -if (all.equal(colnames(xpr),pheno$ID[midx])!=TRUE){ - cat("don't match\n") -browser() -} -pheno <- pheno[midx,] - -gps <- list(dnam=list(dnam=rownames(xpr))) -dats <- list(dnam=xpr) - -#### ----------------------------------------------------- -### BEGIN PRUNING CODE -# apply pruning to proteomic data -curwd <- getwd() -setwd("../PanCancer") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) -for (nm in setdiff(names(dats),"clinical")) { -print(nm) - if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=topVar) - dev.off() - if (!is.na(prune)) { - if (prune$bestThresh < 1) { - res <- prune$res - res <- subset(res, adj.P.Val < prune$bestThresh) - - require(dataExplore) - cat("add hclust\n"); browser() - - tmp <- dats[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats[[nm]] <- tmp - gps[[nm]] <- list(dnam=rownames(tmp)) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", - nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } -} -#### ---------------------------------------------------------- - -runPredictor_nestedCV(pheno, - dataList=dats,groupList=gps, - makeNetFunc=makeNets, ### custom network creation function - outDir=sprintf("%s/pred",megaDir), - numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/netDx_prunedOneNet.R b/misc/Ependymoma/netDx_prunedOneNet.R deleted file mode 100644 index 9c35681c..00000000 --- a/misc/Ependymoma/netDx_prunedOneNet.R +++ /dev/null @@ -1,79 +0,0 @@ -# Ependymoma -rm(list=ls()) - -require(netDx) -require(netDx.examples) - -rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" -inDir <- sprintf("%s/input/netDx_prepared",rootDir) -outDir <- sprintf("%s/output",rootDir) -pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", - rootDir) -load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) - -# exclude ST -idx <- which(pheno$STATUS=="ST") -pheno <- pheno[-idx,] -xpr <- xpr[,-idx] - -makeNets <- function(dataList, groupList, netDir,...) { - netList <- c() - # make RNA nets: group by pathway - if (!is.null(groupList[["rna"]])) { - netList <- makePSN_NamedMatrix(dataList$rna, - rownames(dataList$rna), - groupList[["rna"]],netDir,verbose=FALSE, - writeProfiles=TRUE,...) - netList <- unlist(netList) - cat(sprintf("Made %i RNA pathway nets\n", length(netList))) - } - cat(sprintf("Total of %i nets\n", length(netList))) - return(netList) -} - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/Epen_prunedOneNet_0.001_%s",outDir,dt) -if (!file.exists(megaDir)) dir.create(megaDir) - -gps <- list(rna=list(rna=rownames(xpr))) -dats <- list(rna=xpr) -pheno$STATUS <- droplevels(pheno$STATUS) - -#### ----------------------------------------------------- -### BEGIN PRUNING CODE -# apply pruning to proteomic data -curwd <- getwd() -setwd("../PanCancer") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) -for (nm in setdiff(names(dats),"clinical")) { -print(nm) - #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) - dev.off() - if (!is.na(prune)) { - if (prune$bestThresh < 1) { - res <- prune$res - res <- subset(res, adj.P.Val < 0.001) - tmp <- dats[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats[[nm]] <- tmp - gps[[nm]] <- list(rna=rownames(tmp)) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } -} -#### ---------------------------------------------------------- - -runPredictor_nestedCV(pheno, - dataList=dats,groupList=gps, - makeNetFunc=makeNets, ### custom network creation function - outDir=sprintf("%s/pred",megaDir), - numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) diff --git a/misc/Ependymoma/netDx_prunedPath_lenient.R b/misc/Ependymoma/netDx_prunedPath_lenient.R deleted file mode 100644 index 7d7f2e18..00000000 --- a/misc/Ependymoma/netDx_prunedPath_lenient.R +++ /dev/null @@ -1,81 +0,0 @@ -# Ependymoma -rm(list=ls()) - -require(netDx) -require(netDx.examples) - -rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" -inDir <- sprintf("%s/input/netDx_prepared",rootDir) -outDir <- sprintf("%s/output",rootDir) -pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", - rootDir) -load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) - -# exclude ST -idx <- which(pheno$STATUS=="ST") -pheno <- pheno[-idx,] -xpr <- xpr[,-idx] - -pathwayList <- readPathways(pathFile) -head(pathwayList) - -makeNets <- function(dataList, groupList, netDir,...) { - netList <- c() - # make RNA nets: group by pathway - if (!is.null(groupList[["rna"]])) { - netList <- makePSN_NamedMatrix(dataList$rna, - rownames(dataList$rna), - groupList[["rna"]],netDir,verbose=FALSE, - writeProfiles=TRUE,...) - netList <- unlist(netList) - cat(sprintf("Made %i RNA pathway nets\n", length(netList))) - } - cat(sprintf("Total of %i nets\n", length(netList))) - return(netList) -} - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/Epen_prunedPathway_0.01_%s",outDir,dt) -if (!file.exists(megaDir)) dir.create(megaDir) - -gps <- list(rna=pathwayList) -dats <- list(rna=xpr) -pheno$STATUS <- droplevels(pheno$STATUS) - -#### ----------------------------------------------------- -### BEGIN PRUNING CODE -# apply pruning to proteomic data -curwd <- getwd() -setwd("../PanCancer") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) -for (nm in setdiff(names(dats),"clinical")) { -print(nm) - #if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats[[nm]],pheno$STATUS,topVar=100) - dev.off() - if (!is.na(prune)) { - if (prune$bestThresh < 1) { - res <- prune$res - res <- subset(res, adj.P.Val < 0.01) - tmp <- dats[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats[[nm]] <- tmp - cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } -} -#### ---------------------------------------------------------- - -runPredictor_nestedCV(pheno, - dataList=dats,groupList=gps, - makeNetFunc=makeNets, ### custom network creation function - outDir=sprintf("%s/pred",megaDir), - numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L) From ab7656dbe95b7fbab43e2a4c32f0460cfde7e598 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:36:29 -0400 Subject: [PATCH 032/124] variations on pre-filtering for netDx input --- misc/PanCancer/pruneVersion/lasso/GBM_ridge.R | 362 +++++++++++++++++ .../PanCancer/pruneVersion/lasso/KIRC_lasso.R | 351 ++++++++++++++++ .../PanCancer/pruneVersion/lasso/KIRC_ridge.R | 351 ++++++++++++++++ .../pruneVersion/lasso/KIRC_ridgeExp.R | 369 +++++++++++++++++ .../PanCancer/pruneVersion/lasso/LUSC_ridge.R | 375 ++++++++++++++++++ misc/PanCancer/pruneVersion/lasso/OV_ridge.R | 330 +++++++++++++++ 6 files changed, 2138 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/lasso/GBM_ridge.R create mode 100644 misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R create mode 100644 misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R create mode 100644 misc/PanCancer/pruneVersion/lasso/KIRC_ridgeExp.R create mode 100644 misc/PanCancer/pruneVersion/lasso/LUSC_ridge.R create mode 100644 misc/PanCancer/pruneVersion/lasso/OV_ridge.R diff --git a/misc/PanCancer/pruneVersion/lasso/GBM_ridge.R b/misc/PanCancer/pruneVersion/lasso/GBM_ridge.R new file mode 100644 index 00000000..6278a991 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/GBM_ridge.R @@ -0,0 +1,362 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=0) + # pick lambda that minimizes MSE + wt <- coef(fit,s="lambda.min")[,1] + vars <- names(wt)[which(wt>0)] + if (length(vars) < 6) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R b/misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R new file mode 100644 index 00000000..23ba489f --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R @@ -0,0 +1,351 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) # lasso for univariate filtering + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lasso_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit.lasso <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- coef(fit.lasso,s="lambda.min")[,1] + vars <- names(wt)[which(wt>0)] + if (length(vars) < 6) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +## pruneTrain code end + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R b/misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R new file mode 100644 index 00000000..c8bf2625 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R @@ -0,0 +1,351 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) # lasso for univariate filtering + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/ridge_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=0) + # pick lambda that minimizes MSE + wt <- coef(fit,s="lambda.min")[,1] + vars <- names(wt)[which(wt>0)] + if (length(vars) < 6) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +## pruneTrain code end + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/KIRC_ridgeExp.R b/misc/PanCancer/pruneVersion/lasso/KIRC_ridgeExp.R new file mode 100644 index 00000000..793cb665 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/KIRC_ridgeExp.R @@ -0,0 +1,369 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) # lasso for univariate filtering + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/ridgeExp_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + + +#' distance used in SNF +#' K (integer) num neighbours, should be between 10-30 +#' sigma (numeric) scaling hyperparameter, should be between 0.3 and 0.8 +sim.EucExp <- function(dat,K=20,sigma=0.3) { +sampNames <- colnames(dat) +dat <- t(dat) +# Euclidean distance, code from SNFtool package +dist2 <- function(X,C) { + ndata = nrow(X) + ncentres = nrow(C) + + sumsqX = rowSums(X^2) + sumsqC = rowSums(C^2) + + XC = 2 * (X %*% t(C)) + + res = matrix(rep(sumsqX,times=ncentres),ndata,ncentres) + t(matrix(rep(sumsqC,times=ndata),ncentres,ndata)) - XC + res[res < 0] = 0 + return(res) +} + Diff <- dist2(as.matrix(dat),as.matrix(dat)) + ### code from SNFtool::affinityMatrix. + ### applies exponential scaling + N = nrow(Diff) + + Diff = (Diff + t(Diff)) / 2 + diag(Diff) = 0; + sortedColumns = as.matrix(t(apply(Diff,2,sort))) + finiteMean <- function(x) { mean(x[is.finite(x)]) } + means = apply(sortedColumns[,1:K+1],1,finiteMean)+.Machine$double.eps; + + avg <- function(x,y) ((x+y)/2) + Sig = outer(means,means,avg)/3*2 + Diff/3 + .Machine$double.eps; + Sig[Sig <= .Machine$double.eps] = .Machine$double.eps + densities = dnorm(Diff,0,sigma*Sig,log = FALSE) + + W = (densities + t(densities)) / 2 + colnames(W) <- sampNames + rownames(W) <- sampNames + return(W) +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=0) + # pick lambda that minimizes MSE + wt <- coef(fit,s="lambda.min")[,1] + vars <- names(wt)[which(wt>0)] + if (length(vars) < 2) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +## pruneTrain code end + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") +browser() + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + simMetric="custom",customFunc=sim.EucExp,cutoff=.Machine$double.eps, + writeProfiles=FALSE,sparsify=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.EucExp, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores, + simMetric="custom",customFunc=sim.EucExp, + writeProfiles=FALSE,sparsify=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.EucExp,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/LUSC_ridge.R b/misc/PanCancer/pruneVersion/lasso/LUSC_ridge.R new file mode 100644 index 00000000..5df71b12 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/LUSC_ridge.R @@ -0,0 +1,375 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/ridge_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=0) + # pick lambda that minimizes MSE + wt <- coef(fit,s="lambda.min")[,1] + vars <- names(wt)[which(wt>0)] + if (length(vars) < 6) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +#### ---------------------------------------------------------- + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + ## Create the mega database with all patients and all nets. + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/OV_ridge.R b/misc/PanCancer/pruneVersion/lasso/OV_ridge.R new file mode 100644 index 00000000..081076a3 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/OV_ridge.R @@ -0,0 +1,330 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=0) + # pick lambda that minimizes MSE + wt <- coef(fit,s="lambda.min")[,1] + vars <- names(wt)[which(wt>0)] + if (length(vars) < 6) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + test_netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + ### netSets_iter has univariate filtering for curr round + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],test_netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + test_netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(test_netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 6ea5e48a963553595b82e4f3063b6fda888b3c52 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:36:57 -0400 Subject: [PATCH 033/124] code to test effect of sparsification --- .../pruneVersion/lasso/sparsify2_test.R | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/lasso/sparsify2_test.R diff --git a/misc/PanCancer/pruneVersion/lasso/sparsify2_test.R b/misc/PanCancer/pruneVersion/lasso/sparsify2_test.R new file mode 100644 index 00000000..7bcdd174 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/sparsify2_test.R @@ -0,0 +1,40 @@ +#' cleaner sparsification routine +#' +#' @details Sparsifies similarity matrix to keep strongest edges. +#' Sets diagonal and edges < cutoff to NA. Keeps strongest maxInt edges +#' per node. Ties are ignored. Keeps a max of EDGE_MAX edges in the network. +#' @param W (matrix) similarity matrix +#' @param outFile (char) path to file to write sparsified network +#' @param cutoff (numeric) edges with weight smaller than this are set to NA +#' @param maxInt (numeric) max num edges per node. +#' @param EDGE_MAX (numeric) max num edges in network +#' @return writes SIF content to text file (node1,node2,edge weight) +#' @import reshape2 +#' @export +sparsify2_test <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000) { + if (maxInt > ncol(W)) maxInt <- ncol(W) + + + diag(W) <- 0; + W[W < cutoff] <- NA + x <- apply(W,1,sort,decreasing=TRUE) + if (x) +browser() + for (k in 1:length(x)) { + cur <- x[[k]] + tryCatch({ + tokeep <- names(cur)[1:min(length(cur),maxInt)] + },error=function(ex) { browser() + }) + W[k,which(!colnames(W)%in% tokeep)] <- NA + } + tmp <- na.omit(melt(W)) + tmp <- tmp[order(tmp[,3],decreasing=TRUE),] + #maxEdge <- 0.02*ncol(W); + + maxEdge <- nrow(tmp) + if (maxEdge>EDGE_MAX) maxEdge <- EDGE_MAX + + tmp <- tmp[1:maxEdge,] + write.table(tmp,file=outFile,sep="\t",col=F,row=F,quote=F) +} From 58400b8b3c355db05857becc1e2ab9b1fd3dfdd8 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:37:54 -0400 Subject: [PATCH 034/124] moved --- misc/PanCancer/pruneVersion/KIRC_pruneTrain.R | 358 ---------------- misc/PanCancer/pruneVersion/LUSC_pruneTrain.R | 383 ------------------ misc/PanCancer/pruneVersion/OV_pruneTrain.R | 340 ---------------- 3 files changed, 1081 deletions(-) delete mode 100644 misc/PanCancer/pruneVersion/KIRC_pruneTrain.R delete mode 100644 misc/PanCancer/pruneVersion/LUSC_pruneTrain.R delete mode 100644 misc/PanCancer/pruneVersion/OV_pruneTrain.R diff --git a/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R b/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R deleted file mode 100644 index d9be1b31..00000000 --- a/misc/PanCancer/pruneVersion/KIRC_pruneTrain.R +++ /dev/null @@ -1,358 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) -require(netDx) -require(netDx.examples) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -cutoff <- 9 - -inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" -outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions -# takes average of normdiff of each row in x -normDiff2 <- function(x) { - # normalized difference - # x is vector of values, one per patient (e.g. ages) - normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out - } - - sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) - for (k in 1:nrow(x)) { - tmp <- normDiff(x[k,,drop=FALSE]) - sim <- sim + tmp - rownames(sim) <- rownames(tmp) - colnames(sim) <- colnames(tmp) - } - sim <- sim/nrow(x) - sim -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), - survival=sprintf("%s/KIRC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), - prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), - mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), - dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), - cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -#======transform clinical data========= -pheno$grade <- as.vector(pheno$grade) -pheno$grade[pheno$grade=="G1"] <- "G2" -pheno$grade[pheno$grade=="GX"] <- "G2" -pheno$grade <- as.factor(pheno$grade) -pheno <- pheno[, -which(colnames(pheno)=="gender")] -#====================================== - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -clinical$grade <- as.numeric(factor(clinical$grade)) -clinical$stage <- as.numeric(factor(clinical$stage)) -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) - - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ -# apply pruning to proteomic data -curwd <- getwd() -setwd("..") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) - -# first loop - over train/test splits -for (rngNum in 1:100) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - - ## pruneTrain code ------ - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - netSets_iter <- list() - for (nm in setdiff(names(dats_train),"clinical")) { - print(nm) - if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) - dev.off() - - netSets_iter[[nm]] <- rownames(tmp) - if (!is.na(prune)) { - if (prune$bestThresh < 0.9) { - res <- prune$res - res <- subset(res, adj.P.Val < prune$bestThresh) - tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats_train[[nm]] <- tmp - netSets_iter[[nm]] <- rownames(tmp) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", - nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } - } - alldat_train <- do.call("rbind",dats_train) - netSets_iter[["clinical"]] <- netSets[["clinical"]] -## pruneTrain code end - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, - verbose=FALSE,numCores=numCores,writeProfiles=FALSE, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - ## pruneTrain: make test database - ## This will be used to predict test samples by subsetting just for feature - ## selected nets in a given round - ## Note that this is useful for all train/test splits because we can always - ## change which samples are query and can always subset based on which nets - ## are feature selected in a given round. - netDir <- sprintf("%s/test_networks",outDir) - nonclin <- setdiff(names(netSets_iter),"clinical") - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - testdbDir <- GM_createDB(netDir, pheno_all$ID, - outDir,numCores=numCores) - - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",pDir2,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } - - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F, - quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R b/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R deleted file mode 100644 index 6f833e8b..00000000 --- a/misc/PanCancer/pruneVersion/LUSC_pruneTrain.R +++ /dev/null @@ -1,383 +0,0 @@ -#' PanCancer binarized survival: LUSC: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design - -rm(list=ls()) -require(netDx) -require(netDx.examples) -source("../runLM.R") - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions - -# takes average of normdiff of each row in x -normDiff2 <- function(x) { - # normalized difference - # x is vector of values, one per patient (e.g. ages) - normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out - } - - sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) - for (k in 1:nrow(x)) { - tmp <- normDiff(x[k,,drop=FALSE]) - sim <- sim + tmp - rownames(sim) <- rownames(tmp) - colnames(sim) <- colnames(tmp) - } - sim <- sim/nrow(x) - sim -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), - survival=sprintf("%s/LUSC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), - prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), - mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), - cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno -rownames(clinical) <- clinical[,1]; -# ======================= -# LUSC-specific variables -clinical$stage <- as.vector(clinical$stage) -clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" -clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" -clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" -clinical$stage <- as.factor(clinical$stage) -clinical <- clinical[, -which(colnames(clinical)=="gender")] -clinical <- t(clinical[,c("age","stage")]) -clinical[1,] <- as.integer(clinical[1,]) -clinical[2,] <- as.integer(as.factor(clinical[2,])) -class(clinical) <- "numeric" -# ======================= -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - if (nm == "rna") tmp <- log(tmp+1) - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ -# apply pruning to proteomic data -curwd <- getwd() -setwd("..") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) - - -# first loop - over train/test splits -for (rngNum in 1:100) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - - netSets_iter <- list() - cat("Pruning\n") - for (nm in setdiff(names(dats_train),"clinical")) { - print(nm) - if (nrow(dats_train[[nm]])>10000) - topVar <- 50 else topVar <- 100 - #topVar <- 50 - pdf(sprintf("%s/%s_prune.pdf",outDir,nm)) - prune <- LMprune(dats_train[[nm]], - pheno$STATUS,topVar=topVar) - dev.off() - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - if (!is.na(prune)) { - if (prune$bestThresh < 0.9) { - res <- prune$res - res <- subset(res, adj.P.Val < prune$bestThresh) - tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats_train[[nm]] <- tmp - netSets_iter[[nm]] <- rownames(tmp) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", - nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } - } - alldat_train <- do.call("rbind",dats_train) - netSets_iter[["clinical"]] <- netSets[["clinical"]] -#### ---------------------------------------------------------- - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE,simMetric="pearson") - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, - simMetric="pearson") - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("CombList = %s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - ## Create the mega database with all patients and all nets. - ## This will be used to predict test samples by subsetting just for feature - ## selected nets in a given round - ## Note that this is useful for all train/test splits because we can always - ## change which samples are query and can always subset based on which nets - ## are feature selected in a given round. - netDir <- sprintf("%s/test_networks",megaDir) - nonclin <- setdiff(names(netSets_iter),"clinical") - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE, - simMetric="pearson") - netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, - writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - megadbDir <- GM_createDB(netDir, pheno_all$ID, - megaDir,numCores=numCores, - simMetric="pearson") - - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - # only include the nets that were feature selected - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - } else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } -} - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/pruneVersion/OV_pruneTrain.R b/misc/PanCancer/pruneVersion/OV_pruneTrain.R deleted file mode 100644 index dca4872a..00000000 --- a/misc/PanCancer/pruneVersion/OV_pruneTrain.R +++ /dev/null @@ -1,340 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) -require(netDx) -require(netDx.examples) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -cutoff <- 9 - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions - -# normalized difference -# x is vector of values, one per patient (e.g. ages) -normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/OV_clinical_core.txt",inDir), - survival=sprintf("%s/OV_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/OV_mRNA_core.txt",inDir), - prot=sprintf("%s/OV_RPPA_core.txt",inDir), - mir=sprintf("%s/OV_miRNA_core.txt",inDir), - dnam=sprintf("%s/OV_methylation_core.txt",inDir), - cnv=sprintf("%s/OV_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clin <- pheno -rownames(clin) <- clin[,1]; -clin <- t(clin[,2,drop=FALSE]) -dats$clinical <- clin; rm(clin) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx, drop = FALSE] - x -}) - - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) - - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -#### ----------------------------------------------------- -### BEGIN PRUNING CODE -# apply pruning to proteomic data -curwd <- getwd() -setwd("..") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) - -# first loop - over train/test splits -for (rngNum in 1:100) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - -# pruneTrain: ---- - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - netSets_iter <- list() - for (nm in setdiff(names(dats_train),"clinical")) { - print(nm) - if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) - dev.off() - netSets_iter[[nm]] <- rownames(tmp) - if (!is.na(prune)) { - if (prune$bestThresh < 0.9) { - res <- prune$res - res <- subset(res, adj.P.Val < prune$bestThresh) - tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats_train[[nm]] <- tmp - netSets_iter[[nm]] <- rownames(tmp) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } - } - alldat_train <- do.call("rbind",dats_train) -netSets_iter[["clinical"]] <- netSets[["clinical"]] -## end pruning code -## ---- - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) - - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff, - verbose=FALSE,numCores=numCores,writeProfiles=FALSE, - sparsify=TRUE,append=TRUE) - - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - # -------- - # pruneTrain: make test database - test_netDir <- sprintf("%s/test_networks",outDir) - nonclin <- setdiff(names(netSets_iter),"clinical") - ### netSets_iter has univariate filtering for curr round - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],test_netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - test_netDir,simMetric="custom",customFunc=normDiff, - verbose=FALSE,numCores=numCores, writeProfiles=FALSE, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - testdbDir <- GM_createDB(test_netDir, pheno_all$ID, - outDir,numCores=numCores) - - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - # pTally <- sub(".profile","",pTally) - # pTally <- sub("_cont","",pTally) - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",pDir2,g) - GM_writeQueryFile(qSamps,incNets=pTally - ,nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } - - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) From 9fdcd54daa211654aeab1cf16237acb7cfa66da7 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:38:21 -0400 Subject: [PATCH 035/124] moved to dedicated folder --- .../pruneTrain/GBM_pruneTrained.R | 369 +++++++++++++++++ .../pruneVersion/pruneTrain/KIRC_pruneTrain.R | 358 ++++++++++++++++ .../pruneVersion/pruneTrain/LUSC_pruneTrain.R | 383 ++++++++++++++++++ .../pruneVersion/pruneTrain/OV_pruneTrain.R | 340 ++++++++++++++++ 4 files changed, 1450 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/pruneTrain/GBM_pruneTrained.R create mode 100644 misc/PanCancer/pruneVersion/pruneTrain/KIRC_pruneTrain.R create mode 100644 misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R create mode 100644 misc/PanCancer/pruneVersion/pruneTrain/OV_pruneTrain.R diff --git a/misc/PanCancer/pruneVersion/pruneTrain/GBM_pruneTrained.R b/misc/PanCancer/pruneVersion/pruneTrain/GBM_pruneTrained.R new file mode 100644 index 00000000..bef88fc6 --- /dev/null +++ b/misc/PanCancer/pruneVersion/pruneTrain/GBM_pruneTrained.R @@ -0,0 +1,369 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + for (nm in setdiff(names(dats),"clinical")) { + print(nm) + if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + dev.off() + if (!is.na(prune)) { + if (prune$bestThresh < 0.9) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/pruneTrain/KIRC_pruneTrain.R b/misc/PanCancer/pruneVersion/pruneTrain/KIRC_pruneTrain.R new file mode 100644 index 00000000..d9be1b31 --- /dev/null +++ b/misc/PanCancer/pruneVersion/pruneTrain/KIRC_pruneTrain.R @@ -0,0 +1,358 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) + dev.off() + + netSets_iter[[nm]] <- rownames(tmp) + if (!is.na(prune)) { + if (prune$bestThresh < 0.9) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", + nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +## pruneTrain code end + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R b/misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R new file mode 100644 index 00000000..6f833e8b --- /dev/null +++ b/misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R @@ -0,0 +1,383 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# apply pruning to proteomic data +curwd <- getwd() +setwd("..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + + netSets_iter <- list() + cat("Pruning\n") + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + if (nrow(dats_train[[nm]])>10000) + topVar <- 50 else topVar <- 100 + #topVar <- 50 + pdf(sprintf("%s/%s_prune.pdf",outDir,nm)) + prune <- LMprune(dats_train[[nm]], + pheno$STATUS,topVar=topVar) + dev.off() + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + if (!is.na(prune)) { + if (prune$bestThresh < 0.9) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", + nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +#### ---------------------------------------------------------- + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + ## Create the mega database with all patients and all nets. + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/pruneTrain/OV_pruneTrain.R b/misc/PanCancer/pruneVersion/pruneTrain/OV_pruneTrain.R new file mode 100644 index 00000000..30640661 --- /dev/null +++ b/misc/PanCancer/pruneVersion/pruneTrain/OV_pruneTrain.R @@ -0,0 +1,340 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/prune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../..") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) + +# first loop - over train/test splits +for (rngNum in 1:15) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + +# pruneTrain: ---- + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + if (nrow(dats_train[[nm]])>10000) topVar <- 50 else topVar <- 100 + pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) + prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) + dev.off() + netSets_iter[[nm]] <- rownames(tmp) + if (!is.na(prune)) { + if (prune$bestThresh < 0.9) { + res <- prune$res + res <- subset(res, adj.P.Val < prune$bestThresh) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) + cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) + } + } else { + cat(sprintf("%s: not pruning\n",nm)) + } + } + alldat_train <- do.call("rbind",dats_train) +netSets_iter[["clinical"]] <- netSets[["clinical"]] +## end pruning code +## ---- + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + test_netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + ### netSets_iter has univariate filtering for curr round + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],test_netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + test_netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(test_netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From bf26b24e8f3d94d647e348e6b7faa7be514d7290 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:39:09 -0400 Subject: [PATCH 036/124] version with no univar filter --- misc/PanCancer/noPrune/GBM_noPrune.R | 335 +++++++++++++++++++++++++ misc/PanCancer/noPrune/KIRC_noPrune.R | 323 ++++++++++++++++++++++++ misc/PanCancer/noPrune/LUSC_noPrune.R | 346 ++++++++++++++++++++++++++ misc/PanCancer/noPrune/OV_noPrune.R | 303 ++++++++++++++++++++++ 4 files changed, 1307 insertions(+) create mode 100644 misc/PanCancer/noPrune/GBM_noPrune.R create mode 100644 misc/PanCancer/noPrune/KIRC_noPrune.R create mode 100644 misc/PanCancer/noPrune/LUSC_noPrune.R create mode 100644 misc/PanCancer/noPrune/OV_noPrune.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune.R b/misc/PanCancer/noPrune/GBM_noPrune.R new file mode 100644 index 00000000..e9007efe --- /dev/null +++ b/misc/PanCancer/noPrune/GBM_noPrune.R @@ -0,0 +1,335 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:15) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/KIRC_noPrune.R b/misc/PanCancer/noPrune/KIRC_noPrune.R new file mode 100644 index 00000000..e0392ca5 --- /dev/null +++ b/misc/PanCancer/noPrune/KIRC_noPrune.R @@ -0,0 +1,323 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:15) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/LUSC_noPrune.R b/misc/PanCancer/noPrune/LUSC_noPrune.R new file mode 100644 index 00000000..aa1fd85d --- /dev/null +++ b/misc/PanCancer/noPrune/LUSC_noPrune.R @@ -0,0 +1,346 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + ## Create the mega database with all patients and all nets. + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/OV_noPrune.R b/misc/PanCancer/noPrune/OV_noPrune.R new file mode 100644 index 00000000..d363f176 --- /dev/null +++ b/misc/PanCancer/noPrune/OV_noPrune.R @@ -0,0 +1,303 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:15) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + test_netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + ### netSets_iter has univariate filtering for curr round + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],test_netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + test_netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(test_netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From dec3d314cde5eded2b72c4fdf2fc3b25b2261962 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:44:46 -0400 Subject: [PATCH 037/124] added log-transform switch --- misc/PanCancer/multiCutoff/GBM_parseMulti.R | 5 +- misc/PanCancer/multiCutoff/KIRC_parseMulti.R | 7 +- misc/PanCancer/multiCutoff/LUSC_parseMulti.R | 6 +- misc/PanCancer/multiCutoff/OV_parseMulti.R | 11 +- .../PanCancer/pruneVersion/GBM_pruneTrained.R | 369 ------------------ .../pruneVersion/corrFeatWithOutcome.R | 149 +++++++ misc/PanCancer/simFuns.R | 9 +- 7 files changed, 170 insertions(+), 386 deletions(-) delete mode 100644 misc/PanCancer/pruneVersion/GBM_pruneTrained.R create mode 100644 misc/PanCancer/pruneVersion/corrFeatWithOutcome.R diff --git a/misc/PanCancer/multiCutoff/GBM_parseMulti.R b/misc/PanCancer/multiCutoff/GBM_parseMulti.R index 78b32fb5..822fdc62 100644 --- a/misc/PanCancer/multiCutoff/GBM_parseMulti.R +++ b/misc/PanCancer/multiCutoff/GBM_parseMulti.R @@ -5,7 +5,8 @@ require(reshape2) #dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180419" +#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180420" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180420" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCA1net_180126" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCAmultinet_180126" @@ -25,7 +26,7 @@ for (settype in settypes) { ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:43,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") colctr <- 1 for (cutoff in 9) { diff --git a/misc/PanCancer/multiCutoff/KIRC_parseMulti.R b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R index 429f434b..07a53369 100644 --- a/misc/PanCancer/multiCutoff/KIRC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R @@ -3,7 +3,9 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruneTrain_180419" +#dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/outdated/nestCV_170911" +dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/lasso_180420" +#dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruneTrain_180419" settypes <- c("clinical","mir","rna","prot","cnv","dnam", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", @@ -22,7 +24,7 @@ for (settype in settypes) { ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:50,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:73,sep="") colctr <- 1 for (cutoff in 9) { @@ -51,6 +53,7 @@ for (cutoff in 9) { } ctr <- ctr+1 } +cat(sprintf("Base dir: %s\n", dirname(dataDir))) print(round(outmat,digits=2)) meds <- unlist(lapply(auc_set,median)) diff --git a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R index 766c8f55..77e6b06b 100644 --- a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R @@ -3,7 +3,7 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/prunedPearson_180212" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/ridge_180420" settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") @@ -21,10 +21,10 @@ for (settype in settypes) { ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:93,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") colctr <- 1 -for (cutoff in 9) { +for (cutoff in 7:9) { c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", rngDir,settype,cutoff) torm <- c() diff --git a/misc/PanCancer/multiCutoff/OV_parseMulti.R b/misc/PanCancer/multiCutoff/OV_parseMulti.R index eeb15de2..e93322bc 100644 --- a/misc/PanCancer/multiCutoff/OV_parseMulti.R +++ b/misc/PanCancer/multiCutoff/OV_parseMulti.R @@ -3,8 +3,9 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output/pruned_180206" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output/pruneTrain_180420" +maxRng <- 100 settypes <- c("clinical","mir","rna","prot","cnv","dnam", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", "clinicalAcnv","all") @@ -22,7 +23,7 @@ for (settype in settypes) { ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:14,sep="") +rngDir <- paste(sprintf("%s/rng",dataDir), 1:maxRng,sep="") colctr <- 1 for (cutoff in 9) { @@ -53,8 +54,12 @@ ctr <- ctr+1 } #auc_set <- auc_set[which(names(auc_set)%in% c("clinical","rna","clinicalArna","all"))] - +cat("-----------------\n") +cat(sprintf("OV: Base dir: %s\n", basename(dataDir))) +cat(sprintf("%i splits\n", length(auc_set[[1]]))) +cat("-----------------\n") print(round(outmat,digits=2)) +cat("-----------------\n") pdf("ov_auc.pdf",width=16,height=5); boxplot(auc_set,cex.axis=0.6,pars=list(boxwex=0.3)); abline(h=median(auc_set[["clinical"]])); diff --git a/misc/PanCancer/pruneVersion/GBM_pruneTrained.R b/misc/PanCancer/pruneVersion/GBM_pruneTrained.R deleted file mode 100644 index bef88fc6..00000000 --- a/misc/PanCancer/pruneVersion/GBM_pruneTrained.R +++ /dev/null @@ -1,369 +0,0 @@ -#' PanCancer binarized survival: GBM: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design -#' multi cutoff evaluation -#' also pruning RNA before running - -rm(list=ls()) -require(netDx) -require(netDx.examples) -source("../runLM.R") - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 - -rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" -inDir <- sprintf("%s/input",rootDir) -outRoot <- sprintf("%s/output",rootDir) - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions -# takes average of normdiff of each row in x -normDiff2 <- function(x) { - # normalized difference - # x is vector of values, one per patient (e.g. ages) - normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out - } - - sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) - for (k in 1:nrow(x)) { - tmp <- normDiff(x[k,,drop=FALSE]) - sim <- sim + tmp - rownames(sim) <- rownames(tmp) - colnames(sim) <- colnames(tmp) - } - sim <- sim/nrow(x) - sim -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/GBM_clinical_core.txt",inDir), - survival=sprintf("%s/GBM_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/GBM_mRNA_core.txt",inDir), - mir=sprintf("%s/GBM_miRNA_core.txt",inDir), - dnam=sprintf("%s/GBM_methylation_core.txt",inDir), - cnv=sprintf("%s/GBM_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" -# ------------------ - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -# ======================= -# GBM-specific variables -clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA -clinical$performance_score <- strtoi(clinical$performance_score) -clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -# ======================= -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalAcnv=c("clinical_cont","cnv.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno,pheno_nosurv) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -#### ----------------------------------------------------- -### BEGIN PRUNING CODE -# apply pruning to proteomic data -curwd <- getwd() -setwd("..") -source("LMprune.R") -source("runLM.R") -source("silh.R") -require(cluster) -setwd(curwd) - -# first loop - over train/test splits -for (rngNum in 1:100) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=FALSE]) - netSets_iter <- list() - for (nm in setdiff(names(dats),"clinical")) { - print(nm) - if (nrow(dats[[nm]])>10000) topVar <- 50 else topVar <- 100 - pdf(sprintf("%s/%s_prune.pdf",megaDir,nm)) - prune <- LMprune(dats_train[[nm]],pheno$STATUS,topVar=topVar) - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - dev.off() - if (!is.na(prune)) { - if (prune$bestThresh < 0.9) { - res <- prune$res - res <- subset(res, adj.P.Val < prune$bestThresh) - tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) - tmp <- tmp[which(rownames(tmp)%in% rownames(res)),] - dats_train[[nm]] <- tmp - netSets_iter[[nm]] <- rownames(tmp) - cat(sprintf("%s: Pruning with cutoff %1.2f\n", nm,prune$bestThresh)) - cat(sprintf("\t%i of %i left\n", nrow(tmp),orig_ct)) - } - } else { - cat(sprintf("%s: not pruning\n",nm)) - } - } - alldat_train <- do.call("rbind",dats_train) - netSets_iter[["clinical"]] <- netSets[["clinical"]] - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } -## Create the mega database with all patients and all nets. -## This will be used to predict test samples by subsetting just for feature -## selected nets in a given round -## Note that this is useful for all train/test splits because we can always -## change which samples are query and can always subset based on which nets -## are feature selected in a given round. -netDir <- sprintf("%s/test_networks",outDir) -nonclin <- setdiff(names(netSets),"clinical") -netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) -netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) -netList <- c(netList,netList2) -cat(sprintf("Total of %i nets\n", length(netList))) -# now create database -testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - }else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/pruneVersion/corrFeatWithOutcome.R b/misc/PanCancer/pruneVersion/corrFeatWithOutcome.R new file mode 100644 index 00000000..3752c115 --- /dev/null +++ b/misc/PanCancer/pruneVersion/corrFeatWithOutcome.R @@ -0,0 +1,149 @@ +#' correlate feature with outcome +#' +#' @details Shows patient-level data for features of interest. For complex +#' features such as pathways, shows the patient-level PC projections, and +#' correlates these PC projects with outcome. It is a compact representation +#' of patient-level pathway activity. +#' @param pheno (data.frame) must have ID and STATUS +#' @param datList (list) keys are datatypes, and values are data.frames or +#' matrix with patients in columns and values in rows +#' @param groupList (list) unit groupings. keys are datatypes, values are lists +#' of unit groups. e.g. for pathway grouping in rna, groupList[["rna"]] would +#' be a list with pathway names as keys and genes as values. +#' @param inputNets (data.frame) contents of inputNets.txt. Two-column table +#' with column 1 having datatype and column 2 having net name. +#' @param selFeatures (char) Features to correlate. Typically these would +#' be high-scoring features from the predictor. Names should match column 2 +#' of inputNets +#' @param numPCs (integer) how many principal components to show data for +#' @param filePfx (char) prefix for output pdfs. +#' @import plotrix +#' @return No value. Side effect of creating plots corr +#' @export +corrFeatWithOutcome <- function(pheno, datList, groupList,inputNets, + selFeatures,numPCs=3,filePfx="corrFeat") { + + if (!class(pheno$STATUS)=="factor") pheno$STATUS <- factor(pheno$STATUS) + + resMat <- matrix(0, nrow=length(selFeatures), ncol=numPCs*2) + isDone <- rep(FALSE, length(selFeatures)) + rownames(resMat) <- selFeatures + #rownames(resMat) <- gsub("_"," ",sub(".profile$|_cont$","",selFeatures)) + #rownames(resMat) <- toTitleCase(rownames(resMat)) + plotList <- list();plotCtr <- 1 + + for (idx in 1:length(selFeatures)) { + curF <- selFeatures[idx] + print(curF) + netType <- inputNets[which(inputNets[,2]==curF),1] + myDat <- datList[[netType]] + mySet <- groupList[[netType]][[curF]] + dat <- myDat[mySet,] + dat <- na.omit(dat) + + dat <- dat[,colSums(is.na(dat))==0] + if (ncol(dat)>=3) { + pr <- prcomp(na.omit(t(dat))) + pr <- pr$x[,1:numPCs] + } else { + cat(sprintf("\t\t%s: Has < 3 values!!\n",curF)) + pr <- dat + } + + maxDim <- min(ncol(pr),numPCs) + numPCs <- maxDim + tmp <- data.frame(pr[,1:maxDim],STATUS=pheno$STATUS) + colnames(tmp)[1:maxDim] <- paste("PC",1:maxDim,sep="") + + combs <- combn(maxDim,2) + + for (k in 1:numPCs) { + y <- cor.test(pr[,k],as.integer(pheno$STATUS),method="spearman") + resMat[idx,k] <- y$estimate + resMat[idx,3+k] <- -log10(y$p.value) # y$p.value) + } + + for (k in 1:ncol(combs)) { + i <- combs[1,k]; j <- combs[2,k] + cat(sprintf("[%i %i]\n",i,j)) + + # draw decision boundary in automated manner + tmp2 <- tmp[,c("STATUS",sprintf("PC%i",i),sprintf("PC%i",j))] + colnames(tmp2) <- c("y","x1","x2") + mdl <- glm(y ~ ., data=tmp2,family=binomial) + slope <- coef(mdl)[2]/(-coef(mdl)[3]) + intercept <- coef(mdl)[1]/(-coef(mdl)[3]) + + showLeg_Flag <- (k == ncol(combs)) + if (ncol(combs) > 4) { + pt <- 0.8; lwd <- 1 ;cex <-5 + } else { + pt <- 2; lwd <- 2; cex <- 12 + } + + p <- ggplot(tmp,aes_string(x=sprintf("PC%i",i), + y=sprintf("PC%i",j))) + p <- p + geom_point(aes(colour=factor(STATUS)),alpha=0.6, + size=pt,show.legend=showLeg_Flag) + p <- p + geom_abline(intercept=intercept,slope=slope, + colour="gray50",lwd=lwd) + p <- p + ggtitle(sprintf("%s\ncor=%1.2f (p<%1.2e)", + rownames(resMat)[idx],resMat[idx,i],10^-resMat[idx,3+i])) + p <- p + theme(# legend.position="none", + axis.ticks=element_blank(), + axis.text=element_blank(), + plot.title=element_text(size=cex)) + + plotList[[plotCtr]] <- p + plotCtr <- plotCtr+1 + } +} + +# now plot the PC projections with colour-coded status + +nr <- 3; nc <- choose(numPCs,2) +pdf(sprintf("%s_PCview.pdf", filePfx),height=11,width=11) +tryCatch({ + for (sidx in seq(1,length(plotList),nr*nc)) { + eidx <- sidx+((nr*nc)-1); + cat(sprintf("%i-%i\n",sidx,eidx)) + if (eidx>length(plotList)) eidx <- length(plotList) + multiplot(plotlist=plotList[sidx:eidx], + layout=matrix(1:(nr*nc),ncol=nc,byrow=TRUE)) +} +},error=function(ex) {print(ex)},finally={dev.off()}) + +pdf(sprintf("%s_PCtable.pdf",filePfx),height=11,width=11) +tryCatch({ +#colnames(resMat)[1:3] <- paste("PC ",1:3,sep="") +par(mar = c(0.5, 35, 6.5, 0.5)) + plotrix::color2D.matplot(resMat[,1:3],show.values=TRUE,axes=F, + xlab="",ylab="",vcex=2,vcol='black', + cs1=c(1,1,0),cs2=c(0,1,0),cs3=c(0,1,1)) + axis(3,at=seq_len(3)-0.5,labels=colnames(resMat)[4:6], + tick=F,cex.axis=1,line=-1) + axis(2,at=seq_len(nrow(resMat))-0.5, + labels=sub(".profile","", + sub("_cont","",rev(rownames(resMat)))),tick=F, + las=1,cex.axis=1) +},error=function(ex){print(ex)},finally={dev.off()}) + +} + +toTitleCase <- function(str) { + str <- tolower(str) + sp <- gregexpr(" ",str) + str2 <- sapply(1:length(str), function(i) { + z <- str[i] + z <- paste(toupper(substr(z,1,1)),substr(z,2,nchar(z)),sep="") + if (!sp[[i]][1]==-1) { + for (idx in sp[[i]]) { + z <- gsub(paste("^(.{",idx,"}).",sep=""), + paste("\\1",toupper(substr(z,idx+1,idx+1)),sep=""),z); + } + } + z + }) + str2 <- unlist(str2) + str2 +} diff --git a/misc/PanCancer/simFuns.R b/misc/PanCancer/simFuns.R index e3599466..b12c4a47 100644 --- a/misc/PanCancer/simFuns.R +++ b/misc/PanCancer/simFuns.R @@ -46,12 +46,6 @@ sim.cos <- function(m) { return(out) } -#' similarity from distance-based measures -sim.dist <- function(m,d="euclidean") { - m <- na.omit(t(m)) - out <- 1/(1+as.matrix(dist(m,method=d))) - out -} # normalized difference # x is vector of values, one per patient (e.g. ages) @@ -105,7 +99,8 @@ plotrix::color2D.matplot(x,xrange=c(0,1), #' @param s1 (matrix) similarity matrix. If 3-column table provided, assumes #' it's a SIF #' @param c1,c2 (char) vector of patients in each of the two groups -plotSim <- function(s1,name="simfun",c1,c2) { +plotSim <- function(s1,name="simfun",c1,c2,logT=FALSE) { + if (logT) s1 <- log10(s1+.Machine$double.eps) if (ncol(s1) == 3) { cat("assuming SIF provided\n") colnames(s1) <- c("Var1","Var2","value") From 66ba4b2b2977152e078a3503e2feb81a850cc990 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:45:12 -0400 Subject: [PATCH 038/124] changed num cores --- misc/Asthma_PBMC/netDx.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/Asthma_PBMC/netDx.R b/misc/Asthma_PBMC/netDx.R index 03ace33f..9e65d441 100644 --- a/misc/Asthma_PBMC/netDx.R +++ b/misc/Asthma_PBMC/netDx.R @@ -66,4 +66,4 @@ runPredictor_nestedCV(pheno, dataList=dats,groupList=gps, makeNetFunc=makeNets, ### custom network creation function outDir=sprintf("%s/pred",megaDir), - numCores=2L,nFoldCV=10L, CVcutoff=9L,numSplits=100L,CVmemory=13L) + numCores=4L,nFoldCV=10L, CVcutoff=9L,numSplits=100L,CVmemory=13L) From d46937b0b0542ce1c42c33861ee02e4ba7ed616c Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 08:46:47 -0400 Subject: [PATCH 039/124] netDx with increasing missingness --- misc/BRCA/BRCA_missing_perf_multi_run.r | 182 ++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 misc/BRCA/BRCA_missing_perf_multi_run.r diff --git a/misc/BRCA/BRCA_missing_perf_multi_run.r b/misc/BRCA/BRCA_missing_perf_multi_run.r new file mode 100644 index 00000000..b1a8ab18 --- /dev/null +++ b/misc/BRCA/BRCA_missing_perf_multi_run.r @@ -0,0 +1,182 @@ +# assess performance of BRCA classifier for different levels of missing +# data +rm(list=ls()) +require(netDx) +require(RColorBrewer) + +rngRange <- 1:13 # number of train/test split iterations run + +inDir <- "/mnt/data2/BaderLab/TCGA_BRCA/output/msng_170213" +inFull <- "/mnt/data2/BaderLab/TCGA_BRCA/output/xpr_170213" + +combSet <- paste("miss",c(0,10,50,70,85,90,95,99),sep="") +cols <- c(brewer.pal(n=length(combSet),name="Blues"), "darkblue") + +cat(sprintf("Got %i combs\n", length(combSet))) + +all_overall_acc <- list() +all_tot <- list() +all_f1 <- list() +all_acc <- list() +all_ppv <- list() +for (rngSeed in rngRange) { + #cat(sprintf("RNG %i\n",rngSeed)) + out <- list() + overall_acc <- numeric() + pctMiss <- numeric() + numInt <- numeric() + curRoc <- list() + + for (cur in combSet) { + + if (cur == "miss0") { + inf <- sprintf("%s/rng%i/predictionResults.txt", + inFull,rngSeed) + } else { + inf <- sprintf("%s/rng%i/%s/predictionResults.txt", + inDir,rngSeed,cur) + } + # cat(sprintf("\t%s ", cur)) + dat <- read.delim(inf,sep="\t",h=T,as.is=T) + dat <- dat[-which(dat$STATUS %in% "Normal"),] + out[[cur]] <- perfCalc_multiClass(dat$STATUS,dat$PRED_CLASS)*100 + overall_acc <- c(overall_acc, + sum(dat$STATUS==dat$PRED_CLASS)/nrow(dat)*100) + } + #cat("\n") + names(overall_acc) <- combSet + + tot <- unlist(lapply(out,function(x) sum(x[1,1:4])/100)) + f1 <- unlist(lapply(out, function(x) x[nrow(x),7])) + acc <- unlist(lapply(out, function(x) x[nrow(x),8])) + ppv <- unlist(lapply(out, function(x) x[nrow(x),5])) + + all_tot[[rngSeed]] <- tot + all_f1[[rngSeed]] <- f1 + all_acc[[rngSeed]] <- acc + all_ppv[[rngSeed]] <- ppv + all_overall_acc[[rngSeed]] <- overall_acc + +} + +overall_avg_f1 <- c() +overall_avg_tot <- c() +overall_avg_acc <- c() +overall_avg_ppv <- c() +overall_avg_overall_acc <- c() + + +overall_sd_f1 <- c() +overall_sd_tot <- c() +overall_sd_acc <- c() +overall_sd_ppv <- c() +overall_sd_overall_acc <- c() + +for (index in seq(1:length(combSet))) { + avg_f1 <- c() + avg_tot <- c() + avg_acc <- c() + avg_ppv <- c() + avg_overall_acc <- c() + for (rngSeed in rngRange) { + avg_f1 <- c(avg_f1,all_f1[[rngSeed]][[index]]) + avg_tot <- c(avg_tot,all_tot[[rngSeed]][[index]]) + avg_acc <- c(avg_acc,all_acc[[rngSeed]][[index]]) + avg_ppv <- c(avg_ppv,all_ppv[[rngSeed]][[index]]) + avg_overall_acc <- c(avg_overall_acc,all_overall_acc[[rngSeed]][[index]]) + } + current <- combSet[index] + + current_f1 <- paste(current, '.f1', sep = '') + current_tot <- current + current_acc <- paste(current, '.acc', sep = '') + current_ppv <- paste(current, '.ppv', sep = '') + current_overall_acc <- current + + overall_avg_f1[current_f1] <- mean(avg_f1) + overall_avg_tot[current_tot] <- mean(avg_tot) + overall_avg_acc[current_acc] <- mean(avg_acc) + overall_avg_ppv[current_ppv] <- mean(avg_ppv) + overall_avg_overall_acc[current_overall_acc] <- mean(avg_overall_acc) + + overall_sd_f1[current_f1] <- sd(avg_f1) + overall_sd_tot[current_tot] <- sd(avg_tot) + overall_sd_acc[current_acc] <- sd(avg_acc) + overall_sd_ppv[current_ppv] <- sd(avg_ppv) + overall_sd_overall_acc[current_overall_acc] <- sd(avg_overall_acc) +} + + +pdf("BRCA_missing.pdf",width=8,height=4) +tryCatch({ + +# mean pairwise F1 +f1_plot <- barplot(overall_avg_f1, + main=sprintf("mean F1 (N=%i)",length(rngRange)), + col=cols,ylab="F1",ylim=c(0,100),las =2, cex.names=0.75) +bottom_f1 <- overall_avg_f1 - overall_sd_f1 +top_f1 <- overall_avg_f1 + overall_sd_f1 +names(bottom_f1) <- NULL +names(top_f1) <- NULL +arrows(f1_plot, bottom_f1, f1_plot, + top_f1, lwd = 1.5, angle = 90, + code = 3, length = 0.05) + +# mean pairwise accuracy +acc_plot <- barplot(overall_avg_acc, + main=sprintf("mean accuracy (N=%i)",length(rngRange)), + col=cols,ylab="accuracy",ylim=c(0,100),las =2, cex.names=0.75) +abline(h=25,col='red',lwd=2,lty=2) +abline(h=overall_avg_acc[1],col='grey50',lwd=2) +bottom_acc <- overall_avg_acc - overall_sd_acc +top_acc <- overall_avg_acc + overall_sd_acc +names(bottom_acc) <- NULL +names(top_acc) <- NULL +arrows(acc_plot, bottom_acc, acc_plot, + top_acc, lwd = 1.5, angle = 90, + code = 3, length = 0.05) + +#ppv +ppv_plot <- barplot(overall_avg_ppv, + main=sprintf("mean PPV (N=%i)",length(rngRange)), + col=cols,ylab="PPV",ylim=c(0,100),las =2, cex.names=0.75) +abline(h=50,col='red',lwd=2,lty=2) +abline(h=overall_avg_ppv[1],col='grey50',lwd=2) +bottom_ppv<- overall_avg_ppv - overall_sd_ppv +top_ppv <- overall_avg_ppv + overall_sd_ppv +names(bottom_ppv) <- NULL +names(top_ppv) <- NULL +arrows(ppv_plot, bottom_ppv, ppv_plot, + top_ppv, lwd = 1.5, angle = 90, + code = 3, length = 0.05) + +# plot overall accuracy +overall_acc_plot <- barplot(overall_avg_overall_acc, + main=sprintf("Accuracy (N=%i)",length(rngRange)), + col=cols,ylab="accuracy",ylim=c(0,100),las =2, cex.names=0.75) +abline(h=25,col='red',lwd=2,lty=2) +abline(h=overall_avg_overall_acc[1],col='grey50',lwd=2) +bottom_oall<- overall_avg_overall_acc - overall_sd_overall_acc +top_oall <- overall_avg_overall_acc + overall_sd_overall_acc +names(bottom_oall) <- NULL +names(top_oall) <- NULL +arrows(overall_acc_plot, bottom_oall, overall_acc_plot, + top_oall, lwd = 1.5, angle = 90, + code = 3, length = 0.05) + + +barplot(overall_avg_tot,main="total classified", las = 2, cex.names=0.75) +# names(numInt) <- names(tot) +# barplot(numInt/1000,main="# interactions (x1000)") +# names(pctMiss) <- names(tot) +# barplot(pctMiss*100,main="% missing in profile",ylim=c(0,100)) + +df <- do.call("rbind",all_overall_acc) +boxplot(df,ylab="% accuracy",main=sprintf("%% Overall accuracy (N=%i)", + nrow(df)),col=cols,pars=list(boxwex=0.4),bty='n',ylim=c(0,100),las=1) +abline(h=25,col='red',lty=3,lwd=2) +},error=function(ex){ + print(ex) +},finally={ + dev.off() +}) From 84a5d6434259c985e69cc2315ba3114e0d36c920 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 24 Apr 2018 09:51:52 -0400 Subject: [PATCH 040/124] not needed --- misc/PanCancer/multiCutoff/LUSC_parseMulti2.R | 60 ------------------- 1 file changed, 60 deletions(-) delete mode 100644 misc/PanCancer/multiCutoff/LUSC_parseMulti2.R diff --git a/misc/PanCancer/multiCutoff/LUSC_parseMulti2.R b/misc/PanCancer/multiCutoff/LUSC_parseMulti2.R deleted file mode 100644 index 6d8598aa..00000000 --- a/misc/PanCancer/multiCutoff/LUSC_parseMulti2.R +++ /dev/null @@ -1,60 +0,0 @@ -#' plot LUSC results with multiple CV cutoffs -rm(list=ls()) -require(netDx) -require(reshape2) - -#dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" - -#dataDir_both <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/prune_180124" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCA1net_180126" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/pruneClinFix_180126" -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/pruneCheckIntegr_180126" - -outmat <- matrix(NA,nrow=3,ncol=9) -settypes <- c("clinical","prot","clinicalAprot") -meas <- paste(rep(7:9,each=3),c("auroc","aupr","accuracy"),sep="_") -rownames(outmat)<- settypes -colnames(outmat) <- meas -ctr <- 1 -outD <- sprintf("LUSC_%s",basename(dataDir)) -if (!file.exists(outD)) dir.create(outD) - -for (settype in settypes) { -### if (settype %in% "clinicalArna") -### dataDir <- dataDir_both -### else -### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:10,sep="") - -colctr <- 1 -for (cutoff in 7:9) { - c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", - rngDir,settype,cutoff) - torm <- c() - for (idx in 1:length(c7)) { - dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) - x1 <- sum(dat$STATUS=="SURVIVEYES") - x2 <- sum(dat$STATUS=="SURVIVENO") - if (x1<1 & x2<1) torm <- c(torm, idx) - } - cat(sprintf("%i: removing %i\n", cutoff,length(torm))) - if (length(torm)>0) c7 <- c7[-torm] - postscript(sprintf("%s/%s_cutoff%i.eps",outD,settype,cutoff)); - x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) - dev.off() - - y1 <- unlist(lapply(x,function(i) i$auroc)) - y2 <- unlist(lapply(x,function(i) i$aupr)) - y3 <- unlist(lapply(x,function(i) i$accuracy)) - outmat[ctr,colctr+(0:2)] <- c(mean(y1),mean(y2),mean(y3)) - - colctr <- colctr+3 -} -ctr <- ctr+1 -} -print(round(outmat,digits=2)) - -write.table(outmat,file=sprintf("%s/perf.txt",outD),sep="\t", - col=T,row=T,quote=F) - - From 941d99743459fd9fb71413562553c7f521042df2 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 17:07:37 -0400 Subject: [PATCH 041/124] uses lasso + gene-level nets. not_latest didn't include -ve weighted nets --- .../lasso/GBM_lasso_genes_incClin.R | 434 +++++++++++++++++ misc/PanCancer/pruneVersion/lasso/GBM_ridge.R | 8 +- .../PanCancer/pruneVersion/lasso/KIRC_ridge.R | 6 +- .../pruneVersion/lasso/LUSC_lasso_genes.R | 442 ++++++++++++++++++ .../lasso/not_latest/GBM_lasso_genes.R | 427 +++++++++++++++++ .../not_latest/GBM_lasso_genes_sparsify1.R | 429 +++++++++++++++++ 6 files changed, 1739 insertions(+), 7 deletions(-) create mode 100644 misc/PanCancer/pruneVersion/lasso/GBM_lasso_genes_incClin.R create mode 100644 misc/PanCancer/pruneVersion/lasso/LUSC_lasso_genes.R create mode 100644 misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes.R create mode 100644 misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes_sparsify1.R diff --git a/misc/PanCancer/pruneVersion/lasso/GBM_lasso_genes_incClin.R b/misc/PanCancer/pruneVersion/lasso/GBM_lasso_genes_incClin.R new file mode 100644 index 00000000..9d9452bd --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/GBM_lasso_genes_incClin.R @@ -0,0 +1,434 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoGenes_incClin_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- names(netSets_iter) + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + multiNet <- setdiff(multiNet,"clinical") + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + } + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- names(netSets_iter) #setdiff(names(netSets_iter),"clinical") + +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +multiNet <- setdiff(multiNet,"clinical") +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) +} +if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) +} +if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +} +netList <- c(netList,netList2,netList3) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# ------------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/GBM_ridge.R b/misc/PanCancer/pruneVersion/lasso/GBM_ridge.R index 6278a991..32a7d242 100644 --- a/misc/PanCancer/pruneVersion/lasso/GBM_ridge.R +++ b/misc/PanCancer/pruneVersion/lasso/GBM_ridge.R @@ -18,7 +18,7 @@ inDir <- sprintf("%s/input",rootDir) outRoot <- sprintf("%s/output",rootDir) dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/pruneTrain_%s",outRoot,dt) +megaDir <- sprintf("%s/ridge_AbsFix_%s",outRoot,dt) # ---------------------------------------------------------------- # helper functions @@ -160,7 +160,7 @@ tryCatch({ # first loop - over train/test splits -for (rngNum in 1:100) { +for (rngNum in 1:20) { rng_t0 <- Sys.time() cat(sprintf("-------------------------------\n")) cat(sprintf("RNG seed = %i\n", rngNum)) @@ -187,8 +187,8 @@ for (rngNum in 1:100) { fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), y=factor(pheno$STATUS), family="binomial", alpha=0) # pick lambda that minimizes MSE - wt <- coef(fit,s="lambda.min")[,1] - vars <- names(wt)[which(wt>0)] + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- names(wt)[which(wt>.Machine$double.eps)] if (length(vars) < 6) {# don't compute Pearson,just use all cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", rngNum,nm,length(vars))) diff --git a/misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R b/misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R index c8bf2625..d0913733 100644 --- a/misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R +++ b/misc/PanCancer/pruneVersion/lasso/KIRC_ridge.R @@ -15,7 +15,7 @@ inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/ridge_%s",outRoot,dt) +megaDir <- sprintf("%s/ridgeAbsFix_%s",outRoot,dt) # ---------------------------------------------------------------- # helper functions @@ -189,8 +189,8 @@ for (rngNum in 1:100) { fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), y=factor(pheno$STATUS), family="binomial", alpha=0) # pick lambda that minimizes MSE - wt <- coef(fit,s="lambda.min")[,1] - vars <- names(wt)[which(wt>0)] + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- names(wt)[which(wt>.Machine$double.eps)] if (length(vars) < 6) {# don't compute Pearson,just use all cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", rngNum,nm,length(vars))) diff --git a/misc/PanCancer/pruneVersion/lasso/LUSC_lasso_genes.R b/misc/PanCancer/pruneVersion/lasso/LUSC_lasso_genes.R new file mode 100644 index 00000000..dea140eb --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/LUSC_lasso_genes.R @@ -0,0 +1,442 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoGenes_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +mega_combList <- combList # this will change in each round + +# first loop - over train/test splits +for (rngNum in 3:20) { + combList <- mega_combList # clean slate + + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + cat("add combList changes\n") + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + +# ------------------------------- +# make train db + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=TRUE, + verbose=FALSE,numCores=numCores) + } + + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") +# ------------------------------- + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +# ------------------------------- +# make test db +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=TRUE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") +# ------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes.R b/misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes.R new file mode 100644 index 00000000..622c40fc --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes.R @@ -0,0 +1,427 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoGenes_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +mega_combList <- combList # this will change in each round + +# first loop - over train/test splits +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- coef(fit,s="lambda.min")[,1] + vars <- setdiff(names(wt)[which(wt>0)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=TRUE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ----------------------------------------- +# make test db +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets_iter),"clinical") +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=TRUE, + verbose=FALSE,numCores=numCores) +} +if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) +} +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2,netList3) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) +# ----------------------------------------- + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes_sparsify1.R b/misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes_sparsify1.R new file mode 100644 index 00000000..95ca0518 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/not_latest/GBM_lasso_genes_sparsify1.R @@ -0,0 +1,429 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoGenes_sparse1_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- coef(fit,s="lambda.min")[,1] + vars <- setdiff(names(wt)[which(wt>0)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets_iter),"clinical") + +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) +} +if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) +} +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2,netList3) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# ------------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 3f0c12ec150366c8e0ee460649e593f2970c0d98 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 17:08:13 -0400 Subject: [PATCH 042/124] plot variability as fn num trials --- misc/PanCancer/multiCutoff/GBM_parseMulti.R | 28 +++++++++- misc/PanCancer/multiCutoff/KIRC_parseMulti.R | 55 +++++++++++++++----- misc/PanCancer/multiCutoff/LUSC_parseMulti.R | 49 ++++++++++++----- misc/PanCancer/multiCutoff/OV_parseMulti.R | 27 +++++++++- 4 files changed, 128 insertions(+), 31 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_parseMulti.R b/misc/PanCancer/multiCutoff/GBM_parseMulti.R index 822fdc62..9af5e510 100644 --- a/misc/PanCancer/multiCutoff/GBM_parseMulti.R +++ b/misc/PanCancer/multiCutoff/GBM_parseMulti.R @@ -5,8 +5,8 @@ require(reshape2) #dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180420" -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180420" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/ridge_180420" +#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180419" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCA1net_180126" #dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCAmultinet_180126" @@ -21,6 +21,7 @@ outD <- sprintf("GBM_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) auc_set <- list() +var_set <- list() for (settype in settypes) { ### if (settype %in% "clinicalArna") ### dataDir <- dataDir_both @@ -52,6 +53,13 @@ for (cutoff in 9) { auc_set[[settype]] <- y1 colctr <- colctr+3 + tmp <- c() + cur <- auc_set[[settype]] + for (k in 3:length(cur)) { + tmp <- c(tmp, sd(cur[1:k])) + } + var_set[[settype]] <- data.frame(type=settype,numsplits=4:length(cur), + pctChangeVar=diff(tmp^2)/(tmp[-1]^2)) } ctr <- ctr+1 } @@ -68,4 +76,20 @@ dev.off() write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", col=T,row=T,quote=F) +for (settype in settypes) { + tmp <- var_set[[settype]][,3] + tmp2 <- c() + for (m in 1:(length(tmp)-2)) tmp2 <- c(tmp2,mean(tmp[m:(m+2)])) + cat(sprintf("%s: < 1%% change: %i\n", settype,min(which(abs(tmp2) < 0.01)))) +} +# plot SEM as function of num rounds +var_set <- do.call("rbind",var_set) +setName <- sprintf("GBM_%s", basename(dataDir)) +require(ggplot2) +p <- ggplot(var_set,aes(x=numsplits,y=pctChangeVar)) +p <- p+ geom_smooth(aes(colour=type),method="loess",span=0.1,se=FALSE,lwd=0.5, + alpha=0.5) +p <- p + ggtitle(setName) + ylim(c(-0.25,0.25)) +p <- p + geom_vline(xintercept=c(10,15,25),lty=3) +pdf(sprintf("%s.pdf",setName),width=8,height=3); print(p);dev.off() diff --git a/misc/PanCancer/multiCutoff/KIRC_parseMulti.R b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R index 07a53369..f1ff3050 100644 --- a/misc/PanCancer/multiCutoff/KIRC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/KIRC_parseMulti.R @@ -4,7 +4,8 @@ require(netDx) require(reshape2) #dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/outdated/nestCV_170911" -dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/lasso_180420" +#dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruneTrain_180419" +dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/ridge_180420" #dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruneTrain_180419" settypes <- c("clinical","mir","rna","prot","cnv","dnam", @@ -19,12 +20,13 @@ outD <- sprintf("KIRC_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) auc_set <- list() +var_set <- list() for (settype in settypes) { ### if (settype %in% "clinicalArna") ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:73,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") colctr <- 1 for (cutoff in 9) { @@ -50,24 +52,49 @@ for (cutoff in 9) { auc_set[[settype]] <- y1 colctr <- colctr+3 + tmp <- c() + cur <- auc_set[[settype]] + for (k in 3:length(cur)) { + tmp <- c(tmp, sd(cur[1:k])) + } + var_set[[settype]] <- data.frame(type=settype,numsplits=4:length(cur), + pctChangeVar=diff(tmp^2)/(tmp[-1]^2)) } -ctr <- ctr+1 + ctr <- ctr+1 } cat(sprintf("Base dir: %s\n", dirname(dataDir))) print(round(outmat,digits=2)) -meds <- unlist(lapply(auc_set,median)) -mu <- unlist(lapply(auc_set,mean)) -err <- unlist(lapply(auc_set,sd)) - -auc_set <- auc_set[order(meds)] - -pdf("kirc_auc.pdf",width=13,height=5); - boxplot(auc_set,cex.axis=0.6); - abline(h=median(auc_set[["clinical"]]));dev.off() +###meds <- unlist(lapply(auc_set,median)) +###mu <- unlist(lapply(auc_set,mean)) +###err <- unlist(lapply(auc_set,sd)) +### +###auc_set <- auc_set[order(meds)] +### +###pdf("kirc_auc.pdf",width=13,height=5); +### boxplot(auc_set,cex.axis=0.6); +### abline(h=median(auc_set[["clinical"]]));dev.off() +### +### +###write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", +### col=T,row=T,quote=F) +# plot SEM as function of num rounds +for (settype in settypes) { + tmp <- var_set[[settype]][,3] + tmp2 <- c() + for (m in 1:(length(tmp)-2)) tmp2 <- c(tmp2,mean(tmp[m:(m+2)])) + cat(sprintf("%s: < 1%% change: %i\n", settype,min(which(abs(tmp2) < 0.01)))) +} +var_set <- do.call("rbind",var_set) +setName <- sprintf("KIRC_%s", basename(dataDir)) -write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", - col=T,row=T,quote=F) +require(ggplot2) +p <- ggplot(var_set,aes(x=numsplits,y=pctChangeVar)) +p <- p+ geom_smooth(aes(colour=type),method="loess",span=0.1,se=FALSE,lwd=0.5, + alpha=0.5) +p <- p + ggtitle(setName) + ylim(c(-0.25,0.25))#+ ylim(0.004*c(-1,1)) +p <- p + geom_vline(xintercept=c(10,15,25),lty=3) +pdf(sprintf("%s.pdf",setName),width=8,height=3); print(p);dev.off() diff --git a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R index 77e6b06b..cad1315d 100644 --- a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R @@ -16,11 +16,8 @@ outD <- sprintf("LUSC_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) auc_set <- list() +var_set <- list() for (settype in settypes) { -### if (settype %in% "clinicalArna") -### dataDir <- dataDir_both -### else -### dataDir <- dataDir_each rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") colctr <- 1 @@ -47,19 +44,43 @@ for (cutoff in 7:9) { auc_set[[settype]] <- y1 colctr <- colctr+3 + tmp <- c() + cur <- auc_set[[settype]] + for (k in 3:length(cur)) { + tmp <- c(tmp, sd(cur[1:k])) + } + var_set[[settype]] <- data.frame(type=settype,numsplits=4:length(cur), + pctChangeVar=diff(tmp^2)/(tmp[-1]^2)) } ctr <- ctr+1 } print(round(outmat,digits=2)) -pdf("lusc_auc.pdf",width=13,height=5); - boxplot(auc_set,cex.axis=0.6,pars=list(boxwex=0.3)); - abline(h=median(auc_set[["clinical"]])); - barplot(unlist(lapply(auc_set,mean)),las=1,cex.axis=1.3,font.axis=2, - ylim=c(0.5,1),main="LUSC") -dev.off() - -write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", - col=T,row=T,quote=F) - +# plot SEM as function of num rounds +setName <- sprintf("LUSC_%s", basename(dataDir)) +for (settype in settypes) { + tmp <- var_set[[settype]][,3] + tmp2 <- c() + for (m in 1:(length(tmp)-2)) tmp2 <- c(tmp2,mean(tmp[m:(m+2)])) + cat(sprintf("%s: < 1%% change: %i\n", settype,min(which(abs(tmp2) < 0.01)))) +} +var_set <- do.call("rbind",var_set) +require(ggplot2) +p <- ggplot(var_set,aes(x=numsplits,y=pctChangeVar)) +p <- p+ geom_smooth(aes(colour=type),method="loess",span=0.1,se=FALSE,lwd=0.5, + alpha=0.5) +p <- p + ggtitle(setName) + ylim(c(-0.25,0.25)) +p <- p + geom_vline(xintercept=c(10,15,25),lty=3) +pdf(sprintf("%s.pdf",setName),width=8,height=3); print(p);dev.off() +###pdf("lusc_auc.pdf",width=13,height=5); +### boxplot(auc_set,cex.axis=0.6,pars=list(boxwex=0.3)); +### abline(h=median(auc_set[["clinical"]])); +### barplot(unlist(lapply(auc_set,mean)),las=1,cex.axis=1.3,font.axis=2, +### ylim=c(0.5,1),main="LUSC") +###dev.off() +### +###write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", +### col=T,row=T,quote=F) +### +### diff --git a/misc/PanCancer/multiCutoff/OV_parseMulti.R b/misc/PanCancer/multiCutoff/OV_parseMulti.R index e93322bc..dfbc114a 100644 --- a/misc/PanCancer/multiCutoff/OV_parseMulti.R +++ b/misc/PanCancer/multiCutoff/OV_parseMulti.R @@ -3,7 +3,8 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output/pruneTrain_180420" +#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output/prune_180423" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output/ridge_180420" maxRng <- 100 settypes <- c("clinical","mir","rna","prot","cnv","dnam", @@ -17,6 +18,7 @@ ctr <- 1 outD <- sprintf("OV_%s",basename(dataDir)) if (!file.exists(outD)) dir.create(outD) +var_set <- list() auc_set <- list() for (settype in settypes) { ### if (settype %in% "clinicalArna") @@ -49,6 +51,13 @@ for (cutoff in 9) { auc_set[[settype]] <- y1 colctr <- colctr+3 + tmp <- c() + cur <- auc_set[[settype]] + for (k in 3:length(cur)) { + tmp <- c(tmp, sd(cur[1:k])) + } + var_set[[settype]] <- data.frame(type=settype,numsplits=4:length(cur), + pctChangeVar=diff(tmp^2)/(tmp[-1]^2)) } ctr <- ctr+1 } @@ -78,4 +87,20 @@ dev.off() write.table(round(outmat,digits=2),file=sprintf("%s/perf.txt",outD),sep="\t", col=T,row=T,quote=F) +# plot SEM as function of num rounds +setName <- sprintf("OV_%s", basename(dataDir)) +for (settype in settypes) { + tmp <- var_set[[settype]][,3] + tmp2 <- c() + for (m in 1:(length(tmp)-2)) tmp2 <- c(tmp2,mean(tmp[m:(m+2)])) + cat(sprintf("%s: < 1%% change: %i\n", settype,min(which(abs(tmp2) < 0.01)))) +} +var_set <- do.call("rbind",var_set) +require(ggplot2) +p <- ggplot(var_set,aes(x=numsplits,y=pctChangeVar)) +p <- p+ geom_smooth(aes(colour=type),method="loess",span=0.1,se=FALSE,lwd=0.5, + alpha=0.5) +p <- p + ggtitle(setName) + ylim(c(-0.25,0.25)) +p <- p + geom_vline(xintercept=c(10,15,25),lty=3) +pdf(sprintf("%s.pdf",setName),width=8,height=3); print(p);dev.off() From 50129fc61e974bed7f84a2d337a97c698d231980 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 17:08:59 -0400 Subject: [PATCH 043/124] plots perf of gbm variations --- misc/PanCancer/multiCutoff/GBM_getRes.R | 62 +++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 misc/PanCancer/multiCutoff/GBM_getRes.R diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R new file mode 100644 index 00000000..c34ac58f --- /dev/null +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -0,0 +1,62 @@ +#' plot GBM results for kernel variations + +rm(list=ls()) +require(netDx) +require(reshape2) + +mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output" +dirSet <- list( + base="noPrune_180423", +# rbf3="rbf0.3_noSex_180424", +# rbf5="rbf0.5_noSex_180424", +# rbfother="rbf_noSex_180424", +# tanh="tanh_noSex_180424" + lasso_sp2="lassoGenes_180426", + lasso_sp1="lassoGenes_sparse1_180426" +) + +mega_auc <- list() +for (curdir in names(dirSet)) { +cat(sprintf("***** %s *****\n", curdir)) +dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) +settypes <- c("clinical","mir","rna","cnv","dnam", + "clinicalArna","clinicalAmir","clinicalAdnam","clinicalAcnv","all") +ctr <- 1 + +auc_set <- list() +for (settype in settypes) { + #print(dataDir) +cutoff <-8 + + if (any(grep("lasso",curdir))) { + rngDir <- paste("rng",1:8,sep="") + } else { + rngDir <- dir(path=dataDir,pattern="rng") + } + + cat(sprintf("Got %i rng files\n",length(rngDir))) + rngDir <- sprintf("%s/%s",dataDir,rngDir) + c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", + rngDir,settype,cutoff) + torm <- c() + for (idx in 1:length(c7)) { + dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) + x1 <- sum(dat$STATUS=="SURVIVEYES") + x2 <- sum(dat$STATUS=="SURVIVENO") + if (x1<1 & x2<1) torm <- c(torm, idx) + } + cat(sprintf("%i: removing %i\n", cutoff,length(torm))) + if (length(torm)>0) c7 <- c7[-torm] + postscript("tmp.eps") + x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) + dev.off() + + y1 <- unlist(lapply(x,function(i) i$auroc)) + auc_set[[settype]] <- y1 +ctr <- ctr+1 +} +mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) + +} + +pdf("test.pdf"); boxplot(mega_auc); dev.off() From 73d0ebd48f904503512cbcd82c083af42178eca6 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 17:09:18 -0400 Subject: [PATCH 044/124] uses pamr for prefiltering --- misc/PanCancer/pruneVersion/pamr/GBM_pamr.R | 351 ++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/pamr/GBM_pamr.R diff --git a/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R b/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R new file mode 100644 index 00000000..5a8157ca --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R @@ -0,0 +1,351 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' shrunked centroid based feat sel + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pamr_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:20) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + for (nm in names(dats_train)) { + # shrunken centroid for initial feature selection + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + thresh <- data.cv$threshold[which.min(data.cv$threshold)] + keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) + + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% keepgenes[,1]),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("\t%i:%s:%i of %i left\n",rngNum,nm,nrow(tmp),orig_ct)) + } + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 73aea951de4321db30cf83e1768dc4f491446d6a Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 17:09:39 -0400 Subject: [PATCH 045/124] minor --- misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R b/misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R index 6f833e8b..ef517bf3 100644 --- a/misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R +++ b/misc/PanCancer/pruneVersion/pruneTrain/LUSC_pruneTrain.R @@ -5,7 +5,6 @@ rm(list=ls()) require(netDx) require(netDx.examples) -source("../runLM.R") numCores <- 8L GMmemory <- 4L @@ -160,7 +159,7 @@ sink(logFile,split=TRUE) tryCatch({ # apply pruning to proteomic data curwd <- getwd() -setwd("..") +setwd("../..") source("LMprune.R") source("runLM.R") source("silh.R") @@ -169,7 +168,7 @@ setwd(curwd) # first loop - over train/test splits -for (rngNum in 1:100) { +for (rngNum in 14:15){ rng_t0 <- Sys.time() cat(sprintf("-------------------------------\n")) cat(sprintf("RNG seed = %i\n", rngNum)) From 98a6ed58230b8a720f6a3aa590bffe7831432c69 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:16:46 -0400 Subject: [PATCH 046/124] pamr for all tumours --- misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R | 349 +++++++++++++++++ misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R | 374 +++++++++++++++++++ misc/PanCancer/pruneVersion/pamr/OV_pamr.R | 328 ++++++++++++++++ 3 files changed, 1051 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R create mode 100644 misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R create mode 100644 misc/PanCancer/pruneVersion/pamr/OV_pamr.R diff --git a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R new file mode 100644 index 00000000..d3efe613 --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R @@ -0,0 +1,349 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lasso_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:20) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + # shrunken centroid for initial feature selection + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + thresh <- data.cv$threshold[which.min(data.cv$threshold)] + keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) + + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% keepgenes[,1]),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("\t%i:%s:%i of %i left\n",rngNum,nm,nrow(tmp),orig_ct)) + } + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +## pruneTrain code end + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R new file mode 100644 index 00000000..d7c6a1f2 --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R @@ -0,0 +1,374 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pamr_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # shrunken centroid for initial feature selection + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + thresh <- data.cv$threshold[which.min(data.cv$threshold)] + keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) + + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(keepgenes))) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% keepgenes[,1]),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("\t%i:%s:%i of %i left\n",rngNum,nm,nrow(tmp),orig_ct)) + } + # END lasso UF + # ---------------------- + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +#### ---------------------------------------------------------- + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + ## Create the mega database with all patients and all nets. + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/pamr/OV_pamr.R b/misc/PanCancer/pruneVersion/pamr/OV_pamr.R new file mode 100644 index 00000000..3806048d --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/OV_pamr.R @@ -0,0 +1,328 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pamr_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + # shrunken centroid for initial feature selection + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + thresh <- data.cv$threshold[which.min(data.cv$threshold)] + keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) + + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% keepgenes[,1]),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + cat(sprintf("\t%i:%s:%i of %i left\n",rngNum,nm,nrow(tmp),orig_ct)) + } + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + test_netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + ### netSets_iter has univariate filtering for curr round + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],test_netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + test_netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(test_netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From e2d86277fc8bd1f9f7a22fc658c5806d5828bdfb Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:17:10 -0400 Subject: [PATCH 047/124] old --- .../pamr/outdated/GBM_pamr_genes_incClin.R | 437 ++++++++++++++++++ 1 file changed, 437 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/pamr/outdated/GBM_pamr_genes_incClin.R diff --git a/misc/PanCancer/pruneVersion/pamr/outdated/GBM_pamr_genes_incClin.R b/misc/PanCancer/pruneVersion/pamr/outdated/GBM_pamr_genes_incClin.R new file mode 100644 index 00000000..24e44e5c --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/outdated/GBM_pamr_genes_incClin.R @@ -0,0 +1,437 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pamrGenes_incClin_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + thresh <- data.cv$threshold[which.min(data.cv$threshold)] + vars <- pamr.listgenes(data.fit,data,thresh,data.cv) + + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- names(netSets_iter) + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + multiNet <- setdiff(multiNet,"clinical") + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + } + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- names(netSets_iter) #setdiff(names(netSets_iter),"clinical") + +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +multiNet <- setdiff(multiNet,"clinical") +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) +} +if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) +} +if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +} +netList <- c(netList,netList2,netList3) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# ------------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 15d82dd4549d290f6ca2029c39e2abcb3af2261e Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:26:28 -0400 Subject: [PATCH 048/124] dir name fix --- misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R index d3efe613..a7434df1 100644 --- a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R +++ b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R @@ -15,7 +15,7 @@ inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/lasso_%s",outRoot,dt) +megaDir <- sprintf("%s/pamr_%s",outRoot,dt) # ---------------------------------------------------------------- # helper functions From 6d4fd9be829c6ae6cdc4f0d175496938bc565f96 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:26:44 -0400 Subject: [PATCH 049/124] new test after abs(wt) bug fix --- .../PanCancer/pruneVersion/lasso/KIRC_lasso.R | 4 +- .../PanCancer/pruneVersion/lasso/LUSC_lasso.R | 375 ++++++++++++++++++ misc/PanCancer/pruneVersion/lasso/OV_lasso.R | 330 +++++++++++++++ 3 files changed, 707 insertions(+), 2 deletions(-) create mode 100644 misc/PanCancer/pruneVersion/lasso/LUSC_lasso.R create mode 100644 misc/PanCancer/pruneVersion/lasso/OV_lasso.R diff --git a/misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R b/misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R index 23ba489f..0292e266 100644 --- a/misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R +++ b/misc/PanCancer/pruneVersion/lasso/KIRC_lasso.R @@ -189,8 +189,8 @@ for (rngNum in 1:100) { fit.lasso <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), y=factor(pheno$STATUS), family="binomial", alpha=1) # pick lambda that minimizes MSE - wt <- coef(fit.lasso,s="lambda.min")[,1] - vars <- names(wt)[which(wt>0)] + wt <- abs(coef(fit.lasso,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") if (length(vars) < 6) {# don't compute Pearson,just use all cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", rngNum,nm,length(vars))) diff --git a/misc/PanCancer/pruneVersion/lasso/LUSC_lasso.R b/misc/PanCancer/pruneVersion/lasso/LUSC_lasso.R new file mode 100644 index 00000000..da7d3a09 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/LUSC_lasso.R @@ -0,0 +1,375 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lasso_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + if (length(vars) < 6) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] +#### ---------------------------------------------------------- + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + ## Create the mega database with all patients and all nets. + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") + + for (cutoff in 9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/lasso/OV_lasso.R b/misc/PanCancer/pruneVersion/lasso/OV_lasso.R new file mode 100644 index 00000000..5c2b3ff2 --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/OV_lasso.R @@ -0,0 +1,330 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lasso_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + if (length(vars) < 6) {# don't compute Pearson,just use all + cat(sprintf("rngNum %i: %s: <6 (%i):just use all\n", + rngNum,nm,length(vars))) + } else { + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),] + dats_train[[nm]] <- tmp + netSets_iter[[nm]] <- rownames(tmp) + } + } + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + test_netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + ### netSets_iter has univariate filtering for curr round + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],test_netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + test_netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(test_netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From e824779e4be0d1d5a8cb0129b0c99e02a1ac28b1 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:28:08 -0400 Subject: [PATCH 050/124] mini update, dir name --- misc/PanCancer/multiCutoff/GBM_getRes.R | 11 ++++++----- misc/PanCancer/multiCutoff/GBM_parseMulti.R | 3 --- misc/PanCancer/multiCutoff/LUSC_parseMulti.R | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index c34ac58f..289f3924 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -11,8 +11,9 @@ dirSet <- list( # rbf5="rbf0.5_noSex_180424", # rbfother="rbf_noSex_180424", # tanh="tanh_noSex_180424" - lasso_sp2="lassoGenes_180426", - lasso_sp1="lassoGenes_sparse1_180426" + ridge_fix="ridge_AbsFix_180426", + lassoGenes_sp1="lassoGenes_incClin_180426", + pamr="pamr_180425" ) mega_auc <- list() @@ -26,10 +27,10 @@ ctr <- 1 auc_set <- list() for (settype in settypes) { #print(dataDir) -cutoff <-8 +cutoff <-9 - if (any(grep("lasso",curdir))) { - rngDir <- paste("rng",1:8,sep="") + if (any(c(grep("lasso",curdir),grep("ridge",curdir)))) { + rngDir <- paste("rng",1:18,sep="") } else { rngDir <- dir(path=dataDir,pattern="rng") } diff --git a/misc/PanCancer/multiCutoff/GBM_parseMulti.R b/misc/PanCancer/multiCutoff/GBM_parseMulti.R index 9af5e510..bb3dbcc9 100644 --- a/misc/PanCancer/multiCutoff/GBM_parseMulti.R +++ b/misc/PanCancer/multiCutoff/GBM_parseMulti.R @@ -6,9 +6,6 @@ require(reshape2) #dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/ridge_180420" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneTrain_180419" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCA1net_180126" -#dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/PCAmultinet_180126" settypes <- c("clinical","mir","rna","cnv","dnam", "clinicalArna","clinicalAmir","clinicalAdnam","clinicalAcnv","all") diff --git a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R index cad1315d..b791dcdf 100644 --- a/misc/PanCancer/multiCutoff/LUSC_parseMulti.R +++ b/misc/PanCancer/multiCutoff/LUSC_parseMulti.R @@ -3,7 +3,7 @@ rm(list=ls()) require(netDx) require(reshape2) -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/ridge_180420" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/lassoGenes_180426" settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") @@ -18,7 +18,7 @@ if (!file.exists(outD)) dir.create(outD) auc_set <- list() var_set <- list() for (settype in settypes) { - rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 3:20,sep="") colctr <- 1 for (cutoff in 7:9) { From b461ace0c09eafe74d76a993685c81b44a1d4b56 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:29:10 -0400 Subject: [PATCH 051/124] tanh kernel --- .../noPrune/GBM_noPrune_noSex_Tanh.R | 334 ++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R b/misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R new file mode 100644 index 00000000..e9a3dfcf --- /dev/null +++ b/misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R @@ -0,0 +1,334 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/tanh_noSex_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="tanh",sigma=0.05) { + if (nm=="rbf") { + func <- kernlab::rbfdot(sigma) + cat(sprintf("Sigma = %1.2f\n", sigma)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + diag(out) <- 1 + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) + +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +dats$clinical <- dats$clinical[c("age","performance_score"),] + + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 6891591432808c956a4b284517165c5e16903f41 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:31:15 -0400 Subject: [PATCH 052/124] never worked --- .../noPrune/outdated/GBM_noPrune_justAge.R | 324 +++++++++++++++++ .../noPrune/outdated/GBM_noPrune_noSex.R | 336 ++++++++++++++++++ 2 files changed, 660 insertions(+) create mode 100644 misc/PanCancer/noPrune/outdated/GBM_noPrune_justAge.R create mode 100644 misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex.R diff --git a/misc/PanCancer/noPrune/outdated/GBM_noPrune_justAge.R b/misc/PanCancer/noPrune/outdated/GBM_noPrune_justAge.R new file mode 100644 index 00000000..2a0e627a --- /dev/null +++ b/misc/PanCancer/noPrune/outdated/GBM_noPrune_justAge.R @@ -0,0 +1,324 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_justAge_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +cat("Excluding sex as input param\n") +dats$clinical <- dats$clinical[c("age"),,drop=FALSE] + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex.R b/misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex.R new file mode 100644 index 00000000..efa7a466 --- /dev/null +++ b/misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex.R @@ -0,0 +1,336 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_noSex_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +cat("Excluding sex as input param\n") +dats$clinical <- dats$clinical[c("age","performance_score"),] + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 35810de3ba59d98fe7ae1ddaad5344205e84a9d8 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:31:33 -0400 Subject: [PATCH 053/124] minor --- misc/PanCancer/simFuns.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/misc/PanCancer/simFuns.R b/misc/PanCancer/simFuns.R index b12c4a47..46c045a3 100644 --- a/misc/PanCancer/simFuns.R +++ b/misc/PanCancer/simFuns.R @@ -127,7 +127,3 @@ plotSim <- function(s1,name="simfun",c1,c2,logT=FALSE) { boxplot(out,main=name) } - - - - From 66caaf60db79c9e2cd7f8b5aa908c78f179879b9 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 26 Apr 2018 22:32:56 -0400 Subject: [PATCH 054/124] rbf --- .../PanCancer/noPrune/GBM_noPrune_RBFforAll.R | 362 ++++++++++++++++++ .../PanCancer/noPrune/GBM_noPrune_noSex_RBF.R | 334 ++++++++++++++++ .../noPrune/GBM_noPrune_noSex_RBF_0.3.R | 335 ++++++++++++++++ .../noPrune/GBM_noPrune_noSex_RBF_0.5.R | 335 ++++++++++++++++ 4 files changed, 1366 insertions(+) create mode 100644 misc/PanCancer/noPrune/GBM_noPrune_RBFforAll.R create mode 100644 misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF.R create mode 100644 misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.3.R create mode 100644 misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.5.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune_RBFforAll.R b/misc/PanCancer/noPrune/GBM_noPrune_RBFforAll.R new file mode 100644 index 00000000..ce312840 --- /dev/null +++ b/misc/PanCancer/noPrune/GBM_noPrune_RBFforAll.R @@ -0,0 +1,362 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/np_RBFall_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="rbf",sigma=0.05) { + if (nm=="rbf") { + func <- kernlab::rbfdot(sigma) + cat(sprintf("Sigma = %1.2f\n", sigma)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + diag(out) <- 1 + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) + +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") +browser() + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,simMetric="custom",customFunc=sim.kern,sparsify=TRUE, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,sparsify=TRUE, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + simMetric="custom",customFunc=sim.kern,sparsify=TRUE, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF.R b/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF.R new file mode 100644 index 00000000..0cde6853 --- /dev/null +++ b/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF.R @@ -0,0 +1,334 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/test_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="rbf",sigma=0.05) { + if (nm=="rbf") { + func <- kernlab::rbfdot(sigma) + cat(sprintf("Sigma = %1.2f\n", sigma)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + diag(out) <- 1 + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) + +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +dats$clinical <- dats$clinical[c("age","performance_score"),] + + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.3.R b/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.3.R new file mode 100644 index 00000000..d20f48cf --- /dev/null +++ b/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.3.R @@ -0,0 +1,335 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/rbf0.3_noSex_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="rbf",sigma=0.3) { + if (nm=="rbf") { + func <- kernlab::rbfdot(sigma) + cat(sprintf("Sigma = %1.2f\n", sigma)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + diag(out) <- 1 + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) + +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +dats$clinical <- dats$clinical[c("age","performance_score"),] +cat("Sex excluded\n") + + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.5.R b/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.5.R new file mode 100644 index 00000000..90698f19 --- /dev/null +++ b/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.5.R @@ -0,0 +1,335 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/rbf0.5_noSex_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="rbf",sigma=0.5) { + if (nm=="rbf") { + func <- kernlab::rbfdot(sigma) + cat(sprintf("Sigma = %1.2f\n", sigma)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + diag(out) <- 1 + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) + +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +dats$clinical <- dats$clinical[c("age","performance_score"),] +cat("Sex excluded\n") + + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 9f71af054ae083957e757f514bfeb389f8876497 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Sat, 28 Apr 2018 07:03:06 -0400 Subject: [PATCH 055/124] bug fix in pamr - gene-level nets --- misc/PanCancer/pruneVersion/pamr/GBM_pamr.R | 10 +- .../pamr/GBM_pamr_genes_incClin.R | 445 +++++++++++++++++ misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R | 6 +- .../pruneVersion/pamr/LUSC_pamr_genes.R | 454 ++++++++++++++++++ .../pruneVersion/pamr/LUSC_pamr_genes_sp1.R | 454 ++++++++++++++++++ 5 files changed, 1365 insertions(+), 4 deletions(-) create mode 100644 misc/PanCancer/pruneVersion/pamr/GBM_pamr_genes_incClin.R create mode 100644 misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes.R create mode 100644 misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes_sp1.R diff --git a/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R b/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R index 5a8157ca..4c490d8b 100644 --- a/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R +++ b/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R @@ -175,15 +175,21 @@ for (rngNum in 1:20) { drop=FALSE]) netSets_iter <- list() for (nm in names(dats_train)) { + print(nm) # shrunken centroid for initial feature selection tmp <- na.omit(dats_train[[nm]]) data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), geneid=rownames(tmp)) + set.seed(123); # reproducible data.fit <- pamr.train(data) data.cv <- pamr.cv(data.fit, data) - thresh <- data.cv$threshold[which.min(data.cv$threshold)] + idx <- which.min(data.cv$error) + thresh <- data.cv$threshold[idx] keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) +if (nm %in% "mir") browser() + cat(sprintf("%i:%s:PAMR thresh=%1.2f (idx=%i); %i left\n", + rngNum, nm,thresh,idx,length(keepgenes[,1]))) tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) tmp <- tmp[which(rownames(tmp)%in% keepgenes[,1]),] @@ -279,7 +285,7 @@ netList <- c(netList,netList2) cat(sprintf("Total of %i nets\n", length(netList))) # now create database testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) - for (cutoff in 7:9) { + for (cutoff in 9) { predRes <- list() for (g in subtypes) { pDir2 <- sprintf("%s/%s",pDir,g) diff --git a/misc/PanCancer/pruneVersion/pamr/GBM_pamr_genes_incClin.R b/misc/PanCancer/pruneVersion/pamr/GBM_pamr_genes_incClin.R new file mode 100644 index 00000000..2a502b4a --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/GBM_pamr_genes_incClin.R @@ -0,0 +1,445 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pamrGenes_incClin_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + idx <- which.min(data.cv$error) + thresh <- data.cv$threshold[idx] + vars <- c() + tryCatch({ + vars <- pamr.listgenes(data.fit,data,thresh,data.cv) + cat(sprintf("%i:%s:PAMR thresh=%1.2f (idx=%i); %i left\n", + rngNum, nm,thresh,idx,length(vars[,1]))) + }, error=function(ex) { + cat("caught error\n"); + }) + + if (length(vars)>0) { + varrank <- as.numeric(vars[,4]) #rank + vars <- vars[which(varrank<100),1] # keep top 100 genes + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- names(netSets_iter) + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + multiNet <- setdiff(multiNet,"clinical") + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + } + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- names(netSets_iter) #setdiff(names(netSets_iter),"clinical") + +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +multiNet <- setdiff(multiNet,"clinical") +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) +} +if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) +} +if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +} +netList <- c(netList,netList2,netList3) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# ------------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R index d7c6a1f2..0eff8877 100644 --- a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R +++ b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R @@ -193,10 +193,12 @@ for (rngNum in 1:100) { set.seed(123); # reproducible data.fit <- pamr.train(data) data.cv <- pamr.cv(data.fit, data) - thresh <- data.cv$threshold[which.min(data.cv$threshold)] + idx <- which.min(data.cv$error) + thresh <- data.cv$threshold[idx] keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) + cat(sprintf("%i:%s:PAMR thresh=%1.2f (idx=%i); %i left\n", + rngNum, nm,thresh,idx,length(keepgenes[,1]))) - cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(keepgenes))) tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) tmp <- tmp[which(rownames(tmp)%in% keepgenes[,1]),] dats_train[[nm]] <- tmp diff --git a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes.R b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes.R new file mode 100644 index 00000000..a7ffeeaa --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes.R @@ -0,0 +1,454 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoGenes_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +mega_combList <- combList # this will change in each round + +# first loop - over train/test splits +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + idx <- which.min(data.cv$error) + thresh <- data.cv$threshold[idx] + vars <- c() + tryCatch({ + vars <- pamr.listgenes(data.fit,data,thresh,data.cv) + cat(sprintf("%i:%s:PAMR thresh=%1.2f (idx=%i); %i left\n", + rngNum, nm,thresh,idx,length(vars[,1]))) + },error=function(ex){ + cat("caught error\n"); + }) + + if (length(vars)>0) { + varrank <- as.numeric(vars[,4]) #rank + vars <- vars[which(varrank<100),1] # keep top 100 genes + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + cat("add combList changes\n") + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + +# ------------------------------- +# make train db + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=TRUE, + verbose=FALSE,numCores=numCores) + } + + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE,useSparsify2=TRUE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") +# ------------------------------- + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +# ------------------------------- +# make test db +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=TRUE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE,useSparsify2=TRUE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") +# ------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes_sp1.R b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes_sp1.R new file mode 100644 index 00000000..9189d6c0 --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes_sp1.R @@ -0,0 +1,454 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pamrGenes_sp1_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +mega_combList <- combList # this will change in each round + +# first loop - over train/test splits +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS),genenames=rownames(tmp), + geneid=rownames(tmp)) + set.seed(123); # reproducible + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + idx <- which.min(data.cv$error) + thresh <- data.cv$threshold[idx] + vars <- c() + tryCatch({ + vars <- pamr.listgenes(data.fit,data,thresh,data.cv) + cat(sprintf("%i:%s:PAMR thresh=%1.2f (idx=%i); %i left\n", + rngNum, nm,thresh,idx,length(vars[,1]))) + },error=function(ex){ + cat("caught error\n"); + }) + + if (length(vars)>0) { + varrank <- as.numeric(vars[,4]) #rank + vars <- vars[which(varrank<100),1] # keep top 100 genes + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + cat("add combList changes\n") + + alldat_train <- do.call("rbind",dats_train) + netSets_iter[["clinical"]] <- netSets[["clinical"]] + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + +# ------------------------------- +# make train db + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) + } + + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE,useSparsify2=FALSE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") +# ------------------------------- + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +# ------------------------------- +# make test db +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE,useSparsify2=FALSE) + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") +# ------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 21a28e4773189cf2517395300dc5053d43517d42 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Sat, 28 Apr 2018 07:10:25 -0400 Subject: [PATCH 056/124] added pamr_genes --- misc/PanCancer/multiCutoff/GBM_getRes.R | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index 289f3924..60e2bb9c 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -7,13 +7,13 @@ require(reshape2) mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output" dirSet <- list( base="noPrune_180423", -# rbf3="rbf0.3_noSex_180424", -# rbf5="rbf0.5_noSex_180424", -# rbfother="rbf_noSex_180424", -# tanh="tanh_noSex_180424" + rbf3="rbf0.3_noSex_180424", + rbf5="rbf0.5_noSex_180424", + rbfother="rbf_noSex_180424", + tanh="tanh_noSex_180424", ridge_fix="ridge_AbsFix_180426", lassoGenes_sp1="lassoGenes_incClin_180426", - pamr="pamr_180425" + pamrGenes="pamrGenes_incClin_180427" ) mega_auc <- list() @@ -60,4 +60,9 @@ mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } -pdf("test.pdf"); boxplot(mega_auc); dev.off() +dt <- format(Sys.Date(),"%y%m%d") +require(gplots) +pdf(sprintf("GBM_%s.pdf",dt),width=18,height=6); +boxplot2(mega_auc,las=1,cex.axis=1.7,cex.main=2,main="GBM"); +abline(h=0.5) +dev.off() From a40758a40bf560f59c04bf0694ae9bd3690a2c05 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 30 Apr 2018 21:36:57 -0400 Subject: [PATCH 057/124] compile results from various experiments for GBM,LUSC,OV --- misc/PanCancer/multiCutoff/GBM_parseMulti.R | 4 +- misc/PanCancer/multiCutoff/KIRC_getRes.R | 58 +++++++++++++++++++++ misc/PanCancer/multiCutoff/LUSC_getRes.R | 57 ++++++++++++++++++++ misc/PanCancer/multiCutoff/OV_getRes.R | 57 ++++++++++++++++++++ 4 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 misc/PanCancer/multiCutoff/KIRC_getRes.R create mode 100644 misc/PanCancer/multiCutoff/LUSC_getRes.R create mode 100644 misc/PanCancer/multiCutoff/OV_getRes.R diff --git a/misc/PanCancer/multiCutoff/GBM_parseMulti.R b/misc/PanCancer/multiCutoff/GBM_parseMulti.R index bb3dbcc9..cda95c90 100644 --- a/misc/PanCancer/multiCutoff/GBM_parseMulti.R +++ b/misc/PanCancer/multiCutoff/GBM_parseMulti.R @@ -5,7 +5,7 @@ require(reshape2) #dataDir_each <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pruneClinRNA_alone_180125" -dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/ridge_180420" +dataDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/lassoGenes_incClin_180426" settypes <- c("clinical","mir","rna","cnv","dnam", "clinicalArna","clinicalAmir","clinicalAdnam","clinicalAcnv","all") @@ -24,7 +24,7 @@ for (settype in settypes) { ### dataDir <- dataDir_both ### else ### dataDir <- dataDir_each - rngDir <- paste(sprintf("%s/rng",dataDir), 1:100,sep="") + rngDir <- paste(sprintf("%s/rng",dataDir), 1:20,sep="") colctr <- 1 for (cutoff in 9) { diff --git a/misc/PanCancer/multiCutoff/KIRC_getRes.R b/misc/PanCancer/multiCutoff/KIRC_getRes.R new file mode 100644 index 00000000..03072d5f --- /dev/null +++ b/misc/PanCancer/multiCutoff/KIRC_getRes.R @@ -0,0 +1,58 @@ +#' plot GBM results with multiple CV cutoffs +rm(list=ls()) +require(netDx) +require(reshape2) + +#dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruneTrain_180419" +mainD <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dirSet <- list( + base="noPrune_180423", + lasso="lasso_180426", + pamr="pamr_180426" +# ridge="ridgeAbsFix_180426" +) + +settypes <- c("clinical","mir","rna","prot","cnv","dnam", + "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", + "clinicalAcnv","all") +mega_auc <- list() + +for (curdir in names(dirSet)) { +dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) + rngMax <- 20 + if (any(grep("base",curdir))) { + rngMax <- 15 + } + +auc_set <- list() +for (settype in settypes) { + rngDir <- paste(sprintf("%s/rng",dataDir), 1:rngMax,sep="") + +colctr <- 1 + cutoff <- 9 + c7 <- sprintf("%s/%s/predictionResults.txt", + rngDir,settype,cutoff) + torm <- c() + for (idx in 1:length(c7)) { + dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) + x1 <- sum(dat$STATUS=="SURVIVEYES") + x2 <- sum(dat$STATUS=="SURVIVENO") + if (x1<1 & x2<1) torm <- c(torm, idx) + } + cat(sprintf("%i: removing %i\n", cutoff,length(torm))) + if (length(torm)>0) c7 <- c7[-torm] + postscript(sprintf("tmp.eps")) + x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) + dev.off() + + y1 <- unlist(lapply(x,function(i) i$auroc)) + auc_set[[settype]] <- y1 + } + mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) +} +pdf(sprintf("KIRC_%s",format(Sys.Date(),"%y%m%d"))); +boxplot(mega_auc,main="KIRC",cex.axis=1.7,cex.main=2,las=1); dev.off() + + + diff --git a/misc/PanCancer/multiCutoff/LUSC_getRes.R b/misc/PanCancer/multiCutoff/LUSC_getRes.R new file mode 100644 index 00000000..8927492a --- /dev/null +++ b/misc/PanCancer/multiCutoff/LUSC_getRes.R @@ -0,0 +1,57 @@ +#' plot GBM results with multiple CV cutoffs +rm(list=ls()) +require(netDx) +require(reshape2) + +mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output" +dirSet <- list( + base="noPrune_180423", + lasso="lasso_180426", + lassoGenes="lassoGenes_180426", + pamrGenes_sp2="pamrGenes_180427", + pamrGenes_sp1="pamrGenes_sp1_180427" +) +settypes <- c("clinical","mir","rna","prot","cnv", + "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") + +mega_auc <- list() +for (curdir in names(dirSet)) { +dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) + if (curdir %in% "base") rngMax <- 15 + else rngMax <- 20 + +auc_set <- list() +for (settype in settypes) { + if (curdir %in% "lassoGenes") { + rngDir <- paste(sprintf("%s/rng",dataDir), 3:rngMax,sep="") + } else { + rngDir <- paste(sprintf("%s/rng",dataDir), 1:rngMax,sep="") + } + + +for (cutoff in 9) { + c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", + rngDir,settype,cutoff) + torm <- c() + for (idx in 1:length(c7)) { + dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) + x1 <- sum(dat$STATUS=="SURVIVEYES") + x2 <- sum(dat$STATUS=="SURVIVENO") + if (x1<1 & x2<1) torm <- c(torm, idx) + } + cat(sprintf("%i: removing %i\n", cutoff,length(torm))) + if (length(torm)>0) c7 <- c7[-torm] + postscript(sprintf("tmp.eps")) + x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) + dev.off() + y1 <- unlist(lapply(x,function(i) i$auroc)) + auc_set[[settype]] <- y1 + + tmp <- c() + cur <- auc_set[[settype]] +} +} +mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) +} +dt <- format(Sys.Date(),"%y%m%d") +pdf(sprintf("LUSC_%s.pdf",dt),width=13,height=6); boxplot(mega_auc,main="LUSC",cex.axis=1.7,cex.main=2,las=1); dev.off() diff --git a/misc/PanCancer/multiCutoff/OV_getRes.R b/misc/PanCancer/multiCutoff/OV_getRes.R new file mode 100644 index 00000000..6fb85e40 --- /dev/null +++ b/misc/PanCancer/multiCutoff/OV_getRes.R @@ -0,0 +1,57 @@ +#' plot GBM results with multiple CV cutoffs +rm(list=ls()) +require(netDx) +require(reshape2) + +mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +maxRng <- 100 +settypes <- c("clinical","mir","rna","prot","cnv","dnam", + "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", + "clinicalAcnv","all") +dirSet <- list( + base="noPrune_180423", + prune="pruneTrain_180419", + lasso="lasso_180426" +) + +mega_auc <- list() +for (curdir in names(dirSet)) { + if (curdir %in% c("lasso","pamr")) rngMax <- 20 + else if (curdir %in% "prune") rngMax <- 14 + else rngMax <- 15 + + cat(sprintf("***** %s *****\n", curdir)) + dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) + settypes <- c("clinical","mir","rna","cnv","dnam", + "clinicalArna","clinicalAmir","clinicalAdnam","clinicalAcnv","all") + ctr <- 1 + auc_set <- list() + for (settype in settypes) { + rngDir <- sprintf("%s/rng%i", dataDir,1:rngMax) + cat(sprintf("Got %i rng files\n",length(rngDir))) + + cutoff <- 9 + c7 <- sprintf("%s/%s/predictionResults.txt", + rngDir,settype,cutoff) + torm <- c() + for (idx in 1:length(c7)) { + dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) + x1 <- sum(dat$STATUS=="SURVIVEYES") + x2 <- sum(dat$STATUS=="SURVIVENO") + if (x1<1 & x2<1) torm <- c(torm, idx) + } + cat(sprintf("%i: removing %i\n", cutoff,length(torm))) + if (length(torm)>0) c7 <- c7[-torm] + postscript("tmp.eps") + x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) + dev.off() + + y1 <- unlist(lapply(x,function(i) i$auroc)) + auc_set[[settype]] <- y1 + } + mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) +} + +dt <- format(Sys.Date(),"%y%m%d") +pdf(sprintf("OV_%s.pdf",dt)); boxplot(mega_auc,las=1,cex.axis=1.8,main="OV",cex.main=2); dev.off() From ad510dacb22f26b69c0822a67896d039de546bc0 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 30 Apr 2018 21:38:27 -0400 Subject: [PATCH 058/124] row-wise sort with apply was returning a single long vector for some matrix designs. Using for loop to force list structure --- netDx/DESCRIPTION | 2 +- netDx/R/makePSN_NamedMatrix.R | 13 ++++++++++++- netDx/R/sparsify2.R | 6 +++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/netDx/DESCRIPTION b/netDx/DESCRIPTION index 3c443b01..0e302543 100644 --- a/netDx/DESCRIPTION +++ b/netDx/DESCRIPTION @@ -1,6 +1,6 @@ Package: netDx Title: Learns Patient Binary Classification based on Similarity Networks -Version: 1.0.21 +Version: 1.0.22 Authors@R: c(person("Shraddha", "Pai", email = "shraddha.pai@utoronto.ca", role = c("aut", "cre")), person("Ahmad","Shah", role="aut"), person("Shirley","Hui",role="aut"), diff --git a/netDx/R/makePSN_NamedMatrix.R b/netDx/R/makePSN_NamedMatrix.R index 1991ebeb..11e0967e 100644 --- a/netDx/R/makePSN_NamedMatrix.R +++ b/netDx/R/makePSN_NamedMatrix.R @@ -99,6 +99,10 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, outFile <- sprintf("%s/%s_cont.txt", outDir, curSet) sim <- getSimilarity(xpr[idx,,drop=FALSE], type=simMetric,...) + if (is.null(sim)) { + cat(sprintf("%s: sim is null\n",curSet)) + browser() + } if (!useSparsify2) {# prepare for internal sparsifier idx <- which(upper.tri(sim,diag=F)) ij <- matrix_getIJ(dim(sim),idx) @@ -122,8 +126,11 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, if (sparsify) { if (useSparsify2) { - cat("using sparsify2\n") + tryCatch({ sparsify2(pat_pairs,outFile) + },error=function(ex) { + cat("sparse caught error\n"); browser() + }) } else { cat("using original sparsifier method\n") sparsifyNet(pat_pairs,outFile,numPatients=nrow(sim), @@ -132,11 +139,15 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, } else { write.table(pat_pairs, file=outFile,sep="\t", col=FALSE,row=FALSE,quote=FALSE) + print(basename(outFile)) + cat("done\n") } } +#cat("got here\n") oFile <- basename(outFile) } oFile +#cat("out of loop\n") } stopCluster(cl) outFiles diff --git a/netDx/R/sparsify2.R b/netDx/R/sparsify2.R index 454ee733..72da46ca 100644 --- a/netDx/R/sparsify2.R +++ b/netDx/R/sparsify2.R @@ -16,7 +16,9 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000) diag(W) <- 0; W[W < cutoff] <- NA - x <- apply(W,1,sort,decreasing=TRUE,na.last=NA); + x <- list() + for (i in 1:nrow(W)) { x[[i]] <- sort(W[i,],decreasing=TRUE,na.last=NA)} + names(x) <- rownames(W) for (k in 1:length(x)) { cur <- x[[k]] tokeep <- names(cur)[1:min(length(cur),maxInt)] @@ -24,11 +26,9 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000) } tmp <- na.omit(melt(W)) tmp <- tmp[order(tmp[,3],decreasing=TRUE),] - #maxEdge <- 0.02*ncol(W); maxEdge <- nrow(tmp) if (maxEdge>EDGE_MAX) maxEdge <- EDGE_MAX - tmp <- tmp[1:maxEdge,] write.table(tmp,file=outFile,sep="\t",col=F,row=F,quote=F) From 4ad749210f4d118408a59feacc3b17db4f97af25 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 30 Apr 2018 21:38:48 -0400 Subject: [PATCH 059/124] ridge + gene-wise nets --- .../lasso/GBM_ridge_genes_incClin.R | 434 ++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/lasso/GBM_ridge_genes_incClin.R diff --git a/misc/PanCancer/pruneVersion/lasso/GBM_ridge_genes_incClin.R b/misc/PanCancer/pruneVersion/lasso/GBM_ridge_genes_incClin.R new file mode 100644 index 00000000..c2da509e --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/GBM_ridge_genes_incClin.R @@ -0,0 +1,434 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/ridgeGenes_incClin_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=0) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- names(netSets_iter) + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + multiNet <- setdiff(multiNet,"clinical") + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + + if (length(singNet)>0) { + cat(sprintf("%i: %i single nets { %s }\n", rngNum, length(singNet), + paste(singNet,collapse=","))) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) + } + if (length(multiNet)>0) { + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + } + netList <- c(netList,netList2,netList3) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- names(netSets_iter) #setdiff(names(netSets_iter),"clinical") + +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +multiNet <- setdiff(multiNet,"clinical") +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +if (length(singNet)>0) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[singNet],netDir, + simMetric="custom",customFunc=normDiff,writeProfiles=FALSE,sparsify=TRUE, + useSparsify2=FALSE, + verbose=FALSE,numCores=numCores) +} +if (length(multiNet)>0){ + cat(sprintf("%i: %i multi nets { %s }\n", rngNum, length(multiNet), + paste(multiNet,collapse=","))) + netList3 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[multiNet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) +} +if ("clinical" %in% names(netSets_iter)) { + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +} +netList <- c(netList,netList2,netList3) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + +# ------------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 138f84a385c615e7525fee7caf47870f9a5af90d Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 06:28:06 -0400 Subject: [PATCH 060/124] lasso+gene-level+rbf/eucscale --- .../diff_kernels/GBM_univar_eucscale.R | 386 ++++++++++++++++ .../diff_kernels/GBM_univar_rbf.R | 437 ++++++++++++++++++ .../diff_kernels/GBM_univar_rbf_sp1.R | 437 ++++++++++++++++++ 3 files changed, 1260 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf_sp1.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R new file mode 100644 index 00000000..ffd91467 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R @@ -0,0 +1,386 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +args <- commandArgs(TRUE) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_%s",outRoot,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ---------------------------------------------------------------- +# helper functions + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- names(netSets_iter) + +cat(sprintf("Making test nets for rng%i\n", rngNum)) +netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + verbose=FALSE,numCores=numCores) + + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + + +netDir <- sprintf("%s/test_networks",outDir) +netList <- c() + +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + verbose=TRUE,numCores=numCores) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) +# ------------------------------------- + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R new file mode 100644 index 00000000..ba27bdda --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R @@ -0,0 +1,437 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +args <- commandArgs(TRUE) +kernType <- "rbf" #args[1] +hyperParam <- 1#as.numeric(args[2]) + +cat(sprintf("arg1=%s; arg2=%1.2f\n",args[1],hyperParam),file="test.txt") +cat("boo", file="test.txt",append=TRUE) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoUni_%s_%s_%s",outRoot,kernType, + paste(as.character(hyperParam),collapse="_"),dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ---------------------------------------------------------------- +# helper functions +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="rbf",sigmaVar=0.05) { + + # z-transform + m <- (m-rowMeans(m,na.rm=TRUE))/apply(m,1,sd,na.rm=T) + + if (nm=="rbf") { + func <- kernlab::rbfdot(sigmaVar) + cat(sprintf("Sigma = %1.2f\n", sigmaVar)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + m <- as.matrix(na.omit(m)) + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + # self-similarity for samity + for (k in 1:ncol(m)) out[k,k] <- func(m[,k],m[,k]) + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out[which(out < .Machine$double.eps)] <- .Machine$double.eps + return(out) +} +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("RBF sigmaVar=%1.2f\n", hyperParam)) +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- names(netSets_iter) + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + multiNet <- setdiff(multiNet,"clinical") + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.kern,sigmaVar=hyperParam, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + verbose=FALSE,numCores=numCores) + + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- names(netSets_iter) #setdiff(names(netSets_iter),"clinical") + +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +multiNet <- setdiff(multiNet,"clinical") +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.kern,sigmaVar=hyperParam, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + verbose=TRUE,numCores=numCores) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) +# ------------------------------------- + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf_sp1.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf_sp1.R new file mode 100644 index 00000000..d875d4c6 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf_sp1.R @@ -0,0 +1,437 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +args <- commandArgs(TRUE) +kernType <- "rbf" #args[1] +hyperParam <- 0.1#as.numeric(args[2]) + +cat(sprintf("arg1=%s; arg2=%1.2f\n",args[1],hyperParam),file="test.txt") +cat("boo", file="test.txt",append=TRUE) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoUnisp1_%s_%s_%s",outRoot,kernType, + paste(as.character(hyperParam),collapse="_"),dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ---------------------------------------------------------------- +# helper functions +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="rbf",sigmaVar=0.05) { + + # z-transform + m <- (m-rowMeans(m,na.rm=TRUE))/apply(m,1,sd,na.rm=T) + + if (nm=="rbf") { + func <- kernlab::rbfdot(sigmaVar) + cat(sprintf("Sigma = %1.2f\n", sigmaVar)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + m <- as.matrix(na.omit(m)) + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + # self-similarity for samity + for (k in 1:ncol(m)) out[k,k] <- func(m[,k],m[,k]) + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out[which(out < .Machine$double.eps)] <- .Machine$double.eps + return(out) +} +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("RBF sigmaVar=%1.2f\n", hyperParam)) +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { + netSets_iter[[k]] <- k + } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + nonclin <- names(netSets_iter) + + netLen <- unlist(lapply(netSets_iter,length)) + multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) + multiNet <- setdiff(multiNet,"clinical") + singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.kern,sigmaVar=hyperParam, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=FALSE,cutoff=.Machine$double.eps, + verbose=FALSE,numCores=numCores) + + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- names(netSets_iter) #setdiff(names(netSets_iter),"clinical") + +netLen <- unlist(lapply(netSets_iter,length)) +multiNet <- intersect(nonclin, names(netSets_iter[netLen>1])) +multiNet <- setdiff(multiNet,"clinical") +singNet <- intersect(nonclin, names(netSets_iter[netLen==1])) + +netList3 <- c() +netList2 <- c() +netList <- c() + +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.kern,sigmaVar=hyperParam, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=FALSE,cutoff=.Machine$double.eps, + verbose=TRUE,numCores=numCores) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) +# ------------------------------------- + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 43d7df7cd3408a625c65494782aba6a443132a67 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 06:28:49 -0400 Subject: [PATCH 061/124] runs multiple kernels in virtual screens --- misc/PanCancer/pruneVersion/diff_kernels/runJobs.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 misc/PanCancer/pruneVersion/diff_kernels/runJobs.sh diff --git a/misc/PanCancer/pruneVersion/diff_kernels/runJobs.sh b/misc/PanCancer/pruneVersion/diff_kernels/runJobs.sh new file mode 100755 index 00000000..de92b5a9 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/runJobs.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sig_val=(0.05 0.1 0.3 1); +for cursig in ${sig_val[@]}; do + echo $cursig + screen -d -m bash -c "Rscript GBM_univar_rbf.R rbf $cursig" + sleep 5 +done + From 47c3472da64b1935559b41db2427ec39fb582d58 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 06:30:20 -0400 Subject: [PATCH 062/124] specify EDGE_MAX for sparsify2; also get back sparsified mat for debugging --- netDx/R/makePSN_NamedMatrix.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/netDx/R/makePSN_NamedMatrix.R b/netDx/R/makePSN_NamedMatrix.R index 11e0967e..83941026 100644 --- a/netDx/R/makePSN_NamedMatrix.R +++ b/netDx/R/makePSN_NamedMatrix.R @@ -44,6 +44,7 @@ #' with default parameters. Only used when writeProfiles=FALSE #' @param useSparsify2 (logical). Currently for testing only. A cleaner #' sparsification routine. +#' @param sparsify_edgeMax (numeric). #' @param append (logical) if TRUE does not overwrite netDir. #' @param ... passed to \code{getSimilarity()} #' @return (char) Basename of files to which networks are written. @@ -56,9 +57,10 @@ #' @export makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, - simMetric="pearson", cutoff=0.3,verbose=TRUE, + simMetric="pearson",verbose=TRUE, numCores=1L,writeProfiles=TRUE, - sparsify=FALSE,useSparsify2=FALSE,append=FALSE,...){ + sparsify=FALSE,useSparsify2=FALSE,cutoff=0.3,sparsify_edgeMax=1000, + append=FALSE,...){ if (!append) { if (file.exists(outDir)) unlink(outDir,recursive=TRUE) dir.create(outDir) @@ -97,6 +99,7 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, col=F,row=T,quote=F) } else { outFile <- sprintf("%s/%s_cont.txt", outDir, curSet) + cat(sprintf("computing sim for %s\n",curSet)) sim <- getSimilarity(xpr[idx,,drop=FALSE], type=simMetric,...) if (is.null(sim)) { @@ -127,7 +130,9 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, if (sparsify) { if (useSparsify2) { tryCatch({ - sparsify2(pat_pairs,outFile) + spmat <- sparsify2(pat_pairs,cutoff=cutoff, + EDGE_MAX=sparsify_edgeMax, + outFile) },error=function(ex) { cat("sparse caught error\n"); browser() }) From f9fb5ac0dd013cbb7cec2e40838e16c4e3b39625 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 06:39:22 -0400 Subject: [PATCH 063/124] added includeAllNodes flag to force test patients to be in net --- netDx/R/sparsify2.R | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/netDx/R/sparsify2.R b/netDx/R/sparsify2.R index 72da46ca..40f887ed 100644 --- a/netDx/R/sparsify2.R +++ b/netDx/R/sparsify2.R @@ -8,40 +8,63 @@ #' @param cutoff (numeric) edges with weight smaller than this are set to NA #' @param maxInt (numeric) max num edges per node. #' @param EDGE_MAX (numeric) max num edges in network +#' @param includeAllNodes (logical) if TRUE, ensures at least one edge is present for each patient. This feature is required when sparsification excludes test patients that are required to be classified. If the sparsification rules exclude all edges for a patient and this flag is set, then the strongest edge for each missing patient is added to the net. Note that this condition results in the total number of edges potentially exceeding EDGE_MAX #' @return writes SIF content to text file (node1,node2,edge weight) #' @import reshape2 #' @export -sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000) { +sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, + includeAllNodes=TRUE) { + if (maxInt > ncol(W)) maxInt <- ncol(W) diag(W) <- 0; W[W < cutoff] <- NA x <- list() - for (i in 1:nrow(W)) { x[[i]] <- sort(W[i,],decreasing=TRUE,na.last=NA)} + for (i in 1:nrow(W)) { x[[i]] <- sort(W[i,],decreasing=TRUE,na.last=TRUE)} names(x) <- rownames(W) for (k in 1:length(x)) { cur <- x[[k]] tokeep <- names(cur)[1:min(length(cur),maxInt)] W[k,which(!colnames(W)%in% tokeep)] <- NA } - tmp <- na.omit(melt(W)) - tmp <- tmp[order(tmp[,3],decreasing=TRUE),] + mmat <- na.omit(melt(W)) + mmat <- mmat[order(mmat[,3],decreasing=TRUE),] - maxEdge <- nrow(tmp) + maxEdge <- nrow(mmat) if (maxEdge>EDGE_MAX) maxEdge <- EDGE_MAX - tmp <- tmp[1:maxEdge,] - write.table(tmp,file=outFile,sep="\t",col=F,row=F,quote=F) + mmat <- mmat[1:maxEdge,] + + # we should guarantee an edge from all patients- in this case + # the edge_max would be violated unless we come up with a better rule + if (includeAllNodes) { + mmat[,1] <- as.character(mmat[,1]) + mmat[,2] <- as.character(mmat[,2]) + univ <- c(mmat[,1],mmat[,2]) + missing <- setdiff(rownames(W), univ) + cat(sprintf("missing = { %s }\n",paste(missing, collapse=","))) + if (length(missing)>0) { + cat(sprintf("Sparsify2: found %i missing patients; adding strongest edge\n", + length(missing))) + for (k in missing) { # add the strongest edge for the patient + tmp <- x[[k]] + mmat <- rbind(mmat, c(k, names(tmp)[1],tmp[1])) + } + } + } + head(mmat) + write.table(mmat,file=outFile,sep="\t",col=F,row=F,quote=F) + return(mmat) ### the code below converts the SIF format back to a matrix,potentially ### for debugging. -### W2 <- dcast(tmp,Var2~Var1,value.var="value") +### W2 <- dcast(mmat,Var2~Var1,value.var="value") ### rownames(W2) <- W2[,1]; W2 <- W2[,-1] ### W2 <- W2[,colnames(W)] ### W2 <- W2[colnames(W),] ### n <- ncol(W); -### sp <- nrow(tmp)/(n*(n-1))/2 +### sp <- nrow(mmat)/(n*(n-1))/2 ### cat(sprintf("%i -> %i edges (%i%% sparsity)\n", -### sum(!is.na(W)), nrow(tmp), round(sp*100))) +### sum(!is.na(W)), nrow(mmat), round(sp*100))) ### return(W2); } From 3776ec9b42fceda577792d8028c5ba28641e59cf Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 06:39:37 -0400 Subject: [PATCH 064/124] debug comments --- netDx/inst/python/process_networks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/netDx/inst/python/process_networks.py b/netDx/inst/python/process_networks.py index e6b9c606..ae90c236 100755 --- a/netDx/inst/python/process_networks.py +++ b/netDx/inst/python/process_networks.py @@ -114,7 +114,8 @@ def write_network(organism_id, file_name, network_id): output_file = open(join(output_path, output_file_name), 'w') for line in open(file_name, 'rU'): data = line.strip().split('\t') - #print data[0]; +# print data[0]; +# print file_name; #print symbols; for i in id_indexes: From 7c4f04b31aaaa3f4593069b43b3fa46bc6473c4b Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 14:19:59 -0400 Subject: [PATCH 065/124] change sparsification level by param --- .../diff_kernels/GBM_univar_eucscale.R | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R index ffd91467..fc070d13 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R @@ -12,6 +12,7 @@ require(glmnet) numCores <- 8L GMmemory <- 4L trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification args <- commandArgs(TRUE) @@ -20,7 +21,7 @@ inDir <- sprintf("%s/input",rootDir) outRoot <- sprintf("%s/output",rootDir) dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucscale_%s",outRoot,dt) +megaDir <- sprintf("%s/eucscale_sp2max%i_%s",outRoot,maxEdge,dt) cat(megaDir, file="test.txt",append=TRUE) # ---------------------------------------------------------------- @@ -65,6 +66,7 @@ normalize <- function(X) { W <- (densities + t(densities))/2 W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) idx <- which(rowSums(is.na(euc))==ncol(W)-1) if (any(idx)) { W <- W[-idx,] @@ -210,14 +212,13 @@ for (rngNum in 1:20) { wt <- abs(coef(fit,s="lambda.min")[,1]) vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { - tmp <- dats_train[[nm]] - tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] - dats_train[[nm]] <- tmp - for (k in rownames(tmp)) { - netSets_iter[[k]] <- k - } - combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) } else { # leave dats_train as is, make a single net netSets_iter[[nm]] <- rownames(dats_train[[nm]]) @@ -235,7 +236,6 @@ for (rngNum in 1:20) { # ------------------------------- # make train db netDir <- sprintf("%s/networks",outDir) - nonclin <- names(netSets_iter) cat(sprintf("Making test nets for rng%i\n", rngNum)) netList <- makePSN_NamedMatrix(alldat_train, @@ -243,6 +243,7 @@ netList <- makePSN_NamedMatrix(alldat_train, simMetric="custom",customFunc=sim.eucscale, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, verbose=FALSE,numCores=numCores) cat(sprintf("Total of %i nets\n", length(netList))) @@ -310,6 +311,7 @@ netList <- makePSN_NamedMatrix(alldat, simMetric="custom",customFunc=sim.eucscale, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, verbose=TRUE,numCores=numCores) cat(sprintf("Total of %i nets\n", length(netList))) # now create database From d01b0c1feaf3307096e5be390529449b78ecbe4c Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 14:20:19 -0400 Subject: [PATCH 066/124] lasso+eucscale, but group-level nets --- .../GBM_univar_eucscale_grouped.R | 387 ++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R new file mode 100644 index 00000000..4d5e7406 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R @@ -0,0 +1,387 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running +# Lasso+EucScale but not gene-level nets. + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +args <- commandArgs(TRUE) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_sp2max%i_grouped_%s",outRoot,maxEdge,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ---------------------------------------------------------------- +# helper functions + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + } # else do nothing, use full dataset + + # one net for datatype + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont", nm) + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + +cat(sprintf("Making train nets for rng%i\n", rngNum)) +netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + + +netDir <- sprintf("%s/test_networks",outDir) +netList <- c() + +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) +# ------------------------------------- + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 3140cd28d40b7535a56f771d100af95f957d76b7 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 14:20:38 -0400 Subject: [PATCH 067/124] lasso+genelevel+eucscale for all tumours --- .../diff_kernels/KIRC_univar_eucscale.R | 372 +++++++++++++++++ .../diff_kernels/LUSC_univar_eucscale.R | 393 ++++++++++++++++++ .../diff_kernels/OV_univar_eucscale.R | 363 ++++++++++++++++ 3 files changed, 1128 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R new file mode 100644 index 00000000..730ce907 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R @@ -0,0 +1,372 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) # lasso for univariate filtering + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 +maxEdge <- 6000 ### max edge after sparsification + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_sp2%i_%s",outRoot,maxEdge,dt) + +# ---------------------------------------------------------------- +# helper functions +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + fit.lasso <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit.lasso,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + + netDir <- sprintf("%s/networks",outDir) + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R new file mode 100644 index 00000000..1e4319ac --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R @@ -0,0 +1,393 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_sp2%i_%s",outRoot,maxEdge,dt) + +# ---------------------------------------------------------------- + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +mega_combList <- combList # this will change in each round + +# first loop - over train/test splits +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + ## pruneTrain code ------ + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)){ + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + + alldat_train <- do.call("rbind",dats_train) + + netDir <- sprintf("%s/networks",outDir) + +# ------------------------------- +# make train db +cat(sprintf("Making test nets for rng%i\n", rngNum)) +netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) +# ------------------------------- + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + netDir <- sprintf("%s/test_networks",megaDir) + +# ------------------------------- +# make test db + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores) +# ------------------------------- + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R new file mode 100644 index 00000000..512301ca --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R @@ -0,0 +1,363 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 8 +maxEdge <- 6000 ### max edge after sparsification + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_sp2%i_%s",outRoot,maxEdge,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +mega_combList <- combList +for (rngNum in 1:20) { + combList <- mega_combList + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + # run lasso with cv + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + netDir <- sprintf("%s/test_networks",outDir) + + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From 12f851ac2f58150f33fef4302f7f76fad6cfdf13 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 14:20:51 -0400 Subject: [PATCH 068/124] updated datasets --- misc/PanCancer/multiCutoff/GBM_getRes.R | 32 +++++++++++++++++++----- misc/PanCancer/multiCutoff/KIRC_getRes.R | 9 ++++--- misc/PanCancer/multiCutoff/LUSC_getRes.R | 5 +++- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index 60e2bb9c..8ae34df7 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -7,16 +7,21 @@ require(reshape2) mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output" dirSet <- list( base="noPrune_180423", - rbf3="rbf0.3_noSex_180424", - rbf5="rbf0.5_noSex_180424", - rbfother="rbf_noSex_180424", - tanh="tanh_noSex_180424", ridge_fix="ridge_AbsFix_180426", lassoGenes_sp1="lassoGenes_incClin_180426", - pamrGenes="pamrGenes_incClin_180427" + pamrGenes="pamrGenes_incClin_180427", + #rbf0.05="lassoUni_rbf_0.05", + rbf0.1="lassoUni_rbf_0.1_180502", + rbf0.25="lassoUni_rbf_0.25_180502", + euc_1K="eucscale_sp2max1000_180503", + euc_6K="eucscale_sp2max6000_180503", + euc_6K_group="eucscale_sp2max6000_grouped_180503" + #rbf5="lassoUni_rbf_5", + #rbf10="lassoUni_rbf_10" ) mega_auc <- list() +numSplits <- list() for (curdir in names(dirSet)) { cat(sprintf("***** %s *****\n", curdir)) dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) @@ -31,9 +36,20 @@ cutoff <-9 if (any(c(grep("lasso",curdir),grep("ridge",curdir)))) { rngDir <- paste("rng",1:18,sep="") + } else if (any(c(grep("rbf0.1",curdir)))){ + rngDir <- paste("rng",1:8,sep="") + } else if (any(c(grep("rbf0.25",curdir)))){ + rngDir <- paste("rng",1:8,sep="") + } else if (any(c(grep("euc_1K",curdir)))){ + rngDir <- paste("rng",1:12,sep="") + } else if (curdir =="euc_6K"){ + rngDir <- paste("rng",1:20,sep="") + } else if (curdir =="euc_6K_group"){ + rngDir <- paste("rng",1:14,sep="") } else { rngDir <- dir(path=dataDir,pattern="rng") } + numSplits[[curdir]] <- length(rngDir) cat(sprintf("Got %i rng files\n",length(rngDir))) rngDir <- sprintf("%s/%s",dataDir,rngDir) @@ -48,6 +64,7 @@ cutoff <-9 } cat(sprintf("%i: removing %i\n", cutoff,length(torm))) if (length(torm)>0) c7 <- c7[-torm] + postscript("tmp.eps") x <- plotPerf(c7,c("SURVIVEYES","SURVIVENO")) dev.off() @@ -63,6 +80,9 @@ mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) dt <- format(Sys.Date(),"%y%m%d") require(gplots) pdf(sprintf("GBM_%s.pdf",dt),width=18,height=6); -boxplot2(mega_auc,las=1,cex.axis=1.7,cex.main=2,main="GBM"); +boxplot( mega_auc,las=1,cex.axis=1.7,cex.main=2,main="GBM", + at=1:length(mega_auc)); +tmp <- unlist(numSplits) +text(1:length(mega_auc),0.5,sprintf("N=%i",tmp)) abline(h=0.5) dev.off() diff --git a/misc/PanCancer/multiCutoff/KIRC_getRes.R b/misc/PanCancer/multiCutoff/KIRC_getRes.R index 03072d5f..b52427b9 100644 --- a/misc/PanCancer/multiCutoff/KIRC_getRes.R +++ b/misc/PanCancer/multiCutoff/KIRC_getRes.R @@ -9,7 +9,8 @@ mainD <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dirSet <- list( base="noPrune_180423", lasso="lasso_180426", - pamr="pamr_180426" + pamr="pamr_180426", + euc6K="eucscale_sp26000_180503" # ridge="ridgeAbsFix_180426" ) @@ -23,7 +24,9 @@ dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) rngMax <- 20 if (any(grep("base",curdir))) { rngMax <- 15 - } + } else if (any(grep("euc",curdir))) { + rngMax <- 5 + } auc_set <- list() for (settype in settypes) { @@ -51,7 +54,7 @@ colctr <- 1 } mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } -pdf(sprintf("KIRC_%s",format(Sys.Date(),"%y%m%d"))); +pdf(sprintf("KIRC_%s.pdf",format(Sys.Date(),"%y%m%d"))); boxplot(mega_auc,main="KIRC",cex.axis=1.7,cex.main=2,las=1); dev.off() diff --git a/misc/PanCancer/multiCutoff/LUSC_getRes.R b/misc/PanCancer/multiCutoff/LUSC_getRes.R index 8927492a..4f828328 100644 --- a/misc/PanCancer/multiCutoff/LUSC_getRes.R +++ b/misc/PanCancer/multiCutoff/LUSC_getRes.R @@ -9,7 +9,8 @@ dirSet <- list( lasso="lasso_180426", lassoGenes="lassoGenes_180426", pamrGenes_sp2="pamrGenes_180427", - pamrGenes_sp1="pamrGenes_sp1_180427" + pamrGenes_sp1="pamrGenes_sp1_180427", + euc6K="eucscale_sp26000_180503" ) settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") @@ -24,6 +25,8 @@ auc_set <- list() for (settype in settypes) { if (curdir %in% "lassoGenes") { rngDir <- paste(sprintf("%s/rng",dataDir), 3:rngMax,sep="") + } else if (any(grep("euc",curdir))) { + rngDir <- paste(sprintf("%s/rng",dataDir), 1:9,sep="") } else { rngDir <- paste(sprintf("%s/rng",dataDir), 1:rngMax,sep="") } From 6bec461ea822930500ae75f3d556d6a1fc87e0c2 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 14:21:29 -0400 Subject: [PATCH 069/124] show top indiv features --- misc/PanCancer/multiCutoff/getGenes.R | 85 +++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 misc/PanCancer/multiCutoff/getGenes.R diff --git a/misc/PanCancer/multiCutoff/getGenes.R b/misc/PanCancer/multiCutoff/getGenes.R new file mode 100644 index 00000000..4a50a58c --- /dev/null +++ b/misc/PanCancer/multiCutoff/getGenes.R @@ -0,0 +1,85 @@ +#' see gene scores. +rm(list=ls()) + +dt <- format(Sys.Date(),"%y%m%d") +#outDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/eucscale_sp2max6000_180503" +#dNames <- sprintf("%s/rng%i", outDir,1:20) +#pdf(sprintf("GBM_topFeatures_%s_%s.pdf",setName,dt),width=6,height=13) +# --- +# lusc +###outDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/eucscale_sp26000_180503" +###dNames <- sprintf("%s/rng%i", outDir,1:11) +###setName <- "all" +###pdfFile <-sprintf("LUSC_topFeatures_%s_%s.pdf",setName,dt) +# --- +# kirc +outDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/eucscale_sp26000_180503" +dNames <- sprintf("%s/rng%i", outDir,1:5) +setName <- "all" +pdfFile <-sprintf("KIRC_topFeatures_%s_%s.pdf",setName,dt) + +fSet <- list( + SURVIVEYES=sprintf("%s/%s/SURVIVEYES/GM_results/SURVIVEYES_pathway_CV_score.txt", + dNames,setName), + SURVIVENO=sprintf("%s/%s/SURVIVENO/GM_results/SURVIVENO_pathway_CV_score.txt", + dNames,setName) +) + +require(netDx) + +out <- list() +for (gp in names(fSet)) { + fList <- fSet[[gp]] + netColl <- list() + for (scoreFile in fList) { + tmp <- read.delim(scoreFile,sep="\t",h=T,as.is=T) + colnames(tmp)[1] <- "PATHWAY_NAME" + tmp[,1] <- sub("_cont","",tmp[,1]) + netColl[[scoreFile]] <- tmp + + } + rpos <- regexpr("rng",fList) + sname <- substr(fList,rpos,nchar(fList)) + spos <- regexpr("\\/",sname) + s2name <- substr(sname,1,spos-1) + names(netColl) <- s2name + # filter for nets meeting cutoff criteria + cat("* Computing consensus\n") + cons <- getNetConsensus(netColl); x1 <- nrow(cons) + na_sum <- rowSums(is.na(cons)) + cons <- cons[order(na_sum),] + out[[gp]] <- cons +} + +thresh <- 9 +pctPassT <- lapply(out, function(x) { + nm <- x[,1]; x <- x[,-1] + y <- rowSums(x>=thresh,na.rm=T) + y <- (y/ncol(x))*100 + names(y) <- nm + y <- y[order(-y)] + y <- y[1:20] + y +}) + + + +pdf(sprintf("GBM_topFeatures_%s_%s.pdf",setName,dt),width=6,height=13) +pdf(pdfFile,width=8,height=13) +tryCatch({ + mar <- par("mar") + par(mar=c(5,1,3,1)) + for (k in names(pctPassT)) { + cur <- rev(pctPassT[[k]]) + y <- barplot(cur, horiz=TRUE,col="lightblue",border="white", + main=sprintf("%s:%% splits with score>%i\n(N=%i splits)", + k,thresh,length(dNames)),xlim=c(0,150)) + text(cur,y,names(cur),font=3,cex=1,pos=4) +} +},error=function(ex){print(ex) +},finally={ + dev.off() +}) + + + From 7906f9e5358c5ccbb2fab56211a257bc48991e29 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 3 May 2018 14:22:19 -0400 Subject: [PATCH 070/124] prints out params for confirmation --- netDx/R/sparsify2.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/netDx/R/sparsify2.R b/netDx/R/sparsify2.R index 40f887ed..f4fb6cc5 100644 --- a/netDx/R/sparsify2.R +++ b/netDx/R/sparsify2.R @@ -13,7 +13,9 @@ #' @import reshape2 #' @export sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, - includeAllNodes=TRUE) { + includeAllNodes=TRUE,verbose=TRUE) { + + if (verbose) cat(sprintf("sparsify2:maxInt=%i;EDGE_MAX=%i;cutoff=%1.2e;includeAllNodes=%s",maxInt,EDGE_MAX,cutoff,includeAllNodes)) if (maxInt > ncol(W)) maxInt <- ncol(W) From 153177bc5e1cc8caf625f2de3e3d72198cf151b6 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 11:15:37 -0400 Subject: [PATCH 071/124] clean pipeline for eucdist+expscale --- .../diff_kernels/GBM_eucscale_pipeline.R | 118 ++++++++ .../diff_kernels/GBM_univar_rbf.R | 1 - .../diff_kernels/KIRC_eucscale_pipeline.R | 121 ++++++++ .../diff_kernels/LUSC_eucscale_pipeline.R | 119 ++++++++ .../diff_kernels/OV_eucscale_pipeline.R | 113 ++++++++ .../diff_kernels/PanCancer_eucscale.R | 274 ++++++++++++++++++ .../{ => outdated}/GBM_univar_eucscale.R | 0 7 files changed, 745 insertions(+), 1 deletion(-) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R rename misc/PanCancer/pruneVersion/diff_kernels/{ => outdated}/GBM_univar_eucscale.R (100%) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R new file mode 100644 index 00000000..07b96bfb --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R @@ -0,0 +1,118 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R index ba27bdda..739bac73 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R @@ -36,7 +36,6 @@ cat(megaDir, file="test.txt",append=TRUE) #' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. #' e.g. rbf,tanh,laplace sim.kern <- function(m,nm="rbf",sigmaVar=0.05) { - # z-transform m <- (m-rowMeans(m,na.rm=TRUE))/apply(m,1,sd,na.rm=T) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R new file mode 100644 index 00000000..4ddfa538 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) + + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R new file mode 100644 index 00000000..63e20923 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R @@ -0,0 +1,119 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R new file mode 100644 index 00000000..9a3e1c1b --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R @@ -0,0 +1,113 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R new file mode 100644 index 00000000..150a2461 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R @@ -0,0 +1,274 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/outdated/GBM_univar_eucscale.R similarity index 100% rename from misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale.R rename to misc/PanCancer/pruneVersion/diff_kernels/outdated/GBM_univar_eucscale.R From 90c2df29bd16d8b2bd60d6bff3a5ad6aa537f862 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 16:58:42 -0400 Subject: [PATCH 072/124] Pearson+expscale --- .../diff_kernels/pearscale/GBM_pearscale.R | 118 ++++++++ .../diff_kernels/pearscale/KIRC_pearscale.R | 120 ++++++++ .../diff_kernels/pearscale/LUSC_pearscale.R | 119 ++++++++ .../pearscale/PanCancer_pearscale.R | 272 ++++++++++++++++++ .../diff_kernels/pearscale/test.txt | 1 + 5 files changed, 630 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/KIRC_pearscale.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_pearscale.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale.R new file mode 100644 index 00000000..765ba9b6 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale.R @@ -0,0 +1,118 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pearscale_%s",outRoot,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_pearscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/KIRC_pearscale.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/KIRC_pearscale.R new file mode 100644 index 00000000..165fb0c5 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/KIRC_pearscale.R @@ -0,0 +1,120 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pearscale_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_pearscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_pearscale.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_pearscale.R new file mode 100644 index 00000000..ca44b567 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_pearscale.R @@ -0,0 +1,119 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pearscale_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_pearscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale.R new file mode 100644 index 00000000..9e6d8a5f --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale.R @@ -0,0 +1,272 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +sim.pearscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + + if (nrow(dat)<6) { + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + } else { + euc <- as.matrix(1-cor(dat,method="pearson",use="pairwise.complete.obs")) + } + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt new file mode 100644 index 00000000..1ac129a9 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt @@ -0,0 +1 @@ +/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507 \ No newline at end of file From 623dc3784a9ea65b06259f5c56b615fc2a0febba Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 16:58:59 -0400 Subject: [PATCH 073/124] replaced by rbf/ --- .../pruneVersion/diff_kernels/{ => outdated}/GBM_univar_rbf.R | 0 .../pruneVersion/diff_kernels/{ => outdated}/GBM_univar_rbf_sp1.R | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename misc/PanCancer/pruneVersion/diff_kernels/{ => outdated}/GBM_univar_rbf.R (100%) rename misc/PanCancer/pruneVersion/diff_kernels/{ => outdated}/GBM_univar_rbf_sp1.R (100%) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R b/misc/PanCancer/pruneVersion/diff_kernels/outdated/GBM_univar_rbf.R similarity index 100% rename from misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf.R rename to misc/PanCancer/pruneVersion/diff_kernels/outdated/GBM_univar_rbf.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf_sp1.R b/misc/PanCancer/pruneVersion/diff_kernels/outdated/GBM_univar_rbf_sp1.R similarity index 100% rename from misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_rbf_sp1.R rename to misc/PanCancer/pruneVersion/diff_kernels/outdated/GBM_univar_rbf_sp1.R From 6a152b9c693b79401794128966cbd0e7964d7ca4 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 16:59:07 -0400 Subject: [PATCH 074/124] clean rbf pipeline --- .../diff_kernels/rbf/GBM_rbf_pipeline.R | 121 +++++++++ .../diff_kernels/rbf/LUSC_rbf_pipeline.R | 121 +++++++++ .../diff_kernels/rbf/OV_rbf_pipeline.R | 113 ++++++++ .../diff_kernels/rbf/PanCancer_rbf.R | 253 ++++++++++++++++++ 4 files changed, 608 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/rbf/GBM_rbf_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/rbf/LUSC_rbf_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/rbf/OV_rbf_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/rbf/PanCancer_rbf.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/rbf/GBM_rbf_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/rbf/GBM_rbf_pipeline.R new file mode 100644 index 00000000..9b9914df --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/rbf/GBM_rbf_pipeline.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,inDir,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_rbf.R") +hp <- 0.2 # sigma +dt <- format(Sys.Date(),"%y%m%d") + +megaDir <- sprintf("%s/rbfclean_%1.2f_%s",outRoot,hp,dt) +cat(megaDir, file="test.txt",append=TRUE) +cat("about to run\n") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,sigmaVar=hp) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/rbf/LUSC_rbf_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/rbf/LUSC_rbf_pipeline.R new file mode 100644 index 00000000..3b1cc578 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/rbf/LUSC_rbf_pipeline.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +hp <- 0.2 +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/rbfclean_%1.2f_%s",outRoot,hp,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) +rm(survStr,surv,tmp,nm,inDir,dt,k,inFiles,datFiles,pname,outRoot) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_rbf.R") + +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,sigmaVar=hp) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/rbf/OV_rbf_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/rbf/OV_rbf_pipeline.R new file mode 100644 index 00000000..cb237761 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/rbf/OV_rbf_pipeline.R @@ -0,0 +1,113 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +hp <- 0.2 # sigma +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/rbfclean_%s_%s",outRoot,hp,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_rbf.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,sigmaVar=hp) + + + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/rbf/PanCancer_rbf.R b/misc/PanCancer/pruneVersion/diff_kernels/rbf/PanCancer_rbf.R new file mode 100644 index 00000000..e7e1f06a --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/rbf/PanCancer_rbf.R @@ -0,0 +1,253 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +sim.kern <- function(m,nm="rbf",sigmaVar=0.05) { + + # z-transform + m <- (m-rowMeans(m,na.rm=TRUE))/apply(m,1,sd,na.rm=T) + + if (nm=="rbf") { + func <- kernlab::rbfdot(sigmaVar) + cat(sprintf("Sigma = %1.2f\n", sigmaVar)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + m <- as.matrix(na.omit(m)) + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + # self-similarity for samity + for (k in 1:ncol(m)) out[k,k] <- func(m[,k],m[,k]) + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out[which(out < .Machine$double.eps)] <- .Machine$double.eps + return(out) +} + + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet,sigmaVar) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.kern,sigmaVar=sigmaVar, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.kern,sigmaVar=sigmaVar, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} From c3fe18c24a30af63f8c0b60d1ddafd11c6adb0eb Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 16:59:15 -0400 Subject: [PATCH 075/124] moved to own dir --- .../eucscale/GBM_eucscale_pipeline.R | 118 ++++++ .../eucscale/GBM_univar_eucscale_grouped.R | 387 ++++++++++++++++++ .../eucscale/KIRC_eucscale_pipeline.R | 121 ++++++ .../eucscale/LUSC_eucscale_pipeline.R | 119 ++++++ .../eucscale/OV_eucscale_pipeline.R | 113 +++++ .../eucscale/PanCancer_eucscale.R | 274 +++++++++++++ 6 files changed, 1132 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_univar_eucscale_grouped.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/KIRC_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_pipeline.R new file mode 100644 index 00000000..07b96bfb --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_pipeline.R @@ -0,0 +1,118 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_univar_eucscale_grouped.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_univar_eucscale_grouped.R new file mode 100644 index 00000000..4d5e7406 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_univar_eucscale_grouped.R @@ -0,0 +1,387 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running +# Lasso+EucScale but not gene-level nets. + +rm(list=ls()) +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +args <- commandArgs(TRUE) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_sp2max%i_grouped_%s",outRoot,maxEdge,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ---------------------------------------------------------------- +# helper functions + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ +# first loop - over train/test splits +mega_combList <- combList # changes each round +for (rngNum in 1:20) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + #----------- + # Begin Lasso UF + for (nm in names(dats_train)) { + print(nm) + # run lasso with cv + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) + # pick lambda that minimizes MSE + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + } # else do nothing, use full dataset + + # one net for datatype + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont", nm) + } + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + +# ------------------------------- +# make train db + netDir <- sprintf("%s/networks",outDir) + +cat(sprintf("Making train nets for rng%i\n", rngNum)) +netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + +# ------------------------------------- +# make test db + + +netDir <- sprintf("%s/test_networks",outDir) +netList <- c() + +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) +# ------------------------------------- + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/KIRC_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/KIRC_eucscale_pipeline.R new file mode 100644 index 00000000..4ddfa538 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/KIRC_eucscale_pipeline.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) + + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_pipeline.R new file mode 100644 index 00000000..63e20923 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_pipeline.R @@ -0,0 +1,119 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R new file mode 100644 index 00000000..9a3e1c1b --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R @@ -0,0 +1,113 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale.R new file mode 100644 index 00000000..150a2461 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale.R @@ -0,0 +1,274 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} From ec1103678b945e6fb4db82486157d548746e66e1 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 16:59:24 -0400 Subject: [PATCH 076/124] relocated --- .../diff_kernels/GBM_eucscale_pipeline.R | 118 ------ .../GBM_univar_eucscale_grouped.R | 387 ----------------- .../diff_kernels/KIRC_eucscale_pipeline.R | 121 ------ .../diff_kernels/KIRC_univar_eucscale.R | 372 ----------------- .../diff_kernels/LUSC_eucscale_pipeline.R | 119 ------ .../diff_kernels/LUSC_univar_eucscale.R | 393 ------------------ .../diff_kernels/OV_eucscale_pipeline.R | 113 ----- .../diff_kernels/OV_univar_eucscale.R | 363 ---------------- .../diff_kernels/PanCancer_eucscale.R | 274 ------------ 9 files changed, 2260 deletions(-) delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R deleted file mode 100644 index 07b96bfb..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/GBM_eucscale_pipeline.R +++ /dev/null @@ -1,118 +0,0 @@ -#' PanCancer binarized survival: GBM: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design -#' multi cutoff evaluation -#' also pruning RNA before running - -rm(list=ls()) - -rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" -inDir <- sprintf("%s/input",rootDir) -outRoot <- sprintf("%s/output",rootDir) - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) -cat(megaDir, file="test.txt",append=TRUE) - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/GBM_clinical_core.txt",inDir), - survival=sprintf("%s/GBM_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/GBM_mRNA_core.txt",inDir), - mir=sprintf("%s/GBM_miRNA_core.txt",inDir), - dnam=sprintf("%s/GBM_methylation_core.txt",inDir), - cnv=sprintf("%s/GBM_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -# ======================= -# GBM-specific variables -clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA -clinical$performance_score <- strtoi(clinical$performance_score) -clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -# ======================= -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} -rm(pname) - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -combList <- list( - clinicalAcnv=c("clinical_cont","cnv_cont"), - clinical="clinical_cont", - mir="mir_cont", - rna="rna_cont", - cnv="cnv_cont", - dnam="dnam_cont", - clinicalArna=c("clinical_cont","rna_cont"), - clinicalAmir=c("clinical_cont","mir_cont"), - clinicalAdnam=c("clinical_cont","dnam_cont"), - all="all" -) - -pheno_all <- pheno - -# cleanup -rm(pheno,pheno_nosurv) -rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) - -# ----------------------------------------------------------- -# run predictor -source("PanCancer_eucscale.R") -runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, - dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9) - - diff --git a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R b/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R deleted file mode 100644 index 4d5e7406..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/GBM_univar_eucscale_grouped.R +++ /dev/null @@ -1,387 +0,0 @@ -#' PanCancer binarized survival: GBM: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design -#' multi cutoff evaluation -#' also pruning RNA before running -# Lasso+EucScale but not gene-level nets. - -rm(list=ls()) -require(netDx) -require(netDx.examples) -require(glmnet) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -maxEdge <- 6000 ### max edge after sparsification - -args <- commandArgs(TRUE) - -rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" -inDir <- sprintf("%s/input",rootDir) -outRoot <- sprintf("%s/output",rootDir) - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucscale_sp2max%i_grouped_%s",outRoot,maxEdge,dt) -cat(megaDir, file="test.txt",append=TRUE) - -# ---------------------------------------------------------------- -# helper functions - -# SNF similarity method. Euclidean distance followed by exponential -# scaling where sigma is tuned based on local data structure. -sim.eucscale <- function (dat, K = 20, alpha = 0.5) { -ztrans <- function(m) { - m <- as.matrix(m) - m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) - m2 -} -normalize <- function(X) { - print(dim(X)) - row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) - row.sum.mdiag[row.sum.mdiag == 0] <- 1 - X <- X/(2 * (row.sum.mdiag)) - diag(X) <- 0.5 - X <- (X+t(X))/2 - return(X) -} - nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) - #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median - z1 <- ztrans(dat) - euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) - N <- nrow(euc) - euc <- (euc + t(euc))/2 - sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) - print(dim(sortedColumns)) - finiteMean <- function(x) { - return(mean(x[is.finite(x)],na.rm=T)) - } - means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + - .Machine$double.eps - avg <- function(x, y) { - return((x + y)/2) - } - Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps - Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps - densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) - - W <- (densities + t(densities))/2 - W <- normalize(W) - # remove patients with no datapoints (full column/row of NAs) - idx <- which(rowSums(is.na(euc))==ncol(W)-1) - if (any(idx)) { - W <- W[-idx,] - idx <- which(colSums(is.na(euc))==ncol(W)-1) - W <- W[,-idx] - } - return(W) -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/GBM_clinical_core.txt",inDir), - survival=sprintf("%s/GBM_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/GBM_mRNA_core.txt",inDir), - mir=sprintf("%s/GBM_miRNA_core.txt",inDir), - dnam=sprintf("%s/GBM_methylation_core.txt",inDir), - cnv=sprintf("%s/GBM_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" -# ------------------ - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -# ======================= -# GBM-specific variables -clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA -clinical$performance_score <- strtoi(clinical$performance_score) -clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -# ======================= -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalAcnv=c("clinical_cont","cnv.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno,pheno_nosurv) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ -# first loop - over train/test splits -mega_combList <- combList # changes each round -for (rngNum in 1:20) { - combList <- mega_combList # clean slate - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), - sep="\t",col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=FALSE]) - netSets_iter <- list() - #----------- - # Begin Lasso UF - for (nm in names(dats_train)) { - print(nm) - # run lasso with cv - fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), - y=factor(pheno$STATUS), family="binomial", alpha=1) - # pick lambda that minimizes MSE - wt <- abs(coef(fit,s="lambda.min")[,1]) - vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") - cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) - - if (length(vars)>0) { - tmp <- dats_train[[nm]] - tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] - dats_train[[nm]] <- tmp - } # else do nothing, use full dataset - - # one net for datatype - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - combList[[nm]] <- sprintf("%s_cont", nm) - } - combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) - combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) - combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) - combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) - # END lasso UF - # ---------------------- - alldat_train <- do.call("rbind",dats_train) - -# ------------------------------- -# make train db - netDir <- sprintf("%s/networks",outDir) - -cat(sprintf("Making train nets for rng%i\n", rngNum)) -netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=FALSE,numCores=numCores) - - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("Input datatype\n%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - -# ------------------------------------- -# make test db - - -netDir <- sprintf("%s/test_networks",outDir) -netList <- c() - -netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=TRUE,numCores=numCores) -cat(sprintf("Total of %i nets\n", length(netList))) -# now create database -testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) -# ------------------------------------- - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - }else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R deleted file mode 100644 index 4ddfa538..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/KIRC_eucscale_pipeline.R +++ /dev/null @@ -1,121 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) - -inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" -outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) - - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), - survival=sprintf("%s/KIRC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), - prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), - mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), - dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), - cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -#======transform clinical data========= -pheno$grade <- as.vector(pheno$grade) -pheno$grade[pheno$grade=="G1"] <- "G2" -pheno$grade[pheno$grade=="GX"] <- "G2" -pheno$grade <- as.factor(pheno$grade) -pheno <- pheno[, -which(colnames(pheno)=="gender")] -#====================================== - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -clinical$grade <- as.numeric(factor(clinical$grade)) -clinical$stage <- as.numeric(factor(clinical$stage)) -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) -rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) - -# ----------------------------------------------------------- -# run predictor -source("PanCancer_eucscale.R") -runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, - dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9) - - diff --git a/misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R deleted file mode 100644 index 730ce907..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/KIRC_univar_eucscale.R +++ /dev/null @@ -1,372 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) -require(netDx) -require(netDx.examples) -require(glmnet) # lasso for univariate filtering - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -cutoff <- 9 -maxEdge <- 6000 ### max edge after sparsification - -inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" -outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucscale_sp2%i_%s",outRoot,maxEdge,dt) - -# ---------------------------------------------------------------- -# helper functions -# SNF similarity method. Euclidean distance followed by exponential -# scaling where sigma is tuned based on local data structure. -sim.eucscale <- function (dat, K = 20, alpha = 0.5) { -ztrans <- function(m) { - m <- as.matrix(m) - m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) - m2 -} -normalize <- function(X) { - print(dim(X)) - row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) - row.sum.mdiag[row.sum.mdiag == 0] <- 1 - X <- X/(2 * (row.sum.mdiag)) - diag(X) <- 0.5 - X <- (X+t(X))/2 - return(X) -} - nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) - #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median - z1 <- ztrans(dat) - euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) - N <- nrow(euc) - euc <- (euc + t(euc))/2 - sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) - print(dim(sortedColumns)) - finiteMean <- function(x) { - return(mean(x[is.finite(x)],na.rm=T)) - } - means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + - .Machine$double.eps - avg <- function(x, y) { - return((x + y)/2) - } - Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps - Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps - densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) - - W <- (densities + t(densities))/2 - W <- normalize(W) - # remove patients with no datapoints (full column/row of NAs) - idx <- which(rowSums(is.na(euc))==ncol(W)-1) - if (any(idx)) { - W <- W[-idx,] - idx <- which(colSums(is.na(euc))==ncol(W)-1) - W <- W[,-idx] - } - return(W) -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), - survival=sprintf("%s/KIRC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), - prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), - mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), - dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), - cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -#======transform clinical data========= -pheno$grade <- as.vector(pheno$grade) -pheno$grade[pheno$grade=="G1"] <- "G2" -pheno$grade[pheno$grade=="GX"] <- "G2" -pheno$grade <- as.factor(pheno$grade) -pheno <- pheno[, -which(colnames(pheno)=="gender")] -#====================================== - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -clinical$grade <- as.numeric(factor(clinical$grade)) -clinical$stage <- as.numeric(factor(clinical$stage)) -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) - - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -# first loop - over train/test splits -mega_combList <- combList # changes each round -for (rngNum in 1:20) { - combList <- mega_combList # clean slate - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - - ## pruneTrain code ------ - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - netSets_iter <- list() - #----------- - # Begin Lasso UF - for (nm in names(dats_train)) { - print(nm) - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - # run lasso with cv - fit.lasso <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), - y=factor(pheno$STATUS), family="binomial", alpha=1) - # pick lambda that minimizes MSE - wt <- abs(coef(fit.lasso,s="lambda.min")[,1]) - vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") - if (length(vars)>0) { - tmp <- dats_train[[nm]] - tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] - dats_train[[nm]] <- tmp - for (k in rownames(tmp)) { netSets_iter[[k]] <- k } - combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) - } else { - # leave dats_train as is, make a single net - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - combList[[nm]] <- sprintf("%s_cont",nm) - } - } - combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) - combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) - combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) - combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) - combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) - # END lasso UF - # ---------------------- - alldat_train <- do.call("rbind",dats_train) - - netDir <- sprintf("%s/networks",outDir) - - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=FALSE,numCores=numCores) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - ## pruneTrain: make test database - ## This will be used to predict test samples by subsetting just for feature - ## selected nets in a given round - ## Note that this is useful for all train/test splits because we can always - ## change which samples are query and can always subset based on which nets - ## are feature selected in a given round. - netDir <- sprintf("%s/test_networks",outDir) - - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=TRUE,numCores=numCores) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - testdbDir <- GM_createDB(netDir, pheno_all$ID, - outDir,numCores=numCores) - - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",pDir2,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } - - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F, - quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R deleted file mode 100644 index 63e20923..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/LUSC_eucscale_pipeline.R +++ /dev/null @@ -1,119 +0,0 @@ -#' PanCancer binarized survival: LUSC: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design - -rm(list=ls()) - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucclean_%s",outRoot,dt) - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), - survival=sprintf("%s/LUSC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), - prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), - mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), - cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno -rownames(clinical) <- clinical[,1]; -# ======================= -# LUSC-specific variables -clinical$stage <- as.vector(clinical$stage) -clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" -clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" -clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" -clinical$stage <- as.factor(clinical$stage) -clinical <- clinical[, -which(colnames(clinical)=="gender")] -clinical <- t(clinical[,c("age","stage")]) -clinical[1,] <- as.integer(clinical[1,]) -clinical[2,] <- as.integer(as.factor(clinical[2,])) -class(clinical) <- "numeric" -# ======================= -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - if (nm == "rna") tmp <- log(tmp+1) - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno) -rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) - -# ----------------------------------------------------------- -# run predictor -source("PanCancer_eucscale.R") -runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, - dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R deleted file mode 100644 index 1e4319ac..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/LUSC_univar_eucscale.R +++ /dev/null @@ -1,393 +0,0 @@ -#' PanCancer binarized survival: LUSC: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design - -rm(list=ls()) -require(netDx) -require(netDx.examples) -require(glmnet) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -maxEdge <- 6000 - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucscale_sp2%i_%s",outRoot,maxEdge,dt) - -# ---------------------------------------------------------------- - -# SNF similarity method. Euclidean distance followed by exponential -# scaling where sigma is tuned based on local data structure. -sim.eucscale <- function (dat, K = 20, alpha = 0.5) { -ztrans <- function(m) { - m <- as.matrix(m) - m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) - m2 -} -normalize <- function(X) { - print(dim(X)) - row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) - row.sum.mdiag[row.sum.mdiag == 0] <- 1 - X <- X/(2 * (row.sum.mdiag)) - diag(X) <- 0.5 - X <- (X+t(X))/2 - return(X) -} - z1 <- ztrans(dat) - euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) - N <- nrow(euc) - euc <- (euc + t(euc))/2 - sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) - finiteMean <- function(x) { - return(mean(x[is.finite(x)],na.rm=T)) - } - means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + - .Machine$double.eps - avg <- function(x, y) { - return((x + y)/2) - } - Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps - Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps - densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) - - W <- (densities + t(densities))/2 - W <- normalize(W) - # remove patients with no datapoints (full column/row of NAs) - idx <- which(rowSums(is.na(euc))==ncol(W)-1) - if (any(idx)) { - W <- W[-idx,] - idx <- which(colSums(is.na(euc))==ncol(W)-1) - W <- W[,-idx] - } - return(W) -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), - survival=sprintf("%s/LUSC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), - prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), - mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), - cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno -rownames(clinical) <- clinical[,1]; -# ======================= -# LUSC-specific variables -clinical$stage <- as.vector(clinical$stage) -clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" -clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" -clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" -clinical$stage <- as.factor(clinical$stage) -clinical <- clinical[, -which(colnames(clinical)=="gender")] -clinical <- t(clinical[,c("age","stage")]) -clinical[1,] <- as.integer(clinical[1,]) -clinical[2,] <- as.integer(as.factor(clinical[2,])) -class(clinical) <- "numeric" -# ======================= -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - if (nm == "rna") tmp <- log(tmp+1) - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -mega_combList <- combList # this will change in each round - -# first loop - over train/test splits -for (rngNum in 1:20) { - combList <- mega_combList # clean slate - - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - ## pruneTrain code ------ - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - netSets_iter <- list() - #----------- - # Begin Lasso UF - for (nm in names(dats_train)){ - print(nm) - # run lasso with cv - fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), - y=factor(pheno$STATUS), family="binomial", alpha=1) - # pick lambda that minimizes MSE - wt <- abs(coef(fit,s="lambda.min")[,1]) - vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") - cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) - - if (length(vars)>0) { - tmp <- dats_train[[nm]] - tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] - dats_train[[nm]] <- tmp - for (k in rownames(tmp)) { netSets_iter[[k]] <- k } - combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) - } else { - # leave dats_train as is, make a single net - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - combList[[nm]] <- sprintf("%s_cont",nm) - } - } - - combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) - combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) - combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) - combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) - - # END lasso UF - # ---------------------- - - alldat_train <- do.call("rbind",dats_train) - - netDir <- sprintf("%s/networks",outDir) - -# ------------------------------- -# make train db -cat(sprintf("Making test nets for rng%i\n", rngNum)) -netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=FALSE,numCores=numCores) - - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) -# ------------------------------- - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("CombList = %s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - netDir <- sprintf("%s/test_networks",megaDir) - -# ------------------------------- -# make test db - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=TRUE,numCores=numCores) - - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - megadbDir <- GM_createDB(netDir, pheno_all$ID, - megaDir,numCores=numCores) -# ------------------------------- - - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - # only include the nets that were feature selected - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - } else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } -} - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R deleted file mode 100644 index 9a3e1c1b..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/OV_eucscale_pipeline.R +++ /dev/null @@ -1,113 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) - - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucscale_%s",outRoot,dt) - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/OV_clinical_core.txt",inDir), - survival=sprintf("%s/OV_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/OV_mRNA_core.txt",inDir), - prot=sprintf("%s/OV_RPPA_core.txt",inDir), - mir=sprintf("%s/OV_miRNA_core.txt",inDir), - dnam=sprintf("%s/OV_methylation_core.txt",inDir), - cnv=sprintf("%s/OV_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clin <- pheno -rownames(clin) <- clin[,1]; -clin <- t(clin[,2,drop=FALSE]) -dats$clinical <- clin; rm(clin) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx, drop = FALSE] - x -}) - - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) -rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) - -# ----------------------------------------------------------- -# run predictor -source("PanCancer_eucscale.R") -runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, - dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9) - - - - diff --git a/misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R deleted file mode 100644 index 512301ca..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/OV_univar_eucscale.R +++ /dev/null @@ -1,363 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) -require(netDx) -require(netDx.examples) -require(glmnet) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -cutoff <- 8 -maxEdge <- 6000 ### max edge after sparsification - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucscale_sp2%i_%s",outRoot,maxEdge,dt) - -# ---------------------------------------------------------------- -# helper functions - -# normalized difference -# SNF similarity method. Euclidean distance followed by exponential -# scaling where sigma is tuned based on local data structure. -sim.eucscale <- function (dat, K = 20, alpha = 0.5) { -ztrans <- function(m) { - m <- as.matrix(m) - m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) - m2 -} -normalize <- function(X) { - print(dim(X)) - row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) - row.sum.mdiag[row.sum.mdiag == 0] <- 1 - X <- X/(2 * (row.sum.mdiag)) - diag(X) <- 0.5 - X <- (X+t(X))/2 - return(X) -} - z1 <- ztrans(dat) - euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) - N <- nrow(euc) - euc <- (euc + t(euc))/2 - sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) - print(dim(sortedColumns)) - finiteMean <- function(x) { - return(mean(x[is.finite(x)],na.rm=T)) - } - means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + - .Machine$double.eps - avg <- function(x, y) { - return((x + y)/2) - } - Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps - Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps - densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) - - W <- (densities + t(densities))/2 - W <- normalize(W) - # remove patients with no datapoints (full column/row of NAs) - idx <- which(rowSums(is.na(euc))==ncol(W)-1) - if (any(idx)) { - W <- W[-idx,] - idx <- which(colSums(is.na(euc))==ncol(W)-1) - W <- W[,-idx] - } - return(W) -} -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/OV_clinical_core.txt",inDir), - survival=sprintf("%s/OV_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/OV_mRNA_core.txt",inDir), - prot=sprintf("%s/OV_RPPA_core.txt",inDir), - mir=sprintf("%s/OV_miRNA_core.txt",inDir), - dnam=sprintf("%s/OV_methylation_core.txt",inDir), - cnv=sprintf("%s/OV_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clin <- pheno -rownames(clin) <- clin[,1]; -clin <- t(clin[,2,drop=FALSE]) -dats$clinical <- clin; rm(clin) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx, drop = FALSE] - x -}) - - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) - - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - - -# first loop - over train/test splits -mega_combList <- combList -for (rngNum in 1:20) { - combList <- mega_combList - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - netSets_iter <- list() - #----------- - # Begin Lasso UF - for (nm in names(dats_train)) { - print(nm) - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - # run lasso with cv - if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. - vars <- rownames(dats_train[[nm]]) - else { - fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), - y=factor(pheno$STATUS), family="binomial", alpha=1) - # pick lambda that minimizes MSE - wt <- abs(coef(fit,s="lambda.min")[,1]) - vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") - } - if (length(vars)>0) { - tmp <- dats_train[[nm]] - tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] - dats_train[[nm]] <- tmp - for (k in rownames(tmp)) { netSets_iter[[k]] <- k } - combList[[nm]] <- paste(sprintf("%s_cont", rownames(tmp))) - } else { - # leave dats_train as is, make a single net - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - combList[[nm]] <- sprintf("%s_cont",nm) - } - } - combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) - combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) - combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) - combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) - combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) - # END lasso UF - # ---------------------- - alldat_train <- do.call("rbind",dats_train) - netDir <- sprintf("%s/networks",outDir) - - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=FALSE,numCores=numCores) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - # -------- - # pruneTrain: make test database - netDir <- sprintf("%s/test_networks",outDir) - - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=TRUE,numCores=numCores) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - testdbDir <- GM_createDB(netDir, pheno_all$ID, - outDir,numCores=numCores) - - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - # pTally <- sub(".profile","",pTally) - # pTally <- sub("_cont","",pTally) - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",pDir2,g) - GM_writeQueryFile(qSamps,incNets=pTally - ,nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } - - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R b/misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R deleted file mode 100644 index 150a2461..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/PanCancer_eucscale.R +++ /dev/null @@ -1,274 +0,0 @@ -#' PanCancer predictor: univariate filtering by lasso + gene-level nets -#' similarity by Euclidean distance + local scaling - -# ---------------------------------------------------------------- -# helper functions - -# SNF similarity method. Euclidean distance followed by exponential -# scaling where sigma is tuned based on local data structure. -sim.eucscale <- function (dat, K = 20, alpha = 0.5) { -ztrans <- function(m) { - m <- as.matrix(m) - m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) - m2 -} -normalize <- function(X) { - print(dim(X)) - row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) - row.sum.mdiag[row.sum.mdiag == 0] <- 1 - X <- X/(2 * (row.sum.mdiag)) - diag(X) <- 0.5 - X <- (X+t(X))/2 - return(X) -} - nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) - #if (length(nodata)>0) dat[nodata] <- median(dat) # impute median - z1 <- ztrans(dat) - euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) - N <- nrow(euc) - euc <- (euc + t(euc))/2 - sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) - print(dim(sortedColumns)) - finiteMean <- function(x) { - return(mean(x[is.finite(x)],na.rm=T)) - } - means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + - .Machine$double.eps - avg <- function(x, y) { - return((x + y)/2) - } - Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps - Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps - densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) - - W <- (densities + t(densities))/2 - W <- normalize(W) - # remove patients with no datapoints (full column/row of NAs) - idx <- which(rowSums(is.na(euc))==ncol(W)-1) - if (any(idx)) { - W <- W[-idx,] - idx <- which(colSums(is.na(euc))==ncol(W)-1) - W <- W[,-idx] - } - return(W) -} - -# ---------------------------------------------------------------- -runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, - cutoffSet) { -require(netDx) -require(netDx.examples) -require(glmnet) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -maxEdge <- 6000 ### max edge after sparsification - -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -alldat <- do.call("rbind",dats) - -for (rngNum in rngVals) { - combList <- mega_combList # clean slate - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), - sep="\t",col=T,row=F,quote=F) - - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=FALSE]) - netSets_iter <- list() - - # lasso - for (nm in names(dats_train)) { - print(nm) - if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. - vars <- rownames(dats_train[[nm]]) - else { - fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), - y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso - wt <- abs(coef(fit,s="lambda.min")[,1]) - vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") - } - cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) - - if (length(vars)>0) { - tmp <- dats_train[[nm]] - tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] - dats_train[[nm]] <- tmp - for (k in rownames(tmp)) { netSets_iter[[k]] <- k } - combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) - } else { - # leave dats_train as is, make a single net - netSets_iter[[nm]] <- rownames(dats_train[[nm]]) - combList[[nm]] <- sprintf("%s_cont",nm) - } - } - - if ("clinicalArna" %in% names(combList)) - combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) - if ("clinicalAmir" %in% names(combList)) - combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) - if ("clinicalAcnv" %in% names(combList)) - combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) - if ("clinicalAdnam" %in% names(combList)) - combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) - if ("clinicalAprot" %in% names(combList)) - combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) - - # END lasso UF - # ---------------------- - alldat_train <- do.call("rbind",dats_train) - netDir <- sprintf("%s/networks",outDir) - - cat(sprintf("Making test nets for rng%i\n", rngNum)) - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=FALSE,numCores=numCores) - cat(sprintf("Total of %i nets\n", length(netList))) - - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("Input datatype\n%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck - resDir <- sprintf("%s/GM_results",pDir2) - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - # make test db - - netDir <- sprintf("%s/test_networks",outDir) - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter,netDir, - simMetric="custom",customFunc=sim.eucscale, - writeProfiles=FALSE, - sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, - verbose=TRUE,numCores=numCores) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) - - # classify patients - for (cutoff in cutoffSet) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - }else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - } - } - - } # input data combinations - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) - -} From 8d581d03d7eb4876ec4063f5a3cb41d46f0ac923 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 16:59:57 -0400 Subject: [PATCH 077/124] updated dirs --- misc/PanCancer/multiCutoff/GBM_getRes.R | 11 +++++++---- misc/PanCancer/multiCutoff/KIRC_getRes.R | 17 ++++++++++++----- misc/PanCancer/multiCutoff/LUSC_getRes.R | 12 +++++++++--- misc/PanCancer/multiCutoff/OV_getRes.R | 14 ++++++++++++-- 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index 8ae34df7..473710d9 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -1,9 +1,9 @@ #' plot GBM results for kernel variations -rm(list=ls()) require(netDx) require(reshape2) +GBM_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output" dirSet <- list( base="noPrune_180423", @@ -13,8 +13,8 @@ dirSet <- list( #rbf0.05="lassoUni_rbf_0.05", rbf0.1="lassoUni_rbf_0.1_180502", rbf0.25="lassoUni_rbf_0.25_180502", - euc_1K="eucscale_sp2max1000_180503", - euc_6K="eucscale_sp2max6000_180503", + #euc_1K="eucscale_sp2max1000_180503", + euc_6K="eucclean_180503", euc_6K_group="eucscale_sp2max6000_grouped_180503" #rbf5="lassoUni_rbf_5", #rbf10="lassoUni_rbf_10" @@ -45,7 +45,7 @@ cutoff <-9 } else if (curdir =="euc_6K"){ rngDir <- paste("rng",1:20,sep="") } else if (curdir =="euc_6K_group"){ - rngDir <- paste("rng",1:14,sep="") + rngDir <- paste("rng",1:20,sep="") } else { rngDir <- dir(path=dataDir,pattern="rng") } @@ -86,3 +86,6 @@ tmp <- unlist(numSplits) text(1:length(mega_auc),0.5,sprintf("N=%i",tmp)) abline(h=0.5) dev.off() + +return(mega_auc) +} diff --git a/misc/PanCancer/multiCutoff/KIRC_getRes.R b/misc/PanCancer/multiCutoff/KIRC_getRes.R index b52427b9..a74c5334 100644 --- a/misc/PanCancer/multiCutoff/KIRC_getRes.R +++ b/misc/PanCancer/multiCutoff/KIRC_getRes.R @@ -1,16 +1,17 @@ #' plot GBM results with multiple CV cutoffs -rm(list=ls()) require(netDx) require(reshape2) #dataDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output/pruneTrain_180419" + +KIRC_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dirSet <- list( base="noPrune_180423", lasso="lasso_180426", pamr="pamr_180426", - euc6K="eucscale_sp26000_180503" + euc6K="eucclean_180503" # ridge="ridgeAbsFix_180426" ) @@ -24,8 +25,6 @@ dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) rngMax <- 20 if (any(grep("base",curdir))) { rngMax <- 15 - } else if (any(grep("euc",curdir))) { - rngMax <- 5 } auc_set <- list() @@ -34,8 +33,14 @@ for (settype in settypes) { colctr <- 1 cutoff <- 9 - c7 <- sprintf("%s/%s/predictionResults.txt", + if (curdir=="euc6K") { + c7 <- sprintf("%s/%s/cutoff9/predictionResults.txt", + rngDir,settype,cutoff) + } else { + c7 <- sprintf("%s/%s/predictionResults.txt", rngDir,settype,cutoff) + } + torm <- c() for (idx in 1:length(c7)) { dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) @@ -57,5 +62,7 @@ colctr <- 1 pdf(sprintf("KIRC_%s.pdf",format(Sys.Date(),"%y%m%d"))); boxplot(mega_auc,main="KIRC",cex.axis=1.7,cex.main=2,las=1); dev.off() +return(mega_auc) +} diff --git a/misc/PanCancer/multiCutoff/LUSC_getRes.R b/misc/PanCancer/multiCutoff/LUSC_getRes.R index 4f828328..1ef41972 100644 --- a/misc/PanCancer/multiCutoff/LUSC_getRes.R +++ b/misc/PanCancer/multiCutoff/LUSC_getRes.R @@ -1,8 +1,8 @@ #' plot GBM results with multiple CV cutoffs -rm(list=ls()) require(netDx) require(reshape2) +LUSC_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output" dirSet <- list( base="noPrune_180423", @@ -10,7 +10,7 @@ dirSet <- list( lassoGenes="lassoGenes_180426", pamrGenes_sp2="pamrGenes_180427", pamrGenes_sp1="pamrGenes_sp1_180427", - euc6K="eucscale_sp26000_180503" + euc6K="eucclean_180504" ) settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") @@ -57,4 +57,10 @@ for (cutoff in 9) { mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } dt <- format(Sys.Date(),"%y%m%d") -pdf(sprintf("LUSC_%s.pdf",dt),width=13,height=6); boxplot(mega_auc,main="LUSC",cex.axis=1.7,cex.main=2,las=1); dev.off() +pdf(sprintf("LUSC_%s.pdf",dt),width=13,height=6); +boxplot(mega_auc,main="LUSC",cex.axis=1.7,cex.main=2,las=1); +abline(h=0.5) +dev.off() + +return(mega_auc) +} diff --git a/misc/PanCancer/multiCutoff/OV_getRes.R b/misc/PanCancer/multiCutoff/OV_getRes.R index 6fb85e40..797c5218 100644 --- a/misc/PanCancer/multiCutoff/OV_getRes.R +++ b/misc/PanCancer/multiCutoff/OV_getRes.R @@ -1,8 +1,8 @@ #' plot GBM results with multiple CV cutoffs -rm(list=ls()) require(netDx) require(reshape2) +OV_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" maxRng <- 100 @@ -12,7 +12,8 @@ settypes <- c("clinical","mir","rna","prot","cnv","dnam", dirSet <- list( base="noPrune_180423", prune="pruneTrain_180419", - lasso="lasso_180426" + lasso="lasso_180426", + euc6K="eucscale_180504" ) mega_auc <- list() @@ -32,8 +33,13 @@ for (curdir in names(dirSet)) { cat(sprintf("Got %i rng files\n",length(rngDir))) cutoff <- 9 + if (curdir=="euc6K") { + c7 <- sprintf("%s/%s/cutoff9/predictionResults.txt", + rngDir,settype,cutoff) + } else { c7 <- sprintf("%s/%s/predictionResults.txt", rngDir,settype,cutoff) + } torm <- c() for (idx in 1:length(c7)) { dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) @@ -55,3 +61,7 @@ for (curdir in names(dirSet)) { dt <- format(Sys.Date(),"%y%m%d") pdf(sprintf("OV_%s.pdf",dt)); boxplot(mega_auc,las=1,cex.axis=1.8,main="OV",cex.main=2); dev.off() + + return(mega_auc) + +} From 13dfab92a90b2d185907eccdd6c5ef4a914caff6 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 7 May 2018 17:00:19 -0400 Subject: [PATCH 078/124] tester fn --- .../pruneVersion/diff_kernels/rbf/KIRC_rbf.R | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/rbf/KIRC_rbf.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/rbf/KIRC_rbf.R b/misc/PanCancer/pruneVersion/diff_kernels/rbf/KIRC_rbf.R new file mode 100644 index 00000000..d5058f1b --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/rbf/KIRC_rbf.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +hp <- 0.2 +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/rbfclean_%1.2f_%s",outRoot,hp,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_rbf.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,sigmaVar=hp) + + From cd994c04597b1cfb0a0d41c58223dcd181c71469 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 8 May 2018 22:21:32 -0400 Subject: [PATCH 079/124] updated --- misc/PanCancer/multiCutoff/GBM_getRes.R | 11 ++++++----- misc/PanCancer/multiCutoff/LUSC_getRes.R | 10 ++++++++-- misc/PanCancer/multiCutoff/OV_getRes.R | 10 ++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index 473710d9..513c6fac 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -11,11 +11,12 @@ dirSet <- list( lassoGenes_sp1="lassoGenes_incClin_180426", pamrGenes="pamrGenes_incClin_180427", #rbf0.05="lassoUni_rbf_0.05", - rbf0.1="lassoUni_rbf_0.1_180502", - rbf0.25="lassoUni_rbf_0.25_180502", #euc_1K="eucscale_sp2max1000_180503", euc_6K="eucclean_180503", - euc_6K_group="eucscale_sp2max6000_grouped_180503" + euc_6K_group="eucscale_sp2max6000_grouped_180503", + eucimpute="eucclean_impute_180507", + rbfclean="rbfclean_0.20_180507", + pearscale="pearscale_180507" #rbf5="lassoUni_rbf_5", #rbf10="lassoUni_rbf_10" ) @@ -79,9 +80,9 @@ mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) dt <- format(Sys.Date(),"%y%m%d") require(gplots) -pdf(sprintf("GBM_%s.pdf",dt),width=18,height=6); +pdf(sprintf("GBM_%s.pdf",dt),width=24,height=6); boxplot( mega_auc,las=1,cex.axis=1.7,cex.main=2,main="GBM", - at=1:length(mega_auc)); + at=1:length(mega_auc),cex.lab=0.8); tmp <- unlist(numSplits) text(1:length(mega_auc),0.5,sprintf("N=%i",tmp)) abline(h=0.5) diff --git a/misc/PanCancer/multiCutoff/LUSC_getRes.R b/misc/PanCancer/multiCutoff/LUSC_getRes.R index 1ef41972..82e8ebd2 100644 --- a/misc/PanCancer/multiCutoff/LUSC_getRes.R +++ b/misc/PanCancer/multiCutoff/LUSC_getRes.R @@ -10,7 +10,13 @@ dirSet <- list( lassoGenes="lassoGenes_180426", pamrGenes_sp2="pamrGenes_180427", pamrGenes_sp1="pamrGenes_sp1_180427", - euc6K="eucclean_180504" + rbfclean="rbfclean_0.20_180507", + euc6K="eucclean_180504", + eucimpute="eucscale_impute_180507", + pearscale="pearscale_180507", + ptop20c1="pearscale_top20_topClin1_180508", + ptop30c1="pearscale_top30_topClin1_180508", + ptop40c2="pearscale_top40_topClin2_180508" ) settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") @@ -57,7 +63,7 @@ for (cutoff in 9) { mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } dt <- format(Sys.Date(),"%y%m%d") -pdf(sprintf("LUSC_%s.pdf",dt),width=13,height=6); +pdf(sprintf("LUSC_%s.pdf",dt),width=24,height=6); boxplot(mega_auc,main="LUSC",cex.axis=1.7,cex.main=2,las=1); abline(h=0.5) dev.off() diff --git a/misc/PanCancer/multiCutoff/OV_getRes.R b/misc/PanCancer/multiCutoff/OV_getRes.R index 797c5218..eb0c1437 100644 --- a/misc/PanCancer/multiCutoff/OV_getRes.R +++ b/misc/PanCancer/multiCutoff/OV_getRes.R @@ -13,12 +13,13 @@ dirSet <- list( base="noPrune_180423", prune="pruneTrain_180419", lasso="lasso_180426", - euc6K="eucscale_180504" + euc6K="eucscale_180504", + rbfclean="rbfclean_0.2_180507" ) mega_auc <- list() for (curdir in names(dirSet)) { - if (curdir %in% c("lasso","pamr")) rngMax <- 20 + if (curdir %in% c("lasso","pamr","euc6K","rbfclean")) rngMax <- 20 else if (curdir %in% "prune") rngMax <- 14 else rngMax <- 15 @@ -33,7 +34,7 @@ for (curdir in names(dirSet)) { cat(sprintf("Got %i rng files\n",length(rngDir))) cutoff <- 9 - if (curdir=="euc6K") { + if (curdir %in% c("euc6K","rbfclean")) { c7 <- sprintf("%s/%s/cutoff9/predictionResults.txt", rngDir,settype,cutoff) } else { @@ -60,7 +61,8 @@ for (curdir in names(dirSet)) { } dt <- format(Sys.Date(),"%y%m%d") -pdf(sprintf("OV_%s.pdf",dt)); boxplot(mega_auc,las=1,cex.axis=1.8,main="OV",cex.main=2); dev.off() +pdf(sprintf("OV_%s.pdf",dt),width=12,height=6); +boxplot(mega_auc,las=1,cex.axis=1.8,main="OV",cex.main=2); dev.off() return(mega_auc) From 0ff03316fcf3af16cb680e404a9f4882c7ac3e82 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 8 May 2018 22:21:56 -0400 Subject: [PATCH 080/124] euscale+impute --- .../{ => outdated}/GBM_noPrune_RBFforAll.R | 0 .../{ => outdated}/GBM_noPrune_noSex_RBF.R | 0 .../GBM_noPrune_noSex_RBF_0.3.R | 0 .../GBM_noPrune_noSex_RBF_0.5.R | 0 .../eucscale/GBM_eucscale_impute.R | 119 +++++++ .../eucscale/LUSC_eucscale_impute.R | 121 +++++++ .../eucscale/OV_eucscale_pipeline.R | 1 + .../eucscale/PanCancer_eucscale_impute.R | 303 +++++++++++++++++ .../pearscale/GBM_pearscale_impute.R | 118 +++++++ .../pearscale/PanCancer_pearscale_impute.R | 302 +++++++++++++++++ .../PanCancer_topX_pearscale_impute.R | 306 ++++++++++++++++++ .../diff_kernels/pearscale/test.txt | 2 +- 12 files changed, 1271 insertions(+), 1 deletion(-) rename misc/PanCancer/noPrune/{ => outdated}/GBM_noPrune_RBFforAll.R (100%) rename misc/PanCancer/noPrune/{ => outdated}/GBM_noPrune_noSex_RBF.R (100%) rename misc/PanCancer/noPrune/{ => outdated}/GBM_noPrune_noSex_RBF_0.3.R (100%) rename misc/PanCancer/noPrune/{ => outdated}/GBM_noPrune_noSex_RBF_0.5.R (100%) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_impute.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale_impute.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale_impute.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune_RBFforAll.R b/misc/PanCancer/noPrune/outdated/GBM_noPrune_RBFforAll.R similarity index 100% rename from misc/PanCancer/noPrune/GBM_noPrune_RBFforAll.R rename to misc/PanCancer/noPrune/outdated/GBM_noPrune_RBFforAll.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF.R b/misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex_RBF.R similarity index 100% rename from misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF.R rename to misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex_RBF.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.3.R b/misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex_RBF_0.3.R similarity index 100% rename from misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.3.R rename to misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex_RBF_0.3.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.5.R b/misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex_RBF_0.5.R similarity index 100% rename from misc/PanCancer/noPrune/GBM_noPrune_noSex_RBF_0.5.R rename to misc/PanCancer/noPrune/outdated/GBM_noPrune_noSex_RBF_0.5.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R new file mode 100644 index 00000000..44a495af --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R @@ -0,0 +1,119 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucclean_impute_%s",outRoot,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale_impute.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_impute.R new file mode 100644 index 00000000..ee0a290e --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/LUSC_eucscale_impute.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucscale_impute_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +browser() + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_eucscale_impute.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R index 9a3e1c1b..2f989f79 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R @@ -101,6 +101,7 @@ combList <- list( rm(pheno,pheno_nosurv) rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) +browser() # ----------------------------------------------------------- # run predictor source("PanCancer_eucscale.R") diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R new file mode 100644 index 00000000..225138f9 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R @@ -0,0 +1,303 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) +### # remove patients with no datapoints (full column/row of NAs) +### idx <- which(rowSums(is.na(euc))==ncol(W)-1) +### if (any(idx)) { +### W <- W[-idx,] +### idx <- which(colSums(is.na(euc))==ncol(W)-1) +### W <- W[,-idx] +### } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(dats_train[[nm]]), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale_impute.R new file mode 100644 index 00000000..30d60088 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_pearscale_impute.R @@ -0,0 +1,118 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pearscale_impute_%s",outRoot,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_pearscale_impute.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale_impute.R new file mode 100644 index 00000000..14a0bc6c --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_pearscale_impute.R @@ -0,0 +1,302 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +sim.pearscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + + if (nrow(dat)<6) { + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + } else { + euc <- as.matrix(1-cor(dat,method="pearson",use="pairwise.complete.obs")) + } + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(dats_train[[nm]]), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R new file mode 100644 index 00000000..d134478b --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R @@ -0,0 +1,306 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +sim.pearscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + + if (nrow(dat)<6) { + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + } else { + euc <- as.matrix(1-cor(dat,method="pearson",use="pairwise.complete.obs")) + } + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet,topX=10,topClin) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(dats_train[[nm]]), + y=factor(pheno$STATUS), family="binomial", alpha=0) # ridge + wt <- abs(coef(fit,s="lambda.min")[,1]) + wt <- wt[order(wt,decreasing=TRUE)] + wt <- wt[-which(names(wt) %in% "(Intercept)")] + if (nm %in% "clinical") maxind <- min(topClin,length(wt)-1) + else maxind <- min(topX,length(wt)-1) + vars <- names(wt[1:maxind]) + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt index 1ac129a9..bc1efa7c 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt @@ -1 +1 @@ -/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507 \ No newline at end of file +/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_impute_180508 \ No newline at end of file From c81a4d0fe3ae966c2143b73dbe0e08f927c96b98 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 8 May 2018 22:28:46 -0400 Subject: [PATCH 081/124] pearson with param selecting topX var --- .../pearscale/LUSC_top_pearscale.R | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R new file mode 100644 index 00000000..2176b993 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R @@ -0,0 +1,119 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) +rm(survStr,surv,tmp,nm,inDir,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_topX_pearscale_impute.R") +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" +dt <- format(Sys.Date(),"%y%m%d") +topX <- 20; topClin<-1 +megaDir <- sprintf("%s/pearscale_top%i_topClin%i_%s",outRoot,topX,topClin,dt) +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,topX=topX,topClin=topClin) From 52510809de906385deb5f3d490f9c159859b1019 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 10 May 2018 10:25:19 -0400 Subject: [PATCH 082/124] pamr clean, single version --- .../pruneVersion/pamr/KIRC_pamr_pipeline.R | 120 +++++++ .../pruneVersion/pamr/PanCancer_pamr.R | 310 ++++++++++++++++++ 2 files changed, 430 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/pamr/KIRC_pamr_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R diff --git a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr_pipeline.R b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr_pipeline.R new file mode 100644 index 00000000..3d346c1e --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr_pipeline.R @@ -0,0 +1,120 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pamrclean_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_pamr.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R b/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R new file mode 100644 index 00000000..c7e8afad --- /dev/null +++ b/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R @@ -0,0 +1,310 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { +require(netDx) +require(netDx.examples) +require(pamr) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # pamr + for (nm in setdiff(names(dats_train),"clinical")) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + set.seed(123); # reproducible + tmp <- na.omit(dats_train[[nm]]) + data <- list(x=tmp,y=factor(pheno$STATUS), + genenames=rownames(tmp), + geneid=rownames(tmp)) + data.fit <- pamr.train(data) + data.cv <- pamr.cv(data.fit, data) + idx <- which.min(data.cv$error) + thresh <- data.cv$threshold[idx] + vars <- pamr.listgenes(data.fit,data,thresh,data.cv)[,1] + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) + tmp <- tmp[which(rownames(tmp)%in% vars),] + dats_train[[nm]] <- tmp + cat(sprintf("\t%i:pamr:%s:%i of %i left\n",rngNum,nm,nrow(tmp),orig_ct)) + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END pamr UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + netDir <- sprintf("%s/test_networks",outDir) + + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} From fbebed17b9811a8463554ed26121210f4402f389 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 10 May 2018 11:10:17 -0400 Subject: [PATCH 083/124] clean pamr, but will put away --- misc/PanCancer/pruneVersion/pamr/GBM_pamr.R | 1 - misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R | 2 +- misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R b/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R index 4c490d8b..12a48563 100644 --- a/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R +++ b/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R @@ -187,7 +187,6 @@ for (rngNum in 1:20) { idx <- which.min(data.cv$error) thresh <- data.cv$threshold[idx] keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) -if (nm %in% "mir") browser() cat(sprintf("%i:%s:PAMR thresh=%1.2f (idx=%i); %i left\n", rngNum, nm,thresh,idx,length(keepgenes[,1]))) diff --git a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R index a7434df1..9caa2e57 100644 --- a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R +++ b/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R @@ -191,7 +191,7 @@ for (rngNum in 1:20) { set.seed(123); # reproducible data.fit <- pamr.train(data) data.cv <- pamr.cv(data.fit, data) - thresh <- data.cv$threshold[which.min(data.cv$threshold)] + thresh <- data.cv$threshold[which.min(data.cv$error)] keepgenes <- pamr.listgenes(data.fit,data,thresh,data.cv) tmp <- dats_train[[nm]];orig_ct <- nrow(tmp) diff --git a/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R b/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R index c7e8afad..8e0b12ea 100644 --- a/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R +++ b/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R @@ -108,6 +108,7 @@ for (rngNum in rngVals) { cat(sprintf("\t%i:pamr:%s:%i of %i left\n",rngNum,nm,nrow(tmp),orig_ct)) netSets_iter[[nm]] <- rownames(dats_train[[nm]]) } + netSets_iter[["clinical"]] <- rownames(dats_train[["clinical"]]) if ("clinicalArna" %in% names(combList)) combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) From 8e82c28535e62f9c946b127535fd465711fb6a8f Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 10 May 2018 11:10:47 -0400 Subject: [PATCH 084/124] not needed for ms, maybeuseful for later --- .../pruneVersion/pamr/{ => needs_to_be_rechecked}/GBM_pamr.R | 0 .../pamr/{ => needs_to_be_rechecked}/GBM_pamr_genes_incClin.R | 0 .../pruneVersion/pamr/{ => needs_to_be_rechecked}/KIRC_pamr.R | 0 .../pamr/{ => needs_to_be_rechecked}/KIRC_pamr_pipeline.R | 0 .../pruneVersion/pamr/{ => needs_to_be_rechecked}/LUSC_pamr.R | 0 .../pamr/{ => needs_to_be_rechecked}/LUSC_pamr_genes.R | 0 .../pamr/{ => needs_to_be_rechecked}/LUSC_pamr_genes_sp1.R | 0 .../pruneVersion/pamr/{ => needs_to_be_rechecked}/OV_pamr.R | 0 .../pamr/{ => needs_to_be_rechecked}/PanCancer_pamr.R | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/GBM_pamr.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/GBM_pamr_genes_incClin.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/KIRC_pamr.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/KIRC_pamr_pipeline.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/LUSC_pamr.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/LUSC_pamr_genes.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/LUSC_pamr_genes_sp1.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/OV_pamr.R (100%) rename misc/PanCancer/pruneVersion/pamr/{ => needs_to_be_rechecked}/PanCancer_pamr.R (100%) diff --git a/misc/PanCancer/pruneVersion/pamr/GBM_pamr.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/GBM_pamr.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/GBM_pamr.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/GBM_pamr.R diff --git a/misc/PanCancer/pruneVersion/pamr/GBM_pamr_genes_incClin.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/GBM_pamr_genes_incClin.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/GBM_pamr_genes_incClin.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/GBM_pamr_genes_incClin.R diff --git a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/KIRC_pamr.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/KIRC_pamr.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/KIRC_pamr.R diff --git a/misc/PanCancer/pruneVersion/pamr/KIRC_pamr_pipeline.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/KIRC_pamr_pipeline.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/KIRC_pamr_pipeline.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/KIRC_pamr_pipeline.R diff --git a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/LUSC_pamr.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/LUSC_pamr.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/LUSC_pamr.R diff --git a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/LUSC_pamr_genes.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/LUSC_pamr_genes.R diff --git a/misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes_sp1.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/LUSC_pamr_genes_sp1.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/LUSC_pamr_genes_sp1.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/LUSC_pamr_genes_sp1.R diff --git a/misc/PanCancer/pruneVersion/pamr/OV_pamr.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/OV_pamr.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/OV_pamr.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/OV_pamr.R diff --git a/misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R b/misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/PanCancer_pamr.R similarity index 100% rename from misc/PanCancer/pruneVersion/pamr/PanCancer_pamr.R rename to misc/PanCancer/pruneVersion/pamr/needs_to_be_rechecked/PanCancer_pamr.R From 8076ff8beca41e7aace998e31ee38a004ff2a67c Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:10:47 -0400 Subject: [PATCH 085/124] cleaner versions of base --- misc/PanCancer/noPrune/GBM_noPrune_pipeline.R | 118 ++++++++ .../PanCancer/noPrune/KIRC_noPrune_pipeline.R | 121 ++++++++ .../PanCancer/noPrune/LUSC_noPrune_pipeline.R | 119 ++++++++ misc/PanCancer/noPrune/OV_noPrune_pipeline.R | 113 ++++++++ misc/PanCancer/noPrune/PanCancer_noPrune.R | 236 ++++++++++++++++ .../noPrune/PanCancer_noPrune_impute.R | 264 ++++++++++++++++++ 6 files changed, 971 insertions(+) create mode 100644 misc/PanCancer/noPrune/GBM_noPrune_pipeline.R create mode 100644 misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R create mode 100644 misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R create mode 100644 misc/PanCancer/noPrune/OV_noPrune_pipeline.R create mode 100644 misc/PanCancer/noPrune/PanCancer_noPrune.R create mode 100644 misc/PanCancer/noPrune/PanCancer_noPrune_impute.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune_pipeline.R b/misc/PanCancer/noPrune/GBM_noPrune_pipeline.R new file mode 100644 index 00000000..9079ac26 --- /dev/null +++ b/misc/PanCancer/noPrune/GBM_noPrune_pipeline.R @@ -0,0 +1,118 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noprune_impute_sp0.3_%s",outRoot,dt) +cat(megaDir, file="test.txt",append=TRUE) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_noPrune_impute.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,maxEdge=6000,spCutoff=0.3) + + diff --git a/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R b/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R new file mode 100644 index 00000000..b1f2c25a --- /dev/null +++ b/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noprune_sp0.3_%s",outRoot,dt) + + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_noPrune.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,maxEdge=6000,spCutoff=0.3) + + diff --git a/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R b/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R new file mode 100644 index 00000000..8719c5d8 --- /dev/null +++ b/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R @@ -0,0 +1,119 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_sp0.3_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_noPrune.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=8,maxEdge=6000,spCutoff=0.3) diff --git a/misc/PanCancer/noPrune/OV_noPrune_pipeline.R b/misc/PanCancer/noPrune/OV_noPrune_pipeline.R new file mode 100644 index 00000000..ce581db3 --- /dev/null +++ b/misc/PanCancer/noPrune/OV_noPrune_pipeline.R @@ -0,0 +1,113 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noprune_sp0.3_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_noPrune.R") +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=8,maxEdge=6000,spCutoff=0.3) + + + + diff --git a/misc/PanCancer/noPrune/PanCancer_noPrune.R b/misc/PanCancer/noPrune/PanCancer_noPrune.R new file mode 100644 index 00000000..31ed79dd --- /dev/null +++ b/misc/PanCancer/noPrune/PanCancer_noPrune.R @@ -0,0 +1,236 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet,maxEdge,spCutoff) { + +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in rngVals) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=spCutoff, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + netDir <- sprintf("%s/test_networks",outDir) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + alldat <- do.call("rbind",dats) + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=spCutoff, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) +} diff --git a/misc/PanCancer/noPrune/PanCancer_noPrune_impute.R b/misc/PanCancer/noPrune/PanCancer_noPrune_impute.R new file mode 100644 index 00000000..e6570d38 --- /dev/null +++ b/misc/PanCancer/noPrune/PanCancer_noPrune_impute.R @@ -0,0 +1,264 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet,maxEdge,spCutoff) { + +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in rngVals) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=spCutoff, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + netDir <- sprintf("%s/test_networks",outDir) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=spCutoff, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) +} From a9c5127f188343e36c2c5a3d814a01328d96c902 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:11:52 -0400 Subject: [PATCH 086/124] pearscale lim to top --- .../pearscale/GBM_topX_pearimpute.R | 121 +++++++ .../pearscale/LUSC_top_pearscale.R | 8 +- .../PanCancer_topClin_pearscale_impute.R | 308 ++++++++++++++++++ .../PanCancer_topX_pearscale_impute.R | 15 +- 4 files changed, 442 insertions(+), 10 deletions(-) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_topX_pearimpute.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_topX_pearimpute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_topX_pearimpute.R new file mode 100644 index 00000000..9a56f011 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/GBM_topX_pearimpute.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,inDir,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_topX_pearscale_impute.R") +topX <- 20 +topClin <- 3 + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/pearimp_topX%i_topClin%i_%s",outRoot,topX,topClin,dt) +cat(megaDir, file="test.txt",append=TRUE) +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,topX=topX,topClin=topClin) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R index 2176b993..fc1be1ae 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R @@ -109,11 +109,11 @@ rm(survStr,surv,tmp,nm,inDir,k,inFiles,datFiles,pname) # ----------------------------------------------------------- # run predictor -source("PanCancer_topX_pearscale_impute.R") +source("PanCancer_topClin_pearscale_impute.R") outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" dt <- format(Sys.Date(),"%y%m%d") -topX <- 20; topClin<-1 -megaDir <- sprintf("%s/pearscale_top%i_topClin%i_%s",outRoot,topX,topClin,dt) +topClin<-1 +megaDir <- sprintf("%s/pearscale_lasso_topClin%i_%s",outRoot,topClin,dt) runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9,topX=topX,topClin=topClin) + cutoffSet=9) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R new file mode 100644 index 00000000..e50b93b8 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R @@ -0,0 +1,308 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +sim.pearscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + + if (nrow(dat)<6) { + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + } else { + euc <- as.matrix(1-cor(dat,method="pearson",use="pairwise.complete.obs")) + } + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(dats_train[[nm]]), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + wt <- wt[-which(names(wt) %in% "(Intercept)")] + wt <- wt[order(wt,decreasing=TRUE)] + vars <- names(wt)[which(wt>.Machine$double.eps)] + if (nm %in% "clinical" & length(wt)>1) { + vars <- names(wt)[1] + cat(sprintf("Rng %i:keeping top clin: %s",rngNum,vars)) + } + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R index d134478b..04b2ce52 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R @@ -55,7 +55,7 @@ normalize <- function(X) { # ---------------------------------------------------------------- runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, - cutoffSet,topX=10,topClin) { + cutoffSet,topX,topClin) { require(netDx) require(netDx.examples) require(glmnet) @@ -111,13 +111,16 @@ for (rngNum in rngVals) { vars <- rownames(dats_train[[nm]]) else { fit <- cv.glmnet(x=t(dats_train[[nm]]), - y=factor(pheno$STATUS), family="binomial", alpha=0) # ridge + y=factor(pheno$STATUS), family="binomial", alpha=0.5) # elastic net wt <- abs(coef(fit,s="lambda.min")[,1]) - wt <- wt[order(wt,decreasing=TRUE)] wt <- wt[-which(names(wt) %in% "(Intercept)")] - if (nm %in% "clinical") maxind <- min(topClin,length(wt)-1) - else maxind <- min(topX,length(wt)-1) - vars <- names(wt[1:maxind]) + wt <- wt[order(wt,decreasing=TRUE)] + + # limit to topX or topClin variables + if (nm %in% "clinical") { wt <- wt[1:min(topClin,length(wt))]} + else { wt <- wt[1:min(topX,length(wt))] } + + vars <- names(wt)[which(wt>.Machine$double.eps)] } cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) if (length(vars)>0) { From b93317577c7ae033f0e3986bda85c70f5ec4fcf2 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:12:36 -0400 Subject: [PATCH 087/124] euscale lim to top only --- .../eucscale/GBM_topX_eucimpute.R | 121 +++++++ .../eucscale/PanCancer_topX_eucscale_impute.R | 310 ++++++++++++++++++ 2 files changed, 431 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_topX_eucimpute.R create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_topX_eucscale_impute.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_topX_eucimpute.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_topX_eucimpute.R new file mode 100644 index 00000000..969a6d06 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_topX_eucimpute.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} +rm(pname) + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv_cont"), + clinical="clinical_cont", + mir="mir_cont", + rna="rna_cont", + cnv="cnv_cont", + dnam="dnam_cont", + clinicalArna=c("clinical_cont","rna_cont"), + clinicalAmir=c("clinical_cont","mir_cont"), + clinicalAdnam=c("clinical_cont","dnam_cont"), + all="all" +) + +pheno_all <- pheno + +# cleanup +rm(pheno,pheno_nosurv) +rm(rootDir,survStr,surv,tmp,nm,inDir,k,inFiles,datFiles) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_topX_eucscale_impute.R") +topX <- 50 +topClin <- 3 + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/eucimp_topX%i_topClin%i_%s",outRoot,topX,topClin,dt) +cat(megaDir, file="test.txt",append=TRUE) +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,topX=topX,topClin=topClin) + + diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_topX_eucscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_topX_eucscale_impute.R new file mode 100644 index 00000000..7fac8e65 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_topX_eucscale_impute.R @@ -0,0 +1,310 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions + +# SNF similarity method. Euclidean distance followed by exponential +# scaling where sigma is tuned based on local data structure. +sim.eucscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + nnodata <- which(abs(colSums(dat,na.rm=T)) < .Machine$double.eps) + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) +### # remove patients with no datapoints (full column/row of NAs) +### idx <- which(rowSums(is.na(euc))==ncol(W)-1) +### if (any(idx)) { +### W <- W[-idx,] +### idx <- which(colSums(is.na(euc))==ncol(W)-1) +### W <- W[,-idx] +### } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet,topX,topClin) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(dats_train[[nm]]), + y=factor(pheno$STATUS), family="binomial", alpha=0.5) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + wt <- wt[-which(names(wt) %in% "(Intercept)")] + wt <- wt[order(wt,decreasing=TRUE)] + + # limit to topX or topClin variables + if (nm %in% "clinical") { wt <- wt[1:min(topClin,length(wt))]} + else { wt <- wt[1:min(topX,length(wt))] } + + vars <- names(wt)[which(wt>.Machine$double.eps)] + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter,netDir, + simMetric="custom",customFunc=sim.eucscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} From 51842a4128ca1d8402d4b5a8a2ad414d3ff926b1 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:13:25 -0400 Subject: [PATCH 088/124] border case where only edge for patient is below cutoff. Set to cutoff and include --- netDx/R/sparsify2.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/netDx/R/sparsify2.R b/netDx/R/sparsify2.R index f4fb6cc5..3afb1f08 100644 --- a/netDx/R/sparsify2.R +++ b/netDx/R/sparsify2.R @@ -49,6 +49,10 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, length(missing))) for (k in missing) { # add the strongest edge for the patient tmp <- x[[k]] + if (is.na(tmp[1])) { + cat("\tMissing edge is below cutoff; setting to cutoff\n") + tmp[1] <- cutoff + } mmat <- rbind(mmat, c(k, names(tmp)[1],tmp[1])) } } From 13cfaec2dbe3607ac7c191d7b2f5794978474e15 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:13:54 -0400 Subject: [PATCH 089/124] older --- .../PanCancer_topX_pearscale_impute.R | 311 ++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/outdated/PanCancer_topX_pearscale_impute.R diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/outdated/PanCancer_topX_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/outdated/PanCancer_topX_pearscale_impute.R new file mode 100644 index 00000000..2647b637 --- /dev/null +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/outdated/PanCancer_topX_pearscale_impute.R @@ -0,0 +1,311 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +sim.pearscale <- function (dat, K = 20, alpha = 0.5) { +ztrans <- function(m) { + m <- as.matrix(m) + m2 <- apply(m,1,function(x) { (x-mean(x,na.rm=T))/sd(x,na.rm=T)}) + m2 +} +normalize <- function(X) { + print(dim(X)) + row.sum.mdiag <- rowSums(X,na.rm=T) - diag(X) + row.sum.mdiag[row.sum.mdiag == 0] <- 1 + X <- X/(2 * (row.sum.mdiag)) + diag(X) <- 0.5 + X <- (X+t(X))/2 + return(X) +} + + if (nrow(dat)<6) { + z1 <- ztrans(dat) + euc <- as.matrix(dist(z1,method="euclidean"))^(1/2) + } else { + euc <- as.matrix(1-cor(dat,method="pearson",use="pairwise.complete.obs")) + } + N <- nrow(euc) + euc <- (euc + t(euc))/2 + sortedColumns <- as.matrix(t(apply(euc, 2, sort,na.last=TRUE))) + print(dim(sortedColumns)) + finiteMean <- function(x) { + return(mean(x[is.finite(x)],na.rm=T)) + } + means <- apply(sortedColumns[, 1:K + 1], 1, finiteMean) + + .Machine$double.eps + avg <- function(x, y) { + return((x + y)/2) + } + Sig <- outer(means, means, avg)/3 * 2 + euc/3 + .Machine$double.eps + Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps + densities <- dnorm(euc, 0, alpha * Sig, log = FALSE) + + W <- (densities + t(densities))/2 + W <- normalize(W) + # remove patients with no datapoints (full column/row of NAs) + idx <- which(rowSums(is.na(euc))==ncol(W)-1) + if (any(idx)) { + W <- W[-idx,] + idx <- which(colSums(is.na(euc))==ncol(W)-1) + W <- W[,-idx] + } + return(W) +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet,topClin) { +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(dats_train[[nm]]), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + wt <- wt[order(wt,decreasing=TRUE)] + wt <- wt[-which(names(wt) %in% "(Intercept)")] + if (nm %in% "clinical") maxind <- min(topClin,length(wt)-1) + else { + wt <- wt[which(wt>.Machine$double.eps)] + if (length(wt)>=1) { + maxind <- length(wt) #min(topX,length(wt)-1) + } else maxind <- 0 + } + vars <- names(wt[1:maxind]) + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s_cont",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + + netDir <- sprintf("%s/test_networks",outDir) + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter,netDir, + simMetric="custom",customFunc=sim.pearscale, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=TRUE,numCores=numCores) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} From 51655480a55f5148d193f3b7bec97823315ce64a Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:15:52 -0400 Subject: [PATCH 090/124] cleaned sets --- misc/PanCancer/multiCutoff/GBM_getRes.R | 56 ++++++++++++++---------- misc/PanCancer/multiCutoff/KIRC_getRes.R | 19 +++++--- misc/PanCancer/multiCutoff/LUSC_getRes.R | 18 ++++---- misc/PanCancer/multiCutoff/OV_getRes.R | 13 +++--- 4 files changed, 59 insertions(+), 47 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index 513c6fac..ee2afa0e 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -7,16 +7,23 @@ GBM_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output" dirSet <- list( base="noPrune_180423", - ridge_fix="ridge_AbsFix_180426", - lassoGenes_sp1="lassoGenes_incClin_180426", - pamrGenes="pamrGenes_incClin_180427", - #rbf0.05="lassoUni_rbf_0.05", - #euc_1K="eucscale_sp2max1000_180503", - euc_6K="eucclean_180503", - euc_6K_group="eucscale_sp2max6000_grouped_180503", +# ridge_fix="ridge_AbsFix_180426", +# lassoGenes_sp1="lassoGenes_incClin_180426", +# pamrGenes="pamrGenes_incClin_180427", +# euc_6K="eucclean_180503", eucimpute="eucclean_impute_180507", - rbfclean="rbfclean_0.20_180507", - pearscale="pearscale_180507" + #rbfclean="rbfclean_0.20_180507", +# pearscale="pearscale_180507", + pearimpute="pearscale_impute_180508" +# pimp_40c2="pearimp_topX40_topClin2_180509", +# pimp_40c3="pearimp_topX40_topClin3_180509", +# pimp_100c3="pearimp_topX100_topClin3_180509", +# pimp_200c3="pearimp_topX200_topClin3_180509", +# pimp_20c3="pearimp_topX20_topClin3_180509", +# pimp_30c3="pearimp_topX30_topClin3_180509" +# eimp_100c3="eucimp_topX100_topClin3_180509", +# eimp_50c3="eucimp_topX50_topClin3_180509", +# eimp_20c3="eucimp_topX20_topClin3_180509" #rbf5="lassoUni_rbf_5", #rbf10="lassoUni_rbf_10" ) @@ -35,21 +42,22 @@ for (settype in settypes) { #print(dataDir) cutoff <-9 - if (any(c(grep("lasso",curdir),grep("ridge",curdir)))) { - rngDir <- paste("rng",1:18,sep="") - } else if (any(c(grep("rbf0.1",curdir)))){ - rngDir <- paste("rng",1:8,sep="") - } else if (any(c(grep("rbf0.25",curdir)))){ - rngDir <- paste("rng",1:8,sep="") - } else if (any(c(grep("euc_1K",curdir)))){ - rngDir <- paste("rng",1:12,sep="") - } else if (curdir =="euc_6K"){ - rngDir <- paste("rng",1:20,sep="") - } else if (curdir =="euc_6K_group"){ - rngDir <- paste("rng",1:20,sep="") - } else { - rngDir <- dir(path=dataDir,pattern="rng") - } +# if (any(c(grep("lasso",curdir),grep("ridge",curdir)))) { +# rngDir <- paste("rng",1:18,sep="") +# } else if (any(c(grep("rbf0.1",curdir)))){ +# rngDir <- paste("rng",1:8,sep="") +# } else if (any(c(grep("rbf0.25",curdir)))){ +# rngDir <- paste("rng",1:8,sep="") +# } else if (any(c(grep("euc_1K",curdir)))){ +# rngDir <- paste("rng",1:12,sep="") +# } else if (curdir =="euc_6K"){ +# rngDir <- paste("rng",1:20,sep="") +# } else if (any(grep("pimp",curdir))){ +# rngDir <- paste("rng",1:14,sep="") +# } else { + if (curdir=="base") rngMax<- 15 else rngMax <- 20 + rngDir <- paste("rng",1:rngMax,sep="") #dir(path=dataDir,pattern="rng") +# } numSplits[[curdir]] <- length(rngDir) cat(sprintf("Got %i rng files\n",length(rngDir))) diff --git a/misc/PanCancer/multiCutoff/KIRC_getRes.R b/misc/PanCancer/multiCutoff/KIRC_getRes.R index a74c5334..00a3e332 100644 --- a/misc/PanCancer/multiCutoff/KIRC_getRes.R +++ b/misc/PanCancer/multiCutoff/KIRC_getRes.R @@ -8,10 +8,12 @@ KIRC_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dirSet <- list( - base="noPrune_180423", - lasso="lasso_180426", - pamr="pamr_180426", - euc6K="eucclean_180503" + base="noPrune_180423" +# lasso="lasso_180426", +# lassocl="lassoclean_180509", +# pamr="pamr_180426", +# pamrcl="pamrclean_180509", +# euc6K="eucclean_180503" # ridge="ridgeAbsFix_180426" ) @@ -25,7 +27,10 @@ dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) rngMax <- 20 if (any(grep("base",curdir))) { rngMax <- 15 - } +} +# } else if (any(grep("lasso",curdir))) { +# rngMax <- 16 +# } auc_set <- list() for (settype in settypes) { @@ -33,7 +38,7 @@ for (settype in settypes) { colctr <- 1 cutoff <- 9 - if (curdir=="euc6K") { + if (curdir %in% c("euc6K","lassocl","pamrcl")) { c7 <- sprintf("%s/%s/cutoff9/predictionResults.txt", rngDir,settype,cutoff) } else { @@ -59,7 +64,7 @@ colctr <- 1 } mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } -pdf(sprintf("KIRC_%s.pdf",format(Sys.Date(),"%y%m%d"))); +pdf(sprintf("KIRC_%s.pdf",format(Sys.Date(),"%y%m%d")),width=11,height=6); boxplot(mega_auc,main="KIRC",cex.axis=1.7,cex.main=2,las=1); dev.off() return(mega_auc) diff --git a/misc/PanCancer/multiCutoff/LUSC_getRes.R b/misc/PanCancer/multiCutoff/LUSC_getRes.R index 82e8ebd2..dc568548 100644 --- a/misc/PanCancer/multiCutoff/LUSC_getRes.R +++ b/misc/PanCancer/multiCutoff/LUSC_getRes.R @@ -6,17 +6,15 @@ LUSC_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output" dirSet <- list( base="noPrune_180423", - lasso="lasso_180426", - lassoGenes="lassoGenes_180426", - pamrGenes_sp2="pamrGenes_180427", - pamrGenes_sp1="pamrGenes_sp1_180427", - rbfclean="rbfclean_0.20_180507", +# lasso="lasso_180426", +# lassoGenes="lassoGenes_180426", +# pamrGenes_sp2="pamrGenes_180427", +# pamrGenes_sp1="pamrGenes_sp1_180427", +# rbfclean="rbfclean_0.20_180507", euc6K="eucclean_180504", - eucimpute="eucscale_impute_180507", - pearscale="pearscale_180507", - ptop20c1="pearscale_top20_topClin1_180508", - ptop30c1="pearscale_top30_topClin1_180508", - ptop40c2="pearscale_top40_topClin2_180508" +# eucimpute="eucscale_impute_180507", +# pearscale="pearscale_180507", + plassoc1="pearscale_lasso_topClin1_180509" ) settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") diff --git a/misc/PanCancer/multiCutoff/OV_getRes.R b/misc/PanCancer/multiCutoff/OV_getRes.R index eb0c1437..46f44bd7 100644 --- a/misc/PanCancer/multiCutoff/OV_getRes.R +++ b/misc/PanCancer/multiCutoff/OV_getRes.R @@ -11,17 +11,18 @@ settypes <- c("clinical","mir","rna","prot","cnv","dnam", "clinicalAcnv","all") dirSet <- list( base="noPrune_180423", - prune="pruneTrain_180419", +# prune="pruneTrain_180419", lasso="lasso_180426", - euc6K="eucscale_180504", - rbfclean="rbfclean_0.2_180507" + euc6K="eucscale_180504" +# rbfclean="rbfclean_0.2_180507" ) mega_auc <- list() for (curdir in names(dirSet)) { - if (curdir %in% c("lasso","pamr","euc6K","rbfclean")) rngMax <- 20 - else if (curdir %in% "prune") rngMax <- 14 - else rngMax <- 15 +# if (curdir %in% c("lasso","pamr","euc6K","rbfclean")) rngMax <- 20 +# else if (curdir %in% "prune") rngMax <- 14 + + if (curdir=="base") rngMax <- 15 else rngMax <- 20 cat(sprintf("***** %s *****\n", curdir)) dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) From 4d0cc62628b74a9ceddc5d6f88ae2de2b2af3db6 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:16:22 -0400 Subject: [PATCH 091/124] collects best-of for all tumours --- misc/PanCancer/multiCutoff/compileRes.R | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 misc/PanCancer/multiCutoff/compileRes.R diff --git a/misc/PanCancer/multiCutoff/compileRes.R b/misc/PanCancer/multiCutoff/compileRes.R new file mode 100644 index 00000000..ed41bfdd --- /dev/null +++ b/misc/PanCancer/multiCutoff/compileRes.R @@ -0,0 +1,43 @@ +#' compile best result from all tumours +rm(list=ls()) + +.getBest <- function(d) { + d <- do.call("rbind",d) + x <- apply(d,2,which.max) + val <- apply(d,2,max) + nm <- rownames(d)[x] + z <- cbind(val,nm,names(x)) + z +} + +source("KIRC_getRes.R") +kirc <- KIRC_getRes() +kirc2 <- .getBest(kirc) +kirc2 <- cbind(kirc2,"KIRC") + +source("GBM_getRes.R") +gbm <- GBM_getRes() +gbm2 <- .getBest(gbm) +gbm2 <- cbind(gbm2,"GBM") + +source("OV_getRes.R") +ov <- OV_getRes() +ov2 <- .getBest(ov) +ov2 <- cbind(ov2,"OV") + +source("LUSC_getRes.R") +lusc <- LUSC_getRes() +lusc2 <- .getBest(lusc) +lusc2 <- cbind(lusc2,"LUSC") + +comb <- rbind(kirc2,gbm2,lusc2,ov2) +comb <- as.data.frame(comb) +rownames(comb)<- NULL +comb[,1] <- as.numeric(as.character(comb[,1])) +colnames(comb) <- c("val","method","datatype","tumour") +dt <- format(Sys.Date(),"%y%m%d") +write.table(comb,file=sprintf("netDx_bestModel_%s.txt",dt),sep="\t",col=T,row=F,quote=F) + +# convert into table form for comparison with Yuan et al. +x <- dcast(comb,tumour~datatype,value.var="val") +write.table(x,file=sprintf("netDx_perf_%s.txt",dt),sep="\t",col=T,row=F,quote=F) From 007a74b6fcdc24769b04f2855241afb56b630751 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:17:17 -0400 Subject: [PATCH 092/124] initial write lasso only --- .../pruneVersion/lasso/KIRC_lasso_pipeline.R | 121 +++++++ .../pruneVersion/lasso/PanCancer_lasso.R | 314 ++++++++++++++++++ 2 files changed, 435 insertions(+) create mode 100644 misc/PanCancer/pruneVersion/lasso/KIRC_lasso_pipeline.R create mode 100644 misc/PanCancer/pruneVersion/lasso/PanCancer_lasso.R diff --git a/misc/PanCancer/pruneVersion/lasso/KIRC_lasso_pipeline.R b/misc/PanCancer/pruneVersion/lasso/KIRC_lasso_pipeline.R new file mode 100644 index 00000000..75efbf8b --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/KIRC_lasso_pipeline.R @@ -0,0 +1,121 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/lassoclean_17on%s",outRoot,dt) + + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_lasso.R") +runPredictor(mega_combList=combList,rngVals=17:20,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9) + + diff --git a/misc/PanCancer/pruneVersion/lasso/PanCancer_lasso.R b/misc/PanCancer/pruneVersion/lasso/PanCancer_lasso.R new file mode 100644 index 00000000..6923f88e --- /dev/null +++ b/misc/PanCancer/pruneVersion/lasso/PanCancer_lasso.R @@ -0,0 +1,314 @@ +#' PanCancer predictor: univariate filtering by lasso + gene-level nets +#' similarity by Euclidean distance + local scaling + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet) { + +warning("Pipeline needs to be checked for accuracy\n") + +require(netDx) +require(netDx.examples) +require(glmnet) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +maxEdge <- 6000 ### max edge after sparsification + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +alldat <- do.call("rbind",dats) + +for (rngNum in rngVals) { + combList <- mega_combList # clean slate + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir), + sep="\t",col=T,row=F,quote=F) + + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + netSets_iter <- list() + + # impute + dats_train <- lapply(dats_train, function(x) { + missidx <- which(rowSums(is.na(x))>0) + for (i in missidx) { + na_idx <- which(is.na(x[i,])) + x[i,na_idx] <- median(x[i,],na.rm=TRUE) + } + x + }) + + # lasso + for (nm in names(dats_train)) { + print(nm) + if (nrow(dats_train[[nm]])<2) # clinical only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + for (k in rownames(tmp)) { netSets_iter[[k]] <- k } + combList[[nm]] <- sprintf("%s_cont", rownames(tmp)) + } else { + # leave dats_train as is, make a single net + netSets_iter[[nm]] <- rownames(dats_train[[nm]]) + combList[[nm]] <- sprintf("%s.profile",nm) + } + } + + if ("clinicalArna" %in% names(combList)) + combList[["clinicalArna"]] <- c(combList[["clinical"]],combList[["rna"]]) + if ("clinicalAmir" %in% names(combList)) + combList[["clinicalAmir"]] <- c(combList[["clinical"]],combList[["mir"]]) + if ("clinicalAcnv" %in% names(combList)) + combList[["clinicalAcnv"]] <- c(combList[["clinical"]],combList[["cnv"]]) + if ("clinicalAdnam" %in% names(combList)) + combList[["clinicalAdnam"]] <- c(combList[["clinical"]],combList[["dnam"]]) + if ("clinicalAprot" %in% names(combList)) + combList[["clinicalAprot"]] <- c(combList[["clinical"]],combList[["prot"]]) + + # END lasso UF + # ---------------------- + alldat_train <- do.call("rbind",dats_train) + netDir <- sprintf("%s/networks",outDir) + + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("Input datatype\n%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + print(table(pheno_subtype$STATUS,useNA="always")) # sanitycheck + resDir <- sprintf("%s/GM_results",pDir2) + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # make test db + # impute + train_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TRAIN")] + test_samp <- pheno_all$ID[which(pheno_all$TT_STATUS %in% "TEST")] + dats_tmp <- lapply(dats, function(x) { + missidx <- which(rowSums(is.na(x))>0) + train_idx <- which(colnames(x) %in% train_samp) + test_idx <- which(colnames(x) %in% test_samp) + for (i in missidx) { + # impute train and test separately + na_idx <- intersect(which(is.na(x[i,])),train_idx) + na_idx1 <- na_idx + x[i,na_idx] <- median(x[i,train_idx],na.rm=TRUE) + + na_idx <- intersect(which(is.na(x[i,])),test_idx) + na_idx2 <- na_idx + x[i,na_idx] <- median(x[i,test_idx],na.rm=TRUE) + } + x + }) + alldat_tmp <- do.call("rbind",dats_tmp) + netDir <- sprintf("%s/test_networks",outDir) + + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_tmp, + rownames(alldat_tmp),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, outDir,numCores=numCores) + + # classify patients + for (cutoff in cutoffSet) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + } + } + + } # input data combinations + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) + +} From 4d8a67725f6affef4df1c56d298e0034be813ba7 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:18:44 -0400 Subject: [PATCH 093/124] moved tumour-specific scripts here --- .../noPrune/separate_scripts/GBM_noPrune.R | 335 +++++++++++++++++ .../separate_scripts/GBM_noPrune_noSex_Tanh.R | 334 +++++++++++++++++ .../noPrune/separate_scripts/KIRC_noPrune.R | 323 ++++++++++++++++ .../noPrune/separate_scripts/LUSC_noPrune.R | 346 ++++++++++++++++++ .../noPrune/separate_scripts/OV_noPrune.R | 303 +++++++++++++++ 5 files changed, 1641 insertions(+) create mode 100644 misc/PanCancer/noPrune/separate_scripts/GBM_noPrune.R create mode 100644 misc/PanCancer/noPrune/separate_scripts/GBM_noPrune_noSex_Tanh.R create mode 100644 misc/PanCancer/noPrune/separate_scripts/KIRC_noPrune.R create mode 100644 misc/PanCancer/noPrune/separate_scripts/LUSC_noPrune.R create mode 100644 misc/PanCancer/noPrune/separate_scripts/OV_noPrune.R diff --git a/misc/PanCancer/noPrune/separate_scripts/GBM_noPrune.R b/misc/PanCancer/noPrune/separate_scripts/GBM_noPrune.R new file mode 100644 index 00000000..e9007efe --- /dev/null +++ b/misc/PanCancer/noPrune/separate_scripts/GBM_noPrune.R @@ -0,0 +1,335 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:15) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/separate_scripts/GBM_noPrune_noSex_Tanh.R b/misc/PanCancer/noPrune/separate_scripts/GBM_noPrune_noSex_Tanh.R new file mode 100644 index 00000000..e9a3dfcf --- /dev/null +++ b/misc/PanCancer/noPrune/separate_scripts/GBM_noPrune_noSex_Tanh.R @@ -0,0 +1,334 @@ +#' PanCancer binarized survival: GBM: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design +#' multi cutoff evaluation +#' also pruning RNA before running + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" +inDir <- sprintf("%s/input",rootDir) +outRoot <- sprintf("%s/output",rootDir) + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/tanh_noSex_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +#' radial basis function +#' @param m (matrix) data, columns are patients +#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. +#' e.g. rbf,tanh,laplace +sim.kern <- function(m,nm="tanh",sigma=0.05) { + if (nm=="rbf") { + func <- kernlab::rbfdot(sigma) + cat(sprintf("Sigma = %1.2f\n", sigma)) + } else if (nm == "tanh") { + cat("using tanh\n") + func <- kernlab::tanhdot() + } + idx <- combinat::combn(1:ncol(m),2) + out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) + for (comb in 1:ncol(idx)) { + i <- idx[1,comb]; j <- idx[2,comb] + x <- func(m[,i],m[,j]) + out[i,j] <- x; out[j,i] <- x + } + diag(out) <- 1 + colnames(out)<- colnames(m); + rownames(out) <- colnames(m) + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/GBM_clinical_core.txt",inDir), + survival=sprintf("%s/GBM_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/GBM_mRNA_core.txt",inDir), + mir=sprintf("%s/GBM_miRNA_core.txt",inDir), + dnam=sprintf("%s/GBM_methylation_core.txt",inDir), + cnv=sprintf("%s/GBM_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" +# ------------------ + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +# ======================= +# GBM-specific variables +clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA +clinical$performance_score <- strtoi(clinical$performance_score) +clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +# ======================= +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalAcnv=c("clinical_cont","cnv.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + all="all" +) + +rm(pheno,pheno_nosurv) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) + +sink(logFile,split=TRUE) +tryCatch({ +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) + +dats$clinical <- dats$clinical[c("age","performance_score"),] + + +# first loop - over train/test splits +for (rngNum in 1:50) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=FALSE]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } +## Create the mega database with all patients and all nets. +## This will be used to predict test samples by subsetting just for feature +## selected nets in a given round +## Note that this is useful for all train/test splits because we can always +## change which samples are query and can always subset based on which nets +## are feature selected in a given round. +netDir <- sprintf("%s/test_networks",outDir) +nonclin <- setdiff(names(netSets),"clinical") +netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) +netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=sim.kern,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) +netList <- c(netList,netList2) +cat(sprintf("Total of %i nets\n", length(netList))) +# now create database +testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/separate_scripts/KIRC_noPrune.R b/misc/PanCancer/noPrune/separate_scripts/KIRC_noPrune.R new file mode 100644 index 00000000..e0392ca5 --- /dev/null +++ b/misc/PanCancer/noPrune/separate_scripts/KIRC_noPrune.R @@ -0,0 +1,323 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" +outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), + survival=sprintf("%s/KIRC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), + prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), + mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), + dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), + cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +#======transform clinical data========= +pheno$grade <- as.vector(pheno$grade) +pheno$grade[pheno$grade=="G1"] <- "G2" +pheno$grade[pheno$grade=="GX"] <- "G2" +pheno$grade <- as.factor(pheno$grade) +pheno <- pheno[, -which(colnames(pheno)=="gender")] +#====================================== + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno_nosurv +rownames(clinical) <- clinical[,1]; +clinical$grade <- as.numeric(factor(clinical$grade)) +clinical$stage <- as.numeric(factor(clinical$stage)) +clinical$ID <- NULL +clinical <- t(clinical) +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:15) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + ## pruneTrain: make test database + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/separate_scripts/LUSC_noPrune.R b/misc/PanCancer/noPrune/separate_scripts/LUSC_noPrune.R new file mode 100644 index 00000000..aa1fd85d --- /dev/null +++ b/misc/PanCancer/noPrune/separate_scripts/LUSC_noPrune.R @@ -0,0 +1,346 @@ +#' PanCancer binarized survival: LUSC: Feature selection with one net per +#' datatype +#' 10-fold CV predictor design + +rm(list=ls()) +require(netDx) +require(netDx.examples) +source("../runLM.R") + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), + survival=sprintf("%s/LUSC_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), + prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), + mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), + cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clinical <- pheno +rownames(clinical) <- clinical[,1]; +# ======================= +# LUSC-specific variables +clinical$stage <- as.vector(clinical$stage) +clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" +clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" +clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" +clinical$stage <- as.factor(clinical$stage) +clinical <- clinical[, -which(colnames(clinical)=="gender")] +clinical <- t(clinical[,c("age","stage")]) +clinical[1,] <- as.integer(clinical[1,]) +clinical[2,] <- as.integer(as.factor(clinical[2,])) +class(clinical) <- "numeric" +# ======================= +dats$clinical <- clinical; rm(clinical) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + if (nm == "rna") tmp <- log(tmp+1) + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx] + x +}) + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all" +) + +cat(sprintf("Clinical variables are: { %s }\n", + paste(rownames(dats$clinical),sep=",",collapse=","))) +rm(pheno) + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + + +# first loop - over train/test splits +for (rngNum in 1:100) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin], + netDir,verbose=FALSE,numCores=numCores, + writeProfiles=TRUE,simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, + simMetric="pearson") + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("CombList = %s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + ## Create the mega database with all patients and all nets. + ## This will be used to predict test samples by subsetting just for feature + ## selected nets in a given round + ## Note that this is useful for all train/test splits because we can always + ## change which samples are query and can always subset based on which nets + ## are feature selected in a given round. + netDir <- sprintf("%s/test_networks",megaDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE, + simMetric="pearson") + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + verbose=FALSE,numCores=numCores, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + megadbDir <- GM_createDB(netDir, pheno_all$ID, + megaDir,numCores=numCores, + simMetric="pearson") + + for (cutoff in 7:9) { + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + if (length(pTally)>=1) { + curD <- sprintf("%s/cutoff%i",pDir2,cutoff) + dir.create(curD) + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",curD,g) + # only include the nets that were feature selected + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + oD <- sprintf("%s/cutoff%i",pDir,cutoff) + dir.create(oD) + outFile <- sprintf("%s/predictionResults.txt",oD) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + } else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) + } + } +} + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) diff --git a/misc/PanCancer/noPrune/separate_scripts/OV_noPrune.R b/misc/PanCancer/noPrune/separate_scripts/OV_noPrune.R new file mode 100644 index 00000000..d363f176 --- /dev/null +++ b/misc/PanCancer/noPrune/separate_scripts/OV_noPrune.R @@ -0,0 +1,303 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- 9 + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) + +# ---------------------------------------------------------------- +# helper functions + +# normalized difference +# x is vector of values, one per patient (e.g. ages) +normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out +} + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) + + +# ---------------------------------------------------------- +# build classifier +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in 1:15) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + netDir <- sprintf("%s/networks",outDir) + nonclin <- setdiff(names(netSets),"clinical") + + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[nonclin],netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter["clinical"], + netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores,writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + # -------- + # pruneTrain: make test database + test_netDir <- sprintf("%s/test_networks",outDir) + nonclin <- setdiff(names(netSets_iter),"clinical") + ### netSets_iter has univariate filtering for curr round + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[nonclin],test_netDir, + verbose=FALSE,numCores=numCores,writeProfiles=TRUE) + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter["clinical"], + test_netDir,simMetric="custom",customFunc=normDiff, + verbose=FALSE,numCores=numCores, writeProfiles=FALSE, + sparsify=TRUE,append=TRUE) + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + # now create database + testdbDir <- GM_createDB(test_netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + # pTally <- sub(".profile","",pTally) + # pTally <- sub("_cont","",pTally) + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally + ,nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } + + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) + +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) From c5a7c2c2ef28603508771362cea21ffab17be9b8 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:18:55 -0400 Subject: [PATCH 094/124] moved tumour-specific scripts to own folder --- misc/PanCancer/noPrune/GBM_noPrune.R | 335 ----------------- .../noPrune/GBM_noPrune_noSex_Tanh.R | 334 ----------------- misc/PanCancer/noPrune/KIRC_noPrune.R | 323 ---------------- misc/PanCancer/noPrune/LUSC_noPrune.R | 346 ------------------ misc/PanCancer/noPrune/OV_noPrune.R | 303 --------------- 5 files changed, 1641 deletions(-) delete mode 100644 misc/PanCancer/noPrune/GBM_noPrune.R delete mode 100644 misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R delete mode 100644 misc/PanCancer/noPrune/KIRC_noPrune.R delete mode 100644 misc/PanCancer/noPrune/LUSC_noPrune.R delete mode 100644 misc/PanCancer/noPrune/OV_noPrune.R diff --git a/misc/PanCancer/noPrune/GBM_noPrune.R b/misc/PanCancer/noPrune/GBM_noPrune.R deleted file mode 100644 index e9007efe..00000000 --- a/misc/PanCancer/noPrune/GBM_noPrune.R +++ /dev/null @@ -1,335 +0,0 @@ -#' PanCancer binarized survival: GBM: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design -#' multi cutoff evaluation -#' also pruning RNA before running - -rm(list=ls()) -require(netDx) -require(netDx.examples) -source("../runLM.R") - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 - -rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" -inDir <- sprintf("%s/input",rootDir) -outRoot <- sprintf("%s/output",rootDir) - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions -# takes average of normdiff of each row in x -normDiff2 <- function(x) { - # normalized difference - # x is vector of values, one per patient (e.g. ages) - normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out - } - - sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) - for (k in 1:nrow(x)) { - tmp <- normDiff(x[k,,drop=FALSE]) - sim <- sim + tmp - rownames(sim) <- rownames(tmp) - colnames(sim) <- colnames(tmp) - } - sim <- sim/nrow(x) - sim -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/GBM_clinical_core.txt",inDir), - survival=sprintf("%s/GBM_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/GBM_mRNA_core.txt",inDir), - mir=sprintf("%s/GBM_miRNA_core.txt",inDir), - dnam=sprintf("%s/GBM_methylation_core.txt",inDir), - cnv=sprintf("%s/GBM_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" -# ------------------ - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -# ======================= -# GBM-specific variables -clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA -clinical$performance_score <- strtoi(clinical$performance_score) -clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -# ======================= -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalAcnv=c("clinical_cont","cnv.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno,pheno_nosurv) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -# first loop - over train/test splits -for (rngNum in 1:15) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=FALSE]) - alldat_train <- do.call("rbind",dats_train) - netSets_iter <- netSets - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } -## Create the mega database with all patients and all nets. -## This will be used to predict test samples by subsetting just for feature -## selected nets in a given round -## Note that this is useful for all train/test splits because we can always -## change which samples are query and can always subset based on which nets -## are feature selected in a given round. -netDir <- sprintf("%s/test_networks",outDir) -nonclin <- setdiff(names(netSets),"clinical") -netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) -netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) -netList <- c(netList,netList2) -cat(sprintf("Total of %i nets\n", length(netList))) -# now create database -testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - }else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R b/misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R deleted file mode 100644 index e9a3dfcf..00000000 --- a/misc/PanCancer/noPrune/GBM_noPrune_noSex_Tanh.R +++ /dev/null @@ -1,334 +0,0 @@ -#' PanCancer binarized survival: GBM: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design -#' multi cutoff evaluation -#' also pruning RNA before running - -rm(list=ls()) -require(netDx) -require(netDx.examples) -source("../runLM.R") - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 - -rootDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM" -inDir <- sprintf("%s/input",rootDir) -outRoot <- sprintf("%s/output",rootDir) - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/tanh_noSex_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions -#' radial basis function -#' @param m (matrix) data, columns are patients -#' @param nm (char) kernel to use, prefix to kernlab::*dot() functions. -#' e.g. rbf,tanh,laplace -sim.kern <- function(m,nm="tanh",sigma=0.05) { - if (nm=="rbf") { - func <- kernlab::rbfdot(sigma) - cat(sprintf("Sigma = %1.2f\n", sigma)) - } else if (nm == "tanh") { - cat("using tanh\n") - func <- kernlab::tanhdot() - } - idx <- combinat::combn(1:ncol(m),2) - out <- matrix(NA,nrow=ncol(m),ncol=ncol(m)) - for (comb in 1:ncol(idx)) { - i <- idx[1,comb]; j <- idx[2,comb] - x <- func(m[,i],m[,j]) - out[i,j] <- x; out[j,i] <- x - } - diag(out) <- 1 - colnames(out)<- colnames(m); - rownames(out) <- colnames(m) - out -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/GBM_clinical_core.txt",inDir), - survival=sprintf("%s/GBM_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/GBM_mRNA_core.txt",inDir), - mir=sprintf("%s/GBM_miRNA_core.txt",inDir), - dnam=sprintf("%s/GBM_methylation_core.txt",inDir), - cnv=sprintf("%s/GBM_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" -# ------------------ - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -# ======================= -# GBM-specific variables -clinical$performance_score[which(clinical$performance_score == "[Not Available]")] <- NA -clinical$performance_score <- strtoi(clinical$performance_score) -clinical$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -# ======================= -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalAcnv=c("clinical_cont","cnv.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - all="all" -) - -rm(pheno,pheno_nosurv) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) - -sink(logFile,split=TRUE) -tryCatch({ -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) - -dats$clinical <- dats$clinical[c("age","performance_score"),] - - -# first loop - over train/test splits -for (rngNum in 1:50) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=FALSE]) - alldat_train <- do.call("rbind",dats_train) - netSets_iter <- netSets - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=sim.kern, - writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } -## Create the mega database with all patients and all nets. -## This will be used to predict test samples by subsetting just for feature -## selected nets in a given round -## Note that this is useful for all train/test splits because we can always -## change which samples are query and can always subset based on which nets -## are feature selected in a given round. -netDir <- sprintf("%s/test_networks",outDir) -nonclin <- setdiff(names(netSets),"clinical") -netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) -netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=sim.kern,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) -netList <- c(netList,netList2) -cat(sprintf("Total of %i nets\n", length(netList))) -# now create database -testdbDir <- GM_createDB(netDir, pheno_all$ID, megaDir,numCores=numCores) - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - }else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/noPrune/KIRC_noPrune.R b/misc/PanCancer/noPrune/KIRC_noPrune.R deleted file mode 100644 index e0392ca5..00000000 --- a/misc/PanCancer/noPrune/KIRC_noPrune.R +++ /dev/null @@ -1,323 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) -require(netDx) -require(netDx.examples) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -cutoff <- 9 - -inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" -outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions -# takes average of normdiff of each row in x -normDiff2 <- function(x) { - # normalized difference - # x is vector of values, one per patient (e.g. ages) - normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out - } - - sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) - for (k in 1:nrow(x)) { - tmp <- normDiff(x[k,,drop=FALSE]) - sim <- sim + tmp - rownames(sim) <- rownames(tmp) - colnames(sim) <- colnames(tmp) - } - sim <- sim/nrow(x) - sim -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/KIRC_clinical_core.txt",inDir), - survival=sprintf("%s/KIRC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/KIRC_mRNA_core.txt",inDir), - prot=sprintf("%s/KIRC_RPPA_core.txt",inDir), - mir=sprintf("%s/KIRC_miRNA_core.txt",inDir), - dnam=sprintf("%s/KIRC_methylation_core.txt",inDir), - cnv=sprintf("%s/KIRC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -#======transform clinical data========= -pheno$grade <- as.vector(pheno$grade) -pheno$grade[pheno$grade=="G1"] <- "G2" -pheno$grade[pheno$grade=="GX"] <- "G2" -pheno$grade <- as.factor(pheno$grade) -pheno <- pheno[, -which(colnames(pheno)=="gender")] -#====================================== - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno_nosurv -rownames(clinical) <- clinical[,1]; -clinical$grade <- as.numeric(factor(clinical$grade)) -clinical$stage <- as.numeric(factor(clinical$stage)) -clinical$ID <- NULL -clinical <- t(clinical) -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) - - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -# first loop - over train/test splits -for (rngNum in 1:15) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - alldat_train <- do.call("rbind",dats_train) - netSets_iter <- netSets - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, - verbose=FALSE,numCores=numCores,writeProfiles=FALSE, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - ## pruneTrain: make test database - ## This will be used to predict test samples by subsetting just for feature - ## selected nets in a given round - ## Note that this is useful for all train/test splits because we can always - ## change which samples are query and can always subset based on which nets - ## are feature selected in a given round. - netDir <- sprintf("%s/test_networks",outDir) - nonclin <- setdiff(names(netSets_iter),"clinical") - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - testdbDir <- GM_createDB(netDir, pheno_all$ID, - outDir,numCores=numCores) - - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",pDir2,g) - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } - - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F, - quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/noPrune/LUSC_noPrune.R b/misc/PanCancer/noPrune/LUSC_noPrune.R deleted file mode 100644 index aa1fd85d..00000000 --- a/misc/PanCancer/noPrune/LUSC_noPrune.R +++ /dev/null @@ -1,346 +0,0 @@ -#' PanCancer binarized survival: LUSC: Feature selection with one net per -#' datatype -#' 10-fold CV predictor design - -rm(list=ls()) -require(netDx) -require(netDx.examples) -source("../runLM.R") - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions - -# takes average of normdiff of each row in x -normDiff2 <- function(x) { - # normalized difference - # x is vector of values, one per patient (e.g. ages) - normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out - } - - sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) - for (k in 1:nrow(x)) { - tmp <- normDiff(x[k,,drop=FALSE]) - sim <- sim + tmp - rownames(sim) <- rownames(tmp) - colnames(sim) <- colnames(tmp) - } - sim <- sim/nrow(x) - sim -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/LUSC_clinical_core.txt",inDir), - survival=sprintf("%s/LUSC_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/LUSC_mRNA_core.txt",inDir), - prot=sprintf("%s/LUSC_RPPA_core.txt",inDir), - mir=sprintf("%s/LUSC_miRNA_core.txt",inDir), - cnv=sprintf("%s/LUSC_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clinical <- pheno -rownames(clinical) <- clinical[,1]; -# ======================= -# LUSC-specific variables -clinical$stage <- as.vector(clinical$stage) -clinical$stage[clinical$stage=="Stage IA"| clinical$stage=="Stage IB"] <- "I" -clinical$stage[clinical$stage=="Stage IIA"| clinical$stage=="Stage IIB"| clinical$stage=="Stage II"] <- "II" -clinical$stage[clinical$stage=="Stage IIIA"| clinical$stage=="Stage IIIB"] <- "III" -clinical$stage <- as.factor(clinical$stage) -clinical <- clinical[, -which(colnames(clinical)=="gender")] -clinical <- t(clinical[,c("age","stage")]) -clinical[1,] <- as.integer(clinical[1,]) -clinical[2,] <- as.integer(as.factor(clinical[2,])) -class(clinical) <- "numeric" -# ======================= -dats$clinical <- clinical; rm(clinical) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - if (nm == "rna") tmp <- log(tmp+1) - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID)]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx] - x -}) - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all" -) - -cat(sprintf("Clinical variables are: { %s }\n", - paste(rownames(dats$clinical),sep=",",collapse=","))) -rm(pheno) - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - - -# first loop - over train/test splits -for (rngNum in 1:100) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - - alldat_train <- do.call("rbind",dats_train) - netSets_iter <- netSets - - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin], - netDir,verbose=FALSE,numCores=numCores, - writeProfiles=TRUE,simMetric="pearson") - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2,writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores, - simMetric="pearson") - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("CombList = %s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - ## Create the mega database with all patients and all nets. - ## This will be used to predict test samples by subsetting just for feature - ## selected nets in a given round - ## Note that this is useful for all train/test splits because we can always - ## change which samples are query and can always subset based on which nets - ## are feature selected in a given round. - netDir <- sprintf("%s/test_networks",megaDir) - nonclin <- setdiff(names(netSets_iter),"clinical") - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE, - simMetric="pearson") - netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff2, - writeProfiles=FALSE, - verbose=FALSE,numCores=numCores, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - megadbDir <- GM_createDB(netDir, pheno_all$ID, - megaDir,numCores=numCores, - simMetric="pearson") - - for (cutoff in 7:9) { - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - if (length(pTally)>=1) { - curD <- sprintf("%s/cutoff%i",pDir2,cutoff) - dir.create(curD) - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",curD,g) - # only include the nets that were feature selected - GM_writeQueryFile(qSamps,incNets=pTally, - nrow(pheno_all),qFile) - resFile <- runGeneMANIA(megadbDir$dbDir,qFile,resDir=curD) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } else { - predRes[[g]] <- NA - } - } - oD <- sprintf("%s/cutoff%i",pDir,cutoff) - dir.create(oD) - outFile <- sprintf("%s/predictionResults.txt",oD) - if (any(is.na(predRes))) { - cat("One or more groups had zero feature selected nets\n") - cat("# no feature-selected nets.\n",file=outFile) - } else { - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",oD)) - } - } -} - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) diff --git a/misc/PanCancer/noPrune/OV_noPrune.R b/misc/PanCancer/noPrune/OV_noPrune.R deleted file mode 100644 index d363f176..00000000 --- a/misc/PanCancer/noPrune/OV_noPrune.R +++ /dev/null @@ -1,303 +0,0 @@ -#' PanCancer binarized survival: KIRC: Feature selection with one net per -# datatype -#' 10-fold CV predictor design -rm(list=ls()) -require(netDx) -require(netDx.examples) - -numCores <- 8L -GMmemory <- 4L -trainProp <- 0.8 -cutoff <- 9 - -inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" -outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" - -dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/noPrune_%s",outRoot,dt) - -# ---------------------------------------------------------------- -# helper functions - -# normalized difference -# x is vector of values, one per patient (e.g. ages) -normDiff <- function(x) { - #if (nrow(x)>=1) x <- x[1,] - nm <- colnames(x) - x <- as.numeric(x) - n <- length(x) - rngX <- max(x,na.rm=T)-min(x,na.rm=T) - - out <- matrix(NA,nrow=n,ncol=n); - # weight between i and j is - # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) - for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) - rownames(out) <- nm; colnames(out)<- nm - out -} - -# ----------------------------------------------------------- -# process input -inFiles <- list( - clinical=sprintf("%s/OV_clinical_core.txt",inDir), - survival=sprintf("%s/OV_binary_survival.txt",inDir) - ) -datFiles <- list( - rna=sprintf("%s/OV_mRNA_core.txt",inDir), - prot=sprintf("%s/OV_RPPA_core.txt",inDir), - mir=sprintf("%s/OV_miRNA_core.txt",inDir), - dnam=sprintf("%s/OV_methylation_core.txt",inDir), - cnv=sprintf("%s/OV_CNV_core.txt",inDir) -) - -pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) -colnames(pheno)[1] <- "ID" - -surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) -colnames(surv)[1:2] <- c("ID","STATUS_INT") -survStr <- rep(NA,nrow(surv)) -survStr[surv$STATUS_INT<1] <- "SURVIVENO" -survStr[surv$STATUS_INT>0] <- "SURVIVEYES" -surv$STATUS <- survStr -pheno <- merge(x=pheno,y=surv,by="ID") -pheno$X <- NULL -# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) -pheno_nosurv <- pheno[1:4] - -cat("Collecting patient data:\n") -dats <- list() #input data in different slots -cat("\t* Clinical\n") -clin <- pheno -rownames(clin) <- clin[,1]; -clin <- t(clin[,2,drop=FALSE]) -dats$clinical <- clin; rm(clin) - -# create master input net -for (nm in names(datFiles)) { - cat(sprintf("\t* %s\n",nm)) - tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) - if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] - rownames(tmp) <- tmp[,1] - tmp <- t(tmp[,-1]) - class(tmp) <- "numeric" - dats[[nm]] <- tmp -} - -cat("\t Ordering column names\n") -# include only data for patients in classifier -dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) -dats <- lapply(dats, function(x) { - midx <- match(pheno$ID,colnames(x)) - x <- x[,midx, drop = FALSE] - x -}) - - -# confirm patient order the same for all input nets -pname <- colnames(dats[[1]]) -for (k in 2:length(dats)) { - if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { - cat(sprintf("Patient order doesn't match for %s\n", - names(dats)[k])) - browser() - } -} - -# input nets for each category -netSets <- lapply(dats, function(x) rownames(x)) - -# compile data -alldat <- do.call("rbind",dats) -pheno_all <- pheno - -combList <- list( - clinical="clinical_cont", - mir="mir.profile", - rna="rna.profile", - prot="prot.profile", - cnv="cnv.profile", - dnam="dnam.profile", - clinicalArna=c("clinical_cont","rna.profile"), - clinicalAmir=c("clinical_cont","mir.profile"), - clinicalAprot=c("clinical_cont","prot.profile"), - clinicalAdnam=c("clinical_cont","dnam.profile"), - clinicalAcnv=c("clinical_cont","cnv.profile"), - all="all") - -rm(pheno,pheno_nosurv) - - -# ---------------------------------------------------------- -# build classifier -if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) -dir.create(megaDir) - -logFile <- sprintf("%s/log.txt",megaDir) -sink(logFile,split=TRUE) -tryCatch({ - -# first loop - over train/test splits -for (rngNum in 1:15) { - rng_t0 <- Sys.time() - cat(sprintf("-------------------------------\n")) - cat(sprintf("RNG seed = %i\n", rngNum)) - cat(sprintf("-------------------------------\n")) - outDir <- sprintf("%s/rng%i",megaDir,rngNum) - dir.create(outDir) - - pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, - setSeed=rngNum*5) - write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", - col=T,row=F,quote=F) - # -------------------------------------------- - # feature selection - train only - pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") - - dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), - drop=F]) - alldat_train <- do.call("rbind",dats_train) - netSets_iter <- netSets - netDir <- sprintf("%s/networks",outDir) - nonclin <- setdiff(names(netSets),"clinical") - - netList <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter[nonclin],netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) - - netList2 <- makePSN_NamedMatrix(alldat_train, - rownames(alldat_train),netSets_iter["clinical"], - netDir,simMetric="custom",customFunc=normDiff, - verbose=FALSE,numCores=numCores,writeProfiles=FALSE, - sparsify=TRUE,append=TRUE) - - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - - # now create database - dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) - - # second loop - over combinations of input data - for (cur in names(combList)) { - t0 <- Sys.time() - cat(sprintf("%s\n",cur)) - pDir <- sprintf("%s/%s",outDir, cur) - dir.create(pDir) - - # run featsel once per subtype - subtypes <- unique(pheno$STATUS) - # run 10-fold cv per subtype - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) - dir.create(pDir2) - - cat(sprintf("\n******\nSubtype %s\n",g)) - pheno_subtype <- pheno - ## label patients not in the current class as residual - nong <- which(!pheno_subtype$STATUS %in% g) - pheno_subtype$STATUS[nong] <- "nonpred" - ## sanity check - print(table(pheno_subtype$STATUS,useNA="always")) - resDir <- sprintf("%s/GM_results",pDir2) - ## query for feature selection comprises of training - ## samples from the class of interest - trainPred <- pheno_subtype$ID[ - which(pheno_subtype$STATUS %in% g)] - - # Cross validation - GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, - nrow(pheno_subtype),incNets=combList[[cur]], - verbose=T, numCores=numCores, - GMmemory=GMmemory) - - # patient similarity ranks - prank <- dir(path=resDir,pattern="PRANK$") - # network ranks - nrank <- dir(path=resDir,pattern="NRANK$") - cat(sprintf("Got %i prank files\n",length(prank))) - - # Compute network score - pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) - head(pTally) - # write to file - tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) - write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) - } - - # -------- - # pruneTrain: make test database - test_netDir <- sprintf("%s/test_networks",outDir) - nonclin <- setdiff(names(netSets_iter),"clinical") - ### netSets_iter has univariate filtering for curr round - netList <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter[nonclin],test_netDir, - verbose=FALSE,numCores=numCores,writeProfiles=TRUE) - netList2 <- makePSN_NamedMatrix(alldat, - rownames(alldat),netSets_iter["clinical"], - test_netDir,simMetric="custom",customFunc=normDiff, - verbose=FALSE,numCores=numCores, writeProfiles=FALSE, - sparsify=TRUE,append=TRUE) - netList <- c(netList,netList2) - cat(sprintf("Total of %i nets\n", length(netList))) - # now create database - testdbDir <- GM_createDB(test_netDir, pheno_all$ID, - outDir,numCores=numCores) - - predRes <- list() - for (g in subtypes) { - pDir2 <- sprintf("%s/%s",pDir,g) - # get feature selected net names - pTally <- read.delim( - sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), - sep="\t",h=T,as.is=T) - - # feature selected nets pass cutoff threshold - pTally <- pTally[which(pTally[,2]>=cutoff),1] - # pTally <- sub(".profile","",pTally) - # pTally <- sub("_cont","",pTally) - cat(sprintf("%s: %i pathways\n",g,length(pTally))) - - # query of all training samples for this class - qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & - pheno_all$TT_STATUS%in%"TRAIN")] - - qFile <- sprintf("%s/%s_query",pDir2,g) - GM_writeQueryFile(qSamps,incNets=pTally - ,nrow(pheno_all),qFile) - resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) - predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), - pheno_all,g) - } - - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F,quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) - } - - #cleanup to save disk space - system(sprintf("rm -r %s/dataset %s/tmp %s/networks", - outDir,outDir,outDir)) - system(sprintf("rm -r %s/dataset %s/networks", - outDir,outDir)) - -} - pheno_all$TT_STATUS <- NA - rng_t1 <- Sys.time() - cat(sprintf("Time for one train/test split:")) - print(rng_t1-rng_t0) - -}, error=function(ex){ - print(ex) -}, finally={ - sink(NULL) -}) From 4b48dea901326b8a5680eae13b333f80a8ff1d8a Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 11 May 2018 07:19:10 -0400 Subject: [PATCH 095/124] removed --- misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt deleted file mode 100644 index bc1efa7c..00000000 --- a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/test.txt +++ /dev/null @@ -1 +0,0 @@ -/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_180507/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output/pearscale_impute_180508 \ No newline at end of file From 20dbbb8c7fe3032e5f66e3cd9469b49ff63bb0b1 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Sat, 12 May 2018 09:38:37 -0400 Subject: [PATCH 096/124] variation that uses old sparsification method --- .../noPrune/OV_noPrune_sp1_pipeline.R | 113 ++++++++ .../PanCancer/noPrune/PanCancer_noPrune_sp1.R | 247 ++++++++++++++++++ 2 files changed, 360 insertions(+) create mode 100644 misc/PanCancer/noPrune/OV_noPrune_sp1_pipeline.R create mode 100644 misc/PanCancer/noPrune/PanCancer_noPrune_sp1.R diff --git a/misc/PanCancer/noPrune/OV_noPrune_sp1_pipeline.R b/misc/PanCancer/noPrune/OV_noPrune_sp1_pipeline.R new file mode 100644 index 00000000..7485d401 --- /dev/null +++ b/misc/PanCancer/noPrune/OV_noPrune_sp1_pipeline.R @@ -0,0 +1,113 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design +rm(list=ls()) + + +inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/input" +outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/OV/output" + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/noprune_sp1_%s",outRoot,dt) + +# ----------------------------------------------------------- +# process input +inFiles <- list( + clinical=sprintf("%s/OV_clinical_core.txt",inDir), + survival=sprintf("%s/OV_binary_survival.txt",inDir) + ) +datFiles <- list( + rna=sprintf("%s/OV_mRNA_core.txt",inDir), + prot=sprintf("%s/OV_RPPA_core.txt",inDir), + mir=sprintf("%s/OV_miRNA_core.txt",inDir), + dnam=sprintf("%s/OV_methylation_core.txt",inDir), + cnv=sprintf("%s/OV_CNV_core.txt",inDir) +) + +pheno <- read.delim(inFiles$clinical,sep="\t",h=T,as.is=T) +colnames(pheno)[1] <- "ID" + +surv <- read.delim(inFiles$survival,sep="\t",h=T,as.is=T) +colnames(surv)[1:2] <- c("ID","STATUS_INT") +survStr <- rep(NA,nrow(surv)) +survStr[surv$STATUS_INT<1] <- "SURVIVENO" +survStr[surv$STATUS_INT>0] <- "SURVIVEYES" +surv$STATUS <- survStr +pheno <- merge(x=pheno,y=surv,by="ID") +pheno$X <- NULL +# pheno$gender <- ifelse(pheno$gender=="FEMALE",1, 0) +pheno_nosurv <- pheno[1:4] + +cat("Collecting patient data:\n") +dats <- list() #input data in different slots +cat("\t* Clinical\n") +clin <- pheno +rownames(clin) <- clin[,1]; +clin <- t(clin[,2,drop=FALSE]) +dats$clinical <- clin; rm(clin) + +# create master input net +for (nm in names(datFiles)) { + cat(sprintf("\t* %s\n",nm)) + tmp <- read.delim(datFiles[[nm]],sep="\t",h=T,as.is=T) + if (colnames(tmp)[ncol(tmp)]=="X") tmp <- tmp[,-ncol(tmp)] + rownames(tmp) <- tmp[,1] + tmp <- t(tmp[,-1]) + class(tmp) <- "numeric" + dats[[nm]] <- tmp +} + +cat("\t Ordering column names\n") +# include only data for patients in classifier +dats <- lapply(dats, function(x) { x[,which(colnames(x)%in%pheno$ID), drop = FALSE]}) +dats <- lapply(dats, function(x) { + midx <- match(pheno$ID,colnames(x)) + x <- x[,midx, drop = FALSE] + x +}) + + +# confirm patient order the same for all input nets +pname <- colnames(dats[[1]]) +for (k in 2:length(dats)) { + if (all.equal(colnames(dats[[k]]),pname)!=TRUE) { + cat(sprintf("Patient order doesn't match for %s\n", + names(dats)[k])) + browser() + } +} + +# input nets for each category +netSets <- lapply(dats, function(x) rownames(x)) + +# compile data +alldat <- do.call("rbind",dats) +pheno_all <- pheno + +combList <- list( + clinical="clinical_cont", + mir="mir.profile", + rna="rna.profile", + prot="prot.profile", + cnv="cnv.profile", + dnam="dnam.profile", + clinicalArna=c("clinical_cont","rna.profile"), + clinicalAmir=c("clinical_cont","mir.profile"), + clinicalAprot=c("clinical_cont","prot.profile"), + clinicalAdnam=c("clinical_cont","dnam.profile"), + clinicalAcnv=c("clinical_cont","cnv.profile"), + all="all") + +rm(pheno,pheno_nosurv) +rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) + +# ----------------------------------------------------------- +# run predictor +source("PanCancer_noPrune_sp1.R") +runPredictor(mega_combList=combList,rngVals=1:15,netSets=netSets, + dats=dats,pheno_all=pheno_all,megaDir=megaDir, + cutoffSet=9,maxEdge=6000,spCutoff=0.3) + + + + diff --git a/misc/PanCancer/noPrune/PanCancer_noPrune_sp1.R b/misc/PanCancer/noPrune/PanCancer_noPrune_sp1.R new file mode 100644 index 00000000..2fa1a4fc --- /dev/null +++ b/misc/PanCancer/noPrune/PanCancer_noPrune_sp1.R @@ -0,0 +1,247 @@ +#' PanCancer binarized survival: KIRC: Feature selection with one net per +# datatype +#' 10-fold CV predictor design + +# ---------------------------------------------------------------- +# helper functions +# takes average of normdiff of each row in x +normDiff2 <- function(x) { + # normalized difference + # x is vector of values, one per patient (e.g. ages) + normDiff <- function(x) { + #if (nrow(x)>=1) x <- x[1,] + nm <- colnames(x) + x <- as.numeric(x) + n <- length(x) + rngX <- max(x,na.rm=T)-min(x,na.rm=T) + + out <- matrix(NA,nrow=n,ncol=n); + # weight between i and j is + # wt(i,j) = 1 - (abs(x[i]-x[j])/(max(x)-min(x))) + for (j in 1:n) out[,j] <- 1-(abs((x-x[j])/rngX)) + rownames(out) <- nm; colnames(out)<- nm + out + } + + sim <- matrix(0,nrow=ncol(x),ncol=ncol(x)) + for (k in 1:nrow(x)) { + tmp <- normDiff(x[k,,drop=FALSE]) + sim <- sim + tmp + rownames(sim) <- rownames(tmp) + colnames(sim) <- colnames(tmp) + } + sim <- sim/nrow(x) + sim +} + +# ---------------------------------------------------------------- +runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, + cutoffSet,maxEdge,spCutoff) { + +require(netDx) +require(netDx.examples) + +numCores <- 8L +GMmemory <- 4L +trainProp <- 0.8 +cutoff <- cutoffSet +cat(sprintf("FS cutoff = %i\n", cutoff)) + +if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) +dir.create(megaDir) + +logFile <- sprintf("%s/log.txt",megaDir) +sink(logFile,split=TRUE) +tryCatch({ + +# first loop - over train/test splits +for (rngNum in rngVals) { + rng_t0 <- Sys.time() + cat(sprintf("-------------------------------\n")) + cat(sprintf("RNG seed = %i\n", rngNum)) + cat(sprintf("-------------------------------\n")) + outDir <- sprintf("%s/rng%i",megaDir,rngNum) + dir.create(outDir) + + pheno_all$TT_STATUS <- splitTestTrain(pheno_all,pctT=trainProp, + setSeed=rngNum*5) + write.table(pheno_all,file=sprintf("%s/tt_split.txt",outDir),sep="\t", + col=T,row=F,quote=F) + # -------------------------------------------- + # feature selection - train only + pheno <- subset(pheno_all, TT_STATUS %in% "TRAIN") + + dats_train <- lapply(dats, function(x) x[,which(colnames(x) %in% pheno$ID), + drop=F]) + alldat_train <- do.call("rbind",dats_train) + netSets_iter <- netSets + + netDir <- sprintf("%s/networks",outDir) + cat(sprintf("Making test nets for rng%i\n", rngNum)) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=FALSE,cutoff=spCutoff, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat_train, + rownames(alldat_train),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + dbDir <- GM_createDB(netDir, pheno$ID, outDir,numCores=numCores) + + # second loop - over combinations of input data + for (cur in names(combList)) { + t0 <- Sys.time() + cat(sprintf("%s\n",cur)) + pDir <- sprintf("%s/%s",outDir, cur) + dir.create(pDir) + + # run featsel once per subtype + subtypes <- unique(pheno$STATUS) + # run 10-fold cv per subtype + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + if (file.exists(pDir2)) unlink(pDir2,recursive=TRUE) + dir.create(pDir2) + + cat(sprintf("\n******\nSubtype %s\n",g)) + pheno_subtype <- pheno + ## label patients not in the current class as residual + nong <- which(!pheno_subtype$STATUS %in% g) + pheno_subtype$STATUS[nong] <- "nonpred" + ## sanity check + print(table(pheno_subtype$STATUS,useNA="always")) + resDir <- sprintf("%s/GM_results",pDir2) + ## query for feature selection comprises of training + ## samples from the class of interest + trainPred <- pheno_subtype$ID[ + which(pheno_subtype$STATUS %in% g)] + + # Cross validation + GM_runCV_featureSet(trainPred, resDir, dbDir$dbDir, + nrow(pheno_subtype),incNets=combList[[cur]], + verbose=T, numCores=numCores, + GMmemory=GMmemory) + + # patient similarity ranks + prank <- dir(path=resDir,pattern="PRANK$") + # network ranks + nrank <- dir(path=resDir,pattern="NRANK$") + cat(sprintf("Got %i prank files\n",length(prank))) + + # Compute network score + pTally <- GM_networkTally(paste(resDir,nrank,sep="/")) + head(pTally) + # write to file + tallyFile <- sprintf("%s/%s_pathway_CV_score.txt",resDir,g) + write.table(pTally,file=tallyFile,sep="\t",col=T,row=F,quote=F) + } + + netDir <- sprintf("%s/test_networks",outDir) + netLen <- unlist(lapply(netSets_iter,length)) + pearnet <- which(netLen>=6) + othernet <- setdiff(1:length(netSets_iter),pearnet) + netList <- c(); netList2 <- c() + alldat <- do.call("rbind",dats) + if (any(othernet)) { + netList <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[othernet],netDir, + simMetric="custom",customFunc=normDiff2, + writeProfiles=FALSE, + sparsify=TRUE,useSparsify2=FALSE,cutoff=spCutoff, + sparsify_edgeMax=maxEdge, + verbose=FALSE,numCores=numCores) + } + if (any(pearnet)) { + netList2 <- makePSN_NamedMatrix(alldat, + rownames(alldat),netSets_iter[pearnet],netDir, + writeProfiles=TRUE, + verbose=FALSE,numCores=numCores,append=TRUE) + } + netList <- c(netList,netList2) + cat(sprintf("Total of %i nets\n", length(netList))) + + # now create database + testdbDir <- GM_createDB(netDir, pheno_all$ID, + outDir,numCores=numCores) + + predRes <- list() + for (g in subtypes) { + pDir2 <- sprintf("%s/%s",pDir,g) + # get feature selected net names + pTally <- read.delim( + sprintf("%s/GM_results/%s_pathway_CV_score.txt",pDir2,g), + sep="\t",h=T,as.is=T) + + # feature selected nets pass cutoff threshold + pTally <- pTally[which(pTally[,2]>=cutoff),1] + cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { + + # query of all training samples for this class + qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & + pheno_all$TT_STATUS%in%"TRAIN")] + + qFile <- sprintf("%s/%s_query",pDir2,g) + GM_writeQueryFile(qSamps,incNets=pTally, + nrow(pheno_all),qFile) + resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) + predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), + pheno_all,g) + } else { + predRes[[g]] <- NA + } + } + + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } + } + + #cleanup to save disk space + system(sprintf("rm -r %s/dataset %s/tmp %s/networks", + outDir,outDir,outDir)) + system(sprintf("rm -r %s/dataset %s/networks", + outDir,outDir)) +} + pheno_all$TT_STATUS <- NA + rng_t1 <- Sys.time() + cat(sprintf("Time for one train/test split:")) + print(rng_t1-rng_t0) + +}, error=function(ex){ + print(ex) +}, finally={ + sink(NULL) +}) +} From 5a8b0c2a3bf3767d849309eb5b42c2c5e9f1d479 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 14 May 2018 14:58:48 -0400 Subject: [PATCH 097/124] updated survival plot/logrank test etc. code for dell --- .../multiCutoff/plotSurvivalForest.R | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 misc/PanCancer/multiCutoff/plotSurvivalForest.R diff --git a/misc/PanCancer/multiCutoff/plotSurvivalForest.R b/misc/PanCancer/multiCutoff/plotSurvivalForest.R new file mode 100644 index 00000000..9f30e540 --- /dev/null +++ b/misc/PanCancer/multiCutoff/plotSurvivalForest.R @@ -0,0 +1,133 @@ +# average KM curves for KIRC +rm(list=ls()) + +require(rms) +require(survival) +require(survminer) +source("survival_plots/plot.survfit.custom.R") + +tumourType <- "GBM" + +rootDir <- "/home/shraddhapai/BaderLab" +if (tumourType=="KIRC") { + dataDir <- sprintf("%s/PanCancer_KIRC/output/noprune_sp0.3_180511",rootDir) + survFile <- sprintf("%s/PanCancer_KIRC/input/KIRC_OS_core.txt",rootDir) + clinFile <- sprintf("%s/PanCancer_KIRC/input/KIRC_clinical_core.txt",rootDir) + repIter <- 14 +} else if (tumourType=="GBM") { + dataDir <- sprintf("%s/2017_PanCancer/GBM/output/noprune_impute_sp0.3_180511", + rootDir) + survFile <- sprintf("%s/2017_PanCancer/GBM/input/GBM_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_PanCancer/GBM/input/GBM_clinical_core.txt", + rootDir) + repIter <-6 +} else if (tumourType=="OV") { + dataDir <- sprintf("%s/2017_TCGA_OV/output/OV_oneNetPer_170425",rootDir) + survFile <- sprintf("%s/2017_TCGA_OV/input/OV_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_TCGA_OV/input/OV_clinical_core.txt",rootDir) +} + +survDat <- read.delim(survFile,sep="\t",h=T,as.is=T) +clinDat <- read.delim(clinFile,sep="\t",h=T,as.is=T) +pheno <- merge(x=survDat,y=clinDat,by="feature") + +plotDF <- list() # compiles survival curves across all iterations +megaDF <- list() +hratio <- c() # cum hazards ratio for all iterations +keepCoxph <- c() # for representative iter +keepSurv <- c() # for representative iter +for (k in 1:20) { + print(k) + if (tumourType=="KIRC") { + dat <- read.delim(sprintf("%s/rng%i/clinicalAdnam/predictionResults.txt", + dataDir,k),sep="\t",h=T,as.is=T) + } else if (tumourType =="GBM") { + dat <- read.delim(sprintf("%s/rng%i/clinical/predictionResults.txt", + dataDir,k),sep="\t",h=T,as.is=T) + } + + colnames(dat)[1] <- "feature" + dat <- merge(x=dat,y=pheno,by="feature") + + # force first entry to be YES and second to be NO so we can tell them apart + # in the output. + dat$PRED_CLASS <- factor(dat$PRED_CLASS, + levels=c("SURVIVEYES","SURVIVENO")) + megaDF[[k]] <- dat + + dat$SurvObj <- with(dat, Surv(OS_OS, STATUS_INT == 0)) + + # get cum hazards ratio for this split + model <- coxph(SurvObj~PRED_CLASS, data=dat) + hratio <- c(hratio,summary(model)$coef[1,2]) + + fit <- npsurv(SurvObj ~ PRED_CLASS, data = dat, + conf.type = "log-log") + + #par(mfrow=c(1,2)) +### out <- plot.survfit.custom(fit) +### #plot(0,0,type='n',xlim=c(0,max(out$ends$x)),ylim=c(0,1)) +### out[[1]] <- as.data.frame(out[[1]]) +### out[[2]] <- as.data.frame(out[[2]]) +### +### newdf <- out[[1]]; newdf$PRED_CLASS <- "SURVIVEYES"; newdf$split <- k +### newdf2 <- out[[2]]; newdf2$PRED_CLASS <- "SURVIVENO"; newdf2$split <- k +### +### plotDF[[k]] <- rbind(newdf,newdf2) + + if (k == repIter) { + keepCoxph <- model + keepSurv <- dat + } +} + +cat("Plot representative iter (separately found to have auroc closest to average auroc\n") +require(forestmodel) +fit <- npsurv(SurvObj~PRED_CLASS,data=keepSurv,conf.type="log-log") +pdf(sprintf("%s_survPlot.pdf",tumourType)) +p <- ggsurvplot(fit,pval=TRUE,conf.int=TRUE,palette=c("blue","red"), + legend.title="Survival type") +print(p) +p2 <- forest_model(keepCoxph) +print(p2) +dev.off() + +# approach 1: pool all results and make a single KM curve +res <- do.call("rbind",megaDF) +res$SurvObj <- with(res, Surv(OS_OS,STATUS_INT==0)) +fit <- npsurv(SurvObj ~ PRED_CLASS, data=res,conf.type="log-log") + +idx <- which(hratio > 50) +if (any(idx)) hratio <- hratio[-idx] +hratio <- data.frame(group="tumour",hratio=hratio) +p <- ggplot(hratio,aes(group,y=hratio)) + geom_boxplot() + ylim(c(0,quantile(hratio$hratio,0.98))+3) +p <- p + ggtitle(sprintf("Cum hazard ratio (one per split)(N=%i)",nrow(hratio))) +p <- p + geom_hline(yintercept=1,lty=2) +p <- p + theme(axis.text=element_text(size=12), + axis.title=element_text(size=14,face="bold")) + +p1 <- survminer::ggsurvplot(fit,data=res,conf.int=TRUE) +p2 <- p + +#### approach 2 : pool all step functions +###res <- do.call("rbind",plotDF) +#### compute ci +###out <- list() +###for (k in unique(res$PRED_CLASS)) { +### res2 <- subset(res,PRED_CLASS==k) +### ub <- c(); lb <- c(); xx <- unique(res2$xx); muy <- c() +### for (x in xx) { +### yy <- res2$yy[which(res2$xx == x)] +### mu <- mean(yy); offset <- sd(yy)/sqrt(length(yy)) +### lb <- c(lb, mu-offset) +### ub <- c(ub, mu+offset) +### muy <- c(muy, mu) +### } +### out[[k]] <- data.frame(x=xx,y=muy,lb=lb,ub=ub,PRED_CLASS=k) +###} +###blah <- do.call("rbind",out) +###p3 <- ggplot(blah,(aes(x=x,y=y,colour=PRED_CLASS))) + geom_line() + geom_ribbon(aes(ymin=lb,ymax=ub),alpha=0.2) + ggtitle("Manual mean+CI of compiled KM") + ylab("% survival") +xlab("time (months)") + +pdf(sprintf("%s.pdf",tumourType)) +print(p1); print(p2); #print(p3) +dev.off() From 5f3199d379551e098817f7f4465000a2a02f94f2 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 14 May 2018 14:59:12 -0400 Subject: [PATCH 098/124] clean, all sets use 20 iters --- misc/PanCancer/multiCutoff/GBM_getRes.R | 19 ++++++++++-- misc/PanCancer/multiCutoff/KIRC_getRes.R | 13 +++++--- misc/PanCancer/multiCutoff/LUSC_getRes.R | 39 +++++++++++++++++------- misc/PanCancer/multiCutoff/OV_getRes.R | 14 +++++++-- 4 files changed, 65 insertions(+), 20 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index ee2afa0e..7a141476 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -6,7 +6,8 @@ require(reshape2) GBM_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output" dirSet <- list( - base="noPrune_180423", +# base="noPrune_180423", + baserep="noprune_impute_sp0.3_180511", # ridge_fix="ridge_AbsFix_180426", # lassoGenes_sp1="lassoGenes_incClin_180426", # pamrGenes="pamrGenes_incClin_180427", @@ -55,21 +56,32 @@ cutoff <-9 # } else if (any(grep("pimp",curdir))){ # rngDir <- paste("rng",1:14,sep="") # } else { - if (curdir=="base") rngMax<- 15 else rngMax <- 20 + #if (curdir=="base") rngMax<- 15 +#if (curdir=="baserep") rngMax <- 20 + rngMax <- 20 rngDir <- paste("rng",1:rngMax,sep="") #dir(path=dataDir,pattern="rng") # } numSplits[[curdir]] <- length(rngDir) cat(sprintf("Got %i rng files\n",length(rngDir))) rngDir <- sprintf("%s/%s",dataDir,rngDir) + if (curdir %in% "baserep") { + c7 <- sprintf("%s/%s/predictionResults.txt", + rngDir,settype) + } else { c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", rngDir,settype,cutoff) + } torm <- c() for (idx in 1:length(c7)) { + if (file.exists(c7[idx])){ dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) x1 <- sum(dat$STATUS=="SURVIVEYES") x2 <- sum(dat$STATUS=="SURVIVENO") if (x1<1 & x2<1) torm <- c(torm, idx) + } else { + torm <- c(torm,idx) + } } cat(sprintf("%i: removing %i\n", cutoff,length(torm))) if (length(torm)>0) c7 <- c7[-torm] @@ -82,6 +94,9 @@ cutoff <-9 auc_set[[settype]] <- y1 ctr <- ctr+1 } +if (curdir %in% "baserep") { + browser() +} mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } diff --git a/misc/PanCancer/multiCutoff/KIRC_getRes.R b/misc/PanCancer/multiCutoff/KIRC_getRes.R index 00a3e332..8494d596 100644 --- a/misc/PanCancer/multiCutoff/KIRC_getRes.R +++ b/misc/PanCancer/multiCutoff/KIRC_getRes.R @@ -8,7 +8,9 @@ KIRC_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dirSet <- list( - base="noPrune_180423" +# base="noPrune_180423", + baserep="noprune_sp0.3_180511" +# base_splow="noprune_180510" # lasso="lasso_180426", # lassocl="lassoclean_180509", # pamr="pamr_180426", @@ -25,17 +27,17 @@ mega_auc <- list() for (curdir in names(dirSet)) { dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) rngMax <- 20 - if (any(grep("base",curdir))) { - rngMax <- 15 -} # } else if (any(grep("lasso",curdir))) { # rngMax <- 16 # } auc_set <- list() +auc_var <- list() for (settype in settypes) { rngDir <- paste(sprintf("%s/rng",dataDir), 1:rngMax,sep="") + auc_var[[settype]] <- c() + colctr <- 1 cutoff <- 9 if (curdir %in% c("euc6K","lassocl","pamrcl")) { @@ -61,12 +63,15 @@ colctr <- 1 y1 <- unlist(lapply(x,function(i) i$auroc)) auc_set[[settype]] <- y1 + auc_var[[settype]] <- c(auc_var[[settype]], sd(y1)/sqrt(length(y1))) } mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } pdf(sprintf("KIRC_%s.pdf",format(Sys.Date(),"%y%m%d")),width=11,height=6); boxplot(mega_auc,main="KIRC",cex.axis=1.7,cex.main=2,las=1); dev.off() +browser() + return(mega_auc) } diff --git a/misc/PanCancer/multiCutoff/LUSC_getRes.R b/misc/PanCancer/multiCutoff/LUSC_getRes.R index dc568548..53dd1d47 100644 --- a/misc/PanCancer/multiCutoff/LUSC_getRes.R +++ b/misc/PanCancer/multiCutoff/LUSC_getRes.R @@ -5,13 +5,14 @@ require(reshape2) LUSC_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output" dirSet <- list( - base="noPrune_180423", +# base="noPrune_180423", + baserep="noPrune_sp0.3_180511", # lasso="lasso_180426", # lassoGenes="lassoGenes_180426", # pamrGenes_sp2="pamrGenes_180427", # pamrGenes_sp1="pamrGenes_sp1_180427", # rbfclean="rbfclean_0.20_180507", - euc6K="eucclean_180504", + #euc6K="eucclean_180504", # eucimpute="eucscale_impute_180507", # pearscale="pearscale_180507", plassoc1="pearscale_lasso_topClin1_180509" @@ -20,31 +21,44 @@ settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") mega_auc <- list() +auc_var <- list() for (curdir in names(dirSet)) { dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) - if (curdir %in% "base") rngMax <- 15 - else rngMax <- 20 +# if (curdir %in% "base") rngMax <- 15 +# else if (curdir %in% "baserep") rngMax <- 20 + rngMax <- 20 auc_set <- list() for (settype in settypes) { + if (is.null(auc_var[[settype]])) { auc_var[[settype]] <- c() } + if (curdir %in% "lassoGenes") { rngDir <- paste(sprintf("%s/rng",dataDir), 3:rngMax,sep="") - } else if (any(grep("euc",curdir))) { - rngDir <- paste(sprintf("%s/rng",dataDir), 1:9,sep="") + #} else if (any(grep("euc",curdir))) { + #rngDir <- paste(sprintf("%s/rng",dataDir), 1:9,sep="") } else { rngDir <- paste(sprintf("%s/rng",dataDir), 1:rngMax,sep="") } for (cutoff in 9) { - c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", + if (curdir %in% "baserep") { + c7 <- sprintf("%s/%s/predictionResults.txt", + rngDir,settype) + } else { + c7 <- sprintf("%s/%s/cutoff%i/predictionResults.txt", rngDir,settype,cutoff) + } torm <- c() for (idx in 1:length(c7)) { - dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) - x1 <- sum(dat$STATUS=="SURVIVEYES") - x2 <- sum(dat$STATUS=="SURVIVENO") - if (x1<1 & x2<1) torm <- c(torm, idx) + if (file.exists(c7[idx])) { + dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) + x1 <- sum(dat$STATUS=="SURVIVEYES") + x2 <- sum(dat$STATUS=="SURVIVENO") + if (x1<1 & x2<1) torm <- c(torm, idx) + } else { + torm <- c(torm,idx) + } } cat(sprintf("%i: removing %i\n", cutoff,length(torm))) if (length(torm)>0) c7 <- c7[-torm] @@ -59,6 +73,7 @@ for (cutoff in 9) { } } mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) +auc_var[[settype]] <- c(auc_var[[settype]], sd(unlist(auc_set))/sqrt(length(auc_set))) } dt <- format(Sys.Date(),"%y%m%d") pdf(sprintf("LUSC_%s.pdf",dt),width=24,height=6); @@ -66,5 +81,7 @@ boxplot(mega_auc,main="LUSC",cex.axis=1.7,cex.main=2,las=1); abline(h=0.5) dev.off() +browser() + return(mega_auc) } diff --git a/misc/PanCancer/multiCutoff/OV_getRes.R b/misc/PanCancer/multiCutoff/OV_getRes.R index 46f44bd7..200d551e 100644 --- a/misc/PanCancer/multiCutoff/OV_getRes.R +++ b/misc/PanCancer/multiCutoff/OV_getRes.R @@ -10,9 +10,11 @@ settypes <- c("clinical","mir","rna","prot","cnv","dnam", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAdnam", "clinicalAcnv","all") dirSet <- list( - base="noPrune_180423", +# base="noPrune_180423", + baserep="noprune_sp0.3_180511", + baserep1="noprune_sp1_180512", # prune="pruneTrain_180419", - lasso="lasso_180426", +# lasso="lasso_180426", euc6K="eucscale_180504" # rbfclean="rbfclean_0.2_180507" ) @@ -22,7 +24,9 @@ for (curdir in names(dirSet)) { # if (curdir %in% c("lasso","pamr","euc6K","rbfclean")) rngMax <- 20 # else if (curdir %in% "prune") rngMax <- 14 - if (curdir=="base") rngMax <- 15 else rngMax <- 20 +# if (curdir=="base") rngMax <- 15 +# else if (curdir=="baserep") rngMax <- 20 + rngMax <- 20 cat(sprintf("***** %s *****\n", curdir)) dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) @@ -44,10 +48,14 @@ for (curdir in names(dirSet)) { } torm <- c() for (idx in 1:length(c7)) { + if (file.exists(c7[idx])){ dat <- read.delim(c7[idx],sep="\t",h=T,as.is=T) x1 <- sum(dat$STATUS=="SURVIVEYES") x2 <- sum(dat$STATUS=="SURVIVENO") if (x1<1 & x2<1) torm <- c(torm, idx) + } else { + torm <- c(torm,idx) + } } cat(sprintf("%i: removing %i\n", cutoff,length(torm))) if (length(torm)>0) c7 <- c7[-torm] From 6362dd9d88baf748657d805556850b9ad9fb29ba Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 14 May 2018 14:59:26 -0400 Subject: [PATCH 099/124] writes stats to file --- misc/PanCancer/multiCutoff/compileRes.R | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/misc/PanCancer/multiCutoff/compileRes.R b/misc/PanCancer/multiCutoff/compileRes.R index ed41bfdd..fe2baefa 100644 --- a/misc/PanCancer/multiCutoff/compileRes.R +++ b/misc/PanCancer/multiCutoff/compileRes.R @@ -11,22 +11,41 @@ rm(list=ls()) } source("KIRC_getRes.R") +dt <- format(Sys.Date(),"%y%m%d") + kirc <- KIRC_getRes() +tmp <- melt(do.call("rbind",kirc)) +tmp <- cbind(tmp,"KIRC") +write.table(tmp,file=sprintf("KIRCstats_%s.txt",dt),sep="\t",col=T,row=T,quote=F) +rm(tmp) + kirc2 <- .getBest(kirc) kirc2 <- cbind(kirc2,"KIRC") source("GBM_getRes.R") gbm <- GBM_getRes() +tmp <- melt(do.call("rbind",gbm)) +tmp <- cbind(tmp,"GBM") +write.table(tmp,file=sprintf("GBMstats_%s.txt",dt),sep="\t",col=T,row=T,quote=F) +rm(tmp) gbm2 <- .getBest(gbm) gbm2 <- cbind(gbm2,"GBM") source("OV_getRes.R") ov <- OV_getRes() +tmp <- melt(do.call("rbind",ov)) +tmp <- cbind(tmp,"OV") +write.table(tmp,file=sprintf("OVstats_%s.txt",dt),sep="\t",col=T,row=T,quote=F) +rm(tmp) ov2 <- .getBest(ov) ov2 <- cbind(ov2,"OV") source("LUSC_getRes.R") lusc <- LUSC_getRes() +tmp <- melt(do.call("rbind",lusc)) +tmp <- cbind(tmp,"LUSC") +write.table(tmp,file=sprintf("LUSCstats_%s.txt",dt),sep="\t",col=T,row=T,quote=F) +rm(tmp) lusc2 <- .getBest(lusc) lusc2 <- cbind(lusc2,"LUSC") @@ -35,7 +54,6 @@ comb <- as.data.frame(comb) rownames(comb)<- NULL comb[,1] <- as.numeric(as.character(comb[,1])) colnames(comb) <- c("val","method","datatype","tumour") -dt <- format(Sys.Date(),"%y%m%d") write.table(comb,file=sprintf("netDx_bestModel_%s.txt",dt),sep="\t",col=T,row=F,quote=F) # convert into table form for comparison with Yuan et al. From 8cbc20029d417f2323c6bf28731333d432433d5d Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 14 May 2018 15:00:37 -0400 Subject: [PATCH 100/124] handles cases where no nets pass fs --- .../noPrune/PanCancer_noPrune_impute.R | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/misc/PanCancer/noPrune/PanCancer_noPrune_impute.R b/misc/PanCancer/noPrune/PanCancer_noPrune_impute.R index e6570d38..bd806e35 100644 --- a/misc/PanCancer/noPrune/PanCancer_noPrune_impute.R +++ b/misc/PanCancer/noPrune/PanCancer_noPrune_impute.R @@ -216,6 +216,7 @@ for (rngNum in rngVals) { # feature selected nets pass cutoff threshold pTally <- pTally[which(pTally[,2]>=cutoff),1] cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { # query of all training samples for this class qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & @@ -227,22 +228,30 @@ for (rngNum in rngVals) { resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), pheno_all,g) + } else { + predRes[[g]] <- NA + } } - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F, - quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } } #cleanup to save disk space From 1ac9cdba3740596467fd6a5a86590f9434ba49b4 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 14 May 2018 15:00:50 -0400 Subject: [PATCH 101/124] handles cases where no nets pass fs --- misc/PanCancer/noPrune/PanCancer_noPrune.R | 41 ++++++++++++++-------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/misc/PanCancer/noPrune/PanCancer_noPrune.R b/misc/PanCancer/noPrune/PanCancer_noPrune.R index 31ed79dd..1c114126 100644 --- a/misc/PanCancer/noPrune/PanCancer_noPrune.R +++ b/misc/PanCancer/noPrune/PanCancer_noPrune.R @@ -44,7 +44,9 @@ require(netDx.examples) numCores <- 8L GMmemory <- 4L trainProp <- 0.8 -cutoff <- 9 +cutoff <- cutoffSet +cat(sprintf("FS cutoff = %i\n", cutoff)) + if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) dir.create(megaDir) @@ -188,6 +190,7 @@ for (rngNum in rngVals) { # feature selected nets pass cutoff threshold pTally <- pTally[which(pTally[,2]>=cutoff),1] cat(sprintf("%s: %i pathways\n",g,length(pTally))) + if (length(pTally)>=1) { # query of all training samples for this class qSamps <- pheno_all$ID[which(pheno_all$STATUS %in% g & @@ -199,22 +202,30 @@ for (rngNum in rngVals) { resFile <- runGeneMANIA(testdbDir$dbDir,qFile,resDir=pDir2) predRes[[g]] <- GM_getQueryROC(sprintf("%s.PRANK",resFile), pheno_all,g) + } else { + predRes[[g]] <- NA + } } - predClass <- GM_OneVAll_getClass(predRes) - out <- merge(x=pheno_all,y=predClass,by="ID") - outFile <- sprintf("%s/predictionResults.txt",pDir) - write.table(out,file=outFile,sep="\t",col=T,row=F, - quote=F) - - acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) - cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", - nrow(out), acc*100)) - - require(ROCR) - ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, - out$STATUS=="SURVIVEYES") - save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + if (any(is.na(predRes))) { + cat("One or more groups had zero feature selected nets\n") + cat("# no feature-selected nets.\n",file=outFile) + }else { + predClass <- GM_OneVAll_getClass(predRes) + out <- merge(x=pheno_all,y=predClass,by="ID") + outFile <- sprintf("%s/predictionResults.txt",pDir) + write.table(out,file=outFile,sep="\t",col=T,row=F, + quote=F) + + acc <- sum(out$STATUS==out$PRED_CLASS)/nrow(out) + cat(sprintf("Accuracy on %i blind test subjects = %2.1f%%\n", + nrow(out), acc*100)) + + require(ROCR) + ROCR_pred <- prediction(out$SURVIVEYES_SCORE-out$SURVIVENO, + out$STATUS=="SURVIVEYES") + save(predRes,ROCR_pred,file=sprintf("%s/predRes.Rdata",pDir)) + } } #cleanup to save disk space From 184a5362be8d1b286d9e1228b8a5b55ddc3b11f8 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Mon, 14 May 2018 15:01:00 -0400 Subject: [PATCH 102/124] minor formatting --- misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R | 2 +- misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R | 2 +- misc/PanCancer/noPrune/OV_noPrune_pipeline.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R b/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R index b1f2c25a..a5314bba 100644 --- a/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R +++ b/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R @@ -114,7 +114,7 @@ rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) # ----------------------------------------------------------- # run predictor source("PanCancer_noPrune.R") -runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, +runPredictor(mega_combList=combList,rngVals=9:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, cutoffSet=9,maxEdge=6000,spCutoff=0.3) diff --git a/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R b/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R index 8719c5d8..bb3b4715 100644 --- a/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R +++ b/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R @@ -116,4 +116,4 @@ rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) source("PanCancer_noPrune.R") runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=8,maxEdge=6000,spCutoff=0.3) + cutoffSet=9,maxEdge=6000,spCutoff=0.3) diff --git a/misc/PanCancer/noPrune/OV_noPrune_pipeline.R b/misc/PanCancer/noPrune/OV_noPrune_pipeline.R index ce581db3..4166924a 100644 --- a/misc/PanCancer/noPrune/OV_noPrune_pipeline.R +++ b/misc/PanCancer/noPrune/OV_noPrune_pipeline.R @@ -106,7 +106,7 @@ rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) source("PanCancer_noPrune.R") runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=8,maxEdge=6000,spCutoff=0.3) + cutoffSet=9,maxEdge=6000,spCutoff=0.3) From fde989a2aaeaa664c42fdc87105efb8848f04849 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 23 May 2018 06:58:10 -0400 Subject: [PATCH 103/124] one-net with lasso prefiltering --- misc/Ependymoma/netDx_prunedOneNet.R | 45 ++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 misc/Ependymoma/netDx_prunedOneNet.R diff --git a/misc/Ependymoma/netDx_prunedOneNet.R b/misc/Ependymoma/netDx_prunedOneNet.R new file mode 100644 index 00000000..541cda81 --- /dev/null +++ b/misc/Ependymoma/netDx_prunedOneNet.R @@ -0,0 +1,45 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedOneNet_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=list(rna=rownames(xpr))) +dats <- list(rna=xpr) +pheno$STATUS <- as.character(droplevels(pheno$STATUS)) + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L, + preFilter=TRUE) From b59c813a44ac4fcc3e5a9a9a29dd19470593eebd Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 23 May 2018 17:55:46 -0400 Subject: [PATCH 104/124] clean version --- misc/PanCancer/corrFeatWithOutcome.R | 156 +++++++++++++++++++++++++++ misc/PanCancer/corrFeat_test.R | 49 +++++++++ 2 files changed, 205 insertions(+) create mode 100644 misc/PanCancer/corrFeatWithOutcome.R create mode 100644 misc/PanCancer/corrFeat_test.R diff --git a/misc/PanCancer/corrFeatWithOutcome.R b/misc/PanCancer/corrFeatWithOutcome.R new file mode 100644 index 00000000..8e303b96 --- /dev/null +++ b/misc/PanCancer/corrFeatWithOutcome.R @@ -0,0 +1,156 @@ +#' correlate feature with outcome +#' +#' @details Shows patient-level data for features of interest. For complex +#' features such as pathways, shows the patient-level PC projections, and +#' correlates these PC projects with outcome. It is a compact representation +#' of patient-level pathway activity. +#' @param pheno (data.frame) must have ID and STATUS +#' @param datList (list) keys are datatypes, and values are data.frames or +#' matrix with patients in columns and values in rows +#' @param groupList (list) unit groupings. keys are datatypes, values are lists +#' of unit groups. e.g. for pathway grouping in rna, groupList[["rna"]] would +#' be a list with pathway names as keys and genes as values. +#' @param inputNets (data.frame) contents of inputNets.txt. Two-column table +#' with column 1 having datatype and column 2 having net name. +#' @param selFeatures (char) Features to correlate. Typically these would +#' be high-scoring features from the predictor. Names should match column 2 +#' of inputNets +#' @param numPCs (integer) how many principal components to show data for +#' @param filePfx (char) prefix for output pdfs. +#' @import plotrix +#' @return No value. Side effect of creating plots corr +#' @export +corrFeatWithOutcome <- function(pheno, datList, groupList,inputNets, + selFeatures,numPCs=3,filePfx="corrFeat",plotLevels) { + + dt <- format(Sys.Date(),"%y%m%d") + if (!class(pheno$STATUS)=="factor") pheno$STATUS <- factor(pheno$STATUS) + + resMat <- matrix(0, nrow=length(selFeatures), ncol=numPCs*2) + isDone <- rep(FALSE, length(selFeatures)) + rownames(resMat) <- selFeatures + #rownames(resMat) <- gsub("_"," ",sub(".profile$|_cont$","",selFeatures)) + #rownames(resMat) <- toTitleCase(rownames(resMat)) + plotList <- list();plotCtr <- 1 + + for (idx in 1:length(selFeatures)) { + curF <- selFeatures[idx] + print(curF) + netType <- inputNets[which(inputNets[,2]==curF),1] + myDat <- datList[[netType]] + mySet <- groupList[[netType]][[curF]] + dat <- myDat[mySet,] + dat <- na.omit(dat) + + dat <- dat[,colSums(is.na(dat))==0] + if (ncol(dat)>=3) { + pr <- prcomp(na.omit(t(dat))) + pr <- pr$x[,1:numPCs] + } else { + cat(sprintf("\t\t%s: Has < 3 values!!\n",curF)) + pr <- dat + } + + maxDim <- min(ncol(pr),numPCs) + numPCs <- maxDim + tmp <- data.frame(pr[,1:maxDim],STATUS=pheno$STATUS) + colnames(tmp)[1:maxDim] <- paste("PC",1:maxDim,sep="") + + combs <- combn(maxDim,2) + + for (k in 1:numPCs) { + y <- cor.test(pr[,k],as.integer(pheno$STATUS),method="spearman") + resMat[idx,k] <- y$estimate + resMat[idx,3+k] <- -log10(y$p.value) # y$p.value) + } + + for (k in 1:ncol(combs)) { + i <- combs[1,k]; j <- combs[2,k] + cat(sprintf("[%i %i]\n",i,j)) + + # draw decision boundary in automated manner + tmp2 <- tmp[,c("STATUS",sprintf("PC%i",i),sprintf("PC%i",j))] + colnames(tmp2) <- c("y","x1","x2") + mdl <- glm(y ~ ., data=tmp2,family=binomial) + slope <- coef(mdl)[2]/(-coef(mdl)[3]) + intercept <- coef(mdl)[1]/(-coef(mdl)[3]) + + showLeg_Flag <- (k == ncol(combs)) + if (ncol(combs) > 4) { + pt <- 0.8; lwd <- 1 ;cex <-5 + } else { + pt <- 2; lwd <- 2; cex <- 12 + } + + p <- ggplot(tmp,aes_string(x=sprintf("PC%i",i), + y=sprintf("PC%i",j))) + p <- p + geom_point(aes(colour=factor(STATUS,levels=plotLevels)),alpha=0.6, + size=pt,show.legend=showLeg_Flag) + p <- p + geom_abline(intercept=intercept,slope=slope, + colour="gray50",lwd=lwd) + p <- p + ggtitle(sprintf("%s\ncor=%1.2f (p<%1.2e)", + rownames(resMat)[idx],resMat[idx,i],10^-resMat[idx,3+i])) + p <- p + theme(# legend.position="none", + axis.ticks=element_blank(), + axis.text=element_blank(), + plot.title=element_text(size=cex)) + + plotList[[plotCtr]] <- p + plotCtr <- plotCtr+1 + } +} + +# now plot the PC projections with colour-coded status + + +nr <- 3; nc <- choose(numPCs,2) +pdf(sprintf("%s_PCview_%s.pdf", filePfx,dt),height=11,width=11) +tryCatch({ + for (sidx in seq(1,length(plotList),nr*nc)) { + eidx <- sidx+((nr*nc)-1); + cat(sprintf("%i-%i\n",sidx,eidx)) + if (eidx>length(plotList)) eidx <- length(plotList) + multiplot(plotlist=plotList[sidx:eidx], + layout=matrix(1:(nr*nc),ncol=nc,byrow=TRUE)) +} +},error=function(ex) {print(ex)},finally={dev.off()}) + +resMat <- resMat[order(abs(resMat[,1]),decreasing=TRUE),] +pdf(sprintf("%s_PCtable_%s.pdf",filePfx,dt),height=11,width=11) +tryCatch({ +#colnames(resMat)[1:3] <- paste("PC ",1:3,sep="") +par(mar = c(0.5, 35, 6.5, 0.5)) + plotrix::color2D.matplot(resMat[,1:3],show.values=TRUE,axes=F, + xlab="",ylab="",vcex=2,vcol='black', + cs1=c(1,1,0),cs2=c(0,1,0),cs3=c(0,1,1)) + axis(3,at=seq_len(3)-0.5,labels=colnames(resMat)[4:6], + tick=F,cex.axis=1,line=-1) + axis(2,at=seq_len(nrow(resMat))-0.5, + labels=sub(".profile","", + sub("_cont","",rev(rownames(resMat)))),tick=F, + las=1,cex.axis=1) +},error=function(ex){print(ex)},finally={dev.off()}) + +outFile <- sprintf("corrFeat_table_%s.txt",dt) +write.table(resMat,file=outFile,sep="\t",col=T,row=F,quote=F) + +return(resMat) +} + +toTitleCase <- function(str) { + str <- tolower(str) + sp <- gregexpr(" ",str) + str2 <- sapply(1:length(str), function(i) { + z <- str[i] + z <- paste(toupper(substr(z,1,1)),substr(z,2,nchar(z)),sep="") + if (!sp[[i]][1]==-1) { + for (idx in sp[[i]]) { + z <- gsub(paste("^(.{",idx,"}).",sep=""), + paste("\\1",toupper(substr(z,idx+1,idx+1)),sep=""),z); + } + } + z + }) + str2 <- unlist(str2) + str2 +} diff --git a/misc/PanCancer/corrFeat_test.R b/misc/PanCancer/corrFeat_test.R new file mode 100644 index 00000000..9d7f1370 --- /dev/null +++ b/misc/PanCancer/corrFeat_test.R @@ -0,0 +1,49 @@ +# testing BRCA pathway-level correlation view +rm(list=ls()) + +require(netDx) +require(netDx.examples) +data(TCGA_BRCA) + +subtypes<- c("LumA") +pheno$STATUS[which(!pheno$STATUS %in% subtypes)] <- "other" +pheno$STATUS <- factor(pheno$STATUS, levels=c("other","LumA")) +subtypes <- c(subtypes,"other") # add residual + +rootDir <- "/home/shraddhapai/BaderLab/2017_BRCA" +pathFile <- sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +pathwayList <- readPathways(pathFile) +head(pathwayList) + +gps <- list(rna=pathwayList) +dats <- list(rna=xpr) + +inputNets <- data.frame(datatype="rna",netName=names(pathwayList)) + +inDir <- sprintf("%s/output/BRCA_part2_180223",rootDir) +predClasses <- c("LumA","other") +featScores <- getFeatureScores(inDir,predClasses=predClasses) +featSelNet <- lapply(featScores, function(x) { + callFeatSel(x, fsCutoff=10, fsPctPass=0.7) +}) + +###selFeat <- c( +### "ACTIVATION_OF_ATR_IN_RESPONSE_TO_REPLICATION_STRESS.profile", +### "ADENOSINE_DEOXYRIBONUCLEOTIDES__I_DE_NOVO__I__BIOSYNTHESIS.profile", +### "MISMATCH_REPAIR__MMR__DIRECTED_BY_MSH2:MSH3__MUTSBETA_.profile", +### "MITOTIC_SPINDLE_CHECKPOINT.profile", +### "RESOLUTION_OF_D-LOOP_STRUCTURES.profile" +###) +set.seed(123) +selFeat <- featSelNet[["LumA"]] +selFeat <- selFeat[sample(1:length(selFeat),10,replace=F)] +selFeat <- c(selFeat, "MISMATCH_REPAIR__MMR__DIRECTED_BY_MSH2:MSH3__MUTSBETA_.profile") +selFeat <- sub(".profile","",selFeat) + +source("multiplot.R") +require(plotrix) +require(ggplot2) +source("corrFeatWithOutcome.R"); +resMat <- corrFeatWithOutcome(pheno,dats,gps,inputNets,selFeat,numPCs=3,filePfx="LumA_top10",plotLevels=c("LumA","other")) + From 22e35a014dc720951e70210c7e7d049d0799a725 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 23 May 2018 17:56:10 -0400 Subject: [PATCH 105/124] variations of prefiltering --- misc/Ependymoma/Epen_plotResults.R | 12 ++-- misc/Ependymoma/buggy/netDx_prunedOneNet.R | 2 +- misc/Ependymoma/netDx_nopruneOneNet.R | 45 ++++++++++++ .../Ependymoma/netDx_prunedOneNet_outsideCV.R | 68 +++++++++++++++++++ misc/Ependymoma/netDx_prunedOneNet_univ.R | 59 ++++++++++++++++ 5 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 misc/Ependymoma/netDx_nopruneOneNet.R create mode 100644 misc/Ependymoma/netDx_prunedOneNet_outsideCV.R create mode 100644 misc/Ependymoma/netDx_prunedOneNet_univ.R diff --git a/misc/Ependymoma/Epen_plotResults.R b/misc/Ependymoma/Epen_plotResults.R index bdae47a4..4889261c 100644 --- a/misc/Ependymoma/Epen_plotResults.R +++ b/misc/Ependymoma/Epen_plotResults.R @@ -23,7 +23,10 @@ pathwayList <- readPathways(pathFile) #setName <- "Epen_prunedOneNet_0.001_180409" #setName <- "Epen_prunedPathway_0.01_180409" -setName <- "Epen_prunedOneNet_0.001_180410" +#setName <- "Epen_prunedOneNet_180522" +#setName <- "Epen_prunedOneNet_0.001_180523" +#setName <- "Epen_nopruneOneNet_180523" +setName <- "Epen_lassoOutsideCV_180523" inDir <- sprintf("%s/output/%s/pred",rootDir,setName) outDir <- sprintf("%s/output/%s/plot",rootDir,setName) @@ -36,11 +39,12 @@ auroc <- unlist(lapply(predPerf, function(x) x$auroc)) aupr <- unlist(lapply(predPerf, function(x) x$aupr)) acc <- unlist(lapply(predPerf, function(x) x$accuracy)) +den <- sqrt(length(auroc)) cat("--------------\n") cat(sprintf("Performance: %s\n",setName)) -cat(sprintf("AUROC = %1.2f +/- %1.2f\n",mean(auroc),sd(auroc))) -cat(sprintf("AUPR = %1.2f +/- %1.2f\n",mean(aupr),sd(aupr))) -cat(sprintf("Accuracy = %1.2f +/- %1.2f\n",mean(acc),sd(acc))) +cat(sprintf("AUROC = %1.2f +/- %1.2f\n",mean(auroc),sd(auroc)/den)) +cat(sprintf("AUPR = %1.2f +/- %1.2f\n",mean(aupr),sd(aupr)/den)) +cat(sprintf("Accuracy = %1.2f +/- %1.2f\n",mean(acc),sd(acc)/den)) cat("--------------\n") diff --git a/misc/Ependymoma/buggy/netDx_prunedOneNet.R b/misc/Ependymoma/buggy/netDx_prunedOneNet.R index 9c35681c..0f36fdf3 100644 --- a/misc/Ependymoma/buggy/netDx_prunedOneNet.R +++ b/misc/Ependymoma/buggy/netDx_prunedOneNet.R @@ -43,7 +43,7 @@ pheno$STATUS <- droplevels(pheno$STATUS) ### BEGIN PRUNING CODE # apply pruning to proteomic data curwd <- getwd() -setwd("../PanCancer") +setwd("../../PanCancer") source("LMprune.R") source("runLM.R") source("silh.R") diff --git a/misc/Ependymoma/netDx_nopruneOneNet.R b/misc/Ependymoma/netDx_nopruneOneNet.R new file mode 100644 index 00000000..2c1cb09d --- /dev/null +++ b/misc/Ependymoma/netDx_nopruneOneNet.R @@ -0,0 +1,45 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_nopruneOneNet_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=list(rna=rownames(xpr))) +dats <- list(rna=xpr) +pheno$STATUS <- as.character(droplevels(pheno$STATUS)) + +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L, + preFilter=FALSE) diff --git a/misc/Ependymoma/netDx_prunedOneNet_outsideCV.R b/misc/Ependymoma/netDx_prunedOneNet_outsideCV.R new file mode 100644 index 00000000..16fabd76 --- /dev/null +++ b/misc/Ependymoma/netDx_prunedOneNet_outsideCV.R @@ -0,0 +1,68 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) +require(glmnet) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_lassoOutsideCV_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=list(rna=rownames(xpr))) +dats <- list(rna=xpr) +pheno$STATUS <- as.character(droplevels(pheno$STATUS)) + + #### Test of doing something WRONG - lasso outside cv loop + cat("Prefiltering enabled\n") + for (nm in names(dats)) { + if (nrow(dats[[nm]])<2) # only has one var, take it. + vars <- rownames(dats[[nm]]) + else { + set.seed(123) + fit <- cv.glmnet(x=t(na.omit(dats[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + if (length(vars)>0) { + tmp <- dats[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats[[nm]] <- tmp + gps[[nm]] <- list(rna=rownames(dats[[nm]])) + } + cat(sprintf("%s: %s pruned\n",nm,length(vars))) + } + +browser() +runPredictor_nestedCV(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L, + preFilter=FALSE) diff --git a/misc/Ependymoma/netDx_prunedOneNet_univ.R b/misc/Ependymoma/netDx_prunedOneNet_univ.R new file mode 100644 index 00000000..0f2253b0 --- /dev/null +++ b/misc/Ependymoma/netDx_prunedOneNet_univ.R @@ -0,0 +1,59 @@ +# Ependymoma +rm(list=ls()) + +require(netDx) +require(netDx.examples) + +rootDir <- "/home/shraddhapai/BaderLab/2017_Ependymoma" +inDir <- sprintf("%s/input/netDx_prepared",rootDir) +outDir <- sprintf("%s/output",rootDir) +pathFile <-sprintf("%s/anno/Human_AllPathways_February_01_2018_symbol.gmt", + rootDir) +load(sprintf("%s/Ependymoma_cohortMerged_180125.Rdata",inDir)) + +# exclude ST +idx <- which(pheno$STATUS=="ST") +pheno <- pheno[-idx,] +xpr <- xpr[,-idx] + +makeNets <- function(dataList, groupList, netDir,...) { + netList <- c() + # make RNA nets: group by pathway + if (!is.null(groupList[["rna"]])) { + netList <- makePSN_NamedMatrix(dataList$rna, + rownames(dataList$rna), + groupList[["rna"]],netDir,verbose=FALSE, + writeProfiles=TRUE,...) + netList <- unlist(netList) + cat(sprintf("Made %i RNA pathway nets\n", length(netList))) + } + cat(sprintf("Total of %i nets\n", length(netList))) + return(netList) +} + +dt <- format(Sys.Date(),"%y%m%d") +megaDir <- sprintf("%s/Epen_prunedOneNet_0.001_%s",outDir,dt) +if (!file.exists(megaDir)) dir.create(megaDir) + +gps <- list(rna=list(rna=rownames(xpr))) +dats <- list(rna=xpr) +pheno$STATUS <- droplevels(pheno$STATUS) +pheno$STATUS <- as.character(pheno$STATUS) + +#### ----------------------------------------------------- +### BEGIN PRUNING CODE +# apply pruning to proteomic data +curwd <- getwd() +setwd("../PanCancer") +source("LMprune.R") +source("runLM.R") +source("silh.R") +require(cluster) +setwd(curwd) +#### ---------------------------------------------------------- + +runPredictor_nestedCV_univ(pheno, + dataList=dats,groupList=gps, + makeNetFunc=makeNets, ### custom network creation function + outDir=sprintf("%s/pred",megaDir), + numCores=8L,nFoldCV=10L, CVcutoff=9L,numSplits=10L,startAt=1L,preFilter=TRUE) From 1ecea10201234851e148e9bb60bb04f668bd1b98 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:30:12 -0400 Subject: [PATCH 106/124] moved --- .../pruneVersion/corrFeatWithOutcome.R | 149 ------------------ 1 file changed, 149 deletions(-) delete mode 100644 misc/PanCancer/pruneVersion/corrFeatWithOutcome.R diff --git a/misc/PanCancer/pruneVersion/corrFeatWithOutcome.R b/misc/PanCancer/pruneVersion/corrFeatWithOutcome.R deleted file mode 100644 index 3752c115..00000000 --- a/misc/PanCancer/pruneVersion/corrFeatWithOutcome.R +++ /dev/null @@ -1,149 +0,0 @@ -#' correlate feature with outcome -#' -#' @details Shows patient-level data for features of interest. For complex -#' features such as pathways, shows the patient-level PC projections, and -#' correlates these PC projects with outcome. It is a compact representation -#' of patient-level pathway activity. -#' @param pheno (data.frame) must have ID and STATUS -#' @param datList (list) keys are datatypes, and values are data.frames or -#' matrix with patients in columns and values in rows -#' @param groupList (list) unit groupings. keys are datatypes, values are lists -#' of unit groups. e.g. for pathway grouping in rna, groupList[["rna"]] would -#' be a list with pathway names as keys and genes as values. -#' @param inputNets (data.frame) contents of inputNets.txt. Two-column table -#' with column 1 having datatype and column 2 having net name. -#' @param selFeatures (char) Features to correlate. Typically these would -#' be high-scoring features from the predictor. Names should match column 2 -#' of inputNets -#' @param numPCs (integer) how many principal components to show data for -#' @param filePfx (char) prefix for output pdfs. -#' @import plotrix -#' @return No value. Side effect of creating plots corr -#' @export -corrFeatWithOutcome <- function(pheno, datList, groupList,inputNets, - selFeatures,numPCs=3,filePfx="corrFeat") { - - if (!class(pheno$STATUS)=="factor") pheno$STATUS <- factor(pheno$STATUS) - - resMat <- matrix(0, nrow=length(selFeatures), ncol=numPCs*2) - isDone <- rep(FALSE, length(selFeatures)) - rownames(resMat) <- selFeatures - #rownames(resMat) <- gsub("_"," ",sub(".profile$|_cont$","",selFeatures)) - #rownames(resMat) <- toTitleCase(rownames(resMat)) - plotList <- list();plotCtr <- 1 - - for (idx in 1:length(selFeatures)) { - curF <- selFeatures[idx] - print(curF) - netType <- inputNets[which(inputNets[,2]==curF),1] - myDat <- datList[[netType]] - mySet <- groupList[[netType]][[curF]] - dat <- myDat[mySet,] - dat <- na.omit(dat) - - dat <- dat[,colSums(is.na(dat))==0] - if (ncol(dat)>=3) { - pr <- prcomp(na.omit(t(dat))) - pr <- pr$x[,1:numPCs] - } else { - cat(sprintf("\t\t%s: Has < 3 values!!\n",curF)) - pr <- dat - } - - maxDim <- min(ncol(pr),numPCs) - numPCs <- maxDim - tmp <- data.frame(pr[,1:maxDim],STATUS=pheno$STATUS) - colnames(tmp)[1:maxDim] <- paste("PC",1:maxDim,sep="") - - combs <- combn(maxDim,2) - - for (k in 1:numPCs) { - y <- cor.test(pr[,k],as.integer(pheno$STATUS),method="spearman") - resMat[idx,k] <- y$estimate - resMat[idx,3+k] <- -log10(y$p.value) # y$p.value) - } - - for (k in 1:ncol(combs)) { - i <- combs[1,k]; j <- combs[2,k] - cat(sprintf("[%i %i]\n",i,j)) - - # draw decision boundary in automated manner - tmp2 <- tmp[,c("STATUS",sprintf("PC%i",i),sprintf("PC%i",j))] - colnames(tmp2) <- c("y","x1","x2") - mdl <- glm(y ~ ., data=tmp2,family=binomial) - slope <- coef(mdl)[2]/(-coef(mdl)[3]) - intercept <- coef(mdl)[1]/(-coef(mdl)[3]) - - showLeg_Flag <- (k == ncol(combs)) - if (ncol(combs) > 4) { - pt <- 0.8; lwd <- 1 ;cex <-5 - } else { - pt <- 2; lwd <- 2; cex <- 12 - } - - p <- ggplot(tmp,aes_string(x=sprintf("PC%i",i), - y=sprintf("PC%i",j))) - p <- p + geom_point(aes(colour=factor(STATUS)),alpha=0.6, - size=pt,show.legend=showLeg_Flag) - p <- p + geom_abline(intercept=intercept,slope=slope, - colour="gray50",lwd=lwd) - p <- p + ggtitle(sprintf("%s\ncor=%1.2f (p<%1.2e)", - rownames(resMat)[idx],resMat[idx,i],10^-resMat[idx,3+i])) - p <- p + theme(# legend.position="none", - axis.ticks=element_blank(), - axis.text=element_blank(), - plot.title=element_text(size=cex)) - - plotList[[plotCtr]] <- p - plotCtr <- plotCtr+1 - } -} - -# now plot the PC projections with colour-coded status - -nr <- 3; nc <- choose(numPCs,2) -pdf(sprintf("%s_PCview.pdf", filePfx),height=11,width=11) -tryCatch({ - for (sidx in seq(1,length(plotList),nr*nc)) { - eidx <- sidx+((nr*nc)-1); - cat(sprintf("%i-%i\n",sidx,eidx)) - if (eidx>length(plotList)) eidx <- length(plotList) - multiplot(plotlist=plotList[sidx:eidx], - layout=matrix(1:(nr*nc),ncol=nc,byrow=TRUE)) -} -},error=function(ex) {print(ex)},finally={dev.off()}) - -pdf(sprintf("%s_PCtable.pdf",filePfx),height=11,width=11) -tryCatch({ -#colnames(resMat)[1:3] <- paste("PC ",1:3,sep="") -par(mar = c(0.5, 35, 6.5, 0.5)) - plotrix::color2D.matplot(resMat[,1:3],show.values=TRUE,axes=F, - xlab="",ylab="",vcex=2,vcol='black', - cs1=c(1,1,0),cs2=c(0,1,0),cs3=c(0,1,1)) - axis(3,at=seq_len(3)-0.5,labels=colnames(resMat)[4:6], - tick=F,cex.axis=1,line=-1) - axis(2,at=seq_len(nrow(resMat))-0.5, - labels=sub(".profile","", - sub("_cont","",rev(rownames(resMat)))),tick=F, - las=1,cex.axis=1) -},error=function(ex){print(ex)},finally={dev.off()}) - -} - -toTitleCase <- function(str) { - str <- tolower(str) - sp <- gregexpr(" ",str) - str2 <- sapply(1:length(str), function(i) { - z <- str[i] - z <- paste(toupper(substr(z,1,1)),substr(z,2,nchar(z)),sep="") - if (!sp[[i]][1]==-1) { - for (idx in sp[[i]]) { - z <- gsub(paste("^(.{",idx,"}).",sep=""), - paste("\\1",toupper(substr(z,idx+1,idx+1)),sep=""),z); - } - } - z - }) - str2 <- unlist(str2) - str2 -} From 15cac12e0df822fe01630c33e53273ce27ef61a6 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:30:35 -0400 Subject: [PATCH 107/124] replaced --- misc/PanCancer/plotCombinedKM_HRatio.R | 107 ------------------------- 1 file changed, 107 deletions(-) delete mode 100644 misc/PanCancer/plotCombinedKM_HRatio.R diff --git a/misc/PanCancer/plotCombinedKM_HRatio.R b/misc/PanCancer/plotCombinedKM_HRatio.R deleted file mode 100644 index 0665cd6e..00000000 --- a/misc/PanCancer/plotCombinedKM_HRatio.R +++ /dev/null @@ -1,107 +0,0 @@ -# average KM curves for KIRC -rm(list=ls()) - -require(rms) -require(survival) -require(survminer) -source("plot.survfit.custom.R") - -tumourType <- "OV" - -rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab" -if (tumourType=="KIRC") { - dataDir <- sprintf("%s/2017_TCGA_KIRC/output/KIRC_oneNetPer_normDiff_170518",rootDir) - survFile <- sprintf("%s/2017_TCGA_KIRC/input/KIRC_OS_core.txt",rootDir) - clinFile <- sprintf("%s/2017_TCGA_KIRC/input/KIRC_clinical_core.txt",rootDir) -} else if (tumourType=="LUSC") { - dataDir <- sprintf("%s/2017_TCGA_LUSC/output/LUSC_oneNetPer_170425",rootDir) - survFile <- sprintf("%s/2017_TCGA_LUSC/input/LUSC_OS_core.txt",rootDir) - clinFile <- sprintf("%s/2017_TCGA_LUSC/input/LUSC_clinical_core.txt",rootDir) -} else if (tumourType=="OV") { - dataDir <- sprintf("%s/2017_TCGA_OV/output/OV_oneNetPer_170425",rootDir) - survFile <- sprintf("%s/2017_TCGA_OV/input/OV_OS_core.txt",rootDir) - clinFile <- sprintf("%s/2017_TCGA_OV/input/OV_clinical_core.txt",rootDir) -} - -survDat <- read.delim(survFile,sep="\t",h=T,as.is=T) -clinDat <- read.delim(clinFile,sep="\t",h=T,as.is=T) -pheno <- merge(x=survDat,y=clinDat,by="feature") - -plotDF <- list() # compiles survival curves across all iterations -megaDF <- list() -hratio <- c() # cum hazards ratio for all iterations -for (k in 1:100) { - print(k) - dat <- read.delim(sprintf("%s/rng%i/clinical/predictionResults.txt", - dataDir,k),sep="\t",h=T,as.is=T) - colnames(dat)[1] <- "feature" - dat <- merge(x=dat,y=pheno,by="feature") - - # force first entry to be YES and second to be NO so we can tell them apart - # in the output. - dat$PRED_CLASS <- factor(dat$PRED_CLASS,levels=c("SURVIVEYES","SURVIVENO")) - - megaDF[[k]] <- dat - - dat$SurvObj <- with(dat, Surv(OS_OS, STATUS_INT == 0)) - - # get cum hazards ratio for this split - model <- coxph(SurvObj~PRED_CLASS, data=dat) - hratio <- c(hratio,summary(model)$coef[1,2]) - - fit <- npsurv(SurvObj ~ PRED_CLASS, data = dat, - conf.type = "log-log") - - par(mfrow=c(1,2)) - out <- plot.survfit.custom(fit) - plot(0,0,type='n',xlim=c(0,max(out$ends$x)),ylim=c(0,1)) - out[[1]] <- as.data.frame(out[[1]]) - out[[2]] <- as.data.frame(out[[2]]) -# lines(out[[1]]$xx,out[[1]]$yy,col='green') -# lines(out[[2]]$xx,out[[2]]$yy,col='red') - - newdf <- out[[1]]; newdf$PRED_CLASS <- "SURVIVEYES"; newdf$split <- k - newdf2 <- out[[2]]; newdf2$PRED_CLASS <- "SURVIVENO"; newdf2$split <- k - - plotDF[[k]] <- rbind(newdf,newdf2) -} - -# approach 1: pool all results and make a single KM curve -res <- do.call("rbind",megaDF) -res$SurvObj <- with(res, Surv(OS_OS,STATUS_INT==0)) -fit <- npsurv(SurvObj ~ PRED_CLASS, data=res,conf.type="log-log") - -idx <- which(hratio > 50) -if (any(idx)) hratio <- hratio[-idx] -hratio <- data.frame(group="tumour",hratio=hratio) -p <- ggplot(hratio,aes(group,y=hratio)) + geom_boxplot() + ylim(c(0,quantile(hratio$hratio,0.98))+3) -p <- p + ggtitle(sprintf("Cum hazard ratio (one per split)(N=%i)",nrow(hratio))) -p <- p + geom_hline(yintercept=1,lty=2) -p <- p + theme(axis.text=element_text(size=12), - axis.title=element_text(size=14,face="bold")) - -p1 <- survminer::ggsurvplot(fit,data=res,conf.int=TRUE) -p2 <- p - -# approach 2 : pool all step functions -res <- do.call("rbind",plotDF) -# compute ci -out <- list() -for (k in unique(res$PRED_CLASS)) { - res2 <- subset(res,PRED_CLASS==k) - ub <- c(); lb <- c(); xx <- unique(res2$xx); muy <- c() - for (x in xx) { - yy <- res2$yy[which(res2$xx == x)] - mu <- mean(yy); offset <- sd(yy)/sqrt(length(yy)) - lb <- c(lb, mu-offset) - ub <- c(ub, mu+offset) - muy <- c(muy, mu) - } - out[[k]] <- data.frame(x=xx,y=muy,lb=lb,ub=ub,PRED_CLASS=k) -} -blah <- do.call("rbind",out) -p3 <- ggplot(blah,(aes(x=x,y=y,colour=PRED_CLASS))) + geom_line() + geom_ribbon(aes(ymin=lb,ymax=ub),alpha=0.2) + ggtitle("Manual mean+CI of compiled KM") + ylab("% survival") +xlab("time (months)") - -pdf(sprintf("%s.pdf",tumourType)) -print(p1); print(p2); print(p3) -dev.off() From f89a8c23526a7fda4b9bfbb709f198f7eb099047 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:30:47 -0400 Subject: [PATCH 108/124] outdated --- .../outdated/plotCombinedKM_HRatio.R | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 misc/PanCancer/outdated/plotCombinedKM_HRatio.R diff --git a/misc/PanCancer/outdated/plotCombinedKM_HRatio.R b/misc/PanCancer/outdated/plotCombinedKM_HRatio.R new file mode 100644 index 00000000..0665cd6e --- /dev/null +++ b/misc/PanCancer/outdated/plotCombinedKM_HRatio.R @@ -0,0 +1,107 @@ +# average KM curves for KIRC +rm(list=ls()) + +require(rms) +require(survival) +require(survminer) +source("plot.survfit.custom.R") + +tumourType <- "OV" + +rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab" +if (tumourType=="KIRC") { + dataDir <- sprintf("%s/2017_TCGA_KIRC/output/KIRC_oneNetPer_normDiff_170518",rootDir) + survFile <- sprintf("%s/2017_TCGA_KIRC/input/KIRC_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_TCGA_KIRC/input/KIRC_clinical_core.txt",rootDir) +} else if (tumourType=="LUSC") { + dataDir <- sprintf("%s/2017_TCGA_LUSC/output/LUSC_oneNetPer_170425",rootDir) + survFile <- sprintf("%s/2017_TCGA_LUSC/input/LUSC_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_TCGA_LUSC/input/LUSC_clinical_core.txt",rootDir) +} else if (tumourType=="OV") { + dataDir <- sprintf("%s/2017_TCGA_OV/output/OV_oneNetPer_170425",rootDir) + survFile <- sprintf("%s/2017_TCGA_OV/input/OV_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_TCGA_OV/input/OV_clinical_core.txt",rootDir) +} + +survDat <- read.delim(survFile,sep="\t",h=T,as.is=T) +clinDat <- read.delim(clinFile,sep="\t",h=T,as.is=T) +pheno <- merge(x=survDat,y=clinDat,by="feature") + +plotDF <- list() # compiles survival curves across all iterations +megaDF <- list() +hratio <- c() # cum hazards ratio for all iterations +for (k in 1:100) { + print(k) + dat <- read.delim(sprintf("%s/rng%i/clinical/predictionResults.txt", + dataDir,k),sep="\t",h=T,as.is=T) + colnames(dat)[1] <- "feature" + dat <- merge(x=dat,y=pheno,by="feature") + + # force first entry to be YES and second to be NO so we can tell them apart + # in the output. + dat$PRED_CLASS <- factor(dat$PRED_CLASS,levels=c("SURVIVEYES","SURVIVENO")) + + megaDF[[k]] <- dat + + dat$SurvObj <- with(dat, Surv(OS_OS, STATUS_INT == 0)) + + # get cum hazards ratio for this split + model <- coxph(SurvObj~PRED_CLASS, data=dat) + hratio <- c(hratio,summary(model)$coef[1,2]) + + fit <- npsurv(SurvObj ~ PRED_CLASS, data = dat, + conf.type = "log-log") + + par(mfrow=c(1,2)) + out <- plot.survfit.custom(fit) + plot(0,0,type='n',xlim=c(0,max(out$ends$x)),ylim=c(0,1)) + out[[1]] <- as.data.frame(out[[1]]) + out[[2]] <- as.data.frame(out[[2]]) +# lines(out[[1]]$xx,out[[1]]$yy,col='green') +# lines(out[[2]]$xx,out[[2]]$yy,col='red') + + newdf <- out[[1]]; newdf$PRED_CLASS <- "SURVIVEYES"; newdf$split <- k + newdf2 <- out[[2]]; newdf2$PRED_CLASS <- "SURVIVENO"; newdf2$split <- k + + plotDF[[k]] <- rbind(newdf,newdf2) +} + +# approach 1: pool all results and make a single KM curve +res <- do.call("rbind",megaDF) +res$SurvObj <- with(res, Surv(OS_OS,STATUS_INT==0)) +fit <- npsurv(SurvObj ~ PRED_CLASS, data=res,conf.type="log-log") + +idx <- which(hratio > 50) +if (any(idx)) hratio <- hratio[-idx] +hratio <- data.frame(group="tumour",hratio=hratio) +p <- ggplot(hratio,aes(group,y=hratio)) + geom_boxplot() + ylim(c(0,quantile(hratio$hratio,0.98))+3) +p <- p + ggtitle(sprintf("Cum hazard ratio (one per split)(N=%i)",nrow(hratio))) +p <- p + geom_hline(yintercept=1,lty=2) +p <- p + theme(axis.text=element_text(size=12), + axis.title=element_text(size=14,face="bold")) + +p1 <- survminer::ggsurvplot(fit,data=res,conf.int=TRUE) +p2 <- p + +# approach 2 : pool all step functions +res <- do.call("rbind",plotDF) +# compute ci +out <- list() +for (k in unique(res$PRED_CLASS)) { + res2 <- subset(res,PRED_CLASS==k) + ub <- c(); lb <- c(); xx <- unique(res2$xx); muy <- c() + for (x in xx) { + yy <- res2$yy[which(res2$xx == x)] + mu <- mean(yy); offset <- sd(yy)/sqrt(length(yy)) + lb <- c(lb, mu-offset) + ub <- c(ub, mu+offset) + muy <- c(muy, mu) + } + out[[k]] <- data.frame(x=xx,y=muy,lb=lb,ub=ub,PRED_CLASS=k) +} +blah <- do.call("rbind",out) +p3 <- ggplot(blah,(aes(x=x,y=y,colour=PRED_CLASS))) + geom_line() + geom_ribbon(aes(ymin=lb,ymax=ub),alpha=0.2) + ggtitle("Manual mean+CI of compiled KM") + ylab("% survival") +xlab("time (months)") + +pdf(sprintf("%s.pdf",tumourType)) +print(p1); print(p2); print(p3) +dev.off() From f30c12279d2bf09c9fd888bcacccffae5ea4d0bc Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:31:41 -0400 Subject: [PATCH 109/124] now has option to prefilter with lasso --- netDx/R/runPredictor_nestedCV.R | 47 ++++++++++++++++++++++++++++-- netDx/man/runPredictor_nestedCV.Rd | 12 +++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/netDx/R/runPredictor_nestedCV.R b/netDx/R/runPredictor_nestedCV.R index 1feed14c..f18c5057 100644 --- a/netDx/R/runPredictor_nestedCV.R +++ b/netDx/R/runPredictor_nestedCV.R @@ -42,11 +42,17 @@ #' some designs, each split can result in using 1Gb of data. #' @param startAt (integer) which of the splits to start at (e.g. if the #' job aborted part-way through) +#' @param preFilter (logical) if TRUE uses lasso to prefilter dataList within +#' cross-validation loop. Only variables that pass lasso get included. The +#' current option is not recommended for pathway-level features as most genes +#' will be eliminated by lasso. Future variations may allow other prefiltering +#' options that are more lenient. #' @examples see examples/NestedCV_MultiData.Rmd for example use. +#' @import glmnet #' @export runPredictor_nestedCV <- function(pheno,dataList,groupList,outDir,makeNetFunc, nFoldCV=10L,trainProp=0.8,numSplits=10L,numCores,CVmemory=4L,CVcutoff=9L, - keepAllData=FALSE,startAt=1L) { + keepAllData=FALSE,startAt=1L, preFilter=FALSE) { ### tests# pheno$ID and $status must exist if (missing(dataList)) stop("dataList must be supplied.\n") @@ -111,6 +117,35 @@ for (rngNum in startAt:numSplits) { dats_train <- lapply(dataList,function(x) { x[,which(colnames(x) %in% pheno$ID)]}) + # prefilter with lasso + if (preFilter) { + set.seed(123) + cat("Prefiltering enabled\n") + for (nm in names(dats_train)) { + if (nrow(dats_train[[nm]])<2) # only has one var, take it. + vars <- rownames(dats_train[[nm]]) + else { + fit <- cv.glmnet(x=t(na.omit(dats_train[[nm]])), + y=factor(pheno$STATUS), family="binomial", alpha=1) # lasso + wt <- abs(coef(fit,s="lambda.min")[,1]) + vars <- setdiff(names(wt)[which(wt>.Machine$double.eps)],"(Intercept)") + } + if (length(vars)>0) { + tmp <- dats_train[[nm]] + tmp <- tmp[which(rownames(tmp) %in% vars),,drop=FALSE] + dats_train[[nm]] <- tmp + } else { + # leave dats_train as is, make a single net + } + cat(sprintf("rngNum %i: %s: %s pruned\n",rngNum,nm,length(vars))) + } + } + + cat("# datapoints to make training nets\n") + for (nm in names(dats_train)) { + cat(sprintf("rngNum %i: %s: %i measures\n", rngNum,nm,nrow(dats_train[[nm]]))) + } + netDir <- sprintf("%s/networks",outDir) createPSN_MultiData(dataList=dats_train,groupList=groupList, netDir=netDir,customFunc=makeNetFunc,numCores=numCores) @@ -156,7 +191,15 @@ for (rngNum in startAt:numSplits) { cat(sprintf("%s: %i networks\n",g,length(pTally))) netDir <- sprintf("%s/networks",pDir) - createPSN_MultiData(dataList=dataList,groupList=groupList, + dats_tmp <- list() + for (nm in names(dataList)) { + passed <- rownames(dats_train[[nm]]) + tmp <- dataList[[nm]] + # only variables passing prefiltering should be used to make PSN + dats_tmp[[nm]] <- tmp[which(rownames(tmp) %in% passed),] + } + + createPSN_MultiData(dataList=dats_tmp,groupList=groupList, netDir=sprintf("%s/networks",pDir), customFunc=makeNetFunc,numCores=numCores, filterSet=pTally) diff --git a/netDx/man/runPredictor_nestedCV.Rd b/netDx/man/runPredictor_nestedCV.Rd index 9da161a8..83806b03 100644 --- a/netDx/man/runPredictor_nestedCV.Rd +++ b/netDx/man/runPredictor_nestedCV.Rd @@ -6,7 +6,8 @@ \usage{ runPredictor_nestedCV(pheno, dataList, groupList, outDir, makeNetFunc, nFoldCV = 10L, trainProp = 0.8, numSplits = 10L, numCores, - CVmemory = 4L, CVcutoff = 9L, keepAllData = FALSE) + CVmemory = 4L, CVcutoff = 9L, keepAllData = FALSE, startAt = 1L, + preFilter = FALSE) } \arguments{ \item{pheno}{(data.frame) sample metadata, must have ID and STATUS columns} @@ -47,6 +48,15 @@ in a given split} \item{keepAllData}{(logical) if TRUE keeps all intermediate files, even those not needed for assessing the predictor. Use very cautiously as for some designs, each split can result in using 1Gb of data.} + +\item{startAt}{(integer) which of the splits to start at (e.g. if the +job aborted part-way through)} + +\item{preFilter}{(logical) if TRUE uses lasso to prefilter dataList within +cross-validation loop. Only variables that pass lasso get included. The +current option is not recommended for pathway-level features as most genes +will be eliminated by lasso. Future variations may allow other prefiltering +options that are more lenient.} } \description{ Run nested cross-validation on data From 3e1852528d4e86955aec9ffa6633775271b85f47 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:32:13 -0400 Subject: [PATCH 110/124] sparsify_edgeMax var --- netDx/man/makePSN_NamedMatrix.Rd | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/netDx/man/makePSN_NamedMatrix.Rd b/netDx/man/makePSN_NamedMatrix.Rd index ce868e60..685d626f 100644 --- a/netDx/man/makePSN_NamedMatrix.Rd +++ b/netDx/man/makePSN_NamedMatrix.Rd @@ -5,8 +5,9 @@ \title{Create patient networks from full matrix of named measurements} \usage{ makePSN_NamedMatrix(xpr, nm, namedSets, outDir, simMetric = "pearson", - cutoff = 0.3, verbose = TRUE, numCores = 1L, writeProfiles = TRUE, - sparsify = FALSE, useSparsify2 = FALSE, append = FALSE, ...) + verbose = TRUE, numCores = 1L, writeProfiles = TRUE, sparsify = FALSE, + useSparsify2 = FALSE, cutoff = 0.3, sparsify_edgeMax = 1000, + append = FALSE, ...) } \arguments{ \item{xpr}{(matrix) rows are measurements, columns are samples. Columns @@ -27,9 +28,6 @@ that are input to network generation} for details. If writeProfiles is set to TRUE, must be one of pearson (Pearson correlation) or MI (correlation by mutual information).} -\item{cutoff}{(numeric) patients with similarity smaller than this value -are not included in the corresponding interaction network} - \item{verbose}{(logical) print detailed messages} \item{numCores}{(integer) number of cores for parallel network generation} @@ -45,6 +43,11 @@ with default parameters. Only used when writeProfiles=FALSE} \item{useSparsify2}{(logical). Currently for testing only. A cleaner sparsification routine.} +\item{cutoff}{(numeric) patients with similarity smaller than this value +are not included in the corresponding interaction network} + +\item{sparsify_edgeMax}{(numeric).} + \item{append}{(logical) if TRUE does not overwrite netDir.} \item{...}{passed to \code{getSimilarity()}} From 8772c82e0b18daaeca66692daa3bec7b909b0174 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:32:20 -0400 Subject: [PATCH 111/124] NAMESPACE --- .../PanCancer_topX_pearscale_impute2.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename misc/PanCancer/pruneVersion/diff_kernels/pearscale/{PanCancer_topX_pearscale_impute.R => outdated/PanCancer_topX_pearscale_impute2.R} (100%) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/outdated/PanCancer_topX_pearscale_impute2.R similarity index 100% rename from misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topX_pearscale_impute.R rename to misc/PanCancer/pruneVersion/diff_kernels/pearscale/outdated/PanCancer_topX_pearscale_impute2.R From b5cfb073d5f41e387395aa73549e94769d418fac Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:32:32 -0400 Subject: [PATCH 112/124] updated --- netDx/NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/netDx/NAMESPACE b/netDx/NAMESPACE index a59aa805..66718a42 100644 --- a/netDx/NAMESPACE +++ b/netDx/NAMESPACE @@ -55,6 +55,7 @@ export(resampling_pickBestCutoff_CNV) export(resampling_predTest_CNV) export(runGeneMANIA) export(runPredictor_nestedCV) +export(runPredictor_nestedCV_univ) export(simpleCap) export(sparsify2) export(sparsifyNet) @@ -72,6 +73,7 @@ import(ROCR) import(bigmemory) import(foreach) import(ggplot2) +import(glmnet) import(httr) import(parallel) import(pracma) From 46ce0cfb6975dd79b92a6dcf1aa2a1f52c8dc730 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Thu, 24 May 2018 16:33:09 -0400 Subject: [PATCH 113/124] option to includeAllNodes --- netDx/man/sparsify2.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/netDx/man/sparsify2.Rd b/netDx/man/sparsify2.Rd index eee93753..4173bb6a 100644 --- a/netDx/man/sparsify2.Rd +++ b/netDx/man/sparsify2.Rd @@ -5,7 +5,7 @@ \title{cleaner sparsification routine} \usage{ sparsify2(W, outFile = "tmp.txt", cutoff = 0.3, maxInt = 50, - EDGE_MAX = 1000) + EDGE_MAX = 1000, includeAllNodes = TRUE, verbose = TRUE) } \arguments{ \item{W}{(matrix) similarity matrix} @@ -17,6 +17,8 @@ sparsify2(W, outFile = "tmp.txt", cutoff = 0.3, maxInt = 50, \item{maxInt}{(numeric) max num edges per node.} \item{EDGE_MAX}{(numeric) max num edges in network} + +\item{includeAllNodes}{(logical) if TRUE, ensures at least one edge is present for each patient. This feature is required when sparsification excludes test patients that are required to be classified. If the sparsification rules exclude all edges for a patient and this flag is set, then the strongest edge for each missing patient is added to the net. Note that this condition results in the total number of edges potentially exceeding EDGE_MAX} } \value{ writes SIF content to text file (node1,node2,edge weight) From bfd08bb64f4b1283f7237a9ae5e3b54bc00f58d1 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 30 May 2018 10:27:52 -0400 Subject: [PATCH 114/124] tests after sparse bug fix --- misc/PanCancer/noPrune/GBM_noPrune_pipeline.R | 2 +- misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R | 8 ++++---- misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R | 9 ++++++--- misc/PanCancer/noPrune/PanCancer_noPrune.R | 6 +++--- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/misc/PanCancer/noPrune/GBM_noPrune_pipeline.R b/misc/PanCancer/noPrune/GBM_noPrune_pipeline.R index 9079ac26..31b03e53 100644 --- a/misc/PanCancer/noPrune/GBM_noPrune_pipeline.R +++ b/misc/PanCancer/noPrune/GBM_noPrune_pipeline.R @@ -113,6 +113,6 @@ rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) source("PanCancer_noPrune_impute.R") runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9,maxEdge=6000,spCutoff=0.3) + cutoffSet=9,maxEdge=3000,spCutoff=0.3) diff --git a/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R b/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R index a5314bba..4ff92b36 100644 --- a/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R +++ b/misc/PanCancer/noPrune/KIRC_noPrune_pipeline.R @@ -7,7 +7,6 @@ inDir <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/input" outRoot <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/noprune_sp0.3_%s",outRoot,dt) # ----------------------------------------------------------- @@ -109,13 +108,14 @@ combList <- list( all="all") rm(pheno,pheno_nosurv) -rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) +rm(survStr,surv,tmp,nm,inDir,k,inFiles,datFiles,pname) # ----------------------------------------------------------- # run predictor source("PanCancer_noPrune.R") -runPredictor(mega_combList=combList,rngVals=9:20,netSets=netSets, +megaDir <- sprintf("%s/noprune_sp0.3_maxInt30_%s",outRoot,dt) +runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9,maxEdge=6000,spCutoff=0.3) + cutoffSet=9,maxEdge=6000,spCutoff=0.3,maxInt=30) diff --git a/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R b/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R index bb3b4715..d54fccf1 100644 --- a/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R +++ b/misc/PanCancer/noPrune/LUSC_noPrune_pipeline.R @@ -8,7 +8,6 @@ inDir <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/input/" outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/noPrune_sp0.3_%s",outRoot,dt) # ----------------------------------------------------------- # process input @@ -109,11 +108,15 @@ combList <- list( cat(sprintf("Clinical variables are: { %s }\n", paste(rownames(dats$clinical),sep=",",collapse=","))) rm(pheno) -rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) +rm(survStr,surv,tmp,nm,inDir,k,inFiles,datFiles,pname) # ----------------------------------------------------------- # run predictor source("PanCancer_noPrune.R") +maxEdge <- 6000 +maxInt <- 40 +megaDir <- sprintf("%s/noPrune_sp0.3_maxEdge%i_maxInt%i_%s", + outRoot,maxEdge/1000,maxInt,dt) runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9,maxEdge=6000,spCutoff=0.3) + cutoffSet=9,maxEdge=maxEdge,maxInt=maxInt,spCutoff=0.3) diff --git a/misc/PanCancer/noPrune/PanCancer_noPrune.R b/misc/PanCancer/noPrune/PanCancer_noPrune.R index 1c114126..f514a896 100644 --- a/misc/PanCancer/noPrune/PanCancer_noPrune.R +++ b/misc/PanCancer/noPrune/PanCancer_noPrune.R @@ -36,7 +36,7 @@ normDiff2 <- function(x) { # ---------------------------------------------------------------- runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, - cutoffSet,maxEdge,spCutoff) { + cutoffSet,spCutoff,maxInt=50L,maxEdge=6000) { require(netDx) require(netDx.examples) @@ -88,7 +88,7 @@ for (rngNum in rngVals) { simMetric="custom",customFunc=normDiff2, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=spCutoff, - sparsify_edgeMax=maxEdge, + sparsify_edgeMax=maxEdge,sparsify_maxInt=maxInt, verbose=FALSE,numCores=numCores) } if (any(pearnet)) { @@ -163,7 +163,7 @@ for (rngNum in rngVals) { simMetric="custom",customFunc=normDiff2, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=spCutoff, - sparsify_edgeMax=maxEdge, + sparsify_edgeMax=maxEdge,sparsify_maxInt=maxInt, verbose=FALSE,numCores=numCores) } if (any(pearnet)) { From 08109c364bc8a9ff7ce040b96005a38d605fd12b Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 30 May 2018 10:28:11 -0400 Subject: [PATCH 115/124] after sparse bug fix --- misc/PanCancer/multiCutoff/GBM_getRes.R | 15 ++++++++------- misc/PanCancer/multiCutoff/KIRC_getRes.R | 10 ++++++++-- misc/PanCancer/multiCutoff/LUSC_getRes.R | 15 ++++++++++----- misc/PanCancer/multiCutoff/OV_getRes.R | 8 +++++--- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/misc/PanCancer/multiCutoff/GBM_getRes.R b/misc/PanCancer/multiCutoff/GBM_getRes.R index 7a141476..c39f030d 100644 --- a/misc/PanCancer/multiCutoff/GBM_getRes.R +++ b/misc/PanCancer/multiCutoff/GBM_getRes.R @@ -7,15 +7,19 @@ GBM_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/GBM/output" dirSet <- list( # base="noPrune_180423", - baserep="noprune_impute_sp0.3_180511", +# baserep="noprune_impute_sp0.3_180511", + baserep_sparser="noprune_impute_sp0.3_180528", # ridge_fix="ridge_AbsFix_180426", # lassoGenes_sp1="lassoGenes_incClin_180426", # pamrGenes="pamrGenes_incClin_180427", # euc_6K="eucclean_180503", - eucimpute="eucclean_impute_180507", +# eucimpute="eucclean_impute_180507", +# euci_new="eucclean_impute_180525", + euci_sparser="eucclean_impute_maxEdge3000_top50_180529", #rbfclean="rbfclean_0.20_180507", # pearscale="pearscale_180507", - pearimpute="pearscale_impute_180508" +# pearimpute="pearscale_impute_180508", + pearimp_new="pearscale_impute_180525" # pimp_40c2="pearimp_topX40_topClin2_180509", # pimp_40c3="pearimp_topX40_topClin3_180509", # pimp_100c3="pearimp_topX100_topClin3_180509", @@ -65,7 +69,7 @@ cutoff <-9 cat(sprintf("Got %i rng files\n",length(rngDir))) rngDir <- sprintf("%s/%s",dataDir,rngDir) - if (curdir %in% "baserep") { + if (curdir %in% c("baserep","baserep_new","baserep_sparser")) { c7 <- sprintf("%s/%s/predictionResults.txt", rngDir,settype) } else { @@ -94,9 +98,6 @@ cutoff <-9 auc_set[[settype]] <- y1 ctr <- ctr+1 } -if (curdir %in% "baserep") { - browser() -} mega_auc[[curdir]] <- unlist(lapply(auc_set,mean)) } diff --git a/misc/PanCancer/multiCutoff/KIRC_getRes.R b/misc/PanCancer/multiCutoff/KIRC_getRes.R index 8494d596..f0a19154 100644 --- a/misc/PanCancer/multiCutoff/KIRC_getRes.R +++ b/misc/PanCancer/multiCutoff/KIRC_getRes.R @@ -9,7 +9,9 @@ mainD <- "/home/shraddhapai/BaderLab/PanCancer_KIRC/output" dirSet <- list( # base="noPrune_180423", - baserep="noprune_sp0.3_180511" +# baserep="noprune_sp0.3_180511", + #baserep_new="noprune_sp0.3_180528", + baserep_sparser="noprune_sp0.3_maxInt30_180528" # base_splow="noprune_180510" # lasso="lasso_180426", # lassocl="lassoclean_180509", @@ -34,7 +36,12 @@ dataDir <- sprintf("%s/%s",mainD,dirSet[[curdir]]) auc_set <- list() auc_var <- list() for (settype in settypes) { + + if (curdir %in% "baserep_new") { + rngDir <- paste(sprintf("%s/rng",dataDir), 9:rngMax,sep="") + } else { rngDir <- paste(sprintf("%s/rng",dataDir), 1:rngMax,sep="") + } auc_var[[settype]] <- c() @@ -70,7 +77,6 @@ colctr <- 1 pdf(sprintf("KIRC_%s.pdf",format(Sys.Date(),"%y%m%d")),width=11,height=6); boxplot(mega_auc,main="KIRC",cex.axis=1.7,cex.main=2,las=1); dev.off() -browser() return(mega_auc) } diff --git a/misc/PanCancer/multiCutoff/LUSC_getRes.R b/misc/PanCancer/multiCutoff/LUSC_getRes.R index 53dd1d47..bafa8411 100644 --- a/misc/PanCancer/multiCutoff/LUSC_getRes.R +++ b/misc/PanCancer/multiCutoff/LUSC_getRes.R @@ -6,7 +6,11 @@ LUSC_getRes <- function() { mainD <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output" dirSet <- list( # base="noPrune_180423", - baserep="noPrune_sp0.3_180511", +# baserep="noPrune_sp0.3_180511", +# baserep_new="noPrune_sp0.3_180527", + baserep_new2="noPrune_sp0.3_maxEdge6_maxInt40_180529", +# plasso_303k="pearscale_lasso_topClin1_max30_3K_180528", +# plasso_306k="pearscale_lasso_topClin1_max30_180528", # lasso="lasso_180426", # lassoGenes="lassoGenes_180426", # pamrGenes_sp2="pamrGenes_180427", @@ -15,7 +19,10 @@ dirSet <- list( #euc6K="eucclean_180504", # eucimpute="eucscale_impute_180507", # pearscale="pearscale_180507", - plassoc1="pearscale_lasso_topClin1_180509" + +#plassoc1="pearscale_lasso_topClin1_180509", +# plassoc1_new="pearscale_lasso_topClin1_180528", + lassoc1_new="pearscale_lasso_topClin1_max40_6K_180529" ) settypes <- c("clinical","mir","rna","prot","cnv", "clinicalArna","clinicalAmir","clinicalAprot","clinicalAcnv","all") @@ -42,7 +49,7 @@ for (settype in settypes) { for (cutoff in 9) { - if (curdir %in% "baserep") { + if (curdir %in% c("baserep","baserep_new","baserep_new2")) { c7 <- sprintf("%s/%s/predictionResults.txt", rngDir,settype) } else { @@ -81,7 +88,5 @@ boxplot(mega_auc,main="LUSC",cex.axis=1.7,cex.main=2,las=1); abline(h=0.5) dev.off() -browser() - return(mega_auc) } diff --git a/misc/PanCancer/multiCutoff/OV_getRes.R b/misc/PanCancer/multiCutoff/OV_getRes.R index 200d551e..8c8b1100 100644 --- a/misc/PanCancer/multiCutoff/OV_getRes.R +++ b/misc/PanCancer/multiCutoff/OV_getRes.R @@ -11,11 +11,13 @@ settypes <- c("clinical","mir","rna","prot","cnv","dnam", "clinicalAcnv","all") dirSet <- list( # base="noPrune_180423", - baserep="noprune_sp0.3_180511", + #baserep="noprune_sp0.3_180511", + baserep="noprune_sp0.3_180527", baserep1="noprune_sp1_180512", # prune="pruneTrain_180419", # lasso="lasso_180426", - euc6K="eucscale_180504" +# euc6K="eucscale_180504", + euc_correct="eucscale_180528" # rbfclean="rbfclean_0.2_180507" ) @@ -39,7 +41,7 @@ for (curdir in names(dirSet)) { cat(sprintf("Got %i rng files\n",length(rngDir))) cutoff <- 9 - if (curdir %in% c("euc6K","rbfclean")) { + if (curdir %in% c("euc6K","rbfclean","euc_correct")) { c7 <- sprintf("%s/%s/cutoff9/predictionResults.txt", rngDir,settype,cutoff) } else { From e2aeab4e2c84829b6198ba6f4e069ee830b5b006 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 30 May 2018 10:28:29 -0400 Subject: [PATCH 116/124] after sparse bug fix --- .../diff_kernels/eucscale/GBM_eucscale_impute.R | 11 ++++++++--- .../diff_kernels/eucscale/PanCancer_eucscale_impute.R | 10 ++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R index 44a495af..04e11049 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/GBM_eucscale_impute.R @@ -11,8 +11,6 @@ inDir <- sprintf("%s/input",rootDir) outRoot <- sprintf("%s/output",rootDir) dt <- format(Sys.Date(),"%y%m%d") -megaDir <- sprintf("%s/eucclean_impute_%s",outRoot,dt) -cat(megaDir, file="test.txt",append=TRUE) # ----------------------------------------------------------- # process input @@ -107,13 +105,20 @@ pheno_all <- pheno # cleanup rm(pheno,pheno_nosurv) -rm(rootDir,survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles) +rm(rootDir,survStr,surv,tmp,nm,inDir,k,inFiles,datFiles) # ----------------------------------------------------------- # run predictor source("PanCancer_eucscale_impute.R") +maxInt <- 50 +maxEdge <- 3000 +megaDir <- sprintf("%s/eucclean_impute_maxEdge%i_top%i_%s", + outRoot,maxEdge,maxInt,dt) +cat(megaDir, file="test.txt",append=TRUE) + runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, + maxEdge=maxEdge,maxInt=maxInt, cutoffSet=9) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R index 225138f9..83413345 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/PanCancer_eucscale_impute.R @@ -54,7 +54,7 @@ normalize <- function(X) { # ---------------------------------------------------------------- runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, - cutoffSet) { + cutoffSet,maxInt=50,maxEdge=6000) { require(netDx) require(netDx.examples) require(glmnet) @@ -62,7 +62,6 @@ require(glmnet) numCores <- 8L GMmemory <- 4L trainProp <- 0.8 -maxEdge <- 6000 ### max edge after sparsification if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) dir.create(megaDir) @@ -70,6 +69,9 @@ dir.create(megaDir) logFile <- sprintf("%s/log.txt",megaDir) sink(logFile,split=TRUE) tryCatch({ +cat("Sparsify:\n") +cat(sprintf("maxEdge = %i\n", maxEdge)) +cat(sprintf("maxInt = %i\n",maxInt)) alldat <- do.call("rbind",dats) @@ -151,7 +153,7 @@ for (rngNum in rngVals) { simMetric="custom",customFunc=sim.eucscale, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, + sparsify_edgeMax=maxEdge,sparsify_maxInt=maxInt, verbose=FALSE,numCores=numCores) cat(sprintf("Total of %i nets\n", length(netList))) @@ -230,7 +232,7 @@ for (rngNum in rngVals) { simMetric="custom",customFunc=sim.eucscale, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, + sparsify_edgeMax=maxEdge,sparsify_maxInt=maxInt, verbose=TRUE,numCores=numCores) cat(sprintf("Total of %i nets\n", length(netList))) # now create database From 89d64bd60f7e903bbd466903f8b1f8a9befe3e2a Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 30 May 2018 10:28:34 -0400 Subject: [PATCH 117/124] after sparse bug fix --- .../pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R | 1 - 1 file changed, 1 deletion(-) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R index 2f989f79..9a3e1c1b 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/eucscale/OV_eucscale_pipeline.R @@ -101,7 +101,6 @@ combList <- list( rm(pheno,pheno_nosurv) rm(survStr,surv,tmp,nm,outRoot,inDir,dt,k,inFiles,datFiles,pname) -browser() # ----------------------------------------------------------- # run predictor source("PanCancer_eucscale.R") From b52dcda22694a32392aad78f8a7106c0e1b78fc2 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 30 May 2018 10:31:42 -0400 Subject: [PATCH 118/124] sparsification bug fix --- netDx/DESCRIPTION | 2 +- netDx/NAMESPACE | 1 - netDx/R/makePSN_NamedMatrix.R | 3 ++- netDx/R/sparsify2.R | 5 +++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/netDx/DESCRIPTION b/netDx/DESCRIPTION index 0e302543..d2b2a4b1 100644 --- a/netDx/DESCRIPTION +++ b/netDx/DESCRIPTION @@ -1,6 +1,6 @@ Package: netDx Title: Learns Patient Binary Classification based on Similarity Networks -Version: 1.0.22 +Version: 1.0.23 Authors@R: c(person("Shraddha", "Pai", email = "shraddha.pai@utoronto.ca", role = c("aut", "cre")), person("Ahmad","Shah", role="aut"), person("Shirley","Hui",role="aut"), diff --git a/netDx/NAMESPACE b/netDx/NAMESPACE index 66718a42..17bc6a2b 100644 --- a/netDx/NAMESPACE +++ b/netDx/NAMESPACE @@ -55,7 +55,6 @@ export(resampling_pickBestCutoff_CNV) export(resampling_predTest_CNV) export(runGeneMANIA) export(runPredictor_nestedCV) -export(runPredictor_nestedCV_univ) export(simpleCap) export(sparsify2) export(sparsifyNet) diff --git a/netDx/R/makePSN_NamedMatrix.R b/netDx/R/makePSN_NamedMatrix.R index 83941026..46138eec 100644 --- a/netDx/R/makePSN_NamedMatrix.R +++ b/netDx/R/makePSN_NamedMatrix.R @@ -60,6 +60,7 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, simMetric="pearson",verbose=TRUE, numCores=1L,writeProfiles=TRUE, sparsify=FALSE,useSparsify2=FALSE,cutoff=0.3,sparsify_edgeMax=1000, + sparsify_maxInt=50, append=FALSE,...){ if (!append) { if (file.exists(outDir)) unlink(outDir,recursive=TRUE) @@ -132,7 +133,7 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, tryCatch({ spmat <- sparsify2(pat_pairs,cutoff=cutoff, EDGE_MAX=sparsify_edgeMax, - outFile) + outFile=outFile,maxInt=sparsify_maxInt) },error=function(ex) { cat("sparse caught error\n"); browser() }) diff --git a/netDx/R/sparsify2.R b/netDx/R/sparsify2.R index 3afb1f08..cff7309c 100644 --- a/netDx/R/sparsify2.R +++ b/netDx/R/sparsify2.R @@ -19,7 +19,8 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, if (maxInt > ncol(W)) maxInt <- ncol(W) - diag(W) <- 0; + # don't want same patient edge twice, nor self-similarity + W[upper.tri(W,diag=TRUE)] <- NA W[W < cutoff] <- NA x <- list() for (i in 1:nrow(W)) { x[[i]] <- sort(W[i,],decreasing=TRUE,na.last=TRUE)} @@ -31,7 +32,7 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, } mmat <- na.omit(melt(W)) mmat <- mmat[order(mmat[,3],decreasing=TRUE),] - + maxEdge <- nrow(mmat) if (maxEdge>EDGE_MAX) maxEdge <- EDGE_MAX mmat <- mmat[1:maxEdge,] From e6fe9b5e5bbc9c674f120aa06073d5e12f06b430 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 30 May 2018 10:32:02 -0400 Subject: [PATCH 119/124] post-sparse bug fix --- .../diff_kernels/pearscale/LUSC_top_pearscale.R | 7 +++++-- .../pearscale/PanCancer_topClin_pearscale_impute.R | 7 +++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R index fc1be1ae..a1bec99a 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/LUSC_top_pearscale.R @@ -113,7 +113,10 @@ source("PanCancer_topClin_pearscale_impute.R") outRoot <- "/home/shraddhapai/BaderLab/2017_PanCancer/LUSC/output/" dt <- format(Sys.Date(),"%y%m%d") topClin<-1 -megaDir <- sprintf("%s/pearscale_lasso_topClin%i_%s",outRoot,topClin,dt) +maxEdge <- 6000 +maxInt <- 40 +megaDir <- sprintf("%s/pearscale_lasso_topClin%i_max%i_%iK_%s", + outRoot,topClin,maxInt,maxEdge/1000,dt) runPredictor(mega_combList=combList,rngVals=1:20,netSets=netSets, dats=dats,pheno_all=pheno_all,megaDir=megaDir, - cutoffSet=9) + cutoffSet=9,maxEdge=maxEdge, maxInt=maxInt) diff --git a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R index e50b93b8..097c9d5d 100644 --- a/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R +++ b/misc/PanCancer/pruneVersion/diff_kernels/pearscale/PanCancer_topClin_pearscale_impute.R @@ -55,7 +55,7 @@ normalize <- function(X) { # ---------------------------------------------------------------- runPredictor <- function(mega_combList,rngVals,netSets,dats,pheno_all,megaDir, - cutoffSet) { + cutoffSet,maxEdge=6000,maxInt=30) { require(netDx) require(netDx.examples) require(glmnet) @@ -63,7 +63,6 @@ require(glmnet) numCores <- 8L GMmemory <- 4L trainProp <- 0.8 -maxEdge <- 6000 ### max edge after sparsification if (file.exists(megaDir)) unlink(megaDir,recursive=TRUE) dir.create(megaDir) @@ -157,7 +156,7 @@ for (rngNum in rngVals) { simMetric="custom",customFunc=sim.pearscale, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, + sparsify_edgeMax=maxEdge,sparsify_maxInt=maxInt, verbose=FALSE,numCores=numCores) cat(sprintf("Total of %i nets\n", length(netList))) @@ -235,7 +234,7 @@ for (rngNum in rngVals) { simMetric="custom",customFunc=sim.pearscale, writeProfiles=FALSE, sparsify=TRUE,useSparsify2=TRUE,cutoff=.Machine$double.eps, - sparsify_edgeMax=maxEdge, + sparsify_edgeMax=maxEdge,sparsify_maxInt=maxInt, verbose=TRUE,numCores=numCores) cat(sprintf("Total of %i nets\n", length(netList))) # now create database From 0443e0a98d934fbc76f519bdc9a0dce13a84c196 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 20 Jun 2018 16:55:57 -0400 Subject: [PATCH 120/124] sparsify1 decommissioned; sparsify2/faster sparsify3 new options --- netDx/NAMESPACE | 1 + netDx/R/makePSN_NamedMatrix.R | 59 ++++++++++++---------- netDx/R/sparsify3.R | 84 ++++++++++++++++++++++++++++++++ netDx/man/makePSN_NamedMatrix.Rd | 9 ++-- netDx/man/sparsify3.Rd | 34 +++++++++++++ 5 files changed, 154 insertions(+), 33 deletions(-) create mode 100644 netDx/R/sparsify3.R create mode 100644 netDx/man/sparsify3.Rd diff --git a/netDx/NAMESPACE b/netDx/NAMESPACE index 17bc6a2b..7d5fd7eb 100644 --- a/netDx/NAMESPACE +++ b/netDx/NAMESPACE @@ -57,6 +57,7 @@ export(runGeneMANIA) export(runPredictor_nestedCV) export(simpleCap) export(sparsify2) +export(sparsify3) export(sparsifyNet) export(splitTestTrain) export(splitTestTrain_partition) diff --git a/netDx/R/makePSN_NamedMatrix.R b/netDx/R/makePSN_NamedMatrix.R index 46138eec..60dcbfbb 100644 --- a/netDx/R/makePSN_NamedMatrix.R +++ b/netDx/R/makePSN_NamedMatrix.R @@ -40,10 +40,8 @@ #' create interaction networks. If TRUE, this function writes subsets #' of the original data corresponding to networks to file (profiles). #' If FALSE, uses getSimilarity() and writes interaction networks. -#' @param sparsify (logical) sparsify networks by calling sparsifyNets() -#' with default parameters. Only used when writeProfiles=FALSE -#' @param useSparsify2 (logical). Currently for testing only. A cleaner -#' sparsification routine. +#' @param useSparsify2 (logical). Cleaner sparsification +#' sparsification routine. If FALSE, uses new matrix-based sparsify3 #' @param sparsify_edgeMax (numeric). #' @param append (logical) if TRUE does not overwrite netDir. #' @param ... passed to \code{getSimilarity()} @@ -77,7 +75,7 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, stop("writeProfiles must only be TRUE with simMetric set to pearson or MI. For all other metrics, set writeProfiles=FALSE") } - if (!sparsify & useSparsify2) { stop("if useSparsify=TRUE then sparsify must also be set to TRUE\n")} + #if (!sparsify & useSparsify2) { stop("if useSparsify=TRUE then sparsify must also be set to TRUE\n")} cl <- makeCluster(numCores,outfile=sprintf("%s/makePSN_log.txt",outDir)) registerDoParallel(cl) @@ -107,26 +105,26 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, cat(sprintf("%s: sim is null\n",curSet)) browser() } - if (!useSparsify2) {# prepare for internal sparsifier - idx <- which(upper.tri(sim,diag=F)) - ij <- matrix_getIJ(dim(sim),idx) - - # make interaction network - pat_pairs <- data.frame(p1=rownames(sim)[ij[,1]], - p2=colnames(sim)[ij[,2]], - similarity=sim[idx]) - - too_weak <- which(pat_pairs[,3] < cutoff | - is.na(pat_pairs[,3])) - if (any(too_weak)) { - if (verbose) - cat(sprintf("\t%i weak connections\n", - length(too_weak))) - pat_pairs <- pat_pairs[-too_weak,] - } - } else { # stick to sim matrix +### if (!useSparsify2) {# prepare for internal sparsifier +### idx <- which(upper.tri(sim,diag=F)) +### ij <- matrix_getIJ(dim(sim),idx) +### +### # make interaction network +### pat_pairs <- data.frame(p1=rownames(sim)[ij[,1]], +### p2=colnames(sim)[ij[,2]], +### similarity=sim[idx]) +### +### too_weak <- which(pat_pairs[,3] < cutoff | +### is.na(pat_pairs[,3])) +### if (any(too_weak)) { +### if (verbose) +### cat(sprintf("\t%i weak connections\n", +### length(too_weak))) +### pat_pairs <- pat_pairs[-too_weak,] +### } +### } else { # stick to sim matrix pat_pairs <- sim - } +### } if (sparsify) { if (useSparsify2) { @@ -138,9 +136,16 @@ makePSN_NamedMatrix <- function(xpr, nm, namedSets, outDir, cat("sparse caught error\n"); browser() }) } else { - cat("using original sparsifier method\n") - sparsifyNet(pat_pairs,outFile,numPatients=nrow(sim), - verbose=FALSE) + cat("sparsify3\n") + tryCatch({ + sp_t0 <- Sys.time() + spmat <- sparsify3(pat_pairs,cutoff=cutoff, + EDGE_MAX=sparsify_edgeMax, + outFile=outFile,maxInt=sparsify_maxInt) + print(Sys.time()-sp_t0) + },error=function(ex) { + cat("sparse caught error\n"); browser() + }) } } else { write.table(pat_pairs, file=outFile,sep="\t", diff --git a/netDx/R/sparsify3.R b/netDx/R/sparsify3.R new file mode 100644 index 00000000..378582af --- /dev/null +++ b/netDx/R/sparsify3.R @@ -0,0 +1,84 @@ +#' cleaner sparsification routine - faster, matrix-based version +#' +#' @details Sparsifies similarity matrix to keep strongest edges. +#' Sets diagonal and edges < cutoff to NA. Keeps strongest maxInt edges +#' per node. Ties are ignored. Keeps a max of EDGE_MAX edges in the network. +#' @param W (matrix) similarity matrix +#' @param outFile (char) path to file to write sparsified network +#' @param cutoff (numeric) edges with weight smaller than this are set to NA +#' @param maxInt (numeric) max num edges per node. +#' @param EDGE_MAX (numeric) max num edges in network +#' @param includeAllNodes (logical) if TRUE, ensures at least one edge is present for each patient. This feature is required when sparsification excludes test patients that are required to be classified. If the sparsification rules exclude all edges for a patient and this flag is set, then the strongest edge for each missing patient is added to the net. Note that this condition results in the total number of edges potentially exceeding EDGE_MAX +#' @return writes SIF content to text file (node1,node2,edge weight) +#' @import reshape2 +#' @export +sparsify3 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, + includeAllNodes=TRUE,verbose=TRUE,numCores=4L) { + + if (verbose) cat(sprintf("sparsify2:maxInt=%i;EDGE_MAX=%i;cutoff=%1.2e;includeAllNodes=%s",maxInt,EDGE_MAX,cutoff,includeAllNodes)) + + if (maxInt > ncol(W)) maxInt <- ncol(W) + + if (class(W)!="matrix") W <- as.matrix(W) + diag(W) <- NA + mytop <- cbind(colnames(W),colnames(W)[apply(W,1,which.max)], + apply(W,1,max,na.rm=TRUE)) + # don't want same patient edge twice, nor self-similarity + W[upper.tri(W,diag=TRUE)] <- NA + W[W < cutoff] <- NA + maxind <- min(ncol(W),maxInt) + + # effectively empty out the slots that are not the top interactions + # create a "switch off" matrix with NA in non-top edges + W_order <- t(apply(W,1,order,decreasing=TRUE,na.last=TRUE)) + W_order[which(W_order > 50)] <- NA + W_order[which(W_order <= 50)] <- .Machine$double.eps + W2 <- W + W_order # NA for non-top edges, unchanged for top edges + mmat <- na.omit(melt(W2,varnames=names(dimnames(W2)))) + + maxEdge <- nrow(mmat) + if (maxEdge>EDGE_MAX) maxEdge <- EDGE_MAX + mmat <- mmat[1:maxEdge,] + + # we should guarantee an edge from all patients- in this case + # the edge_max would be violated unless we come up with a better rule + if (includeAllNodes) { + mmat[,1] <- as.character(mmat[,1]) + mmat[,2] <- as.character(mmat[,2]) + univ <- c(mmat[,1],mmat[,2]) + missing <- setdiff(rownames(W), univ) + #cat(sprintf("missing = { %s }\n",paste(missing, collapse=","))) + if (length(missing)>0) { + cat(sprintf("Sparsify2: found %i missing patients; adding strongest edge\n", + length(missing))) + for (k in missing) { # add the strongest edge for the patient + tmp <- mytop[which(mytop[,1]%in% k),] + x <- as.numeric(tmp[3]) + if (x < cutoff) { + cat("\tMissing edge is below cutoff; setting to cutoff\n") + x <- cutoff + } + mmat <- rbind(mmat, c(k, tmp[2],x)) + } + } + } + head(mmat) + mmat <- na.omit(mmat) # boundary case where cutoff exceeds net max + mmat[,3] <- as.numeric(mmat[,3]) + mmat[,3] <- round(mmat[,3],digits=4) + write.table(mmat,file=outFile,sep="\t",col=F,row=F,quote=F) + return(mmat) + +### the code below converts the SIF format back to a matrix,potentially +### for debugging. +### W2 <- dcast(mmat,Var2~Var1,value.var="value") +### rownames(W2) <- W2[,1]; W2 <- W2[,-1] +### W2 <- W2[,colnames(W)] +### W2 <- W2[colnames(W),] +### n <- ncol(W); +### sp <- nrow(mmat)/(n*(n-1))/2 +### cat(sprintf("%i -> %i edges (%i%% sparsity)\n", +### sum(!is.na(W)), nrow(mmat), round(sp*100))) +### return(W2); +} + diff --git a/netDx/man/makePSN_NamedMatrix.Rd b/netDx/man/makePSN_NamedMatrix.Rd index 685d626f..9e6a93bd 100644 --- a/netDx/man/makePSN_NamedMatrix.Rd +++ b/netDx/man/makePSN_NamedMatrix.Rd @@ -7,7 +7,7 @@ makePSN_NamedMatrix(xpr, nm, namedSets, outDir, simMetric = "pearson", verbose = TRUE, numCores = 1L, writeProfiles = TRUE, sparsify = FALSE, useSparsify2 = FALSE, cutoff = 0.3, sparsify_edgeMax = 1000, - append = FALSE, ...) + sparsify_maxInt = 50, append = FALSE, ...) } \arguments{ \item{xpr}{(matrix) rows are measurements, columns are samples. Columns @@ -37,11 +37,8 @@ create interaction networks. If TRUE, this function writes subsets of the original data corresponding to networks to file (profiles). If FALSE, uses getSimilarity() and writes interaction networks.} -\item{sparsify}{(logical) sparsify networks by calling sparsifyNets() -with default parameters. Only used when writeProfiles=FALSE} - -\item{useSparsify2}{(logical). Currently for testing only. A cleaner -sparsification routine.} +\item{useSparsify2}{(logical). Cleaner sparsification +sparsification routine. If FALSE, uses new matrix-based sparsify3} \item{cutoff}{(numeric) patients with similarity smaller than this value are not included in the corresponding interaction network} diff --git a/netDx/man/sparsify3.Rd b/netDx/man/sparsify3.Rd new file mode 100644 index 00000000..b7827d93 --- /dev/null +++ b/netDx/man/sparsify3.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparsify3.R +\name{sparsify3} +\alias{sparsify3} +\title{cleaner sparsification routine - faster, matrix-based version} +\usage{ +sparsify3(W, outFile = "tmp.txt", cutoff = 0.3, maxInt = 50, + EDGE_MAX = 1000, includeAllNodes = TRUE, verbose = TRUE, + numCores = 4L) +} +\arguments{ +\item{W}{(matrix) similarity matrix} + +\item{outFile}{(char) path to file to write sparsified network} + +\item{cutoff}{(numeric) edges with weight smaller than this are set to NA} + +\item{maxInt}{(numeric) max num edges per node.} + +\item{EDGE_MAX}{(numeric) max num edges in network} + +\item{includeAllNodes}{(logical) if TRUE, ensures at least one edge is present for each patient. This feature is required when sparsification excludes test patients that are required to be classified. If the sparsification rules exclude all edges for a patient and this flag is set, then the strongest edge for each missing patient is added to the net. Note that this condition results in the total number of edges potentially exceeding EDGE_MAX} +} +\value{ +writes SIF content to text file (node1,node2,edge weight) +} +\description{ +cleaner sparsification routine - faster, matrix-based version +} +\details{ +Sparsifies similarity matrix to keep strongest edges. +Sets diagonal and edges < cutoff to NA. Keeps strongest maxInt edges +per node. Ties are ignored. Keeps a max of EDGE_MAX edges in the network. +} From 392865a5ae1486f71e253a0e9a854462ef802328 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 20 Jun 2018 16:56:31 -0400 Subject: [PATCH 121/124] trims edge weights to 4 sig digits --- netDx/R/sparsify2.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/netDx/R/sparsify2.R b/netDx/R/sparsify2.R index cff7309c..785c1544 100644 --- a/netDx/R/sparsify2.R +++ b/netDx/R/sparsify2.R @@ -24,12 +24,15 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, W[W < cutoff] <- NA x <- list() for (i in 1:nrow(W)) { x[[i]] <- sort(W[i,],decreasing=TRUE,na.last=TRUE)} +cat("past sorting\n") names(x) <- rownames(W) for (k in 1:length(x)) { + print(k) cur <- x[[k]] tokeep <- names(cur)[1:min(length(cur),maxInt)] W[k,which(!colnames(W)%in% tokeep)] <- NA } +cat("got past b\n") mmat <- na.omit(melt(W)) mmat <- mmat[order(mmat[,3],decreasing=TRUE),] @@ -44,7 +47,7 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, mmat[,2] <- as.character(mmat[,2]) univ <- c(mmat[,1],mmat[,2]) missing <- setdiff(rownames(W), univ) - cat(sprintf("missing = { %s }\n",paste(missing, collapse=","))) + #cat(sprintf("missing = { %s }\n",paste(missing, collapse=","))) if (length(missing)>0) { cat(sprintf("Sparsify2: found %i missing patients; adding strongest edge\n", length(missing))) @@ -59,6 +62,8 @@ sparsify2 <- function(W, outFile="tmp.txt",cutoff=0.3,maxInt=50,EDGE_MAX=1000, } } head(mmat) + mmat[,3] <- as.numeric(mmat[,3]) + mmat[,3] <- round(mmat[,3],digits=4) write.table(mmat,file=outFile,sep="\t",col=F,row=F,quote=F) return(mmat) From 13fa5d794d9ad1503263f52237fc8e48a011f5d0 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Wed, 20 Jun 2018 16:57:00 -0400 Subject: [PATCH 122/124] removes TT_STATUS if found --- netDx/R/nWay_netSum.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/netDx/R/nWay_netSum.R b/netDx/R/nWay_netSum.R index 9e9eb082..0a85eebd 100644 --- a/netDx/R/nWay_netSum.R +++ b/netDx/R/nWay_netSum.R @@ -71,7 +71,10 @@ Nway_netSum <- function(netmat=NULL, phenoDF,predClass,outDir,netDir, predClass=predClass, setSeed=seed_resampling, verbose=TRUE) p_full <- netmat pheno_full <- phenoDF - pheno_full <- pheno_full[,-which(colnames(pheno_full)%in%"TT_STATUS")] + if (any(colnames(pheno_full)%in% "TT_STATUS")) { + cat("** Warning, found TT_STATUS column. netDx adds its own column so this one will be removed **\n") + pheno_full <- pheno_full[,-which(colnames(pheno_full)%in%"TT_STATUS")] + } pScore <- list() cliqueNets <- list() From f27a66910485df9e45a38b27e369fd1d5fb10d46 Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Fri, 22 Jun 2018 17:24:46 -0400 Subject: [PATCH 123/124] added option to return rows with na values for highly variable feature situations (pathway-filtering) --- netDx/R/getFeatureScores.R | 9 ++++++--- netDx/R/writeEmapInput.R | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/netDx/R/getFeatureScores.R b/netDx/R/getFeatureScores.R index 84fe23cf..1fa4cd98 100644 --- a/netDx/R/getFeatureScores.R +++ b/netDx/R/getFeatureScores.R @@ -9,6 +9,9 @@ #' if inDir is a list, it should have one key per class. The value should be the #' corresponding set of filenames for pathway_CV_score.txt #' @param predClasses (char) possible STATUS for patients +#' @param getFullCons (logical) if TRUE, does not remove rows with NA. +#' Recommended only when the number of input features is extensively +#' pruned by first-pass feature selection. #' @return (list) one key per patient class. Value is matrix of network #' scores across all train/test splits. Each score is the output of #' the inner fold of CV. @@ -17,7 +20,7 @@ #' path.package("netDx.examples")) #' netScores <- getFeatureScores(inDir, predClasses = c("SURVIVEYES","SURVIVENO")) #' @export -getFeatureScores <- function(inDir,predClasses) { +getFeatureScores <- function(inDir,predClasses,getFullCons=FALSE) { if (missing(inDir)) stop("inDir not provided"); if (missing(predClasses)) stop("predClasses missing; please specify classes"); @@ -44,7 +47,6 @@ getFeatureScores <- function(inDir,predClasses) { colnames(tmp)[1] <- "PATHWAY_NAME" netColl[[scoreFile]] <- tmp } - spos <- gregexpr("\\/",fList) # get the name of the iteration (rngX) assuming directory structure # rngX//GM_results>/pathway_CV_score.txt @@ -58,12 +60,13 @@ getFeatureScores <- function(inDir,predClasses) { # filter for nets meeting cutoff criteria cat("* Computing consensus\n") + cons <- getNetConsensus(netColl); x1 <- nrow(cons) na_sum <- rowSums(is.na(cons)) full_cons <- cons cons <- cons[which(na_sum < 1),] - out[[gp]] <- cons + if (getFullCons) out[[gp]] <- full_cons else out[[gp]] <- cons } return(out) } diff --git a/netDx/R/writeEmapInput.R b/netDx/R/writeEmapInput.R index 6777bc80..146d6c0c 100644 --- a/netDx/R/writeEmapInput.R +++ b/netDx/R/writeEmapInput.R @@ -40,7 +40,8 @@ #' outPfx=sprintf("%s/%s",outDir,gp)) #' @export writeEMapInput <- function(featScores, namedSets,netInfo,outPfx="curr", - pctPass=0.70,minScore=1,maxScore=10,trimFromName=c(".profile","_cont"),verbose=FALSE) { + pctPass=0.70,minScore=1,maxScore=10,trimFromName=c(".profile","_cont"), + verbose=FALSE) { netNames <- featScores[,1]; featScores <- featScores[,-1] @@ -60,7 +61,7 @@ writeEMapInput <- function(featScores, namedSets,netInfo,outPfx="curr", for (tr in trimFromName) netNames <- sub(tr,"",netNames) df1 <- data.frame(netName=netNames, maxScore=maxNetS) - colnames(netInfo) <- c("netType","netName") + #colnames(netInfo) <- c("netType","netName") df2 <- merge(x=df1,y=netInfo,by="netName") # write node attributes From 4dfc8591add81f81e7ca74eba3bbddba3ac82aed Mon Sep 17 00:00:00 2001 From: shraddhapai Date: Tue, 26 Jun 2018 16:58:57 -0400 Subject: [PATCH 124/124] dummy to resolve merge conflict --- misc/PanCancer/plotCombinedKM_HRatio.R | 131 +++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 misc/PanCancer/plotCombinedKM_HRatio.R diff --git a/misc/PanCancer/plotCombinedKM_HRatio.R b/misc/PanCancer/plotCombinedKM_HRatio.R new file mode 100644 index 00000000..c94d5666 --- /dev/null +++ b/misc/PanCancer/plotCombinedKM_HRatio.R @@ -0,0 +1,131 @@ +# average KM curves for KIRC +rm(list=ls()) + +require(rms) +require(survival) +require(survminer) +source("survival_plots/plot.survfit.custom.R") + +tumourType <- "GBM" + +rootDir <- "/Users/shraddhapai/Dropbox/netDx/BaderLab" +if (tumourType=="KIRC") { + dataDir <- sprintf("%s/2017_TCGA_KIRC/output/pruned_180204",rootDir) + survFile <- sprintf("%s/2017_TCGA_KIRC/input/KIRC_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_TCGA_KIRC/input/KIRC_clinical_core.txt",rootDir) + repIter <- 17 +} else if (tumourType=="GBM") { + dataDir <- sprintf("%s/2017_TCGA_GBM/output/pruned_180204",rootDir) + survFile <- sprintf("%s/2017_TCGA_GBM/input/GBM_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_TCGA_GBM/input/GBM_clinical_core.txt",rootDir) + repIter <- 21 +} else if (tumourType=="OV") { + dataDir <- sprintf("%s/2017_TCGA_OV/output/OV_oneNetPer_170425",rootDir) + survFile <- sprintf("%s/2017_TCGA_OV/input/OV_OS_core.txt",rootDir) + clinFile <- sprintf("%s/2017_TCGA_OV/input/OV_clinical_core.txt",rootDir) +} + +survDat <- read.delim(survFile,sep="\t",h=T,as.is=T) +clinDat <- read.delim(clinFile,sep="\t",h=T,as.is=T) +pheno <- merge(x=survDat,y=clinDat,by="feature") + +plotDF <- list() # compiles survival curves across all iterations +megaDF <- list() +hratio <- c() # cum hazards ratio for all iterations +keepCoxph <- c() # for representative iter +keepSurv <- c() # for representative iter +for (k in 1:100) { + print(k) + if (tumourType=="KIRC") { + dat <- read.delim(sprintf("%s/rng%i/clinicalAdnam/predictionResults.txt", + dataDir,k),sep="\t",h=T,as.is=T) + } else if (tumourType =="GBM") { + dat <- read.delim(sprintf("%s/rng%i/all/cutoff9/predictionResults.txt", + dataDir,k),sep="\t",h=T,as.is=T) + } + + colnames(dat)[1] <- "feature" + dat <- merge(x=dat,y=pheno,by="feature") + + # force first entry to be YES and second to be NO so we can tell them apart + # in the output. + dat$PRED_CLASS <- factor(dat$PRED_CLASS, + levels=c("SURVIVEYES","SURVIVENO")) + + megaDF[[k]] <- dat + + dat$SurvObj <- with(dat, Surv(OS_OS, STATUS_INT == 0)) + + # get cum hazards ratio for this split + model <- coxph(SurvObj~PRED_CLASS, data=dat) + hratio <- c(hratio,summary(model)$coef[1,2]) + + fit <- npsurv(SurvObj ~ PRED_CLASS, data = dat, + conf.type = "log-log") + + #par(mfrow=c(1,2)) + out <- plot.survfit.custom(fit) + #plot(0,0,type='n',xlim=c(0,max(out$ends$x)),ylim=c(0,1)) + out[[1]] <- as.data.frame(out[[1]]) + out[[2]] <- as.data.frame(out[[2]]) + + newdf <- out[[1]]; newdf$PRED_CLASS <- "SURVIVEYES"; newdf$split <- k + newdf2 <- out[[2]]; newdf2$PRED_CLASS <- "SURVIVENO"; newdf2$split <- k + + plotDF[[k]] <- rbind(newdf,newdf2) + + if (k == repIter) { + keepCoxph <- model + keepSurv <- dat + } +} + +cat("Plot representative iter (separately found to have auroc closest to average auroc\n") +require(forestmodel) +fit <- npsurv(SurvObj~PRED_CLASS,data=keepSurv,conf.type="log-log") +pdf(sprintf("%s_survPlot.pdf",tumourType)) +p <- ggsurvplot(fit,pval=TRUE,conf.int=TRUE,palette=c("blue","red"),legend.title="Survival type") +print(p) +p2 <- forest_model(keepCoxph) +print(p2) +dev.off() + +# approach 1: pool all results and make a single KM curve +res <- do.call("rbind",megaDF) +res$SurvObj <- with(res, Surv(OS_OS,STATUS_INT==0)) +fit <- npsurv(SurvObj ~ PRED_CLASS, data=res,conf.type="log-log") + +idx <- which(hratio > 50) +if (any(idx)) hratio <- hratio[-idx] +hratio <- data.frame(group="tumour",hratio=hratio) +p <- ggplot(hratio,aes(group,y=hratio)) + geom_boxplot() + ylim(c(0,quantile(hratio$hratio,0.98))+3) +p <- p + ggtitle(sprintf("Cum hazard ratio (one per split)(N=%i)",nrow(hratio))) +p <- p + geom_hline(yintercept=1,lty=2) +p <- p + theme(axis.text=element_text(size=12), + axis.title=element_text(size=14,face="bold")) + +p1 <- survminer::ggsurvplot(fit,data=res,conf.int=TRUE) +p2 <- p + +# approach 2 : pool all step functions +res <- do.call("rbind",plotDF) +# compute ci +out <- list() +for (k in unique(res$PRED_CLASS)) { + res2 <- subset(res,PRED_CLASS==k) + ub <- c(); lb <- c(); xx <- unique(res2$xx); muy <- c() + for (x in xx) { + yy <- res2$yy[which(res2$xx == x)] + mu <- mean(yy); offset <- sd(yy)/sqrt(length(yy)) + lb <- c(lb, mu-offset) + ub <- c(ub, mu+offset) + muy <- c(muy, mu) + } + out[[k]] <- data.frame(x=xx,y=muy,lb=lb,ub=ub,PRED_CLASS=k) +} +blah <- do.call("rbind",out) +p3 <- ggplot(blah,(aes(x=x,y=y,colour=PRED_CLASS))) + geom_line() + geom_ribbon(aes(ymin=lb,ymax=ub),alpha=0.2) + ggtitle("Manual mean+CI of compiled KM") + ylab("% survival") +xlab("time (months)") + +pdf(sprintf("%s.pdf",tumourType)) +print(p1); print(p2); print(p3) +dev.off()