## Task * Prostate dataset from ElemStatLearn package * Column "lpsa" is continous response * Column "train" is originally used split into train/test * Test if the split into train and test set is "lucky": compare with models based on fit on a random split Source as R-markdown: [Prostate-Example.Rmd](Prostate-Example.Rmd). ```{r,options,include=FALSE} options(BBmisc.ProgressBar.style = "off") # preload packages to reduce output in markdown library(BatchExperiments) library(ElemStatLearn) library(glmnet) library(plyr) ``` ## Set up problems and algorithms ```{r,prob_algos} library(BatchExperiments) library(ElemStatLearn) library(glmnet) library(plyr) # Load data and get a quick overview data("prostate", package = "ElemStatLearn") str(prostate) pairs(prostate, col = c("red", "darkgreen")[prostate$train + 1]) # Create registry file.dir <- tempfile("prostate_") reg <- makeExperimentRegistry("prostate_example", file.dir = file.dir, packages = "glmnet", seed = 1) # we explicitly use the multicore backend here, # regardless of what you have in your config setConfig(conf = list(cluster.functions=makeClusterFunctionsMulticore(4))) # Define subsample function ss <- function(static, orig=FALSE) { nr <- nrow(static) train.ind <- if(orig) static$train else sample(static$train) list(train = subset(static, train.ind, select = -train), test = subset(static, !train.ind, select = -train)) } # Function to fit a simple linear model linear <- function(dynamic) { model <- lm(lpsa ~ ., data = dynamic$train) y <- dynamic$test$lpsa y.hat <- predict(model, newdata = dynamic$test) 1 - sum((y - y.hat)^2) / sum((y - mean(y))^2) # R^2 } # Function to fit a regularized regression model, see ?glmnet regularized <- function(dynamic, alpha) { x <- as.matrix(subset(dynamic$train, select = -lpsa)) y <- dynamic$train[["lpsa"]] model <- cv.glmnet(x, y, alpha = alpha) x <- as.matrix(subset(dynamic$test, select = -lpsa)) y <- dynamic$test$lpsa y.hat <- predict(model, newx = x) 1 - sum((y - y.hat)^2) / sum((y - mean(y))^2) # R^2 } # Add problem and algorithms to the registry addProblem(reg, "prostate", static = prostate, dynamic = ss, seed = 1) addAlgorithm(reg, "linear", fun = linear) addAlgorithm(reg, "lasso", fun = regularized) addAlgorithm(reg, "ridge", fun = regularized) # Add experiments using the original split pd <- makeDesign("prostate", design = data.frame(orig = TRUE)) ad <- list(makeDesign("linear"), makeDesign("lasso", design = data.frame(alpha = 1)), makeDesign("ridge", design = data.frame(alpha = 0))) addExperiments(reg, pd, ad) # Add experiments using random splits, 100 replications pd <- makeDesign("prostate", design = data.frame(orig = FALSE)) addExperiments(reg, pd, ad, repl = 100) ``` ## Submit jobs to the cluster ```{r,submit} # Quick overview: defined experiments summarizeExperiments(reg) # Chunk jobs together - they are pretty fast chunked <- chunk(getJobIds(reg), n.chunks = 10, shuffle = TRUE) submitJobs(reg, chunked) # Check computational status showStatus(reg) ``` ## Analysis ```{r,analysis} # Wait for the jobs first waitForJobs(reg) # Get grouped mean values df <- reduceResultsExperiments(reg, fun = function(job, res) list(R2 = res)) ddply(df, c("orig", "algo"), summarise, n = length(R2), mean = mean(R2)) # Extract R2 of original split ids <- findExperiments(reg, prob.pars = (orig == TRUE)) df <- reduceResultsExperiments(reg, ids, fun = function(job, res) list(R2 = res)) orig <- unlist(split(df$R2, df$algo)) # Extract R2 of subsampled splits ids <- findExperiments(reg, prob.pars = (orig == FALSE)) df <- reduceResultsExperiments(reg, ids, fun = function(job, res) list(R2 = res)) subsampled <- as.data.frame(split(df$R2, df$algo)) # boxplot of the results boxplot(subsampled) points(1:3, orig[names(subsampled)], col = "red", pch = 19) ```