## Task
* Prostate dataset from ElemStatLearn package
* Column "lpsa" is continous response
* Column "train" is originally used split into train/test
* Test if the split into train and test set is "lucky": compare with models based on fit on a random split

Source as R-markdown: [Prostate-Example.Rmd](Prostate-Example.Rmd).


```{r,options,include=FALSE}
options(BBmisc.ProgressBar.style = "off")
# preload packages to reduce output in markdown
library(BatchExperiments)
library(ElemStatLearn)
library(glmnet)
library(plyr)
```

## Set up problems and algorithms
```{r,prob_algos}
library(BatchExperiments)
library(ElemStatLearn)
library(glmnet)
library(plyr)


# Load data and get a quick overview
data("prostate", package = "ElemStatLearn")
str(prostate)
pairs(prostate, col = c("red", "darkgreen")[prostate$train + 1])


# Create registry
file.dir <- tempfile("prostate_")
reg <- makeExperimentRegistry("prostate_example", file.dir = file.dir, packages = "glmnet", seed = 1)

# we explicitly use the multicore backend here,
# regardless of what you have in your config
setConfig(conf = list(cluster.functions=makeClusterFunctionsMulticore(4)))


# Define subsample function
ss <- function(static, orig=FALSE) {
  nr <- nrow(static)
  train.ind <- if(orig) static$train else sample(static$train)
  list(train = subset(static,  train.ind, select = -train),
       test  = subset(static, !train.ind, select = -train))
}


# Function to fit a simple linear model
linear <- function(dynamic) {
  model <- lm(lpsa ~ ., data = dynamic$train)
  y <- dynamic$test$lpsa
  y.hat <- predict(model, newdata = dynamic$test)
  1 - sum((y - y.hat)^2) / sum((y - mean(y))^2) # R^2
}


# Function to fit a regularized regression model, see ?glmnet
regularized <- function(dynamic, alpha) {
  x <- as.matrix(subset(dynamic$train, select = -lpsa))
  y <- dynamic$train[["lpsa"]]
  model <- cv.glmnet(x, y, alpha = alpha)


  x <- as.matrix(subset(dynamic$test, select = -lpsa))
  y <- dynamic$test$lpsa
  y.hat <- predict(model, newx = x)
  1 - sum((y - y.hat)^2) / sum((y - mean(y))^2) # R^2
}


# Add problem and algorithms to the registry
addProblem(reg, "prostate", static = prostate, dynamic = ss, seed = 1)
addAlgorithm(reg, "linear", fun = linear)
addAlgorithm(reg, "lasso", fun = regularized)
addAlgorithm(reg, "ridge", fun = regularized)


# Add experiments using the original split
pd <- makeDesign("prostate", design = data.frame(orig = TRUE))
ad <- list(makeDesign("linear"),
           makeDesign("lasso", design = data.frame(alpha = 1)),
           makeDesign("ridge", design = data.frame(alpha = 0)))
addExperiments(reg, pd, ad)


# Add experiments using random splits, 100 replications
pd <- makeDesign("prostate", design = data.frame(orig = FALSE))
addExperiments(reg, pd, ad, repl = 100)
```

## Submit jobs to the cluster
```{r,submit}
# Quick overview: defined experiments
summarizeExperiments(reg)


# Chunk jobs together - they are pretty fast
chunked <- chunk(getJobIds(reg), n.chunks = 10, shuffle = TRUE)
submitJobs(reg, chunked)


# Check computational status
showStatus(reg)
```

## Analysis
```{r,analysis}
# Wait for the jobs first
waitForJobs(reg)

# Get grouped mean values
df <- reduceResultsExperiments(reg, fun = function(job, res) list(R2 = res))
ddply(df, c("orig", "algo"), summarise, n = length(R2), mean = mean(R2))


# Extract R2 of original split
ids <- findExperiments(reg, prob.pars = (orig == TRUE))
df <- reduceResultsExperiments(reg, ids, fun = function(job, res) list(R2 = res))
orig <- unlist(split(df$R2, df$algo))


# Extract R2 of subsampled splits
ids <- findExperiments(reg, prob.pars = (orig == FALSE))
df <- reduceResultsExperiments(reg, ids, fun = function(job, res) list(R2 = res))
subsampled <- as.data.frame(split(df$R2, df$algo))


# boxplot of the results
boxplot(subsampled)
points(1:3, orig[names(subsampled)], col = "red", pch = 19)
```