simulation2.Rnw

\documentclass{article}
\usepackage[left=20mm, right=15mm, top=15mm, bottom=20mm]{geometry}

\begin{document}
%\VignetteEngine{knitr::knitr}
<<setup>>=
library("knitr")
opts_chunk$set(cache = FALSE, echo = TRUE, message = FALSE)

library("partykit")
library("sandwich")
library("plyr")
library("mvtnorm")
library("ggplot2")
theme_set(theme_bw(base_size = 18))


source("basis/personalised_models.R")
source("basis/variable_importance.R")
source("basis/dependence_plots.R")
source("basis/pruning.R")

set.seed(123)
@

\section{Simulation of data}
<<datasimulation>>=
logistf <- function(k, x) {
  1 / (1 + exp(-k * x))
}

##' @param confounderind number of z which is the confounder
##' @param Pa_logis is P(treatment = A) logistic function (TRUE) or 1-logistic function (FALSE)
sim_data <- function(n = 500, p = 10, beta = 3, sd = 1, confounderind = 1, Pa_logis = TRUE){
  
  ## z variables are correlated
  sigma <- diag(p) 
  sigma[sigma == 0] <- 0.2
  ztemp <- rmvnorm(n, sigma = sigma)
  z <- (pnorm(ztemp) * 2 * pi) - pi  
  colnames(z) <- paste0("z", 1:ncol(z))
  z1 <- z[,1]
  
  prob_a <- logistf(1.5, z[, confounderind]) 
  if(!Pa_logis) prob_a <- 1 - prob_a
  # plot(z[, confounderind], prob_a)
  
  lev <- c("C", "A")
  a <- factor(sapply(prob_a, function(p) sample(lev, size = 1, prob = c(1-p, p))), labels = lev, levels = lev)
  # a <- rep(factor(lev, labels = lev, levels = lev), length = n)
  
  y <- 1.9 + 0.2 * (a %in% "A") + beta * logistf(1.5, z1) * (a %in% "A") + rnorm(n, 0, sd)
  
  data.frame(y = y, a = a, z)
}

@


\section{The function needed for fitting the cforest}
<<lm_fittingfunction>>=
##' fitting function for cforest with linear model
lmf <- function(data, weights, parm = c(1,2)) {
  
  tb <- table(data[["a"]][weights > 0])
  ## only one treatment arm left (or only one observation in one treatment arm); 
  ## we don't want to split further...
  if (any(tb < 2)) return(matrix(0, nrow = nrow(data), ncol = length(parm)))
  
  mod <- lm(y ~ a, weights = weights, data = data, subset = weights > 0)
  ef <- as.matrix(estfun(mod)[, parm])
  ret <- matrix(0, nrow = nrow(data), ncol = ncol(ef))
  ret[weights > 0,] <- ef
  ret
}
@


\section{Computing individualised models}

\Sexpr{knit_child("personalised_models.Rnw")}

<<sind_mods>>=
beta <- 3
train <- sim_data(p = 10, beta = beta, n = 600)
Z <- names(train)[!(names(train) %in% c("y", "a"))]
cfm <- as.formula(paste("y + a ~", paste(Z, collapse = "+")))

lmforest <-  cforest(cfm, data = train, ytrafo = lmf, ntree = 100, cores = 1, 
                     perturb = list(replace = FALSE))
pmods <- person_mods(object = lmforest, basemod = "lm", newdata = train, 
                     OOB = FALSE, parallel = TRUE)
@


\section{Variable importance}
<<svarimp, out.width = "0.7\\textwidth", out.height = "0.7\\textwidth">>=
###### variable importance ######
lmforest_pruned <- prune_forest(lmforest, endpoint = "other", treatmentvar = "a", minobs = 2)
VI <- varimp(forest = lmforest_pruned, loglik = comp_loglik.lm, 
             basemod = "lm", OOB = TRUE, parallel = TRUE)
VI

text.z <- sapply(Z, function(x) bquote(z[.(gsub("z", "", x))]))
VI$variable <- factor(VI$variable, levels = Z)

save(VI, file = "sim2_varimp.rda")

ggplot(VI, aes(y = VI, x = variable)) + geom_bar(stat = "identity", width = .1) +
  coord_flip() + scale_x_discrete(labels = text.z) +
  theme(panel.grid.major.y = element_blank())

@

\section{Comparison to regular forest}
<<cforest2>>=

set.seed(22)
condforest <- cforest(y ~ ., data = train, ntree = 100, cores = 50, 
                      perturb = list(replace = FALSE))
t0 <- t1 <- train
t1$a[t1$a == "C"] <- "A"
t0$a[t1$a == "A"] <- "C"

trt_eff_condforest <- predict(condforest, newdata = t1, type = "response") -
  predict(condforest, newdata = t0, type = "response")


trt_eff_pmods <- sapply(pmods, coef)["aA", ]


trt_eff <- data.frame(trt_eff = c(trt_eff_pmods, trt_eff_condforest), 
                      type = rep(c("pmods", "cforest"), each = nrow(train)),
                      z1 = rep(train$z1, times = 2))

save(trt_eff, file = "sim2_compareforests.rda")

ggplot(data = trt_eff) +
  geom_line(aes(x = z1, y = (0.2 + beta * logistf(1.5, z1)))) + 
  geom_point(aes(x = z1, y = trt_eff, color = type)) +
  ylab(expression(beta(z[1]))) +  xlab(bquote(z[.(gsub("z", "", Z[1]))])) 


@


\section{Function which computes log-likelihoods for simulated data}

<<sim_complogliks>>=
##' simulation for cforest check
##' comparison with two linear models
##' (1) lm(y ~ a, data = train)
##' (2) lm(y ~ a * I(z1 > 0), data = train)
##' @param i unused
##' @param data list of 2 list(train, test). If NULL training data of size n and test data of size m are obtained.
sim_wi <- function(i, data = NULL, n = 600, m = 600, p = 10, beta = 3,
                   confounderind = 1, Pa_logis = TRUE){
  
  if(is.null(data)) {
    data <- sim_data(n = (n + m), p = p, beta = beta, confounderind = confounderind, Pa_logis = Pa_logis)
    train <- data[1:n, ]
    test <- data[(n+1):(n+m), ]
  } else {
    train <- data$train
    test <- data$test
  }
  
  Z <- names(train)[!(names(train) %in% c("y", "a"))]
  
  
  ### linear model random forest ################
  ncores <- detectCores() - 1
  cfm <- as.formula(paste("y + a ~", paste(Z, collapse = "+")))
  lmforest <-  cforest(cfm, data = train, ytrafo = lmf, ntree = 100, cores = ncores, 
                       perturb = list(replace = FALSE))
  
  mods <- person_mods(object = lmforest, basemod = "lm", newdata = test, ncores = ncores)
  
  
  person.mods <- cbind(t(sapply(mods, coef)), test)
  return(person.mods)
  
}
@

\section{Results}
<<sim, out.width = "\\textwidth", out.height = "\\textheight">>=
scenarios <- expand.grid(Pa_logis = c(TRUE, FALSE), confounderind = c(1, 2))

pms <- apply(scenarios, 1, 
             function(sc) lapply(1:100, sim_wi, 
                                 confounderind = sc["confounderind"], 
                                 Pa_logis = sc["Pa_logis"]))


# personalized models
library("plyr")
pd <- lapply(pms, function(pm) { 
  pm <- ldply(1:length(pm), 
              function(i) {
                pm[[i]]$iteration <- i
                return(pm[[i]])
              }) 
})
save(Z, pd, file = "sim2_dependenceplotdat.rda")


dependenceplot.sim <- function(pd, variable, treatment) {
  ggplot(data = pd, aes_string(x = variable, y = treatment)) + 
    geom_point(alpha = 0.1) + 
    xlab(bquote(z[.(gsub("z", "", variable))])) + 
    ylab(bquote(bar(beta)(z[.(gsub("z", "", variable))])))+ 
    xlim(-pi, pi) + ylim(-0.9, 3.9)              
}
p <- lapply(pd, function(pdd) lapply(Z[1:2], dependenceplot.sim, pd = pdd, treatment = "aA"))

library("gridExtra")


scenarios
grid.arrange( 
  p[[1]][[1]] + geom_line(aes(x = z1, y = (0.2 + beta * logistf(1.5, z1)))), 
  p[[1]][[2]],
  
  p[[2]][[1]] + geom_line(aes(x = z1, y = (0.2 + beta * logistf(1.5, z1)))), 
  p[[2]][[2]],
  
  p[[3]][[1]] + geom_line(aes(x = z1, y = (0.2 + beta * logistf(1.5, z1)))),
  p[[3]][[2]],
  
  p[[4]][[1]] + geom_line(aes(x = z1, y = (0.2 + beta * logistf(1.5, z1)))), 
  p[[4]][[2]],
  
  ncol = 2
)
@


\end{document}