sim_variance_plot.R

############################################################################################
# Plot relative variance from static setting (T=1) simulations                             #
############################################################################################

library(tidyverse)
library(ggplot2)
library(data.table)
library(latex2exp)
library(dplyr)
library(grid)
library(gtable)

results_dir <- './sim_results/'
if(!dir.exists(results_dir)){
  print(paste0('create folder for results plot at: ', results_dir))
  dir.create(results_dir)
}

# Define parameters

options(echo=TRUE)
args <- commandArgs(trailingOnly = TRUE) # args <- c("outputs/20230427", 6, 'FALSE', 'FALSE') args <- c("outputs/20231002", 6, 'FALSE', 'FALSE')
output.path <- as.character(args[1])
J <- as.numeric(args[2])
covars40 <- as.character(args[3])
covars100 <- as.character(args[4])

estimand <- "ate"
m <- choose(J, 2)
n <- ifelse(J==6, 10000, 5000)
outcome.type <- 'binomial'

overlap.setting <- c("adequate","inadequate","rct")
gamma.setting <- c("zero","low","yang")

estimators <- c("tmle","tmle_glm", "tmle_bin", "gcomp", "iptw", "iptw_bin", "aiptw", "aiptw_bin")

estimator.color <- c("#F8766D","#A3A500","#00BF7D","#00B0F6","#E76BF3") #"#FF6C90"

n.estimators <- as.numeric(length(estimators))

filenames <- c(list.files(path=output.path, pattern = ".rds", full.names = TRUE))

filenames <- filenames[grep(paste0("J_",J),filenames)]
filenames <- filenames[grep(paste0("n_",n,"_"),filenames)]
filenames <- filenames[grep(paste0("outcome_type_",outcome.type),filenames)]
if(any( duplicated(substring(filenames, 18)))){
  print("removing duplicate filenames")
  filenames <- filenames[-which(duplicated(substring(filenames, 18)))]
}

results <- list() # structure is: [[filename]][[comparison]][[metric]]
for(f in filenames){
  print(f)
  result.matrix <- readRDS(f)
  R <- ncol(result.matrix )
  if(isTRUE(grep("use_SL_FALSE",f)==1)){
    estimator <- estimators[grep("glm", estimators)]
    rownames(result.matrix)[grep("tmle",rownames(result.matrix))] <- gsub("tmle","tmle_glm", rownames(result.matrix)[grep("tmle",rownames(result.matrix))])  }else{
      estimator <- setdiff(estimators,estimators[grep("glm", estimators)])
    }
  var <- matrix(NA, R, n.estimators)
  colnames(var) <- paste0("var_",estimand,"_",estimators)
  for(j in 1:m){
    for(i in paste0("var_",estimand,"_",estimator)){
      var[,which(i==colnames(var))] <- unlist(lapply(result.matrix[i,], "[", j))[1:R]
    }
    results[[f]][[j]] <- list("var"=var,"R"=R)
  }
}

# Create New lists
# structure is: [[estimator]][[filename]][[comparison]]

var <- list()
for(i in 1:length(paste0("var_",estimand,"_",estimators))){
  var[[i]] <- lapply(filenames, function(f) lapply(1:m, function(j) results[[f]][[j]]$var[,i]))
}

# Create dataframe for plot
results.df <- data.frame("var"=unlist(var),
                         "filename"=c(rep(unlist(sapply(1:length(filenames), function(i) rep(filenames[i], length.out=results[[i]][[1]]$R*m))), n.estimators)))


results.df$Estimator <- c(rep("TMLE-multi. (SL)",length.out=length(c(unlist(var[[1]])))), 
                          rep("TMLE-multi. (GLM)",length.out=length(c(unlist(var[[2]])))), 
                          rep("TMLE-bin. (SL)",length.out=length(c(unlist(var[[3]])))),
                          rep("G-Comp. (SL)",length.out=length(c(unlist(var[[4]])))),
                          rep("IPTW-multi. (SL)",length.out=length(c(unlist(var[[5]])))),
                          rep("IPTW-bin. (SL)",length.out=length(c(unlist(var[[6]])))),
                          rep("AIPTW-multi. (SL)",length.out=length(c(unlist(var[[7]])))),
                          rep("AIPTW-bin. (SL)",length.out=length(c(unlist(var[[8]])))))

if(J==3){
  results.df$comparison <- factor(rep(unlist(sapply(1:length(filenames), function (i) c(rep("32",length.out=results[[i]][[1]]$R), rep("31",length.out=results[[i]][[1]]$R), rep("21",length.out=results[[i]][[1]]$R)))), n.estimators))                        
}else if(J==6){
  results.df$comparison <- factor(rep(unlist(sapply(1:length(filenames), function (i) c(rep("65",length.out=results[[i]][[1]]$R),
                                                                                        rep("64",length.out=results[[i]][[1]]$R),
                                                                                        rep("63",length.out=results[[i]][[1]]$R),
                                                                                        rep("62",length.out=results[[i]][[1]]$R),
                                                                                        rep("61",length.out=results[[i]][[1]]$R),
                                                                                        rep("54",length.out=results[[i]][[1]]$R),
                                                                                        rep("53",length.out=results[[i]][[1]]$R),
                                                                                        rep("52",length.out=results[[i]][[1]]$R),
                                                                                        rep("51",length.out=results[[i]][[1]]$R),
                                                                                        rep("43",length.out=results[[i]][[1]]$R),
                                                                                        rep("42",length.out=results[[i]][[1]]$R),
                                                                                        rep("41",length.out=results[[i]][[1]]$R),
                                                                                        rep("32",length.out=results[[i]][[1]]$R),
                                                                                        rep("31",length.out=results[[i]][[1]]$R),
                                                                                        rep("21",length.out=results[[i]][[1]]$R)))), n.estimators))                        
}
results.df$J <- ifelse(J==3,3,6)
results.df$overlap.setting <- NA
results.df$gamma.setting <- NA

for(s in overlap.setting){
  print(s)
  results.df[grep(s, results.df$filename),]$overlap.setting <- s
}

for(s in gamma.setting){
  print(s)
  results.df[grep(s, results.df$filename),]$gamma.setting <- s
}

results.df <- results.df[rowSums(is.na(results.df[,1:3])) ==0,]

# create relative precision variable and filter Estimators

results.df <- results.df %>%
  group_by(comparison,overlap.setting,gamma.setting,J) %>% 
  mutate(oracle.var= mean(var[Estimator=="TMLE-multi. (GLM)"])) %>% 
  group_by(Estimator,comparison,overlap.setting,gamma.setting,J) %>% 
  mutate(rel.var = mean(oracle.var/var))

results.df <- results.df %>%
  filter(Estimator %in% c("TMLE-multi. (SL)",
                          "TMLE-bin. (SL)",
                          "G-Comp. (SL)",
                          "IPTW-multi. (SL)",
                          "IPTW-bin. (SL)"))

n.estimators <- length(unique(results.df$Estimator))

# reshape and plot
results.df$id <- with(results.df, paste(overlap.setting, gamma.setting, J, sep = "_"))
results_long <- reshape2::melt(results.df[!colnames(results.df) %in% c("J","id","filename")], id.vars=c("Estimator","comparison","overlap.setting","gamma.setting"))  # convert to long format

variable_names3 <- list(
  'low'="Low event rate",
  'yang'="Moderate event rate",
  'zero'= "No treatment effect",
  'adequate'= "Adequate overlap",
  'inadequate'= "Inadequate overlap",
  "rct"= "RCT") #  

labeller3 <- function(variable,value){
  return(variable_names3[value])
}

if(estimand=="att"){
  xlabel <- TeX('$ATT_{j,j^*}$')
}else if(estimand=="ate"){
  xlabel <- TeX('$ATE_{j,j^*}$')
}

## rel.var bar plot (avg. across comparisons)

rel.var.df <- data.frame(results.df) %>% 
  select(c(rel.var,Estimator,overlap.setting,gamma.setting)) %>% 
  group_by(Estimator,overlap.setting,gamma.setting) %>% 
  mutate(rel.var = mean(rel.var)) %>%
  dplyr::slice(n()) %>%
  select(c(rel.var,Estimator,overlap.setting,gamma.setting))

sim.results.rel.var.avg <- ggplot(data=rel.var.df,
                               aes(x=Estimator, y=rel.var, fill=forcats::fct_rev(Estimator)))  + geom_col() +
  facet_grid(overlap.setting ~  gamma.setting, scales = "free", labeller=labeller3)   + ylab("Average precision relative to TMLE-multi. (GLM) over all pairwise comparisons") +  
  scale_fill_manual(values= estimator.color) +
  scale_x_discrete(labels=NULL, limits = rev) +
  theme(legend.position = "none",legend.margin=margin(1,5,5,5), legend.justification="center",
        legend.box.margin=margin(0,0,0,0),legend.text=element_text(size=14), legend.key.width = unit(0.75, "cm"),legend.spacing.x = unit(0.75, 'cm'), legend.spacing.y = unit(0.75, 'cm')) +  
  theme(plot.title = element_text(hjust = 0.5, family="serif", size=16)) +
  theme(axis.title=element_text(family="serif", size=16)) +
  theme(axis.text.y=element_text(family="serif", size=16)) +
  theme(axis.text.x=element_text(family="serif", size=14, angle = 0, vjust = 0.5, hjust=0.25)) +
  theme(legend.text=element_text(family="serif", size=14)) +
  theme(legend.title=element_text(family="serif", size=14)) +
  theme(strip.text.x = element_text(family="serif", size=16)) +
  theme(strip.text.y = element_text(family="serif", size=16)) +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l =0))) +
  theme(axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l =0))) +
  theme(panel.spacing = unit(1, "lines")) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

# Labels 
labelR <- "Treatment setting" #TeX("$\\beta_j$ treatment model coefficients")
labelT <- "Outcome setting" #TeX("$\\gamma_j$ outcome model coefficients")

# Construct the new strip grobs
stripR <- gTree(name = "Strip_right", children = gList(
  rectGrob(gp = gpar(col = NA, fill = "grey85")),
  textGrob(labelR, rot = -90, gp = gpar(fontsize=22, col = "grey10"))))

stripT <- gTree(name = "Strip_top", children = gList(
  rectGrob(gp = gpar(col = NA, fill = "grey85")),
  textGrob(labelT, gp = gpar(fontsize=22, col = "grey10"))))

# Get the ggplot grob
z.rel.var.avg <- ggplotGrob(sim.results.rel.var.avg)

# Get the positions of the strips in the gtable: t = top, l = left, ...
posR <- subset(z.rel.var.avg$layout, grepl("strip-r", name), select = t:r)
posT <- subset(z.rel.var.avg$layout, grepl("strip-t", name), select = t:r)

# Add a new column to the right of current right strips, 
# and a new row on top of current top strips
width <- z.rel.var.avg$widths[max(posR$r)]    # width of current right strips
height <- z.rel.var.avg$heights[min(posT$t)]  # height of current top strips

z.rel.var.avg <- gtable_add_cols(z.rel.var.avg, width, max(posR$r))  
z.rel.var.avg <- gtable_add_rows(z.rel.var.avg, height, min(posT$t)-1)

# Position the grobs in the gtable
z.rel.var.avg <- gtable_add_grob(z.rel.var.avg, stripR, t = min(posR$t)+1, l = max(posR$r) + 1, b = max(posR$b)+1, name = "strip-right")
z.rel.var.avg <- gtable_add_grob(z.rel.var.avg, stripT, t = min(posT$t), l = min(posT$l), r = max(posT$r), name = "strip-top")

# Add small gaps between strips
z.rel.var.avg <- gtable_add_cols(z.rel.var.avg, unit(1/5, "line"), max(posR$r))
z.rel.var.avg <- gtable_add_rows(z.rel.var.avg, unit(1/5, "line"), min(posT$t))

# Draw it
grid.newpage()
grid.draw(z.rel.var.avg)

ggsave(paste0("sim_results/static_simulation_rel_var_avg_estimand_",estimand,"_J_",J,"_n_",n,"_outcome_",outcome.type,"_covars_40_",covars40,"_covars_100_",covars100,"_R_",results[[1]][[1]]$R,".png"),plot = z.rel.var.avg,scale=1.75)