diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6fa0854
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+data/*
+docs/*
+fig/*
+picrust2/*
+Rmd/*
+.DS_Store
diff --git a/R/alpha_diversity.R b/R/alpha_diversity.R
new file mode 100644
index 0000000..c4eb076
--- /dev/null
+++ b/R/alpha_diversity.R
@@ -0,0 +1,791 @@
+# for reproducibility
+set.seed(1)
+# this script contains step 1 of https://osf.io/s45mu (alpha diversity)
+library(mia)
+library(glue)
+library(tidyverse)
+library(rstanarm)
+library(mice)
+library(brms)
+library(here)
+library(HDInterval)
+source(here("R/helper_functions.R"))
+
+
+
+
+# Data Preparation
+################################################################################
+if (!file.exists(here::here("data/data_imp.Rds"))) {
+
+
+  # import of biomfile and meta data can be found in the import script
+  load(here::here("data/data.Rds"))
+
+  mdata$siblings <- ifelse(
+    mdata$siblings == 0, 0, ifelse(
+      mdata$siblings >= 1, 1, NA))
+  # before estimating AD, I bring the mdata in long format where appropriate 
+  l_mdata <- pivot_longer(
+    mdata, 
+    matches("age_week\\d+"), 
+    names_to = "week",
+    names_prefix = "age_week",
+    values_to = "age") %>%
+    mutate(sample_id = ifelse(week == "2", glue("{skippy_id}_1"), ifelse(week == "5", glue("{skippy_id}_2"), ifelse(week == "52", glue("{skippy_id}_3"), NA))))
+
+
+  # obtain AD and combine dfs
+  tse <- estimateDiversity(
+    tse,
+    assay_name = "counts",
+    index = c("shannon", "faith", "inverse_simpson"),
+    name = c("shannon", "faith", "inverse_simpson")
+  )
+  tse <- estimateRichness(
+    tse,
+    assay_name = "counts",
+    index = "chao1",
+    name = "chao1"
+  )
+  ad <- colData(tse) %>% 
+    as.data.frame() %>%
+    select(sample_id, chao1, inverse_simpson, shannon, faith)
+
+  df <- ad %>%
+    dplyr::full_join(l_mdata, by = "sample_id") %>%
+    arrange(skippy_id, week)
+
+
+
+  # variables I choose to retain for imputation based on expected predictive 
+  # value for microbiota data imputation
+  vars <- c(
+    "sample_id",
+    "skippy_id",
+    "week",
+    "age",
+    "chao1",
+    "inverse_simpson",
+    "shannon",
+    "faith",
+    "condition",
+    "csection",
+    "birthweight",
+    "siblings",
+    "sex",
+    "bfexcl",
+    "bfperc_w1",
+    "bfperc_w2",
+    "bfperc_w3",
+    "bfperc_w4",
+    "bfperc_w5",
+    "apgar_5",
+    "ges_age",
+    "edlevel",
+    "parity",
+    "weaning",
+    "antibiotic_1year",
+    "antibiotic_week2",
+    "antibiotic_week5"
+  )
+  mdata <- dplyr::rename(
+    mdata, 
+    constipation_week52 = constipation_1year,
+    diarrhea_week52 = diarrhea_1year,
+    antibiotic_week52 = antibiotic_1year
+  )
+
+
+
+  # furthermore I include constipation and diarrhea but first i need to put 
+  # them into long format
+  c_and_d <- select(
+    mdata,
+    skippy_id, 
+    matches("constipation_week\\d+$"), 
+    matches("diarrhea_week\\d+$"),
+    matches("antibiotic_week\\d+$")
+    ) %>%
+    pivot_longer(
+    cols = contains("week"),
+    names_to = c(".value", "week"),
+    names_pattern = "(\\w+)_week(\\d+)")
+
+
+
+  d <- dplyr::left_join(
+  select(df, all_of(vars)),
+  c_and_d, 
+  by = c("skippy_id", "week")) %>%
+  arrange(sample_id) %>%
+  select(
+    sample_id, skippy_id, week, age, chao1, inverse_simpson, shannon, faith, condition,
+    csection, birthweight, siblings, sex, contains("bf"), constipation, antibiotic, 
+    diarrhea, everything(), -contains("antibiotic_w"))
+
+  save(d, file = here::here("data/table1.Rds"))
+  # now we are ready for imputation, we start with m = 5 to make the script run 
+  # but for the final estimates we will increase number of imputations
+  # if skippy_id and week are numeric the imputation model function better, other-
+  # wise the age var imputation is not good.
+  nvars <- c("skippy_id", "week")
+  d <- mutate(d, 
+    across(all_of(nvars), function(x) as.numeric(x)),
+    age = age - week * 7
+  )
+  imp <- mice(d, m = 50)
+
+  # for the analysis we need to change dtypes for some vars
+  # variables to standardize
+
+  svars <- c(
+    "age", 
+    "age_dev",
+    "chao1",
+    "inverse_simpson",
+    "faith",
+    "shannon", 
+    "birthweight", 
+    "ges_age",
+    "apgar_5",
+    "edlevel"
+  )
+  fvars <- c(
+    "week", 
+    "condition", 
+    "csection", 
+    "siblings", 
+    "sex", 
+    "antibiotic"
+  )
+  implist <- map(1:5, function(x) {
+    dtemp <- complete(imp, x)
+    dtemp <- mutate(dtemp, 
+      across(all_of(fvars), function(x) as.factor(x)),
+      # across(all_of(svars), function(x) scale(x)[, 1]),
+      age_dev = age,
+      age = as.numeric(levels(week)) * 7 + age_dev,
+      age_s = scale(age)[, 1],
+      age_y = age/365,
+      age_dev_s = scale(age_dev)[, 1],
+      chao1_s = scale(chao1)[, 1],
+      faith_s = scale(faith)[, 1],
+      inverse_simpson_s = scale(inverse_simpson)[, 1],
+      shannon_s = scale(shannon)[, 1],
+      birthweight_s = scale(birthweight)[, 1],
+      ges_age_s = scale(ges_age)[, 1],
+      apgar_5_s = scale(apgar_5)[, 1],
+      edlevel_s = scale(edlevel)[, 1],
+      skippy_id = as.integer(skippy_id)
+    )
+    return(dtemp)
+  })
+
+  save(d, implist, file = here::here("data/data_imp.Rds"))
+ } else {
+  load(file = here::here("data/data_imp.Rds"))
+}
+
+
+
+# Figure out model structure for the ITT analyses 
+################################################################################
+
+
+# I decide which time variable we will use based on LOO 
+f1 <- bf(shannon_s ~ week * condition + (1|skippy_id))
+m1 <- brm_multiple(
+  data = implist,
+  formula = f1,
+  file = here::here("data/m1.Rds")
+)
+loo_m1 <- add_criterion(
+  m1,
+  "loo",
+  file = here::here("data/loo_m1"),
+  moment_match = FALSE
+)
+loo_m1
+
+f2 <- bf(shannon_s ~ age_y * condition + (1|skippy_id))
+m2 <- brm_multiple(
+  data = implist,
+  formula = f2,
+  file = here::here("data/m2.Rds")
+)
+
+loo_m2 <- add_criterion(
+  m2,
+  "loo",
+  file = here::here("data/loo_m2"),
+  moment_match = FALSE
+)
+f3 <- bf(shannon_s ~ week * condition + age_dev + (1|skippy_id))
+m3 <- brm_multiple(
+  data = implist,
+  formula = f3,
+  file = here::here("data/m3.Rds")
+)
+loo_m3 <- add_criterion(
+  m3,
+  "loo",
+  file = here::here("data/loo_m3"),
+  moment_match = FALSE
+)
+
+
+loo_comp <- loo_compare(loo_m1, loo_m2, loo_m3)
+loo_comp
+
+# the model with age has best fit. Therefore, we will use these as our main
+# models. We will double check if results differ as compared to using week 
+# But especially for the frequentist models this setup makes it easier as well.
+
+
+# first we must decide which covariates we want to include. For ITT we can 
+# decide totally based on model fit as long as it does not interfere with 
+# causal chain of our DAG. E.g. we must exclude BF although it prob help to 
+# predict AD as SSC --> BF and we want ACE of SSC. However, variables such as 
+# age, csection, siblings etc. can be included based on whether they improve 
+# model fit. We can use loo for this to get a quick answer: 
+
+
+# I consider the following variables for selection 
+coefs <- c(
+  "csection", 
+  "birthweight_s", 
+  "siblings",
+  "sex",
+  "apgar_5_s",
+  "ges_age_s",
+  "edlevel_s"
+)
+
+
+
+
+# I will use the following algorithm:
+# For each dataset in implist:
+  # Calculate LOO for base model
+  # Then for each var in coefs:
+    # calculate LOO for base model + coef 
+    # If LOO indicates the new model is a better fit
+    # keep that coef in list 
+
+loo_comp <- map_dfr(coefs, function(coef) {
+  map_dfr(1:5, function(i) {
+    map_dfr(list(c(2, 5), 52), function(n_week) {
+      if (52 %in% n_week) {
+        # base model 
+        f1 <- bf(shannon_s ~ age * condition)
+        m1 <- brm(
+          data = filter(implist[[i]], week == 52) %>%
+                    mutate(age = scale(age)[, 1]),
+          formula = f1,
+          file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}.Rds"))
+        )
+        loo_m1 <- add_criterion(
+          m1,
+          "loo",
+          file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}")),
+          moment_match = FALSE
+        )
+
+        f2 <- bf(glue("shannon_s ~ age * condition + {coef}"))
+        m2 <- brm(
+          data = filter(implist[[i]], week == 52) %>%
+                    mutate(age = scale(age)[, 1]),
+          formula = f2,
+          file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}.Rds"))
+        )
+
+        loo_m2 <- add_criterion(
+          m2,
+          "loo",
+          file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}")),
+          moment_match = FALSE
+        )
+        lcomp <- loo_compare(loo_m2, loo_m1)
+        score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1)
+      } else {
+        # base model 
+        f1 <- bf(shannon_s ~ age * condition + (1|skippy_id))
+        m1 <- brm(
+          data = filter(implist[[i]], week != 52) %>%
+                    mutate(age = scale(age)[, 1]),
+          formula = f1,
+          file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}.Rds"))
+        )
+        loo_m1 <- add_criterion(
+          m1,
+          "loo",
+          file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}")),
+          moment_match = FALSE
+        )
+
+        f2 <- bf(glue("shannon_s ~ age * condition + {coef} + (1|skippy_id)"))
+        m2 <- brm(
+          data = filter(implist[[i]], week != 52) %>%
+                    mutate(age = scale(age)[, 1]),
+          formula = f2,
+          file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}.Rds"))
+        )
+
+        loo_m2 <- add_criterion(
+          m2,
+          "loo",
+          file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}")),
+          moment_match = FALSE
+        )
+        lcomp <- loo_compare(loo_m2, loo_m1)
+        score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1)
+      }
+      tibble(
+        model = ifelse(n_week == 52, "1year", "2 and 5 weeks"),
+        coef = coef,
+        imp = i,
+        score = score
+      )      
+    })
+  })
+})
+
+# siblings was the only covariate that improved out of sample 
+# predictions. Therefore, we will only keep this variable
+group_by(loo_comp, coef, model) %>%
+  summarise(ss = sum(score))
+
+# lastly: should we include random effects for age?
+loo_comp <- map_dfr(1:5, function(i) {
+  # base model 
+  f1 <- bf(shannon_s ~ age * condition + (1|skippy_id))
+  m1 <- brm(
+    data = implist[[i]],
+    formula = f1,
+    file = here::here(glue("data/m1_age_imp{i}.Rds"))
+  )
+  loo_m1 <- add_criterion(
+    m1,
+    "loo",
+    file = here::here(glue("data/loo_m1_age_imp{i}")),
+    moment_match = FALSE
+  )
+
+  f2 <- bf(glue("shannon_s ~ age * condition + (1 + age|skippy_id)"))
+  m2 <- brm(
+    data = implist[[i]],
+    formula = f2,
+    file = here::here(glue("data/m2_age_imp{i}_rftime.Rds"))
+  )
+
+  loo_m2 <- add_criterion(
+    m2,
+    "loo",
+    file = here::here(glue("data/loo_m2_age_imp{i}_rftime")),
+    moment_match = FALSE
+  )
+  lcomp <- loo_compare(loo_m2, loo_m1)
+  score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1)
+  tibble(
+    imp = i,
+    score = score
+  )
+})
+# results indicate that varying time effects arent necessary
+loo_comp
+
+
+
+
+
+# ITT analyses with MI 
+#############################################################################
+
+
+# start with posterior predictive checks and model diagnostics before we inter 
+# pret the results. In the end I will use the final models for interpretation.
+# because some loo estimates indicate outliers I will use robust regression.
+# Shannon is least prone to non-normality and yet we got some warnings while 
+# there is no disadvantage of using robust regression:
+
+# "inverse_simpson_s" cannot be modelled with student()
+indeces <- c("shannon_s", "chao1_s", "faith_s")
+ad_models <- map(indeces, function(index) {
+  f <- bf(glue("{index} ~ age_s * condition + siblings + (1|skippy_id)"))
+  m <- brm_multiple(
+    family = student(),
+    data = implist,
+    formula = f,
+    file = here::here(glue("data/m_age_{index}_bez.Rds"))
+  )
+})
+ad_models
+save(ad_models, file = here::here("data/ad_models.Rds"))
+# now for each model we perform posterior predictive checks and have a look 
+# at residuals 
+pp_checks <- map(ad_models, function(m) {
+  p <- pp_check(m)
+})
+pp_checks
+# load helper function to diagnose lms 
+source(here::here("R/ml_helper.R"))
+names(ad_models) <- indeces
+lm_diags <- map(indeces, function(index) {
+  lm_diag(ad_models[[index]], ad_models[[index]]$data, index, id = "skippy_id")
+})
+lm_diags
+
+# check also models split by time points (this only makes really a difference for siblings effect)
+ad_models2 <- map(c("2and5", "52"), function(weekchr) {
+  if (weekchr == "2and5") {
+    ad_models <- map(indeces, function(index) {
+      f <- bf(glue("{index} ~ age * condition + siblings + (1|skippy_id)"))
+      m <- brm_multiple(
+        family = student(),
+        data = map(implist, ~filter(.x, week != 52)),
+        formula = f,
+        file = here::here(glue("data/m_age_{index}_{weekchr}_bez.Rds"))
+      )
+    })
+  } else {
+    ad_models <- map(indeces, function(index) {
+      f <- bf(glue("{index} ~ condition + age + siblings"))
+      m <- brm_multiple(
+        family = student(),
+        data = map(implist, ~filter(.x, week == 52)),
+        formula = f,
+        file = here::here(glue("data/m_{index}_{weekchr}_bez.Rds"))
+      )
+    })
+  }
+  ad_models
+})
+ad_models2
+post <- posterior_samples(ad_models2[[1]][[1]])
+mean(post$b_siblings1 < 0)
+length(unlist(ad_models2, recursive = FALSE))
+# now for each model we perform posterior predictive checks and have a look 
+# at residuals 
+ad_models2 <- unlist(ad_models2, recursive = FALSE)
+
+length(ad_models2)
+pp_checks2 <- map(ad_models2, function(m) {
+  p <- pp_check(m)
+})
+pp_checks2
+
+
+lm_diags2 <- map(c("2and5", "52"), function(weekchr) {
+  if (weekchr == "2and5") {
+    lm_diags <- map(indeces, function(index) {
+      lm_diag(
+        ad_models2[1:4][[index]], 
+        filter(implist[[1]], week != 52), 
+        index, id = "skippy_id")
+    })
+  } else {
+    lm_diags <- map(indeces, function(index) {
+      lm_diag(
+        ad_models2[5:8][[index]], 
+        filter(implist[[1]], week== 52), 
+        index, id = "skippy_id")
+    })
+  }
+  lm_diags
+})
+lm_diags2[[1]][[1]]
+lm_diags2[[2]][[1]]
+
+
+
+
+# create a plot for AD 
+if (!is.factor(d$week)) {
+  d <-mutate(d, 
+        week = as.factor(week), condition = as.factor(condition),
+        group = ifelse(week == 2 & condition == 0, "W2 CAU", ifelse(
+          week == 2 & condition == 1, "W2 SSC", ifelse(
+            week == 5 & condition == 0, "W5 CAU", ifelse(
+              week == 5 & condition == 1, "W5 SSC", ifelse(
+                week == 52 & condition == 0, "W52 CAU", ifelse(
+                  week == 52 & condition == 1, "W52 SSC", NA)))))),
+        group = as.factor(group),
+        condition_label = ifelse(condition == 0, "CAU", ifelse(
+          condition == 1, "SSC", NA)),
+        week_label = glue::glue("Week {week}")
+        )
+}
+  d <-mutate(d, 
+        week = as.factor(week), condition = as.factor(condition),
+        group = ifelse(week == 2 & condition == 0, "W2 CAU", ifelse(
+          week == 2 & condition == 1, "W2 SSC", ifelse(
+            week == 5 & condition == 0, "W5 CAU", ifelse(
+              week == 5 & condition == 1, "W5 SSC", ifelse(
+                week == 52 & condition == 0, "W52 CAU", ifelse(
+                  week == 52 & condition == 1, "W52 SSC", NA)))))),
+        group = as.factor(group),
+        condition_label = ifelse(condition == 0, "CAU", ifelse(
+          condition == 1, "SSC", NA)),
+        week_label = glue::glue("Week {week}")
+        )
+# alternative 1
+adplots <- map(str_remove(indeces, "_s$"), function(index) {
+  d %>% 
+    ggplot(aes_string("group", index, fill = "condition")) +
+      geom_boxplot() +
+      #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) +
+      geom_jitter(width = 0.1, size = 2, alpha = 0.7) +
+      # scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+      scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+      theme_bw(base_size = 25) +
+      theme(legend.position = "none") +
+      xlab("") + ylab(str_to_title(index))
+})
+# alternative 2
+adplots <- map(str_remove(indeces, "_s$"), function(index) {
+  d %>%
+    ggplot(aes_string("condition_label", index, fill = "condition_label")) +
+      geom_boxplot() +
+      #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) +
+      geom_jitter(width = 0.1, size = 2, alpha = 0.7) +
+      # scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+      scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+      facet_wrap(~week_label, strip.position = "bottom") +
+      #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+      theme_bw(base_size = 25) +
+      theme(
+        legend.position = "none",
+        strip.placement = "outside",
+        strip.background = element_blank()) +
+      xlab("") + ylab(str_to_title(index))
+})
+
+
+
+save(adplots, file = here::here("data/adplots.Rds"))
+
+
+
+
+# ITT analyses complete case 
+#############################################################################
+if (!"shannon_s" %in% colnames(d)) {
+  d <- mutate(d, 
+   age_s = scale(age)[, 1],
+   chao1_s = scale(chao1)[, 1],
+   faith_s = scale(faith)[, 1],
+   inverse_simpson_s = scale(inverse_simpson)[, 1],
+   shannon_s = scale(shannon)[, 1],
+   skippy_id = as.integer(skippy_id),
+   siblings = as.factor(siblings)
+  )
+}
+
+
+ad_models_cc <- map(c("2and5", "52"), function(weekchr) {
+  if (weekchr == "2and5") {
+    ad_models <- map(indeces, function(index) {
+      f <- bf(glue("{index} ~ age * condition + siblings + (1|skippy_id)"))
+      m <- brm(
+        family = student(),
+        data = filter(d, week != 52) %>% 
+                              mutate(age = scale(age)[, 1]),
+        formula = f,
+        file = here::here(glue("data/m_age_{index}_{weekchr}_cc_bez.Rds"))
+      )
+    })
+  } else {
+    ad_models <- map(indeces, function(index) {
+      f <- bf(glue("{index} ~ condition + age + siblings"))
+      m <- brm(
+        family = studetnt(),
+        data = filter(d, week == 52) %>% 
+                              mutate(age = scale(age)[, 1]),
+        formula = f,
+        file = here::here(glue("data/m_age_{index}_{weekchr}_cc_bez.Rds"))
+      )
+    })
+  }
+  ad_models
+})
+parameters <- c(
+  "b_age", 
+  "b_condition1",
+  "b_siblings1",
+  "b_age:condition1"
+)
+
+
+
+tbs_cc <- map2_dfr(c("2and5", "52"), ad_models_cc, function(weekchr, models) {
+  if (weekchr == "2and5") {
+    tb <- map2_dfr(indeces, models, function(index, model) {
+      summarise_posterior(model, parameters, 2) %>%
+      mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0))
+    })
+  } else {
+    tb <- map2_dfr(indeces, models, function(index, model) {
+      summarise_posterior(model, parameters[-length(parameters)], 2) %>%
+      mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0))
+    })
+  }
+})
+filter(tbs_cc, indicator)
+
+filter(tbs_cc, indicator)
+
+
+
+# results do not differ meaningully between complete case analyses and mi
+# for inverse simpson the models are not a good fit in either case. I ran 
+# wilcoxon tests here:
+filter(d, group %in% c("W52 CAU", "W52 SSC")) %>%
+  mutate(group = fct_drop(group)) %>%
+  wilcox.test(inverse_simpson ~ group, data = .)
+
+
+
+
+# PP analyses with MI 
+#############################################################################
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+count(pp_indicator, PP, SSC)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+count(d, condition, pp)
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+covariates <- c(
+  "siblings", 
+  "birthweight_s", 
+  "ges_age_s", 
+  "edlevel_s", 
+  "csection", 
+  "sex"
+)
+
+model_str <- "age_s * condition"
+for (coef in covariates) {
+  model_str <- glue("{model_str} + {coef}")
+}
+
+# "inverse_simpson_s" cannot be modelled with student()
+indeces <- c("shannon_s", "chao1_s", "faith_s")
+ad_models_pp <- map(indeces, function(index) {
+  f <- bf(glue("{index} ~ {model_str} + (1|skippy_id)"))
+  m <- brm_multiple(
+    family = student(),
+    data = implist_pp,
+    formula = f,
+    file = here::here(glue("data/m_age_{index}_pp_bez.Rds"))
+  )
+})
+ad_models_pp
+
+indeces <- c("shannon_s", "inverse_simpson_s", "chao1_s", "faith_s")
+ad_models_pp <- map(c("2and5", "52"), function(weekchr) {
+  if (weekchr == "2and5") {
+    ad_models <- map(indeces, function(index) {
+      f <- bf(glue("{index} ~ {model_str} + (1|skippy_id)"))
+      m <- brm_multiple(
+        family = student(),
+        data = map(implist_pp, ~filter(.x, week != 52) %>% 
+                              mutate(age = scale(age)[, 1])),
+        formula = f,
+        file = here::here(glue("data/m_age_{index}_{weekchr}_sn_pp_bez.Rds"))
+      )
+    })
+  } else {
+    ad_models <- map(indeces, function(index) {
+      f <- bf(glue("{index} ~ {model_str}"))
+      if (index %in% c("chao1_s", "inverse_simpon_s")) {
+        family <- skew_normal()
+      } else {
+        family <- student()
+      }
+      m <- brm_multiple(
+        family = family,
+        data = map(implist_pp, ~filter(.x, week == 52) %>% 
+                              mutate(age = scale(age)[, 1])),
+        formula = f,
+        file = here::here(glue("data/m_age_{index}_{weekchr}_sn_pp_bez.Rds"))
+      )
+    })
+  }
+  ad_models
+})
+
+
+
+pp_checks_pp <- map(ad_models_pp, function(m) {
+  p <- pp_check(m)
+})
+pp_checks_pp
+
+tbs_pp <- map2_dfr(c("2and5", "52"), ad_models_pp, function(weekchr, models) {
+  if (weekchr == "2and5") {
+    tb <- map2_dfr(indeces, models, function(index, model) {
+      summarise_posterior(model, parameters, 2) %>%
+      mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0))
+    })
+  } else {
+    tb <- map2_dfr(indeces, models, function(index, model) {
+      summarise_posterior(model, parameters[-length(parameters)], 2) %>%
+      mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0))
+    })
+  }
+})
+filter(tbs_pp, indicator)
+
+
+# results are similar to the ITT analyses. No effects on alpha diversity 
+
+
+# create a supplementary table with all beta coefficients
+
+ad_stbl <- map2_dfr(c("2and5", "52"), ad_models_pp, function(weekchr, models) {
+  if (weekchr == "2and5") {
+    tb <- map2_dfr(indeces, models, function(index, model) {
+      summarise_posterior(model, contains("b_"), 3) %>%
+      mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0))
+    })
+  } else {
+    tb <- map2_dfr(indeces, models, function(index, model) {
+      summarise_posterior(model, contains("b_"), 3) %>%
+      mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0))
+    })
+  }
+})
+ad_stbl <- group_by(ad_stbl, model, index) %>% nest()
+
+ad_stbl_pp <- pmap(list(ad_stbl[[1]], ad_stbl[[2]], ad_stbl[[3]]), function(time, index, tbl){
+  time_t <- ifelse(time == "2and5", "week 2 and 5", "1 year")
+  index_t <- str_to_title(str_remove(index, "_s$"))
+  caption <- glue("Alpha diversity model using samples obtained at {time_t} and {index_t}")
+  knitr::kable(tbl, caption = caption)
+})
+
+ad_stbl_pp
+
+
+
+
+
+
+
+
+
+
diff --git a/R/beta_diversity_ait.R b/R/beta_diversity_ait.R
new file mode 100644
index 0000000..5fc6b74
--- /dev/null
+++ b/R/beta_diversity_ait.R
@@ -0,0 +1,1223 @@
+# thanks to Gavin for helping out with this part:
+# https://stats.stackexchange.com/questions/590510/repeated-measures-permanova-nowhere-to-find
+
+set.seed(1)
+library(mia)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(vegan)
+library(permute)
+library(glue)
+
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  left_join(select(d, age, sample_id), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  DataFrame()
+
+colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+
+tse <- agglomerateByRank(tse, rank = "genus")
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+
+####################### 1.1 Complete Case Analysis ############################
+
+### First I fit a model to all samples 
+
+# we use Aitchison distance
+tse <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr")
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse, "clr"))
+asv <- asv[meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s + condition:age_s,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "euclidean",
+                     # h does not work if trend is in data (therefore use 999), see Gavins post
+                     permutations = 999
+                   )
+
+permanova
+
+
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ age_s + age_s*condition, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "euclidean",
+                        permutations = 999)
+permanova2
+
+
+
+### Now split models by infancy and 1 year olds 
+
+## first infancy
+tse_inf <- filter(tse, week != 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_inf) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_inf, "clr"))
+asv <- asv[meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s + condition:age_s,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "euclidean",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "euclidean",
+                        permutations = 999)
+permanova2
+
+
+
+
+## then 1 year olds 
+tse_y <- filter(tse, week == 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_y) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_y, "clr"))
+asv <- asv[meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "euclidean",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "euclidean",
+                        permutations = 999)
+permanova2
+
+
+
+
+######################## 1. 2Multiple imputation  #############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+
+permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  
+  # step 1
+  # we use Aitchison distance
+  tse <- agglomerateByRank(tse, rank = "genus")
+  tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr")
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(dimp, condition, siblings, age, sample_id, week, skippy_id), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+    
+
+    # step 2
+
+    ## first all samples
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_map, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_all <- adonis2(asv ~ condition + age_s + age_s:condition,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+
+    ## first infancy
+    tse_inf <- filter(tse_map, week != 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_inf) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_inf, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+
+    ## then 1 year olds 
+    tse_y <- filter(tse_map, week == 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_y) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_y, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_y <- adonis2(asv ~ condition + age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+    permanova_all$time <- "all"
+    permanova_inf$time <- "infancy"
+    permanova_y$time <- "year1"
+    permanova <- bind_rows(
+      as.data.frame(permanova_all) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+      )
+    permanova$imp <- imp 
+    
+    #list(infancy = permanova_inf, year1 = permanova_y, imp = imp)
+    permanova
+})
+permanovas
+
+
+
+
+######################## 1. 2 WITH BREASTFEEDING  #############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+permanovas_bf <- map2_dfr(implist, 1:length(implist), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  
+  # step 1
+  # we use Aitchison distance
+  tse <- agglomerateByRank(tse, rank = "genus")
+  tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr")
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(
+        dimp, condition, siblings, age, sample_id, 
+        week, skippy_id, bfexcl), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+    
+
+    # step 2
+
+    ## first all samples
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_map, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_all <- adonis2(asv ~ bfexcl + condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999,
+                         # by = "margin"
+                       )
+
+
+    ## first infancy
+    tse_inf <- filter(tse_map, week != 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_inf) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_inf, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_inf <- adonis2(asv ~ bfexcl + condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+
+    ## then 1 year olds 
+    tse_y <- filter(tse_map, week == 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_y) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>%
+                      na.omit()
+
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_y, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_y <- adonis2(asv ~ bfexcl + condition + age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+    permanova_all$time <- "all"
+    permanova_inf$time <- "infancy"
+    permanova_y$time <- "year1"
+    permanova <- bind_rows(
+      as.data.frame(permanova_all) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+      )
+    permanova$imp <- imp 
+
+    #list(all = permanova_all, infancy = permanova_inf, year1 = permanova_y, imp = imp)
+    permanova
+})
+permanovas_bf
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################         2. PP          ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+count(pp_indicator, PP, SSC)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id") %>%
+    mutate(
+      birthweight_s = scale(birthweight)[, 1],
+      ges_age_s = scale(ges_age)[, 1],
+      edlevel_s = scale(edlevel)[, 1],
+    )
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+count(d, condition, pp)
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(sample_id) %>%
+  left_join(select(d_pp, skippy_id, siblings, week, age, condition, birthweight_s, ges_age_s, edlevel_s, csection, sex, sample_id, pp), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  DataFrame()
+colnames(colData(tse))
+colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+tse_pp <- filter(tse, pp == 1)
+
+colData(tse_pp)$csection
+####################### 2.1 Complete Case Analysis ############################
+
+### First I fit a model to all samples 
+
+# we use Aitchison distance
+tse_pp <- transformSamples(x = tse_pp, method = "clr", pseudocount = 1, name = "clr")
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_pp) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s,
+                  edlevel_s, csection, sex) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_pp, "clr"))
+asv <- asv[meta$sample_id, ]
+
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "euclidean",
+                     # does not work if trend is in data (therefore use 999)
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "euclidean",
+                        permutations = 999)
+permanova2
+
+
+
+### Now split models by infancy and 1 year olds 
+
+## first infancy
+tse_inf <- filter(tse_pp, week != 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_inf) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s,
+                  edlevel_s, csection, sex) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_inf, "clr"))
+asv <- asv[meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s * condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "euclidean",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "euclidean",
+                        permutations = 999)
+permanova2
+
+
+
+
+## then 1 year olds 
+tse_y <- filter(tse_pp, week == 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_y) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s,
+                  edlevel_s, csection, sex) %>%
+                  na.omit()
+
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_y, "clr"))
+asv <- asv[meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "euclidean",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "euclidean",
+                        permutations = 999)
+permanova2
+
+# same as above
+
+
+######################## 2.2 Multiple imputation  #############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+permanovas_pp <- map2_dfr(implist_pp, 1:length(implist_pp), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  
+  # step 1
+  # we use Aitchison distance
+  tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr")
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(dimp, condition, siblings, age, birthweight_s, ges_age_s, edlevel_s, csection, sex, sample_id, week, skippy_id, pp), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+    
+    tse_map <- filter(tse_map, pp == 1)
+    
+    
+    # step 2
+    ## first all samples
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s,
+                      edlevel_s, csection, sex) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_map, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_all <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+    ## then infancy
+    tse_inf <- filter(tse_map, week != 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_inf) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s, 
+                      edlevel_s, csection, sex) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_inf, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+
+    ## then 1 year olds 
+    tse_y <- filter(tse_map, week == 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_y) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s,
+                      edlevel_s, csection, sex) %>%
+                      na.omit()
+
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_y, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_y <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+    permanova_all$time <- "all"
+    permanova_inf$time <- "infancy"
+    permanova_y$time <- "year1"
+    permanova <- bind_rows(
+      as.data.frame(permanova_all) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+      )
+    permanova$imp <- imp 
+    permanova
+    #list(infancy = permanova_inf, year1 = permanova_y, imp = imp)
+})
+
+permanovas_pp
+
+# same as above
+
+
+
+
+
+
+##############################################################################
+#################################PLOT & TABLE ################################
+##############################################################################
+
+
+# for the tables I use first imputed dataset of ITT analyses
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+
+tse_plot <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr")
+colData(tse_plot) <- colData(tse_plot) %>%
+  as.data.frame() %>%
+  select(sample_id) %>%
+  left_join(
+    select(
+      implist[[1]], 
+      condition, siblings, age,
+      sample_id, week, skippy_id, bfexcl), 
+    by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(
+    age = age + as.numeric(as.character(week)),
+    age_s = scale(age)[, 1]
+  ) %>%
+  DataFrame()
+  
+
+
+## first all samples
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta_plot <- colData(tse_plot) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta_plot %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+            nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_plot, "clr"))
+asv <- asv[meta_plot$sample_id, ]
+
+# fit and inspect model 
+permanova_all <- adonis2(asv ~ condition * age_s,
+                      # by = "margin", # each term analyzed individually
+                      data = meta_plot,
+                      method = "euclidean",
+                      permutations = 999
+                    )
+beta_table <- permanova_all %>% 
+  as.data.frame() %>%
+  rownames_to_column("parameter") %>%
+  mutate(
+    `R2 (%)` = glue("{round(R2 * 100, 2)}"),
+    P = as.character(`Pr(>F)`),
+    P = str_remove(P, "^0"),
+    parameter = str_to_title(parameter),
+    parameter = ifelse(parameter == "Condition", "SSC", ifelse(
+      parameter == "Age_s", "Age", ifelse(
+        parameter == "Condition:age_s", "SSC x Age", parameter
+      )))
+    ) %>%
+  mutate(across(where(is.numeric), round, 2)) %>%
+  mutate_all(function(x) ifelse(is.na(x), "", x)) %>%
+  select(parameter, Df, SumOfSqs, "R2 (%)", F, P)
+
+colnames(beta_table) <- str_to_title(colnames(beta_table))
+
+# fit and inspect model 
+permanova_all <- adonis2(
+  asv ~ bfexcl + condition * age_s,
+  # by = "margin", # each term analyzed individually
+  data = meta_plot,
+  method = "euclidean",
+  permutations = 999)
+
+beta_table_bf <- permanova_all %>%
+  as.data.frame() %>%
+  rownames_to_column("parameter") %>%
+  mutate(
+    `R2 (%)` = glue("{round(R2 * 100, 2)}"),
+    P = as.character(`Pr(>F)`),
+    P = str_remove(P, "^0"),
+    parameter = str_to_title(parameter),
+    parameter = ifelse(parameter == "Condition", "SSC", ifelse(
+      parameter == "Age_s", "Age", ifelse(
+        parameter == "Condition:age_s", "SSC x Age", ifelse(
+          parameter == "Bfexcl", "Breastfeeding", parameter
+      ))))
+    ) %>%
+  mutate(across(where(is.numeric), round, 2)) %>%
+  mutate_all(function(x) ifelse(is.na(x), "", x)) %>%
+  select(parameter, Df, SumOfSqs, "R2 (%)", F, P)
+
+colnames(beta_table_bf) <- str_to_title(colnames(beta_table_bf))
+
+beta_table
+beta_table_bf
+
+
+
+
+
+
+
+
+# same but split by infants and year1 
+
+# for the tables I use first imputed dataset of ITT analyses
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+tse_inf <- filter(tse, week != 52)
+
+tse_plot <- transformSamples(x = tse_inf, method = "clr", pseudocount = 1, name = "clr")
+colData(tse_plot) <- colData(tse_plot) %>%
+  as.data.frame() %>%
+  select(sample_id) %>%
+  left_join(
+    select(
+      implist[[1]], 
+      condition, siblings, age,
+      sample_id, week, skippy_id, bfexcl), 
+    by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(
+    age = age + as.numeric(as.character(week)),
+    age_s = scale(age)[, 1]
+  ) %>%
+  DataFrame()
+  
+
+# step 2
+
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta_plot <- colData(tse_plot) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta_plot %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+            nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_plot, "clr"))
+asv <- asv[meta_plot$sample_id, ]
+
+# fit and inspect model 
+permanova_inf <- adonis2(asv ~ condition * age_s,
+                      # by = "margin", # each term analyzed individually
+                      data = meta_plot,
+                      method = "euclidean",
+                      permutations = 999
+                    )
+beta_table_inf <- permanova_inf %>% 
+  as.data.frame() %>%
+  rownames_to_column("parameter") %>%
+  mutate(
+    `R2 (%)` = glue("{round(R2 * 100, 2)}"),
+    P = as.character(`Pr(>F)`),
+    P = str_remove(P, "^0"),
+    parameter = str_to_title(parameter),
+    parameter = ifelse(parameter == "Condition", "SSC", ifelse(
+      parameter == "Age_s", "Age", ifelse(
+        parameter == "Condition:age_s", "SSC x Age", parameter
+      )))
+    ) %>%
+  mutate(across(where(is.numeric), round, 2)) %>%
+  mutate_all(function(x) ifelse(is.na(x), "", x)) %>%
+  select(parameter, Df, SumOfSqs, "R2 (%)", F, P)
+
+colnames(beta_table_inf) <- str_to_title(colnames(beta_table_inf))
+
+# fit and inspect model 
+permanova_inf <- adonis2(
+  asv ~ bfexcl + condition * age_s,
+  # by = "margin", # each term analyzed individually
+  data = meta_plot,
+  method = "euclidean",
+  permutations = 999)
+
+beta_table_bf_inf <- permanova_inf %>%
+  as.data.frame() %>%
+  rownames_to_column("parameter") %>%
+  mutate(
+    `R2 (%)` = glue("{round(R2 * 100, 2)}"),
+    P = as.character(`Pr(>F)`),
+    P = str_remove(P, "^0"),
+    parameter = str_to_title(parameter),
+    parameter = ifelse(parameter == "Condition", "SSC", ifelse(
+      parameter == "Age_s", "Age", ifelse(
+        parameter == "Condition:age_s", "SSC x Age", ifelse(
+          parameter == "Bfexcl", "Breastfeeding", parameter
+      ))))
+    ) %>%
+  mutate(across(where(is.numeric), round, 2)) %>%
+  mutate_all(function(x) ifelse(is.na(x), "", x)) %>%
+  select(parameter, Df, SumOfSqs, "R2 (%)", F, P)
+
+colnames(beta_table_bf_inf) <- str_to_title(colnames(beta_table_bf_inf))
+
+beta_table_inf
+beta_table_bf_inf
+
+
+
+
+
+# now lastly year 1
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+tse_year1 <- filter(tse, week == 52)
+
+tse_plot <- transformSamples(x = tse_year1, method = "clr", pseudocount = 1, name = "clr")
+colData(tse_plot) <- colData(tse_plot) %>%
+  as.data.frame() %>%
+  select(sample_id) %>%
+  left_join(
+    select(
+      implist[[1]], 
+      condition, siblings, age,
+      sample_id, week, skippy_id, bfexcl), 
+    by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(
+    age = age + as.numeric(as.character(week)),
+    age_s = scale(age)[, 1]
+  ) %>%
+  DataFrame()
+  
+
+# step 2
+
+## first all samples
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta_plot <- colData(tse_plot) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta_plot %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+            nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_plot, "clr"))
+asv <- asv[meta_plot$sample_id, ]
+
+# fit and inspect model 
+permanova_year1 <- adonis2(asv ~ condition + age_s,
+                      # by = "margin", # each term analyzed individually
+                      data = meta_plot,
+                      method = "euclidean",
+                      permutations = 999
+                    )
+beta_table_year1 <- permanova_year1 %>% 
+  as.data.frame() %>%
+  rownames_to_column("parameter") %>%
+  mutate(
+    `R2 (%)` = glue("{round(R2 * 100, 2)}"),
+    P = as.character(`Pr(>F)`),
+    P = str_remove(P, "^0"),
+    parameter = str_to_title(parameter),
+    parameter = ifelse(parameter == "Condition", "SSC", ifelse(
+      parameter == "Age_s", "Age", ifelse(
+        parameter == "Condition:age_s", "SSC x Age", parameter
+      )))
+    ) %>%
+  mutate(across(where(is.numeric), round, 2)) %>%
+  mutate_all(function(x) ifelse(is.na(x), "", x)) %>%
+  select(parameter, Df, SumOfSqs, "R2 (%)", F, P)
+
+colnames(beta_table_year1) <- str_to_title(colnames(beta_table_year1))
+
+# fit and inspect model 
+permanova_year1 <- adonis2(
+  asv ~ bfexcl + condition + age_s,
+  # by = "margin", # each term analyzed individually
+  data = meta_plot,
+  method = "euclidean",
+  permutations = 999)
+
+beta_table_bf_year1 <- permanova_year1 %>%
+  as.data.frame() %>%
+  rownames_to_column("parameter") %>%
+  mutate(
+    `R2 (%)` = glue("{round(R2 * 100, 2)}"),
+    P = as.character(`Pr(>F)`),
+    P = str_remove(P, "^0"),
+    parameter = str_to_title(parameter),
+    parameter = ifelse(parameter == "Condition", "SSC", ifelse(
+      parameter == "Age_s", "Age", ifelse(
+        parameter == "Condition:age_s", "SSC x Age", ifelse(
+          parameter == "Bfexcl", "Breastfeeding", parameter
+      ))))
+    ) %>%
+  mutate(across(where(is.numeric), round, 2)) %>%
+  mutate_all(function(x) ifelse(is.na(x), "", x)) %>%
+  select(parameter, Df, SumOfSqs, "R2 (%)", F, P)
+
+colnames(beta_table_bf_year1) <- str_to_title(colnames(beta_table_bf_year1))
+
+beta_table_year1
+beta_table_bf_year1
+
+
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/mb_helper.R")
+
+
+pseq <- makePhyloseqFromTreeSE(tse)
+pseq_clr <- microbiome::transform(pseq, transform = "clr")
+sample_data(pseq_clr) <- sd_to_df(pseq_clr) %>%
+  mutate(
+    Infancy = ifelse(week == 52, "Late", "Early"),
+    Condition = ifelse(condition == 1, "SSC", ifelse(condition == 0, "CAU", NA))
+  ) %>%
+  df_to_sd()
+
+
+# all samples
+bp <- biplot(
+  pseq_clr, 
+  color = "Condition", 
+  point_size = 5, 
+  otu_alpha = 0,
+  colors = c("#909090", "#000000"),
+  shape = "Infancy"
+)
+
+save(bp, beta_table, beta_table_bf, beta_table_inf, beta_table_bf_inf, beta_table_year1, beta_table_bf_year1, file = here::here("data/beta_plot_table.Rds"))
+
+
+sids <- colData(tidySummarizedExperiment::filter(tse, week != "52")) %>% rownames()
+bp <- biplot(
+  pseq_clr, 
+  color = "condition", 
+  point_size = 5, 
+  otu_alpha = 0,
+  colors = c("#909090", "#000000"),
+  filter_samples = sids
+)
+
+bp[[1]]
+
+sids <- colData(tidySummarizedExperiment::filter(tse, week == "52")) %>% rownames()
+bp <- biplot(
+  pseq_clr, 
+  color = "condition", 
+  point_size = 5, 
+  otu_alpha = 0,
+  colors = c("#909090", "#000000"),
+  filter_samples = sids
+  )
+
+bp[[2]]
+
+
+sids <- colData(tidySummarizedExperiment::filter(tse, week != "52")) %>% rownames()
+sids
+filter <- dplyr::filter
+bp_series <- biplot(
+  pseq_clr,
+  color = "condition", 
+  point_size = 5,
+  otu_alpha = 0,
+  connect_series = "week",
+  subject_id = "skippy_id",
+  colors = c("#909090", "#000000"),
+  filter_samples = sids
+  )
+
+
+bp_series[[1]]
+
+
+
+# make also a plot with Bray Curtis
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+library(scater)
+# Bray-Curtis is usually applied to relative abundances
+tse_ra <- transformCounts(tse, method = "relabundance")
+tse_ra <- mutate(
+    tse_ra, 
+    Infancy = ifelse(week == 52, "Late", "Early"),
+    Condition = factor(ifelse(condition == 1, "SSC", ifelse(condition == 0, "CAU", NA)), levels = c("SSC", "CAU"))
+)
+# Perform PCoA
+tse_ra <- runMDS(tse_ra, FUN = vegan::vegdist, method = "bray", name = "PCoA_BC", exprs_values = "relabundance")
+# Create ggplot object
+p <- plotReducedDim(
+  tse_ra, 
+  "PCoA_BC", 
+  colour_by = "Condition",
+  shape_by = "Infancy", 
+  point_size = 5, 
+  point_alpha = 1,
+  theme_size = 25)
+p
+?"scater-plot-args"
+plotReducedDim(tse_ra, "PCoA_BC", colour_by = "Condition", other_fields = list(point_size = 3))
+
+# Add explained variance for each axis
+e <- attr(reducedDim(tse, "PCoA_BC"), "eig");
+rel_eig <- e/sum(e[e>0])          
+p <- p + labs(x = paste("PCoA 1 (", round(100 * rel_eig[[1]],1), "%", ")", sep = ""),
+              y = paste("PCoA 2 (", round(100 * rel_eig[[2]],1), "%", ")", sep = ""))
+
+print(p)
+
+
+
+
+
+###############################################################################
+################################ DR analyses ##################################
+###############################################################################
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+dr <- foreign::read.spss(
+  here::here("data/kelly_documents/data_itt_pp_dr.sav"),
+  to.data.frame = TRUE
+  ) %>%
+  select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>%
+  mutate(SSC_s = scale(SSC)[, 1])
+
+implist <- map(implist, function(imp) {
+  imp_new <- imp %>%
+    left_join(
+      select(dr, skippy_id, SSC_s),
+      by = "skippy_id")
+  #mice::complete(mice::mice(imp_new))
+})
+
+permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  
+  # step 1
+  # we use Aitchison distance
+  tse <- agglomerateByRank(tse, rank = "genus")
+  tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr")
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(dimp, condition, siblings, age, sample_id, week, skippy_id, SSC_s), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+    
+
+    # step 2
+
+    ## first all samples
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, SSC_s) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_map, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_all <- adonis2(asv ~ SSC_s + age_s + age_s:SSC_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+    ## first infancy
+    tse_inf <- filter(tse_map, week != 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_inf) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, SSC_s) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_inf, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_inf <- adonis2(asv ~ SSC_s + age_s + SSC_s:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+
+    ## then 1 year olds 
+    tse_y <- filter(tse_map, week == 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_y) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings, SSC_s) %>%
+                      na.omit()
+
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_y, "clr"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_y <- adonis2(asv ~ SSC_s + age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+    permanova_all$time <- "all"
+    permanova_inf$time <- "infancy"
+    permanova_y$time <- "year1"
+    permanova <- bind_rows(
+      as.data.frame(permanova_all) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+      )
+    permanova$imp <- imp 
+    
+    #list(infancy = permanova_inf, year1 = permanova_y, imp = imp)
+    permanova
+})
+permanovas
diff --git a/R/beta_diversity_bray.R b/R/beta_diversity_bray.R
new file mode 100644
index 0000000..20552f8
--- /dev/null
+++ b/R/beta_diversity_bray.R
@@ -0,0 +1,794 @@
+# thanks to Gavin for helping out with this part:
+# https://stats.stackexchange.com/questions/590510/repeated-measures-permanova-nowhere-to-find
+
+set.seed(1)
+library(mia)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(vegan)
+library(permute)
+
+
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  left_join(select(d, age, sample_id), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  DataFrame()
+
+colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+
+####################### 1.1 Complete Case Analysis ############################
+
+### First I fit a model to all samples 
+
+
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse, "counts"))
+asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s + condition:age_s,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "bray",
+                     # does not work if trend is in data (therefore use 999)
+                     permutations = 999 
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "bray",
+                        permutations = 999)
+permanova2
+
+
+
+### Now split models by infancy and 1 year olds 
+
+## first infancy
+tse_inf <- filter(tse, week != 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_inf) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_inf, "counts"))
+asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s + condition:age_s,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "bray",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "bray",
+                        permutations = 999)
+permanova2
+
+
+
+
+## then 1 year olds 
+tse_y <- filter(tse, week == 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_y) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_y, "counts"))
+asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~ condition + age_s + condition:age_s,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "bray",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~ condition + age_s, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "bray",
+                        permutations = 999)
+permanova2
+
+
+
+
+######################## 1.2 Multiple imputation  #############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  tse_map <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance")
+  # step 1
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(dimp, condition, siblings, age, sample_id, week, skippy_id), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+
+    ## first all samples
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_map, "counts"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_all <- adonis2(asv ~ condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "bray",
+                         permutations = 999
+                       )
+
+    
+    # step 2
+    ## first infancy
+    tse_inf <- filter(tse_map, week != 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_inf) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_inf, "counts"))
+    asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+    # fit and inspect model
+    permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "bray",
+                         permutations = 999
+                       )
+
+
+    ## then 1 year olds
+    tse_y <- filter(tse_map, week == 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_y) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_y, "counts"))
+    asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_y <- adonis2(asv ~ condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "bray",
+                         permutations = 999
+                       )
+
+    permanova_all$time <- "all"
+    permanova_inf$time <- "infancy"
+    permanova_y$time <- "year1"
+    permanova <- bind_rows(
+      as.data.frame(permanova_all) %>% rownames_to_column("parameter"),
+      as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+      )
+    permanova$imp <- imp
+    permanova
+    #list(infancy = permanova_inf, year1 = permanova_y, imp = imp)
+})
+
+permanovas
+
+
+
+
+######################## 1.2 WITH BREASTFEEDING  #############################
+
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+permanovas_bf <- map2_dfr(implist, 1:length(implist), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  
+  # step 1
+  # we use Aitchison distance
+  tse_map <- agglomerateByRank(tse, rank = "genus")
+  tse_map <- transformSamples(x = tse_map, method = "relabundance", pseudocount = 1, name = "relabundance")
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(
+        dimp, condition, siblings, age, 
+        sample_id, 
+        week, skippy_id, bfexcl), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+    
+
+    # step 2
+
+    ## first all samples
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings,
+                      bfexcl) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_map, "counts"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_all <- adonis2(asv ~ bfexcl + siblings  + condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999,
+                         by = "margin"
+                       )
+
+
+    ## first infancy
+    tse_inf <- filter(tse_map, week != 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_inf) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_inf, "counts"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+
+    ## then 1 year olds 
+    tse_y <- filter(tse_map, week == 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_y) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                      na.omit()
+
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_y, "counts"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_y <- adonis2(asv ~ condition + age_s,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "euclidean",
+                         permutations = 999
+                       )
+    permanova_all$time <- "all"
+    permanova_inf$time <- "infancy"
+    permanova_y$time <- "year1"
+    permanova <- bind_rows(
+      as.data.frame(permanova_all) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+      )
+    permanova$imp <- imp 
+     permanova
+    # #list(infancy = permanova_inf, year1 = permanova_y, imp = imp)
+})
+permanovas_bf
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################         2. PP          ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance")
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+count(pp_indicator, PP, SSC)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+count(d, condition, pp)
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  left_join(select(d_pp, age, sample_id, pp), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  DataFrame()
+
+colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+tse_pp <- filter(tse, pp == 1)
+
+####################### 2.1 Complete Case Analysis ############################
+
+### First I fit a model to all samples 
+
+
+
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_pp) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_pp, "counts"))
+asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~  condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "bray",
+                     # does not work if trend is in data (therefore use 999)
+                     permutations = 999 
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~  condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "bray",
+                        permutations = 999)
+permanova2
+
+
+
+### Now split models by infancy and 1 year olds 
+
+## first infancy
+tse_inf <- filter(tse_pp, week != 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_inf) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+# we need to account for non-independence of data in the infancy model 
+ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+  .$skippy_id
+h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_inf, "counts"))
+asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~  condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "bray",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~  condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "bray",
+                        permutations = 999)
+permanova2
+
+
+
+
+## then 1 year olds 
+tse_y <- filter(tse_pp, week == 52)
+# extract relevant meta data and omit na as adonis doesnt accept them.
+meta <- colData(tse_y) %>% as.data.frame() %>%
+          rownames_to_column("sample_id") %>%
+          select(sample_id, skippy_id, condition, age_s, siblings) %>%
+                  na.omit()
+
+# according to omitted NAs I need to select stool samples 
+asv <- t(assay(tse_y, "counts"))
+asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+# fit and inspect model 
+permanova <- adonis2(asv ~  condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                     # by = "margin", # each term analyzed individually
+                     data = meta,
+                     method = "bray",
+                     permutations = 999
+                   )
+
+permanova
+
+# Perform dbRDA
+dbrda <- dbrda(asv ~  condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta)
+# Perform permutational analysis
+permanova2 <- anova.cca(dbrda,
+                        by = "margin", # each term analyzed individually
+                        method = "bray",
+                        permutations = 999)
+permanova2
+
+# no effects in PP either!
+
+
+######################## 2.2 Multiple imputation  #############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+permanovas_pp <- map2_dfr(implist_pp, 1:length(implist_pp), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  
+  # step 1
+  # we use Aitchison distance
+  tse_map <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance")
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(dimp, condition, siblings, age, birthweight_s, ges_age_s, edlevel_s, csection, sex, sample_id, week, skippy_id, pp), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+  
+  tse_map <- filter(tse_map, pp == 1)
+  
+  
+  # step 2
+  ## first all samples
+  # extract relevant meta data and omit na as adonis doesnt accept them.
+  meta <- colData(tse_map) %>% as.data.frame() %>%
+    rownames_to_column("sample_id") %>%
+    select(sample_id, skippy_id, condition, age_s, siblings,
+           birthweight_s, ges_age_s,
+           edlevel_s, csection, sex) %>%
+    na.omit()
+  # we need to account for non-independence of data in the infancy model 
+  ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+    .$skippy_id
+  h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+  # according to omitted NAs I need to select stool samples 
+  asv <- t(assay(tse_map, "counts"))
+  asv <- asv[meta$sample_id, ]
+  
+  # fit and inspect model 
+  permanova_all <- adonis2(asv ~ siblings  + condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex,
+                           # by = "margin", # each term analyzed individually
+                           data = meta,
+                           method = "bray",
+                           permutations = 999
+  )
+  
+  ## then infancy
+  tse_inf <- filter(tse_map, week != 52)
+  # extract relevant meta data and omit na as adonis doesnt accept them.
+  meta <- colData(tse_inf) %>% as.data.frame() %>%
+    rownames_to_column("sample_id") %>%
+    select(sample_id, skippy_id, condition, age_s, siblings,
+           birthweight_s, ges_age_s, 
+           edlevel_s, csection, sex) %>%
+    na.omit()
+  # we need to account for non-independence of data in the infancy model 
+  ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+    .$skippy_id
+  h <- how(plots = Plots(strata = ids, type = "none"),
+           nperm = 999)
+  # according to omitted NAs I need to select stool samples 
+  asv <- t(assay(tse_inf, "counts"))
+  asv <- asv[meta$sample_id, ]
+  
+  # fit and inspect model 
+  permanova_inf <- adonis2(asv ~ siblings  + condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex,
+                           # by = "margin", # each term analyzed individually
+                           data = meta,
+                           method = "bray",
+                           permutations = 999
+  )
+  
+  
+  ## then 1 year olds 
+  tse_y <- filter(tse_map, week == 52)
+  # extract relevant meta data and omit na as adonis doesnt accept them.
+  meta <- colData(tse_y) %>% as.data.frame() %>%
+    rownames_to_column("sample_id") %>%
+    select(sample_id, skippy_id, condition, age_s, siblings,
+           birthweight_s, ges_age_s,
+           edlevel_s, csection, sex) %>%
+    na.omit()
+  
+  # according to omitted NAs I need to select stool samples 
+  asv <- t(assay(tse_y, "counts"))
+  asv <- asv[meta$sample_id, ]
+  
+  # fit and inspect model 
+  permanova_y <- adonis2(asv ~ siblings  + condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "bray",
+                         permutations = 999
+  )
+  permanova_all$time <- "all"
+  permanova_inf$time <- "infancy"
+  permanova_y$time <- "year1"
+  permanova <- bind_rows(
+    as.data.frame(permanova_all) %>% rownames_to_column("parameter"), 
+    as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+    as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+  )
+  permanova$imp <- imp 
+  permanova
+  #list(infancy = permanova_inf, year1 = permanova_y, imp = imp)
+})
+
+permanovas_pp
+
+
+
+
+
+###############################################################################
+################################ DR analyses ##################################
+###############################################################################
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+dr <- foreign::read.spss(
+  here::here("data/kelly_documents/data_itt_pp_dr.sav"),
+  to.data.frame = TRUE
+  ) %>%
+  select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>%
+  mutate(SSC_s = scale(SSC)[, 1])
+
+implist <- map(implist, function(imp) {
+  imp_new <- imp %>%
+    left_join(
+      select(dr, skippy_id, SSC_s),
+      by = "skippy_id")
+  #mice::complete(mice::mice(imp_new))
+})
+
+permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) {
+  # Steps are repeated as from the beginning in the script above
+  tse_map <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance")
+  # step 1
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(sample_id) %>%
+    left_join(
+      select(dimp, condition, siblings, age, sample_id, week, skippy_id, SSC_s), 
+      by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(
+      age = age + as.numeric(as.character(week)),
+      age_s = scale(age)[, 1]
+    ) %>%
+    DataFrame()
+
+    ## first all samples
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings,
+                      SSC_s) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_map, "counts"))
+    asv <- asv[meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_all <- adonis2(asv ~  SSC_s + age_s + SSC_s:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "bray",
+                         permutations = 999
+                       )
+
+   
+    # step 2
+    ## first infancy
+    tse_inf <- filter(tse_map, week != 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_inf) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings,
+                      SSC_s) %>%
+                      na.omit()
+    # we need to account for non-independence of data in the infancy model 
+    ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>%
+      .$skippy_id
+    h <- how(plots = Plots(strata = ids, type = "none"),
+               nperm = 999)
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_inf, "counts"))
+    asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+    # fit and inspect model
+    permanova_inf <- adonis2(asv ~  SSC_s + age_s + SSC_s:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "bray",
+                         permutations = 999
+                       )
+
+
+    ## then 1 year olds
+    tse_y <- filter(tse_map, week == 52)
+    # extract relevant meta data and omit na as adonis doesnt accept them.
+    meta <- colData(tse_y) %>% as.data.frame() %>%
+              rownames_to_column("sample_id") %>%
+              select(sample_id, skippy_id, condition, age_s, siblings,
+                      SSC_s) %>%
+                      na.omit()
+
+    # according to omitted NAs I need to select stool samples 
+    asv <- t(assay(tse_y, "counts"))
+    asv <- asv[rownames(asv) %in% meta$sample_id, ]
+
+    # fit and inspect model 
+    permanova_y <- adonis2(asv ~  SSC_s + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings,
+                         # by = "margin", # each term analyzed individually
+                         data = meta,
+                         method = "bray",
+                         permutations = 999
+                       )
+
+    permanova_all$time <- "all"
+    permanova_inf$time <- "infancy"
+    permanova_y$time <- "year1"
+    permanova <- bind_rows(
+      as.data.frame(permanova_all) %>% rownames_to_column("parameter"),
+      as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), 
+      as.data.frame(permanova_y) %>% rownames_to_column("parameter")
+      )
+    permanova$imp <- imp
+    permanova
+    #list(infancy = permanova_inf, year1 = permanova_y, imp = imp)
+})
+
+permanovas
diff --git a/R/bibo_import_biom.R b/R/bibo_import_biom.R
new file mode 100644
index 0000000..ca2b57b
--- /dev/null
+++ b/R/bibo_import_biom.R
@@ -0,0 +1,41 @@
+# options(repr.plot.width=15, repr.plot.height=15)
+
+library(tidyverse)
+library(mia)
+library(glue)
+library(tidySummarizedExperiment)
+library(miaViz)
+library(scater)
+library(readxl)
+
+# list.files("data/raw_data")
+# biom_file <- here::here("data/raw_data/bibo_SILVA_138.biom")
+# file.exists(biom_file)
+# # because of some issue in NGT2 we need to replace NaN in the biom file with 0
+# biom_lines <- read_lines(biom_file)
+# sum(str_count(biom_lines, "NaN"))
+# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0")
+# sum(str_count(biom_lines, "NaN"))
+# write_lines(
+#   biom_lines,
+#   here::here("data/raw_data/bibo_SILVA_138_rpl.biom")
+# )
+
+# now we should be able to use read_biom 
+biom_file <- here::here("data/raw_data/bibo_SILVA_138_rpl.biom")
+bibo <- loadFromBiom(biom_file)
+colnames(rowData(bibo)) <- c(
+  "kingdom", "phylum", "class", "order", "family", "genus", "species"
+)
+colData(bibo)[["sample_id"]] <- rownames(colData(bibo))
+tse_bibo <- filter(bibo, !str_detect(sample_id, "MOCK"))
+
+
+rns <- str_c(rowData(tse_bibo)$family, rowData(tse_bibo)$genus, sep = "_")
+rns[duplicated(rns)]
+rownames(tse_bibo) <- rns
+rownames(tse_bibo)
+
+
+save(tse_bibo, file = here::here("data/tse_bibo.Rds"))
+
diff --git a/R/bingo_import_biom.R b/R/bingo_import_biom.R
new file mode 100644
index 0000000..05e5f6d
--- /dev/null
+++ b/R/bingo_import_biom.R
@@ -0,0 +1,98 @@
+# options(repr.plot.width=15, repr.plot.height=15)
+
+library(tidyverse)
+library(mia)
+library(glue)
+library(tidySummarizedExperiment)
+library(miaViz)
+library(scater)
+library(readxl)
+
+
+
+### BINGO 1
+
+# biom_file <- here::here("data/raw_data/bingo_infancy_SILVA_138.biom")
+# file.exists(biom_file)
+# # because of some issue in NGT2 we need to replace NaN in the biom file with 0
+# biom_lines <- read_lines(biom_file)
+# sum(str_count(biom_lines, "NaN"))
+# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0")
+# sum(str_count(biom_lines, "NaN"))
+# write_lines(
+#   biom_lines,
+#   here::here("data/raw_data/bingo_infancy_SILVA_138_rpl.biom")
+# )
+
+# now we should be able to use read_biom 
+biom_file <- here::here("data/raw_data/bingo_infancy_SILVA_138_rpl.biom")
+
+
+xl <- read_excel("bingo_infancy_mapping.xlsx", sheet = "baby metadata good samples")
+colnames(xl)
+sample_data <- select(
+  xl, 
+  sample_id = `#SampleID`, 
+  id = ID_participants, 
+  age = Actual_Age_Collection_days)
+bingo1 <- loadFromBiom(biom_file)
+
+# use practical names for the ranks instead 
+colnames(rowData(bingo1)) <- c(
+  "kingdom", "phylum", "class", "order", "family", "genus", "species"
+)
+
+# add sample ids as column
+colData(bingo1)[["sample_id"]] <- rownames(colData(bingo1))
+colData(bingo1) <- colData(bingo1) %>% as.data.frame() %>%
+  left_join(sample_data, by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(sample_id = rownames(colData(bingo1))) %>%
+  DataFrame()
+
+
+tse_bingo1 <- filter(bingo1, !str_detect(sample_id, "MOCK"))
+rns <- str_c(rowData(tse_bingo1)$family, rowData(tse_bingo1)$genus, sep = "_")
+rns[duplicated(rns)]
+rownames(tse_bingo1) <- rns
+save(tse_bingo1, file = here::here("data/tse_bingo1.Rds"))
+
+
+
+### BINGO 2
+
+
+# biom_file <- here::here("data/raw_data/bingo_year13_SILVA_138.biom")
+# file.exists(biom_file)
+# # because of some issue in NGT2 we need to replace NaN in the biom file with 0
+# biom_lines <- read_lines(biom_file)
+# sum(str_count(biom_lines, "NaN"))
+# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0")
+# sum(str_count(biom_lines, "NaN"))
+# write_lines(
+#   biom_lines,
+#   here::here("data/raw_data/bingo_year13_SILVA_138_rpl.biom")
+# )
+
+# now we should be able to use read_biom 
+biom_file <- here::here("data/raw_data/bingo_year13_SILVA_138_rpl.biom")
+bingo2 <- loadFromBiom(biom_file)
+colnames(rowData(bingo2)) <- c(
+  "kingdom", "phylum", "class", "order", "family", "genus", "species"
+)
+
+
+# add sample ids as column
+colData(bingo2)[["sample_id"]] <- rownames(colData(bingo2))
+colData(bingo2) <- colData(bingo2) %>% as.data.frame() %>%
+  left_join(sample_data, by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(sample_id = rownames(colData(bingo2))) %>%
+  DataFrame()
+
+
+tse_bingo2 <- filter(bingo2, !str_detect(sample_id, "MOCK"))
+rns <- str_c(rowData(tse_bingo2)$family, rowData(tse_bingo2)$genus, sep = "_")
+rns[duplicated(rns)]
+rownames(tse_bingo2) <- rns
+save(tse_bingo2, file = here::here("data/tse_bingo2.Rds"))
\ No newline at end of file
diff --git a/R/daa_ancombc_bez.R b/R/daa_ancombc_bez.R
new file mode 100644
index 0000000..2effe1d
--- /dev/null
+++ b/R/daa_ancombc_bez.R
@@ -0,0 +1,1461 @@
+set.seed(1)
+library(mia)
+library(ANCOMBC)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(glue)
+
+
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# for analyses we apply prevalence fitlering and analyze at genus level
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+fvars <- c("siblings", "condition")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings) %>%
+  left_join(select(d, age, sample_id, siblings), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+
+####################### 1.1 Complete Case Analysis ############################
+
+
+
+# model includes random intercepts and all samples 
+if (!file.exists(here::here("data/daa_cc_mlm.Rds"))) {
+  output <- ancombc2(
+    data = tse, 
+    assay_name = "counts", 
+    tax_level = "genus",
+    fix_formula = "condition * age_s",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.2, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl()
+    )
+    save(output, file = here::here("data/daa_cc_mlm.Rds"))
+ } else {
+   load(here::here("data/daa_cc_mlm.Rds"))
+}
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+  filter(value >5)
+
+res_prim <- output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s")
+map(effects, function(effect) {
+  select(res_prim, 
+    taxon, 
+    lfc = glue::glue("lfc_{effect}"),
+    se = glue::glue("se_{effect}"),
+    indicator = glue::glue("diff_{effect}")) %>%
+  filter(indicator) %>%
+  mutate(effect = effect)
+})
+
+res_prim %>% select(taxon, contains("condition1")) %>%
+  arrange(q_condition1, p_condition1) %>%
+  head(15)
+
+
+# model includes random intercepts and excludes 1 year samples
+if (!file.exists(here::here("data/daa_cc_mlm_infancy.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse, week != 52), 
+    assay_name = "counts", 
+    tax_level = "genus",
+    fix_formula = "condition * age_s",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.2, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl()
+    )
+    save(output, file = here::here("data/daa_cc_mlm_infancy.Rds"))
+ } else {
+   load(here::here("data/daa_cc_mlm_infancy.Rds"))
+}
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+ filter(value >5)
+
+res_prim <- output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s")
+map(effects, function(effect) {
+ select(res_prim, 
+   taxon, 
+   lfc = glue::glue("lfc_{effect}"),
+   se = glue::glue("se_{effect}"),
+   indicator = glue::glue("diff_{effect}")) %>%
+ filter(indicator) %>%
+ mutate(effect = effect)
+})
+
+res_prim %>% select(taxon, contains("condition1")) %>%
+  arrange(p_condition1) %>%
+  head(15)
+
+# model includes 1 years samples only 
+if (!file.exists(here::here("data/daa_cc_year1.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse, week == 52), 
+    assay_name = "counts", 
+    tax_level = "genus",
+    fix_formula = "condition * age_s",
+    # rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.2, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl()
+    )
+    save(output, file = here::here("data/daa_cc_year1.Rds"))
+ } else {
+   load(here::here("data/daa_cc_year1.Rds"))
+ }
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+ # sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+   filter(value >5)
+ 
+res_prim <- output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s")
+map(effects, function(effect) {
+   select(res_prim, 
+     taxon, 
+     lfc = glue::glue("lfc_{effect}"),
+     se = glue::glue("se_{effect}"),
+     indicator = glue::glue("diff_{effect}")) %>%
+   filter(indicator) %>%
+   mutate(effect = effect)
+})
+
+
+
+
+
+######################## 1.2 Multiple imputation  #############################
+
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+models_imp <- map2(implist, length(implist), function(dimp, imp) {
+  tse_map <- tse
+  fvars <- c("siblings", "condition")
+  # add metadata to tse
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(-siblings) %>%
+    left_join(select(dimp, age, sample_id, siblings), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+  colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+
+
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_cc_mlm_all_imp{imp}2_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_map), 
+      assay_name = "counts", 
+      tax_level = "genus",
+      fix_formula = "condition * age_s",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0,
+      pseudo_sens = TRUE,
+      prv_cut = 0.10,
+      lib_cut = 1000,
+      s0_perc = 0.05,
+      group = "condition",
+      struc_zero = TRUE,
+      neg_lb = TRUE,
+      alpha = 0.2,
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE,
+      pairwise = FALSE,
+      dunnet = FALSE,
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_cc_mlm_all_imp{imp}2_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_cc_mlm_all_imp{imp}2_bez.Rds")))
+  }
+     
+  # structural zeros 
+  tab_zero_all = output$zero_ind
+  # sensitivity scores 
+  tab_sens_all = output$pseudo_sens_tab
+  sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_all <- output$res
+  
+  effects_all <- map(effects, function(effect) {
+     select(res_prim_all, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+
+
+  
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_map, week != 52), 
+      assay_name = "counts", 
+      tax_level = "genus",
+      fix_formula = "condition * age_s",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.2, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_bez.Rds")))
+  }
+     
+  # structural zeros 
+  tab_zero_infancy = output$zero_ind
+  # sensitivity scores 
+  tab_sens_infancy = output$pseudo_sens_tab
+  sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_infancy <- output$res
+  
+  effects_infancy <- map(effects, function(effect) {
+     select(res_prim_infancy, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+  
+  
+  # model includes 1 years samples only 
+  if (!file.exists(here::here(glue("data/daa_cc_year1_imp{imp}_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_map, week == 52), 
+      assay_name = "counts", 
+      tax_level = "genus",
+      fix_formula = "condition * age_s",
+      # rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.2, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_cc_year1_imp{imp}_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_cc_year1_imp{imp}_bez.Rds")))
+   }
+     
+  # structural zeros 
+  tab_zero_year1 = output$zero_ind
+  # sensitivity scores 
+  tab_sens_year1 = output$pseudo_sens_tab
+  sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_year1 <- output$res
+  effects <- c("age_s", "condition1")
+  effects_year1 <- map(effects, function(effect) {
+     select(res_prim_year1, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+
+  list(
+      all = list(
+        tab_zero_all,
+        sens_scores_all,
+        res_prim_all,
+        effects_all
+    ),
+    infancy = list(
+      tab_zero_infancy,
+      sens_scores_infancy,
+      res_prim_infancy,
+      effects_infancy
+    ),
+    year1 = list(
+      tab_zero_year1,
+      sens_scores_year1,
+      res_prim_year1,
+      effects_year1
+    )
+  )
+})
+
+# you must have effects defined before running the map command
+models_imp[[1]]$all$effects_all
+models_imp[[1]]$infancy$effects_infancy
+
+ancom_tables_itt <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_itt) <- str_remove(colnames(ancom_tables_itt), "_condition1") 
+
+ancom_tables_itt2 <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_itt2) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") 
+
+save(ancom_tables_itt, file = here::here("data/ancom_tables_itt.Rds"))
+
+ancom_tables_itt
+ancom_tables_itt2
+# ancom did not identify any as significant but effect sizes are in line
+
+
+
+
+
+
+
+t1 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  lfc_age_s) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+  
+
+t2 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  p_age_s
+  ) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  q_age_s) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining <- full_join(t1, t2, by = c("taxon", "variable")) %>%
+  full_join(t3, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+    ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining, file = here::here("data/ancombc_remaining.Rds"))
+
+
+
+
+# same for early infancy samples:
+ancom_tables_itt_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_itt_infancy) <- str_remove(colnames(ancom_tables_itt), "_condition1") 
+
+ancom_tables_itt2_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_itt2_infancy) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") 
+
+save(ancom_tables_itt_infancy, file = here::here("data/ancom_tables_itt_infancy.Rds"))
+
+ancom_tables_itt_infancy
+
+
+
+
+t1_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  lfc_age_s) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  p_age_s
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  q_age_s) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_infancy <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>%
+  full_join(t3_infancy, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_infancy, file = here::here("data/ancombc_remaining_infancy.Rds"))
+
+
+
+# same for year1 samples:
+ancom_tables_itt_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_itt_year1) <- str_remove(colnames(ancom_tables_itt), "_condition1") 
+
+ancom_tables_itt2_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_itt2_year1) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") 
+
+save(ancom_tables_itt_year1, file = here::here("data/ancom_tables_itt_year1.Rds"))
+
+ancom_tables_itt_year1
+
+
+
+
+t1_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  lfc_age_s) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  p_age_s
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  q_age_s) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_year1 <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>%
+  full_join(t3_year1, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_year1, file = here::here("data/ancombc_remaining_year1.Rds"))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+fvars <- c("siblings", "condition")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>%
+  left_join(select(d_pp, age, sample_id, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(
+    across(all_of(fvars), function(x) as.factor(x)),
+    ges_age_s = scale(ges_age)[, 1],
+    birthweight_s = scale(birthweight)[, 1],
+    edlevel_s = scale(edlevel)[, 1]
+    ) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+tse <- filter(tse, pp == 1)
+
+
+####################### 2.1 Complete Case Analysis ############################
+
+
+
+# model includes random intercepts and all samples 
+if (!file.exists(here::here("data/daa_pp_mlm_bez.Rds"))) {
+  output <- ancombc2(
+    data = tse, 
+    assay_name = "counts", 
+    tax_level = "genus",
+    fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.2, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    # mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+    # trend_control = list(
+    #   contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+    #   node = list(2), 
+    #   solver = "ECOS",B = 100)
+    # 
+    )
+    save(output, file = here::here("data/daa_pp_mlm_bez.Rds"))
+ } else {
+   load(here::here("data/daa_pp_mlm_bez.Rds"))
+ }
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+  filter(value >5)
+
+res_prim <- output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s")
+map(effects, function(effect) {
+  select(res_prim, 
+    taxon, 
+    lfc = glue::glue("lfc_{effect}"),
+    se = glue::glue("se_{effect}"),
+    indicator = glue::glue("diff_{effect}")) %>%
+  filter(indicator) %>%
+  mutate(effect = effect)
+})
+
+
+# model excludes random intercepts and includes all samples
+if (!file.exists(here::here("data/daa_pp_bez.Rds"))) {
+  output <- ancombc2(
+    data = tse, 
+    assay_name = "counts", 
+    tax_level = "genus",
+    fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+    #rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.2, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl()
+    )
+    save(output, file = here::here("data/daa_pp_bez.Rds"))
+ } else {
+   load(here::here("data/daa_pp_bez.Rds"))
+ }
+   
+ # structural zeros 
+ tab_zero = output$zero_ind
+ sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+ # sensitivity scores 
+ tab_sens = output$pseudo_sens_tab
+ head(tab_sens)
+ pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+   filter(value >5)
+ 
+ res_prim <- output$res
+ colnames(res_prim)
+ effects <- c("age_s", "condition1", "condition1:age_s")
+ map(effects, function(effect) {
+   select(res_prim, 
+     taxon, 
+     lfc = glue::glue("lfc_{effect}"),
+     se = glue::glue("se_{effect}"),
+     indicator = glue::glue("diff_{effect}")) %>%
+   filter(indicator) %>%
+   mutate(effect = effect)
+ })
+
+# model includes random intercepts and excludes 1 year samples
+if (!file.exists(here::here("data/daa_pp_mlm_infancy_bez.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse, week != 52), 
+    assay_name = "counts", 
+    tax_level = "genus",
+    fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.2, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl()
+    )
+    save(output, file = here::here("data/daa_pp_mlm_infancy_bez.Rds"))
+ } else {
+   load(here::here("data/daa_pp_mlm_infancy_bez.Rds"))
+}
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+ filter(value >5)
+
+res_prim <- output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s")
+map(effects, function(effect) {
+ select(res_prim, 
+   taxon, 
+   lfc = glue::glue("lfc_{effect}"),
+   se = glue::glue("se_{effect}"),
+   indicator = glue::glue("diff_{effect}")) %>%
+ filter(indicator) %>%
+ mutate(effect = effect)
+})
+
+
+# model includes 1 years samples only 
+if (!file.exists(here::here("data/daa_pp_year1_bez.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse, week == 52), 
+    assay_name = "counts", 
+    tax_level = "genus",
+    fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+    # rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.2, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl()
+    )
+    save(output, file = here::here("data/daa_pp_year1_bez.Rds"))
+ } else {
+   load(here::here("data/daa_pp_year1_bez.Rds"))
+ }
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+ # sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+   filter(value >5)
+ 
+res_prim <- output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s")
+map(effects, function(effect) {
+   select(res_prim, 
+     taxon, 
+     lfc = glue::glue("lfc_{effect}"),
+     se = glue::glue("se_{effect}"),
+     indicator = glue::glue("diff_{effect}")) %>%
+   filter(indicator) %>%
+   mutate(effect = effect)
+})
+
+
+
+
+
+######################## 2.2 Multiple imputation  #############################
+
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) {
+  tse_map <- tse 
+  fvars <- c("siblings", "condition", "csection", "sex")
+  # add metadata to tse
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>%
+    left_join(select(dimp, age, sample_id, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+  colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+  
+  tse_map <- filter(tse_map, pp == 1)
+  
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_pp_mlm_imp{imp}_bez.Rds")))) {
+    output <- ancombc2(
+      data = tse_map, 
+      assay_name = "counts", 
+      tax_level = "genus",
+      fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.2, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here(glue("data/daa_pp_mlm_imp{imp}_bez.Rds")))
+  } else {
+    load(here::here(glue("data/daa_pp_mlm_imp{imp}_bez.Rds")))
+  }
+  
+  # structural zeros 
+  tab_zero_all = output$zero_ind
+  # sensitivity scores 
+  tab_sens_all = output$pseudo_sens_tab
+  sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>%
+    filter(value >5)
+  res_prim_all <- output$res
+  effects_all <- map(effects, function(effect) {
+    select(res_prim_all, 
+           taxon, 
+           lfc = glue::glue("lfc_{effect}"),
+           se = glue::glue("se_{effect}"),
+           indicator = glue::glue("diff_{effect}")) %>%
+      filter(indicator) %>%
+      mutate(effect = effect)
+  })
+  
+  
+  
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_map, week != 52), 
+      assay_name = "counts", 
+      tax_level = "genus",
+      fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.2, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_bez.Rds")))
+  }
+     
+  # structural zeros 
+  tab_zero_infancy = output$zero_ind
+  # sensitivity scores 
+  tab_sens_infancy = output$pseudo_sens_tab
+  sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_infancy <- output$res
+  effects_infancy <- map(effects, function(effect) {
+     select(res_prim_infancy, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+  
+  
+  # model includes 1 years samples only 
+  if (!file.exists(here::here(glue("data/daa_pp_year1_imp{imp}_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_map, week == 52), 
+      assay_name = "counts", 
+      tax_level = "genus",
+      fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+      # rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.2, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_pp_year1_imp{imp}_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_pp_year1_imp{imp}_bez.Rds")))
+   }
+     
+  # structural zeros 
+  tab_zero_year1 = output$zero_ind
+  # sensitivity scores 
+  tab_sens_year1 = output$pseudo_sens_tab
+  sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_year1 <- output$res
+  effects_year1 <- map(effects, function(effect) {
+     select(res_prim_year1, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+
+  list(
+    all = list(
+      tab_zero_all,
+      sens_scores_all,
+      res_prim_all,
+      effects_all
+    ),
+    infancy = list(
+      tab_zero_infancy,
+      sens_scores_infancy,
+      res_prim_infancy,
+      effects_infancy
+    ),
+    year1 = list(
+      tab_zero_year1,
+      sens_scores_year1,
+      res_prim_year1,
+      effects_year1
+    )
+  )
+  
+})
+
+
+
+
+
+ancom_tables_pp <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_pp) <- str_remove(colnames(ancom_tables_pp), "_condition1") 
+
+ancom_tables_pp2 <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_pp2) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") 
+
+save(ancom_tables_pp, file = here::here("data/ancom_tables_pp.Rds"))
+
+ancom_tables_pp
+ancom_tables_pp2
+
+
+
+
+
+
+
+
+t1 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  lfc_age_s) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  p_age_s
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  q_age_s) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_pp <- full_join(t1, t2, by = c("taxon", "variable")) %>%
+  full_join(t3, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_pp, file = here::here("data/ancombc_remaining_pp.Rds"))
+
+
+
+
+# same for early infancy samples:
+ancom_tables_pp_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_pp_infancy) <- str_remove(colnames(ancom_tables_pp), "_condition1") 
+
+ancom_tables_pp2_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_pp2_infancy) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") 
+
+save(ancom_tables_pp_infancy, file = here::here("data/ancom_tables_pp_infancy.Rds"))
+
+ancom_tables_pp_infancy
+
+
+
+
+t1_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  lfc_age_s) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  p_age_s
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  q_age_s) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_infancy_pp <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>%
+  full_join(t3_infancy, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_infancy_pp, file = here::here("data/ancombc_remaining_infancy_pp.Rds"))
+
+
+
+# same for year1 samples:
+ancom_tables_pp_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_pp_year1) <- str_remove(colnames(ancom_tables_pp), "_condition1") 
+
+ancom_tables_pp2_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_pp2_year1) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") 
+
+save(ancom_tables_pp_year1, file = here::here("data/ancom_tables_pp_year1.Rds"))
+
+ancom_tables_pp_year1
+
+
+
+
+t1_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  lfc_age_s) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  p_age_s
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  q_age_s) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_year1_pp <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>%
+  full_join(t3_year1, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_year1_pp, file = here::here("data/ancombc_remaining_year1_pp.Rds"))
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+################################# PLOTS #######################################
+###############################################################################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- transformCounts(tse, method = "relabundance")
+
+asv_tab <- t(assay(tse, "relabundance")) %>% as.data.frame() %>%
+  select(all_of(c("genus:Parabacteroides", "genus:Flavonifractor", "genus:Faecalibacterium"))) %>%
+  rownames_to_column("sample_id")
+colnames(asv_tab) <- str_remove(colnames(asv_tab), "genus:")
+meta <- colData(tse) %>% as.data.frame() %>%
+  mutate(
+    condition_label = ifelse(condition == 1, "SSC", ifelse(condition == 0, "CAU", NA)),
+    week_label = glue::glue("Week {week}")
+    ) %>%
+  select(sample_id, condition_label, week_label)
+
+ds <- meta %>% full_join(asv_tab, by = "sample_id") %>%
+  pivot_longer(
+    all_of(c("Parabacteroides", "Flavonifractor", "Faecalibacterium")),
+    names_to = "Genus", values_to = "Abundance"  
+  ) %>%
+  group_by(Genus) %>%
+  nest()
+
+
+daa_plots <- map2(ds[[1]], ds[[2]], function(genus, d) {
+    p <- d %>%
+    ggplot(aes_string("condition_label", "Abundance", fill = "condition_label")) +
+      geom_boxplot(outlier.alpha = 0) +
+      #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) +
+      geom_jitter(width = 0.1, size = 2.5, alpha = 0.5) +
+      scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+      theme_bw(base_size = 25) +
+      theme(
+        legend.position = "none") +
+      xlab("") + ylab(str_to_title(genus))
+  if (genus == "Parabacteroides") {
+    p <- p + facet_wrap(~week_label, strip.position = "bottom") +
+      theme(
+        legend.position = "none",
+        strip.placement = "outside",
+        strip.background = element_blank())
+  }
+  p
+})
+daa_plots
+daa_plots <- map2(ds[[1]], ds[[2]], function(genus, d) {
+    if (genus == "Parabacteroides") {
+      df <- d 
+    } else {
+      df <- filter(d, week_label == "Week 52")
+    }
+    p <- df %>%
+    ggplot(aes_string("condition_label", "Abundance", fill = "condition_label")) +
+      geom_boxplot(outlier.alpha = 0) +
+      #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) +
+      geom_jitter(width = 0.1, size = 2.5, alpha = 0.5) +
+      scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+      theme_bw(base_size = 25) +
+      theme(
+        legend.position = "none") +
+      xlab("") + ylab(str_to_title(genus))
+  if (genus == "Parabacteroides") {
+    p <- p + facet_wrap(~week_label, strip.position = "bottom") +
+      theme(
+        legend.position = "none",
+        strip.placement = "outside",
+        strip.background = element_blank())
+  }
+  p
+})
+daa_plots[[1]]
+
+library(patchwork)
+daa_plot <-  daa_plots[[1]] /
+  (daa_plots[[2]] + daa_plots[[3]]) 
+
+daa_plot
+save(daa_plot, file = here::here("data/daa_plots.Rds"))
+
+
+
diff --git a/R/daa_linda_bez.R b/R/daa_linda_bez.R
new file mode 100644
index 0000000..c929bdd
--- /dev/null
+++ b/R/daa_linda_bez.R
@@ -0,0 +1,520 @@
+set.seed(1)
+library(mia)
+library(LinDA)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(glue)
+
+
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+fvars <- c("siblings", "condition")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings) %>%
+  left_join(select(d, age, sample_id, siblings), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+
+####################### 1.1 Complete Case Analysis ############################
+
+
+# model includes random intercepts and all samples 
+asv_tab <- as.data.frame(assay(tse, "counts"))
+meta <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+head(meta)
+asv_tab <- asv_tab[, rownames(meta)]
+head(asv_tab[, 1:5])
+linda_obj <- linda(
+  asv_tab, 
+  meta, 
+  formula = '~condition * age_s + (1|skippy_id)', 
+  alpha = 0.4,
+  prev.cut = 0.1, 
+  lib.cut = 1000, 
+  winsor.quan = 0.97
+)
+
+filter(linda_obj$output$condition1, reject)
+
+
+
+
+
+# model includes random intercepts and excludes 1 year samples
+tse_infancy <- filter(tse, week != 52)
+asv_tab <- as.data.frame(assay(tse_infancy, "counts"))
+meta <- colData(tse_infancy) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[, rownames(meta)]
+dim(na.omit(meta))
+linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + (1|skippy_id)', alpha = 0.4,
+                   prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+# linda.plot(linda_obj, c('condition'), 
+#            titles = c('Condition: n v.s. y'), alpha = 0.4, lfc.cut = 1,
+#            legend = TRUE, directory = NULL, width = 11, height = 8)
+
+filter(linda_obj$output$condition1, reject)
+
+
+
+
+# model includes 1 years samples only 
+tse_year1 <- filter(tse, week == 52)
+asv_tab <- as.data.frame(assay(tse_year1, "counts"))
+meta <- colData(tse_year1) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[, rownames(meta)]
+linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s', alpha = 0.4,
+                   prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+res <- filter(linda_obj$output$condition1, reject) %>%
+  rownames_to_column("taxid")
+rd <- rowData(tse_year1) %>% 
+  as.data.frame() %>% 
+  rownames_to_column("taxid") %>%
+  filter(taxid %in% res$taxid)
+res <- left_join(res, rd, by = "taxid")
+filter(linda_obj$output$condition1, reject)
+
+
+
+
+
+######################## 1. 2Multiple imputation  #############################
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+models_imp <- map2(implist, 1:length(implist), function(dimp, imp) {
+  tse_map <- tse 
+  fvars <- c("siblings", "condition")
+  # add metadata to tse
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(-siblings) %>%
+    left_join(select(dimp, age, sample_id, siblings), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+  colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+  
+  # all samples
+  asv_tab <- as.data.frame(assay(tse_map, "counts"))
+  meta <- colData(tse_map) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[, rownames(meta)]
+
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + (1|skippy_id)', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  
+
+  res_all <- linda_obj$output$condition1
+  res_all2 <- linda_obj$output[["condition1:age_s"]]
+  
+  # model includes random intercepts and excludes 1 year samples
+  tse_infancy <- filter(tse_map, week != 52)
+  asv_tab <- as.data.frame(assay(tse_infancy, "counts"))
+  meta <- colData(tse_infancy) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars))
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + (1|skippy_id)', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  
+  res_infancy <- linda_obj$output$condition1
+  res_infancy2 <- linda_obj$output[["condition1:age_s"]]
+
+  
+  
+  # model includes 1 years samples only 
+  tse_year1 <- filter(tse_map, week == 52)
+  asv_tab <- as.data.frame(assay(tse_year1, "counts"))
+  meta <- colData(tse_year1) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars))
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+
+  res_year1 <- linda_obj$output$condition1
+  res_year12 <- linda_obj$output[["condition1:age_s"]]
+
+  list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12)
+})
+
+
+tables_linda_itt <- map(models_imp, function(x) {
+  x[[1]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+tables_linda_itt
+
+tables_linda_itt2 <- map(models_imp, function(x) {
+  x[[4]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+tables_linda_itt2
+
+
+
+
+
+tables_linda_itt_infancy <- map(models_imp, function(x) {
+  x[[2]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+ 
+
+tables_linda_itt_year1 <- map(models_imp, function(x) {
+  x[[3]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+
+tables_linda_itt
+tables_linda_itt2
+tables_linda_itt_infancy
+tables_linda_itt_year1
+
+linda_identified <- map2_dfr(
+  list(
+    tables_linda_itt,
+    tables_linda_itt2,
+    tables_linda_itt_infancy,
+    tables_linda_itt_year1
+    ),
+  c(
+    "main effect",
+    "interaction term",
+    "early",
+    "late"
+  ),
+  function(table, term) {
+    filter(table[[1]], padj <= 0.2)  %>%
+      mutate(term = term)
+  }) %>%
+  arrange(desc(abs(log2FoldChange)), padj)
+
+# delete duplicate row
+linda_identified <- linda_identified[-4, ]
+linda_identified
+
+
+save(
+  tables_linda_itt, 
+  tables_linda_itt2, 
+  linda_identified, 
+  file = here::here("data/tables_linda_itt.Rds")
+)
+
+save(
+  tables_linda_itt_infancy, 
+  file = here::here("data/tables_linda_itt_infancy.Rds")
+)
+
+save(
+  tables_linda_itt_year1, 
+  file = here::here("data/tables_linda_itt_year1.Rds")
+) 
+
+
+
+
+
+
+
+
+
+
+
+models_imp <- map2(implist, 1:length(implist), function(dimp, imp) {
+  tse_map <- tse 
+  fvars <- c("siblings", "condition")
+  # add metadata to tse
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(-siblings, -bfexcl) %>%
+    left_join(select(dimp, age, sample_id, siblings, bfexcl), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+  colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+  
+  
+  # model includes 1 years samples only 
+  tse_year1 <- filter(tse_map, week == 52)
+  asv_tab <- as.data.frame(assay(tse_year1, "counts"))
+  meta <- colData(tse_year1) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), bfexcl)
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + bfexcl', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  res <- filter(linda_obj$output$condition1, reject) %>%
+    rownames_to_column("taxid")
+  if (dim(res)[1] >= 1) {
+    rd <- rowData(tse_year1) %>% 
+     as.data.frame() %>% 
+     rownames_to_column("taxid") %>%
+     filter(taxid %in% res$taxid)
+    res_year1 <- left_join(res, rd, by = "taxid") %>% 
+     mutate(time = "year1")
+  }
+  
+  out <- list()
+  if(exists("res_year1")) {
+    out[[1]] <- res_year1 
+  } 
+  out
+})
+
+
+models_imp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+fvars <- c("siblings", "condition", "csection", "sex")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>%
+  left_join(select(d_pp, age, sample_id, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(
+    across(all_of(fvars), function(x) as.factor(x)),
+    ges_age_s = scale(ges_age)[, 1],
+    birthweight_s = scale(birthweight)[, 1],
+    edlevel_s = scale(edlevel)[, 1]
+    ) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+tse <- filter(tse, pp == 1)
+
+####################### 2.1 Complete Case Analysis ############################
+
+
+# model includes random intercepts and all samples 
+asv_tab <- as.data.frame(assay(tse, "counts"))
+meta <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s)
+asv_tab <- asv_tab[, rownames(meta)]
+linda_obj <- linda(
+  asv_tab, 
+  meta, 
+  formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4,
+  prev.cut = 0.1, 
+  lib.cut = 1000, 
+  winsor.quan = 0.97
+)
+
+
+filter(linda_obj$output$condition1, reject)
+
+
+fvars <- c("siblings", "condition", "csection", "sex")
+# model includes random intercepts and excludes 1 year samples
+tse_infancy <- filter(tse, week != 52)
+asv_tab <- as.data.frame(assay(tse_infancy, "counts"))
+meta <- colData(tse_infancy) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars), ges_age_s, edlevel_s, birthweight_s)
+dim(na.omit(meta))
+asv_tab <- asv_tab[, rownames(meta)]
+linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + edlevel_s + ges_age_s + birthweight_s + sex + csection + (1|skippy_id)', alpha = 0.4,
+                   prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+# linda.plot(linda_obj, c('condition'), 
+#            titles = c('Condition: n v.s. y'), alpha = 0.4, lfc.cut = 1,
+#            legend = TRUE, directory = NULL, width = 11, height = 8)
+
+filter(linda_obj$output$condition1, reject)
+
+
+
+
+# model includes 1 years samples only 
+tse_year1 <- filter(tse, week == 52)
+asv_tab <- as.data.frame(assay(tse_year1, "counts"))
+meta <- colData(tse_year1) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s)
+asv_tab <- asv_tab[, rownames(meta)]
+linda_obj <- linda(
+  asv_tab, 
+  meta, 
+  formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4,
+  prev.cut = 0.1, 
+  lib.cut = 1000, 
+  winsor.quan = 0.97
+)
+res <- filter(linda_obj$output$condition1, reject) %>%
+  rownames_to_column("taxid")
+rd <- rowData(tse_year1) %>% 
+  as.data.frame() %>% 
+  rownames_to_column("taxid") %>%
+  filter(taxid %in% res$taxid)
+res <- left_join(res, rd, by = "taxid")
+
+filter(linda_obj$output$condition1, reject)
+
+
+
+
+
+######################## 2.2 Multiple imputation  #############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) {
+  tse_map <- tse 
+  fvars <- c("siblings", "condition", "csection", "sex")
+  # add metadata to tse
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>%
+    left_join(select(dimp, age, sample_id, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+  colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+  tse_map <- filter(tse_map, pp == 1)
+  
+  # all samples
+  asv_tab <- as.data.frame(assay(tse_map, "counts"))
+  meta <- colData(tse_map) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  
+
+  res_all <- linda_obj$output$condition1
+  res_all2 <- linda_obj$output[["condition1:age_s"]]
+  
+  # model includes random intercepts and excludes 1 year samples
+  tse_infancy <- filter(tse_map, week != 52)
+  asv_tab <- as.data.frame(assay(tse_infancy, "counts"))
+  meta <- colData(tse_infancy) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  
+  res_infancy <- linda_obj$output$condition1
+  res_infancy2 <- linda_obj$output[["condition1:age_s"]]
+
+  
+  
+  # model includes 1 years samples only 
+  tse_year1 <- filter(tse_map, week == 52)
+  asv_tab <- as.data.frame(assay(tse_year1, "counts"))
+  meta <- colData(tse_year1) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+
+  res_year1 <- linda_obj$output$condition1
+  res_year12 <- linda_obj$output[["condition1:age_s"]]
+
+  list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12)
+})
+
+models_imp[[1]][[1]]
+tables_linda_pp <- map(models_imp, function(x) {
+  x[[3]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3))) 
+})
+tables_linda_pp
+
+save(tables_linda_pp, file = here::here("data/tables_linda_pp.Rds"))
+
+
+
diff --git a/R/daa_maaslin2_bez.R b/R/daa_maaslin2_bez.R
new file mode 100644
index 0000000..ad4d519
--- /dev/null
+++ b/R/daa_maaslin2_bez.R
@@ -0,0 +1,495 @@
+set.seed(1)
+library(mia)
+library(Maaslin2)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(glue)
+
+
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+fvars <- c("siblings", "condition")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings) %>%
+  left_join(select(d, age, sample_id, siblings), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+
+####################### 1.1 Complete Case Analysis ############################
+
+
+
+asv_tab <- t(assay(tse))
+meta <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta),]
+
+# you can specifiy different GLMs/normalizations/transforms. We used similar
+# settings as in Nearing et al. (2021) here:
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c("condition", "age_s"),
+  random_effects = "skippy_id", 
+  reference = "0",  
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.5, metadata == "condition")
+
+# model includes random intercepts and excludes 1 year samples
+tse_infancy <- filter(tse, week != 52)
+asv_tab <- t(assay(tse))
+meta <- colData(tse_infancy) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta), ]
+
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c("condition", "age_s"),
+  random_effects = "skippy_id", 
+  reference = "0",  
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.4, metadata == "condition")
+
+
+
+
+# model includes 1 years samples only 
+tse_year1 <- filter(tse, week == 52)
+asv_tab <- t(assay(tse))
+meta <- colData(tse_year1) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta), ]
+
+
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c("condition", "age_s"),
+  #random_effects = "skippy_id", 
+  reference = "0",  
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.4, metadata == "condition")
+
+
+
+
+######################## 1. 2Multiple imputation  #############################
+
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+
+if (!file.exists(here::here("data/maaslin2_itt_mi_bez.Rds"))) {
+  models_imp <- map2(implist, 1:length(implist), function(dimp, imp) {
+    tse_map <- tse
+    fvars <- c("siblings", "condition")
+    # add metadata to tse
+    colData(tse_map) <- colData(tse_map) %>%
+      as.data.frame() %>%
+      select(-siblings) %>%
+      left_join(select(dimp, age, sample_id, siblings), by = "sample_id") %>%
+      column_to_rownames("sample_id") %>%
+      mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+      DataFrame()
+    colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+    colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+
+    # all samples
+    asv_tab <- t(assay(tse_map))
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c("condition", "age_s"),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+    
+    res_all <- fit_data$results
+
+
+
+    
+    # model includes random intercepts and excludes 1 year samples
+    tse_infancy <- filter(tse_map, week != 52)
+    asv_tab <- t(assay(tse_infancy))
+    meta <- colData(tse_infancy) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[rownames(meta), ]
+
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c("condition", "age_s"),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_inf <- fit_data$results
+
+
+    # model includes 1 years samples only 
+    tse_year1 <- filter(tse_map, week == 52)
+    asv_tab <- t(assay(tse_year1))
+    meta <- colData(tse_year1) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c("condition", "age_s"),
+      #random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_year1 <- fit_data$results
+
+    list(res_all, res_inf, res_year1)
+  })
+  save(models_imp, file = here::here("data/maaslin2_itt_mi_bez.Rds"))
+ } else {
+  load(here::here("data/maaslin2_itt_mi_bez.Rds"))
+ }
+
+
+# change [[1]] to 2-5 to inspect the other imputations
+maaslin2_tables_itt <- map(models_imp[[1]], function(x) {
+    x %>% filter(metadata == "condition") %>%
+      select(feature, coef, stderr, pval, qval) %>%
+      arrange(qval, desc(abs(coef))) %>%
+      mutate(
+        across(where(is.numeric), round, 3),
+        feature = str_replace(feature, "\\.\\.", "."),
+        feature = str_replace(feature, "\\.", ":")
+        )
+})
+maaslin2_tables_itt
+
+# since MaAsLin does not support interactions I need to evaluate early and late infancy separately
+maaslin2_identified <- map2_dfr(maaslin2_tables_itt, c("all", "early", "late"), function(table, term) {
+  filter(table, qval <= 0.2) %>%
+  mutate(term = term) %>%
+  arrange(coef, qval)
+})
+
+maaslin2_identified
+save(maaslin2_tables_itt, maaslin2_identified, file = here::here("data/maaslin2_tables_itt.Rds"))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+fvars <- c("siblings", "condition", "csection", "sex")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>%
+  left_join(select(d_pp, age, sample_id, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+    mutate(
+    across(all_of(fvars), function(x) as.factor(x)),
+    ges_age_s = scale(ges_age)[, 1],
+    birthweight_s = scale(birthweight)[, 1],
+    edlevel_s = scale(edlevel)[, 1]
+    ) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+tse <- filter(tse, pp == 1)
+
+####################### 2.1 Complete Case Analysis ############################
+
+
+# model includes random intercepts and all samples 
+asv_tab <- t(assay(tse))
+meta <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta), ]
+
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c(
+    "condition", 
+    "siblings", 
+    "age_s",
+    "csection",
+    "sex",
+    "ges_age_s",
+    "birthweight_s",
+    "edlevel_s"
+    ),
+  random_effects = "skippy_id", 
+  reference = "0",
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.1, metadata == "condition")
+
+
+
+
+
+
+
+######################## 2.2 Multiple imputation  #############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1)
+
+if (!file.exists(here::here("data/maaslin2_pp_mi_bez.Rds"))) {
+  models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) {
+    tse_map <- tse 
+    fvars <- c("siblings", "condition", "csection", "sex")
+    # add metadata to tse
+    colData(tse_map) <- colData(tse_map) %>%
+      as.data.frame() %>%
+      select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>%
+      left_join(select(dimp, age, sample_id, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>%
+      column_to_rownames("sample_id") %>%
+      mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+      DataFrame()
+    colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+    colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+    tse_map <- filter(tse_map, pp == 1)
+    
+    # all samples
+    asv_tab <- t(assay(tse_map))
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c(
+        "condition", 
+        "siblings", 
+        "age_s",
+        "csection",
+        "sex",
+        "ges_age_s",
+        "birthweight_s",
+        "edlevel_s"
+        ),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+      
+    res_all <- fit_data$results
+
+
+    # model includes random intercepts and excludes 1 year samples
+    tse_infancy <- filter(tse_map, week != 52)
+    asv_tab <- t(assay(tse_infancy))
+    meta <- colData(tse_infancy) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+    asv_tab <- asv_tab[rownames(meta), ]
+
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c(
+        "condition", 
+        "siblings", 
+        "age_s",
+        "csection",
+        "sex",
+        "ges_age_s",
+        "birthweight_s",
+        "edlevel_s"
+        ),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_inf <- fit_data$results
+    
+    
+      # model includes 1 years samples only 
+    tse_year1 <- filter(tse_map, week == 52)
+    asv_tab <- t(assay(tse_year1))
+    meta <- colData(tse_year1) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c(
+        "condition", 
+        "siblings", 
+        "age_s",
+        "csection",
+        "sex",
+        "ges_age_s",
+        "birthweight_s",
+        "edlevel_s"
+        ),
+      #random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_year1 <- fit_data$results
+    
+    list(res_all, res_inf, res_year1)
+  })
+  save(models_imp, file = here::here("data/maaslin2_pp_mi_bez.Rds"))
+ } else {
+  load(here::here("data/maaslin2_pp_mi_bez.Rds"))
+ }
+
+
+
+
+
+# switch the 1 to 2-5 to check other imputations
+maaslin2_tables_itt_pp <- map(models_imp[[1]], function(x) {
+    x %>% filter(metadata == "condition") %>%
+      select(feature, coef, stderr, pval, qval) %>%
+      arrange(qval, desc(abs(coef))) %>%
+      mutate(
+        across(where(is.numeric), round, 3),
+        feature = str_replace(feature, "\\.\\.", "."),
+        feature = str_replace(feature, "\\.", ":")
+        )
+})
+maaslin2_tables_itt_pp
+save(maaslin2_tables_itt_pp, file = here::here("data/maaslin2_tables_itt_pp.Rds"))
\ No newline at end of file
diff --git a/R/gbm.R b/R/gbm.R
new file mode 100644
index 0000000..8e8329d
--- /dev/null
+++ b/R/gbm.R
@@ -0,0 +1,86 @@
+library(tidyverse)
+library(omixerRpm)
+
+# load the KEGG orthologue table 
+ko <- read.delim(
+    "picrust2/data/KO_metagenome_out/pred_metagenome_unstrat.tsv",
+    header = TRUE
+    )
+
+# pick most recent database 
+listDB()
+db <- loadDB(name = listDB()[1])
+head(ko)
+# calculate GBM abundance and store in df
+# threads = 16, java.mem = 16
+gbm <- rpm(x = ko, module.db = db, )
+gbm <- asDataFrame(gbm, "abundance")
+head(gbm)
+
+
+# swap ids back to the sample ids I use in the other datasets
+mapfiles <- list.files(here::here("data/map_files"), full.names = TRUE)
+id_swap <- map_dfr(mapfiles, ~readxl::read_excel(.x)) %>%
+  select(sample_id = Seq_ID, internal_sample_id) %>%
+  mutate(
+    sample_id = str_replace(sample_id, "SKIPPY_", ""),
+    sample_id = str_replace(sample_id, "_2", "_1"),
+    sample_id = str_replace(sample_id, "_52", "_3"),
+    sample_id = str_replace(sample_id, "_5", "_2"),
+    sample_id = str_replace(sample_id, "245_1_15-1-17", "245_1"),
+    sample_id = str_replace(sample_id, "245_1_22-1-17", "245_1_2"),
+    sample_id = str_replace(sample_id, "269_1_15_mei_2017", "269_1"),
+    sample_id = str_replace(sample_id, "269_1_18_mei_2017", "269_1_2")
+    )
+
+
+modules <- select(gbm, Module, Description)
+gbm <- select(gbm, -contains("MOCK"), -Description)
+colnames(gbm) <- str_remove(colnames(gbm), "X")
+gbm <- column_to_rownames(gbm, "Module") %>% 
+    t() %>% 
+    as.data.frame() %>%
+    mutate_all(function(x) as.integer(x)) %>%
+    rownames_to_column("internal_sample_id") %>%
+    left_join(id_swap, by = "internal_sample_id") %>%
+    select(-internal_sample_id)
+head(gbm)
+dim(gbm)
+
+
+# store the gbm in a tse to then perform DAA using similar script as before
+load(here::here("data/data_imp.Rds"))
+load(here::here("data/data.Rds"))
+
+fvars <- c("siblings", "condition")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings) %>%
+  left_join(select(d, age, sample_id, siblings), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+gbm <- gbm %>%
+    filter(sample_id %in% colnames(tse)) %>%
+    column_to_rownames("sample_id") %>%
+    t()
+
+colnames(gbm)[!colnames(gbm) %in% colnames(tse)]
+colnames(tse)[!colnames(tse) %in% colnames(gbm)]
+md <- colData(tse) %>%
+    as.data.frame() %>%
+    rownames_to_column("sample_id") %>%
+    arrange(factor(sample_id, levels = colnames(gbm))) %>%
+    mutate(sid = sample_id) %>%
+    column_to_rownames("sid") %>%
+    DataFrame()
+
+tse_pc <- TreeSummarizedExperiment(assays = gbm, colData = md)
+assayNames(tse_pc) <- "counts"
+save(tse_pc, modules, file = here::here("data/data_pc.Rds"))
+
+
+gbm %>% dim()
diff --git a/R/h_skippy_import_biom.R b/R/h_skippy_import_biom.R
new file mode 100644
index 0000000..9c62ad7
--- /dev/null
+++ b/R/h_skippy_import_biom.R
@@ -0,0 +1,386 @@
+library(tidyverse)
+library(mia)
+library(glue)
+library(tidySummarizedExperiment)
+library(miaViz)
+library(scater)
+library(readxl)
+library(lubridate)
+
+# extract skippy id and week from map files
+raw_path <- here::here("data/map_files")
+map_files <- list.files(raw_path, pattern = "2021_\\d{4}.*.xlsx$")
+map_files %>% length()
+lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d")
+library_number_merged <- c(1:6)
+map_file_merged <- map2_dfr(
+  map_files,
+library_number_merged,
+function(filename, lnm) {
+  # extract library number
+  lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d")
+  # read provided map file
+  xl <- read_excel(glue("{raw_path}/{filename}"))
+  map_file <- xl %>%
+    mutate(
+      LibraryNumber = lnm,
+      ProjectName = ifelse(is.na(ProjectName), "unspecified", ProjectName)
+    ) %>%
+    filter(!ProjectName == "Shime-uncooled_ileal_digesta") %>%
+    select(
+      "sample_id" = internal_sample_id,
+      id = Seq_ID,
+      ProjectName
+    ) %>%
+      filter(!is.na(`sample_id`))
+  return(map_file)
+})
+# make sample_ids unique as they are in the tse object later
+map_file_merged <- map_file_merged %>%
+  mutate(
+  namealt = glue("{sample_id}_{1:dim(map_file_merged)[1]}"),
+  sample_id = ifelse(str_detect(sample_id, "MOCK"), namealt, sample_id))
+
+
+# the following provided duplicates and need extra labeling later
+# when I create the new sample names (see further below)
+duplicates <- c(
+  "245_2_22-1-17",
+  "240_52_11-2-18",
+  "269_2_18_mei_2017",
+  "276_5_26052017"
+)
+
+# create a dataframe that has id from wur, our id, an indicator whether we have
+# 2 samples from an id
+sample_data <- str_match(map_file_merged$id, ".*(\\d\\d\\d)(_|-)(\\d+).*") %>%
+  as_tibble() %>%
+  select(id = V1, skippy_id = V2, week = V4) %>%
+  full_join(map_file_merged, by = "id")  %>%
+  filter(!is.na(sample_id)) %>%
+  mutate(duplicated = ifelse(id %in% duplicates, TRUE, FALSE)) %>%
+  select(wur_id = id, skippy_id, sample_id, week, duplicated)
+filter(sample_data, duplicated)
+
+# import the biome file to combine with above metadata
+
+# read_biom does not work with the output of NGT2 in its current version
+# we must replace certain NaNs with 0 (not relevant for our analysis)
+biom_file <- here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom")
+if (!file.exists(biom_file)) {
+  biom_file_old <- here::here("data/raw_data/skippy_non_concat_SILVA_138.biom")
+  biom_lines <- read_lines(biom_file_old)
+  biom_lines <- str_replace_all(biom_lines, "NaN", "0.0")
+  write_lines(
+    biom_lines,
+    here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom")
+  )
+}
+
+skippy <- loadFromBiom(biom_file)
+# add phylogenetic tree made with NGT2 (clustal omega)
+tree_file <- here::here(
+  "data/raw_data/skippy_non_concat_tree_files/all_otus.tree"
+)
+tree <- ape::read.tree(tree_file)
+# tree can only be added to tse but not se object
+skippy <- as(skippy, "TreeSummarizedExperiment")
+rowTree(skippy) <- tree
+# use practical names for the ranks instead
+colnames(rowData(skippy)) <- c(
+  "kingdom", "phylum", "class", "order", "family", "genus", "species"
+)
+
+# add sample ids as column
+colData(skippy)[["sample_id"]] <- rownames(colData(skippy))
+# now we can add our sample data we created above to the tse
+colData(skippy) <- colData(skippy) %>%
+  as.data.frame() %>%
+  left_join(sample_data, by = "sample_id") %>%
+  mutate(neg_control = as.factor(ifelse(is.na(week), 1, 0))) %>%
+  column_to_rownames("sample_id") %>%
+  mutate(sample_id = rownames(colData(skippy))) %>%
+  DataFrame()
+
+# we will now store MOCK samples in a separate tse
+mock_samples <- filter(skippy, str_detect(sample_id, "MOCK"))
+mock_samples <- transformSamples(mock_samples, method = "clr", pseudocount = 1)
+# check correlations between mock samples
+assay(mock_samples, "clr") %>% cor()
+
+
+# also check they cluster together closely when using ordination
+skippy <- mutate(skippy,
+  mock = as.factor(ifelse(str_detect(sample_id, "MOCK"), 1, 0))
+)
+colData(skippy)[["library_size"]] <- colSums(assay(skippy))
+# perform NMDS coordination method
+skippy <- runNMDS(
+  skippy,
+  FUN = vegan::vegdist,
+  name = "NMDS"
+)
+# plot results of a 2-component NMDS on tse,
+plotReducedDim(skippy, "NMDS", colour_by = "mock")
+plotReducedDim(skippy, "NMDS", colour_by = "library_size")
+# MOCK samples do cluster together but particularly 1 outlier is there
+# now drop the MOCK samples
+tse <- filter(skippy, !str_detect(sample_id, "MOCK"))
+# fix names of taxa 
+rowData(tse) <- rowData(tse) %>%
+  as_tibble() %>%
+  mutate(across(everything(), function(x) str_remove(x, ".*[kpcofgs]__"))) %>%
+  DataFrame()
+
+# which are the negative control samples?
+colData(tse) %>% as.data.frame() %>% filter(!week %in% c(2, 5, 52))
+# the low ls are negative controls as we can see in above table
+colData(tse) %>%
+as.data.frame() %>%
+  arrange(library_size)
+# exclude negative controls and create unique sample ids indicative of week 
+tse <- filter(tse, neg_control == 0) %>%
+  mutate(
+    sample_ids_new = glue(
+      "{skippy_id}_{ifelse(week == 2, 1, 
+        ifelse(week == 5, 2, ifelse(week == 52, 3, 'CHECK')))}{
+          ifelse(duplicated, '_2', '')}"))
+
+# we retain 331 samples at this step
+# lets change row and column names to better descriptions
+colnames(tse) <- colData(tse)[, "sample_ids_new"]
+save(tse, file = here::here("data/tse.Rds"))
+
+
+
+# store a file that lists all samples we have
+colData(tse) %>%
+  as.data.frame() %>%
+  select(sample_ids_new, skippy_id, week, wur_id) %>%
+  arrange(sample_ids_new) %>%
+  write_excel_csv(here::here("data/processed_samples.csv"))
+
+# test if we need to rarefy or not: do the methods in ad_tjazi and mia 
+# produce different results?
+tse <- estimateDiversity(
+  tse,
+  assay_name = "counts",
+  index = c("shannon", "faith", "inverse_simpson"),
+  name = c("shannon", "faith", "inverse_simpson")
+)
+
+ad_tjazi <- Tjazi::get_asymptotic_alpha(assay(tse)) %>%
+  rownames_to_column("sample_id")
+
+colData(tse) %>%
+  as.data.frame() %>%
+  select(
+    library_size,
+    sample_id = sample_ids_new,
+    shannon,
+    inverse_simpson,
+    faith) %>%
+  full_join(ad_tjazi, by = "sample_id") %>%
+  select_if(is_numeric) %>%
+  cor()
+
+  # colData(tse) %>% as.data.frame() %>%
+  #   select(library_size, sample_id = sample_ids_new, shannon, inverse_simpson, faith) %>%
+  #   full_join(ad_tjazi, by = "sample_id") %>%
+  #   select_if(is_numeric) %>%
+  #   select(-library_size, -Chao1) %>%
+  #   pivot_longer(everything(), names_to = "index", values_to = "diversity") %>%
+  #   ggplot(aes(index, diversity)) +
+  #     geom_boxplot() +
+  #     geom_jitter()
+
+# no correlation with library size + perfect correlation of shannon -->
+# we can pick any method 
+
+
+# here I check if if end up with all the samples as per documented by Kelly:
+samples <- tibble(sample_id = colnames(tse)) %>%
+  mutate(
+    week = str_match(sample_id, "_(\\d+)$")[, 2],
+    id = str_match(sample_id, "^(\\d\\d\\d)")[, 2]
+  ) %>%
+  arrange(id, week)
+doc <- read_excel(here::here("data/StoolSamples_20162017_SKIPPY.xlsx"))
+doc <- select(doc,
+  id = ID,
+  w2 = "Stoolsample week 2",
+  w5 = "Stoolsample week 5",
+  w52 = "Stoolsample week 52")  %>%
+  pivot_longer(matches("w\\d"), names_to = "week", values_to = "planned") %>%
+  mutate(
+    week = str_extract(week, "\\d+"),
+    week = ifelse(
+      week == 52, 3, ifelse(
+        week == 5, 2, ifelse(
+          week == 2, 1, NA)))
+  )
+samples$id <- as.double(samples$id)
+samples$week <- as.double(samples$week)
+full_join(samples, doc, by = c("id", "week")) %>%
+ filter(is.na(sample_id), planned == 1) %>%
+ arrange(id, week)
+missing_ids <- full_join(samples, doc, by = c("id", "week")) %>%
+  filter(is.na(sample_id), planned == 1) %>%
+  arrange(id, week) %>%
+  .$id
+missing_ids %in% samples$id
+filter(samples, id %in% missing_ids) %>%
+  arrange(id, week)
+
+
+# there is a discrepancy in the samples that should be there and that
+# are there. I will search through all map files once more but this time
+# without filtering to make sure that I didnt filter them out before
+
+raw_path <- here::here("data/map_files")
+map_files <- list.files(raw_path, pattern = "2021_\\d{4}.*.xlsx$")
+map_files %>% length()
+lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d")
+library_number_merged <- c(1:6)
+map_file_merged <- map2_dfr(
+  map_files,
+  library_number_merged,
+  function(filename, lnm) {
+  # extract library number
+  lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d")
+  # read provided map file
+  xl <- read_excel(glue("{raw_path}/{filename}"))
+  map_file <- xl %>%
+    mutate(
+      LibraryNumber = lnm,
+      ProjectName = ifelse(is.na(ProjectName), "unspecified", ProjectName)
+    ) %>%
+    select(
+      "sample_id" = internal_sample_id,
+      id = Seq_ID,
+      ProjectName
+    ) %>%
+      filter(!is.na(`sample_id`))
+
+    return(map_file)}) %>%
+  mutate(id2 = str_match(id, ".*(\\d\\d\\d).*")[, 2])
+filter(map_file_merged, id2 %in% missing_ids)
+
+# according to documentation all samples except 226 week 52 are correctly
+# missing as they have never been sent to WUR
+
+n_total <- length(colnames(tse))
+duplicate_ids <- colnames(tse)[map_int(colnames(tse), ~str_length(.x)) > 5]
+n_dupl <- length(colnames(tse)[map_int(colnames(tse), ~str_length(.x)) > 5])
+n_total - n_dupl
+# how many unique samples per time point?
+colData(tse) %>% 
+  as.data.frame() %>%
+  filter(!sample_ids_new %in% duplicate_ids) %>%
+  count(week)
+
+
+# after Kelly's latest prereg email I check now which of the ITT individuals 
+# are in the data
+
+library(foreign)
+sav <- read.spss(here::here("data/kelly_documents/data_itt_pp_dr.sav"), to.data.frame = TRUE) %>% select(id = ID, itt = ITT) %>%
+  filter(itt != "Excl") %>%
+  mutate(id = as.character(id))
+  
+df <- colData(tse) %>%
+  as.data.frame() %>%
+  select(wur_id, id = skippy_id, week, duplicated) %>%
+  full_join(sav, by = "id")
+ids <- filter(df, is.na(wur_id)) %>%
+  .$id
+cat(glue("{ids},"))
+filter(df, id %in% ids)
+
+df %>% filter(id %in% sav$id, !duplicated) %>% count(week)
+
+
+# after all has been checked, I will now double check that only the correct
+# ids remain in the tse + that I add any necessary variables for the project
+load(file = here::here("data/tse.Rds"))
+
+colData(tse) %>%
+  as.data.frame() %>%
+  filter(skippy_id %in% sav$id, !duplicated) %>%
+  count(week)
+tse <- filter(tse, skippy_id %in% sav$id, !duplicated)
+
+# apparently, there were several date files and this file has more complete data
+# than the mdata below for the birthdate variable:
+xl <- read_excel(here::here("data/Birthdates_SKIPPY.xlsx")) %>%
+  mutate(birthdate = dmy(GeboortedatumBaby)) %>%
+  select(id = ID, birthdate) %>%
+  distinct()
+# fix wrong year entry for id 272 (typo)
+xl[xl$id == 272, "birthdate"] <- ymd("2017-04-24")
+
+
+# import metadata
+mdata <- read_csv2(here::here("data/skippy_stool_data_cleaned.csv")) %>%
+  mutate_all(function(x) ifelse(x == 99999, NA, ifelse(x == 88888, NA, x))) %>%
+  filter(id %in% sav$id) %>%
+  left_join(xl, by = "id") %>%
+  # dates must be fixed in order to calculate age
+  mutate(across(contains("dat_"), function(x) {
+    x_new <- str_replace(x, "okt", "october")
+    x_new <- str_replace(x_new, "mrt", "march")
+    x_new <- str_replace(x_new, "mei", "may")
+    x_new <- str_replace(x_new, "15-jan-17/22-jan-17", "18/jan/17")
+    x_new <- dmy(x_new)
+    return(x_new)
+  }))
+
+# again a typo for dat_week2. Easier to fix manually in base R
+# dat_week2 = ifelse(id == 300, dmy("25-06-2017"), dmy(dat_week2)),
+mdata[mdata$id == 300, "dat_week2"] <- ymd("2017-06-25")
+
+mdata <- mutate(mdata,
+  datbirth_infant = dmy(datbirth_infant),
+  # now that dates are fine we can calculate age
+  checkold_week2 = as.numeric(dat_week2 - datbirth_infant),
+  checkold_week5 = as.numeric(dat_week5 - datbirth_infant),
+  checkold_week52 = as.numeric(dat_1year - datbirth_infant),
+  age_week2 = as.numeric(dat_week2 - birthdate),
+  age_week5 = as.numeric(dat_week5 - birthdate),
+  age_week52 = as.numeric(dat_1year - birthdate),
+  id = as.character(id)) %>%
+  select(
+    skippy_id = id, condition, csection, birthweight,
+    siblings, sex, bfexcl, contains("bf"), everything())
+
+filter(
+  mdata,
+  age_week2 != checkold_week2 |
+    age_week5 != checkold_week5 |
+      age_week52 != checkold_week52) %>%
+  select(skippy_id, birthdate, datbirth_infant)
+
+# now we add all the meta variables to the tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  rownames_to_column("sid") %>%
+  left_join(mdata, by = "skippy_id") %>%
+  select(-wur_id, -duplicated, -neg_control, -mock, -sample_id) %>%
+  select(everything(), sample_id = sample_ids_new) %>%
+  column_to_rownames("sid") %>%
+  DataFrame()
+
+
+# this resulting file is our starting point for the analyses
+# i will also save it with the mdata and sav as we need that for multiple
+# imputation
+save(tse, sav, mdata, file = here::here("data/data.Rds"))
+select(
+  mdata, 
+  birthdate, 
+  datbirth_infant, 
+  skippy_id, 
+  age_week2, 
+  age_week5, 
+  age_week52
+)
\ No newline at end of file
diff --git a/R/helper_functions.R b/R/helper_functions.R
new file mode 100644
index 0000000..9108a63
--- /dev/null
+++ b/R/helper_functions.R
@@ -0,0 +1,42 @@
+library(HDInterval)
+library(tidyverse)
+
+
+# calculate contrasts for brms models
+calc_contrast <- function(tb, str_vs_str = c("con0", "con1"), ctname = str_vs_str[2]) {
+  group1 <- select(tb, contains(str_vs_str[1])) %>%
+              pivot_longer(
+              everything(), 
+              names_to = "subgroups", 
+              values_to = "value"
+            ) 
+  group2 <- select(tb, contains(str_vs_str[2])) %>%
+              pivot_longer(
+              everything(), 
+              names_to = "subgroups", 
+              values_to = "value"
+            ) 
+  contrast <- group2$value - group1$value 
+  tb <- tibble(
+      name = ctname,
+      beta = median(contrast),
+      lower = hdi(contrast)[1],
+      upper = hdi(contrast)[2]) %>%
+    mutate(across(where(is.numeric), round, 2))
+  return(tb)
+}
+
+
+summarise_posterior <- function(model, parameters, n_digits = 3) {
+  as_draws_df(model) %>%
+    select(all_of(parameters)) %>%
+    pivot_longer(everything(), names_to = "parameter", values_to = "value") %>%
+    group_by(parameter) %>%
+    summarise(
+      m = median(value),
+      sd = sd(value),
+      lower = hdi(value)[1],
+      upper = hdi(value)[2]
+    ) %>%
+    mutate(across(where(is.numeric), round, n_digits))
+}
\ No newline at end of file
diff --git a/R/mb_age.R b/R/mb_age.R
new file mode 100644
index 0000000..f63234f
--- /dev/null
+++ b/R/mb_age.R
@@ -0,0 +1,1293 @@
+library(glue)
+library(mia)
+library(readxl)
+library(lubridate)
+library(here)
+library(ranger)
+library(tidyverse)
+library(brms)
+library(ggbeeswarm)
+library(ComplexHeatmap)
+library(circlize)
+
+full_join <- dplyr::full_join
+
+# double check by using two approaches.
+# APPROACH 1: combine to one TSE, then extract data:
+for (file in list.files(here("data/"), pattern = "tse_\\w+.Rds")) {
+  load(here(glue("data/{file}")))
+}
+
+lnsk <- length(rownames(tse_skippy))
+lnbib <- length(rownames(tse_bibo))
+lnbin1 <- length(rownames(tse_bingo1))
+lnbin2 <- length(rownames(tse_bingo2))
+rnskippy <- glue("x{1:lnsk}")
+rnbibo <- glue("x{(lnsk + 1): (lnsk +lnbib)}")
+rnbingo1 <- glue("x{(lnsk + lnbib + 1): (lnsk +lnbib + lnbin1)}")
+rnbingo2 <- glue("x{(lnsk + lnbib + lnbin1 + 1): (lnsk +lnbib + lnbin1 + 
+  lnbin2)}")
+
+tses <- map2(
+  list(rnskippy, rnbibo, rnbingo1, rnbingo2),
+  list(tse_skippy, tse_bibo, tse_bingo1, tse_bingo2),
+  function(rn, tse) {
+    rownames(tse) <- rn
+    tse
+  }
+)
+
+tses[[1]] %>% colnames()
+skbib <- mergeSEs(tses[[1]], tses[[2]], missing_values = 0)
+bin <- mergeSEs(tses[[3]], tses[[4]], missing_values = 0)
+assay(bin)
+tse <- mergeSEs(skbib, bin, missing_values = 0)
+rowData(tse) %>% dim()
+atest <- assay(tse)
+
+plot(density(colSums(atest)))
+# now we need to replace "g__", "f__" etc. by NA in order to aggregate correctly
+rowData(tse) <- rowData(tse) %>%
+  as_tibble() %>%
+  mutate(across(everything(), function(x) str_remove(x, ".*[kpcofgs]__"))) %>%
+  DataFrame()
+
+test <- agglomerateByRank(tse, rank = "Genus")
+plot(density(colSums(assay(tse))))
+plot(density(assay(test)[1, ]))
+
+# APPROACH 2: extract data from each tse and then combine
+tses <- map(
+  list(tse_skippy, tse_bibo, tse_bingo1, tse_bingo2),
+  function(tse) {
+    new_tse <- tse
+    rowData(new_tse) <- rowData(new_tse) %>%
+      as_tibble() %>%
+      mutate(
+        across(everything(),
+        function(x) str_remove(x, ".*[kpcofgs]__"))) %>%
+      DataFrame()
+    new_tse <- agglomerateByRank(new_tse, rank = "genus")
+    new_tse <- transformSamples(new_tse, method = "relabundance")
+    as.data.frame(assay(new_tse, "relabundance")) %>% rownames_to_column("taxon")
+  }
+)
+
+# we can see that there are different numbers of genera in each df
+map(tses, ~dim(.x))
+df <- full_join(tses[[1]], tses[[2]], by = "taxon") %>%
+  full_join(tses[[3]], by = "taxon") %>%
+  full_join(tses[[4]], by = "taxon") %>%
+  mutate_all(function(x) ifelse(is.na(x), 0, x))
+
+
+# Plots are identical (checked them with the counts but now we need relab)
+# the next step is to get the age at sample collection for each study
+
+# SKIPPY
+load(file = here::here("data/data_imp.Rds"))
+skage <- select(d, skippy_id, sample_id, week, age) %>%
+  mutate(age = age + as.numeric(as.character(week)) * 7) %>%
+  select(sample_id, age)
+mapfiles <- list.files(here::here("data/map_files"), full.names = TRUE)
+id_swap <- map_dfr(mapfiles, ~read_excel(.x)) %>%
+  select(sample_id = Seq_ID, internal_sample_id) %>%
+  mutate(
+    sample_id = str_replace(sample_id, "SKIPPY_", ""),
+    sample_id = str_replace(sample_id, "_2", "_1"),
+    sample_id = str_replace(sample_id, "_52", "_3"),
+    sample_id = str_replace(sample_id, "_5", "_2"),
+    sample_id = str_replace(sample_id, "245_1_15-1-17", "245_5"),
+    sample_id = str_replace(sample_id, "245_1_22-1-17", "245_5_2"),
+    sample_id = str_replace(sample_id, "269_1_15_mei_2017", "269_5"),
+    sample_id = str_replace(sample_id, "269_1_18_mei_2017", "269_5_2")
+    )
+
+id_swap %>% filter(str_detect(sample_id, "269_"))
+skage <- full_join(skage, id_swap, by = "sample_id") %>%
+  select(sample_id = internal_sample_id, age) %>%
+  na.omit()
+mean(skage$sample_id %in% id_swap$sample_id)
+
+
+# BINGO
+load(here::here("data/tse_bingo1.Rds"))
+bin_age0 <- colData(tse_bingo1) %>%
+  as.data.frame() %>%
+  mutate(
+    # id = str_extract(id, "\\d\\d\\d"),
+    time = ifelse(
+      str_detect(sample_id, "k1"), "week2", ifelse(
+        str_detect(sample_id, "k2"), "week6", ifelse(
+          str_detect(sample_id, "k3"), "week12", NA
+        )
+      )
+    )
+  ) %>%
+  select(sample_id, age)
+
+
+bin_age1 <- read_excel(here("data/BINGO1y_ActualAge.xlsx")) %>%
+  select(id = FamilyID, age = "1y-actual age_corrected in days") %>%
+  filter(!is.na(age))
+
+# we need to add the correct sample ids to the file
+map_files <- c(
+  "2019_11_06_Sequencing sample collection_2019_0084.xlsx",
+  "2019_11_06_Sequencing sample collection_2019_0085.xlsx",
+  "2020_01_14_Sequencing sample collection_2020_0005.xlsx"
+)
+bingo_map <- map_dfr(map_files, function(mapfile) {
+  xl <- read_excel(glue(here("data/bingo_mapfiles/{mapfile}"))) %>%
+    select(internal_sample_id, ProjectName, Seq_ID) %>%
+    filter(ProjectName == "BINGO")
+})
+
+bingo_map_1 <- filter(bingo_map, str_detect(Seq_ID, "1_")) %>%
+  mutate(id = str_pad(str_remove(Seq_ID, "1_"), 3, side = "left", "0")) %>%
+  select(sample_id = internal_sample_id, id)
+bin_age1 <- full_join(bin_age1, bingo_map_1, by = "id") %>%
+  select(-id)
+
+bin_age2 <- read_csv(here("data/bingo_age_1_3_years.csv")) %>%
+  select(
+    ID,
+    birthdate = "Geboortedatum baby",
+    coldate1 = "Poop collection tube arrives",
+    coldate2 = "Poop collection date"
+  ) %>%
+  filter(!is.na(ID), ID != "Test") %>%
+  mutate(
+    coldate2 = str_remove(coldate2, "^\\w+,\\s"),
+    coldate1 = mdy(coldate1),
+    coldate2 = mdy(coldate2),
+    birthdate = dmy(birthdate),
+    age = ifelse(
+      is.na(coldate2),
+      coldate1 - birthdate,
+      coldate2 - birthdate)
+  ) %>%
+  select(id = ID, age)
+
+bingo_map_2 <- filter(bingo_map, str_detect(Seq_ID, "3_")) %>%
+  mutate(id = str_pad(str_remove(Seq_ID, "3_"), 3, side = "left", "0")) %>%
+  select(sample_id = internal_sample_id, id)
+bin_age2 <- full_join(bin_age2, bingo_map_2, by = "id") %>%
+  select(-id)
+
+bin_age <- bind_rows(bin_age0, bin_age1, bin_age2) %>%
+  na.omit()
+bin_age
+
+
+
+# BIBO
+# first add proper sample ids
+raw_path <- here::here("data/bibo_mapfiles/")
+map_files <- list.files(raw_path, pattern = ".xlsx$")
+lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d")
+library_number_merged <- seq_along(map_files)
+map_file_merged <- map2_dfr(
+  map_files,
+  library_number_merged,
+  function(filename, lnm) {
+  # extract library number
+  lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d")
+
+  # read provided map file
+  xl <- read_excel(glue("{raw_path}/{filename}"))
+
+  # create new map file for NGtax2.0
+  map_file <- xl %>%
+    filter(
+      !grepl("L\\d\\d_NC_\\d", `Seq_ID`),
+      Seq_ID != "empty",
+      ProjectName == "BIBO"
+    ) %>%
+    select(sample_id = internal_sample_id, Seq_ID)
+
+  return(map_file)
+})
+
+#---code book: a, 1m; b, 3m; c, 4m; d, 6y; e, 10y.
+bibo_map <- map_file_merged %>%
+  mutate(
+    id = str_extract(Seq_ID, "\\d+"),
+    time = str_extract(Seq_ID, "\\w{1}"),
+    time = ifelse(time == "a", 28,
+            ifelse(time == "b", 75, ifelse(time == "c", 105, NA))),
+    id = as.numeric(id),
+    time = as.numeric(time)
+  ) %>%
+  filter(time %in% c(28, 75, 105))
+bibo_age_path <- "data/bibo_age_per_sample_infancy.xlsx"
+bibo_age <- read_excel(bibo_age_path) %>%
+  select(
+    id = ID,
+    day28 = "28 days",
+    day75 = "CC -2 days",
+    day105 = "CC+28 days"
+  ) %>%
+  mutate(id = as.integer(id)) %>%
+  filter(!is.na(id)) %>%
+  pivot_longer(contains("day"), names_to = "time", values_to = "age") %>%
+  mutate(
+    id = as.numeric(id),
+    time = as.numeric(str_extract(time, "\\d+"))
+  )
+
+bibo_age <- dplyr::full_join(bibo_age, bibo_map, by = c("id", "time")) %>%
+  select(sample_id, age) %>%
+  filter(!is.na(sample_id))
+
+
+bibo_age
+
+
+# now we got all ages. We need to create a df that has sample ids as rownames
+# and genera as colnames + age.
+
+df <- full_join(tses[[1]], tses[[2]], by = "taxon") %>%
+  full_join(tses[[3]], by = "taxon") %>%
+  full_join(tses[[4]], by = "taxon") %>%
+  mutate_all(function(x) ifelse(is.na(x), 0, x)) %>%
+  column_to_rownames("taxon")
+colnames(df)
+# first we do prevalence filtering.
+prev <- 0.1
+# for that level we will loose % of all genera -->
+mean(rowSums(df[, -1]) <= 0.1)
+df <- df[c(rowSums(df[, -1]) > 0.1), ]
+df <- as.data.frame(t(df)) %>%
+  rownames_to_column("sample_id")
+
+
+
+
+
+
+
+sample_ages <- bind_rows(bin_age, bibo_age, skage) %>% na.omit()
+filter(sample_ages) %>%
+  ggplot(aes(age)) +
+    geom_histogram(bins = 25)
+
+
+d <- full_join(df, sample_ages, by = "sample_id") %>%
+  na.omit()
+
+
+# now we can train the model
+colnames(d) <- make.names(colnames(d))
+# in the paper they use ntree = 10k and mtry = p/3
+p <- ((ncol(d) - 2) / 3) %>% round()
+# i exclude the skippy samples for training
+traindata <- filter(d, !sample_id %in% skage$sample_id)
+# also leave out samples that are far above the age we work with here.
+traindata <- filter(traindata, age <= 1000)
+testdata <- filter(d, sample_id %in% skage$sample_id)
+model <- ranger(
+  formula = age ~ .,
+  data = select(traindata, -sample_id),
+  num.trees = 1e4,
+  mtry = p,
+  importance = "permutation"
+)
+model
+
+# calculate correlation of predictions and actual values
+pred <- predict(model, data = select(testdata, -sample_id))
+testdata$pred <- pred$predictions
+r <- cor.test(testdata$pred, testdata$age)$estimate
+r
+source(here::here("R/ml_helper.R"))
+
+if (!file.exists(here::here("data/rvalues.Rds"))) {
+  rvalues <- rf_null(
+    y = "age",
+    features = select(traindata, -sample_id, -age) %>% colnames(),
+    train = traindata,
+    test = testdata,
+    n_perm = 500,
+    ntree = 500
+  )
+  save(rvalues, file = here::here("data/rvalues.Rds"))
+ } else {
+  load(here::here("data/rvalues.Rds"))
+}
+
+# pvalue
+mean(rvalues > r)
+
+
+out <- select(testdata, internal_sample_id = sample_id, pred) %>%
+  full_join(id_swap, by = "internal_sample_id") %>%
+  filter(!is.na(pred)) %>%
+  select(sample_id, pred)
+out
+
+
+
+# now we have the data to calculate the microbiota age and MAZ
+
+# calculate microbiota for age z score
+# first we need the median microbiota age of healthy children that are in the
+# same age (same month). I.E. month 1 (2 weeks), month 2 (5 weeks) and month 12
+# (52 weeks).
+
+traindata$mbage <- predict(model, data = select(traindata, -sample_id))$predictions
+median_mbage <- select(traindata, sample_id, age, mbage) %>%
+  mutate(month = as.integer(age / (365 / 12))) %>%
+  group_by(month) %>%
+  summarise(md = median(mbage), sd = sd(mbage), n = n())
+median_mbage
+age_range <- ggplot(traindata, aes(age)) +
+  geom_histogram(bins = 50, color = "white", fill = "#606060") +
+  theme_bw(base_size = 25) +
+  scale_x_continuous(breaks = seq(0, 600, 100)) +
+  scale_y_continuous(breaks = seq(0, 110, 10)) +
+  xlab("Age") + ylab("n")
+age_range
+
+
+# the formula is: microbiota age - median microbiota age of healthy children
+# of same echronological age / sd of the healthy childrens mage
+maz <- select(testdata, internal_sample_id = sample_id, age, mbage = pred) %>%
+  left_join(id_swap, by = "internal_sample_id") %>%
+  select(-internal_sample_id) %>%
+  mutate(
+    month = as.integer(age / (365 / 12))
+  )
+
+
+maz$maz <- NA
+
+for (i in seq_along(maz$mbage)) {
+  maz$maz[i] <- as.numeric((maz$mbage[i] -
+            median_mbage[median_mbage$month == maz$month[i], "md"]) /
+              median_mbage[median_mbage$month == maz$month[i], "sd"])
+}
+
+# for analyses it seems better to use microbiota age rather than MAZ because to calculate 
+# the sd, more coverage in certain months would have been needed. However, including age in
+# the regression model is comparable in meaning and works fine.
+
+
+
+
+
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+
+
+####################### 1.1 Complete Case Analysis ############################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# for analyses we apply prevalence fitlering and analyze at genus level
+fvars <- c("siblings", "condition")
+# add metadata to tse
+d <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings) %>%
+  left_join(select(d, age, sample_id, siblings), by = "sample_id") %>%
+  mutate(across(all_of(fvars), function(x) as.factor(x))) 
+d$age <- d$age + as.numeric(d$week) * 7
+d$age <- scale(d$age)[, 1]
+
+if(!file.exists(here::here("data/mbagedataa.Rds"))) {
+  d_cc <- full_join(select(d, -age), maz, by = "sample_id") %>%
+  mutate(
+    condition_label = ifelse(condition == 0, "CAU", ifelse(
+      condition == 1, "SSC", NA)),
+    week_label = glue::glue("Week {week}")
+  ) %>% 
+  group_by(week) %>%
+  mutate(
+    maz_s = scale(maz)[, 1],
+    maz_c = maz - median(maz, na.rm = TRUE)
+    ) %>%
+  ungroup()
+  save(d_cc, file = here::here("data/mbagedataa.Rds"))
+ } else {
+  load(here::here("data/mbagedataa.Rds"))
+}
+
+# create plot
+mba_plot <- ggplot(d_cc, aes(condition_label, mbage, fill = condition_label)) +
+  geom_boxplot(outlier.alpha = 0) +
+  #geom_beeswarm(alpha = 0.4) + 
+  geom_jitter(width = 0.1, size = 2) + 
+  #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") +
+  facet_wrap(~week_label, strip.position = "bottom") +
+  scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+  theme_bw(base_size = 25) +
+  theme(
+    legend.position = "none",
+    strip.placement = "outside",
+    strip.background = element_blank(),
+    strip.text.x = element_text(size = 20)
+    ) +
+  xlab("") + ylab("Microbiota Age")
+mba_plot
+save(mba_plot, age_range, file = here::here("data/mba_out.Rds"))
+
+# how is bfexcl distruted
+d_cc$bfexcl
+dim(d_cc)
+select(d_cc, skippy_id, bfexcl) %>% arrange(skippy_id) %>%
+  distinct() %>%
+  ggplot(aes(bfexcl)) +
+  geom_histogram()
+
+
+
+# check optimal model structure and then fit the models
+coefs <- c(
+  "csection", 
+  "birthweight_s", 
+  "siblings",
+  "sex",
+  "apgar_5_s",
+  "ges_age_s",
+  "edlevel_s"
+)
+
+
+loo_comp <- map_dfr(coefs, function(coef) {
+  map_dfr(1:5, function(i) {
+    map_dfr(list(c(2, 5), 52), function(n_week) {
+      if (52 %in% n_week) {
+        # base model 
+        f1 <- bf(mbage ~ age * condition)
+        m1 <- brm(
+          data = filter(implist[[i]], week == 52),
+          formula = f1,
+          file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}_mbage.Rds"))
+        )
+        loo_m1 <- add_criterion(
+          m1,
+          "loo",
+          file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}_mbage")),
+          moment_match = FALSE
+        )
+
+        f2 <- bf(glue("mbage ~ age * condition + {coef}"))
+        m2 <- brm(
+          data = filter(implist[[i]], week == 52),
+          formula = f2,
+          file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}_mbage.Rds"))
+        )
+
+        loo_m2 <- add_criterion(
+          m2,
+          "loo",
+          file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}_mbage")),
+          moment_match = FALSE
+        )
+        lcomp <- loo_compare(loo_m2, loo_m1)
+        score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1)
+      } else {
+        # base model 
+        f1 <- bf(mbage ~ age * condition + (1|skippy_id))
+        m1 <- brm(
+          data = filter(implist[[i]], week != 52),
+          formula = f1,
+          file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}_mbage.Rds"))
+        )
+        loo_m1 <- add_criterion(
+          m1,
+          "loo",
+          file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}_mbage")),
+          moment_match = FALSE
+        )
+
+        f2 <- bf(glue("mbage ~ age * condition + {coef} + (1|skippy_id)"))
+        m2 <- brm(
+          data = filter(implist[[i]], week != 52),
+          formula = f2,
+          file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}_mbage.Rds"))
+        )
+
+        loo_m2 <- add_criterion(
+          m2,
+          "loo",
+          file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}_mbage")),
+          moment_match = FALSE
+        )
+        lcomp <- loo_compare(loo_m2, loo_m1)
+        score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1)
+      }
+      tibble(
+        model = ifelse(n_week == 52, "1year", "2 and 5 weeks"),
+        coef = coef,
+        imp = i,
+        score = score
+      )      
+    })
+  })
+})
+
+group_by(loo_comp, coef, model) %>%
+  summarise(ss = sum(score))
+
+
+
+
+
+
+
+
+
+
+formula <- bf(
+  mbage ~ condition * age + siblings + sex + csection + (1 | skippy_id)
+)
+model_infancy_cc <- brm(
+  family = student(),
+  formula = formula,
+  data = filter(d_cc, week != 52),
+  file = here::here("data/mbage_infancy_cc_cov")
+)
+
+formula <- bf(mbage ~ age + condition + siblings + sex + csection)
+model_year1_cc <- brm(
+  family = student(),
+  formula = formula,
+  data = filter(d_cc, week == 52),
+  file = here::here("data/mbage_year1_cc_cov")
+)
+
+formula <- bf(mbage ~ condition * age + siblings + sex + csection)
+model_all_cc <- brm(
+  family = student(),
+  formula = formula,
+  data = d_cc,
+  file = here::here("data/mbage_all_cc_cov")
+)
+
+summary(model_infancy_cc)
+summary(model_year1_cc)
+summary(model_all_cc)
+
+# sensitivity analyses (no covs)
+formula <- bf(mbage ~ age + condition)
+model_year1_cc_bez <- brm(
+  family = student(),
+  formula = formula,
+  data = filter(d_cc, week == 52),
+  file = here::here("data/mbage_year1_cc_bezcov")
+)
+summary(model_year1_cc_bez)
+
+
+
+
+
+
+######################## 1.2 Multiple imputation  #############################
+
+
+implist <- map(implist, function(imp) {
+  imp_new <- imp %>%
+    left_join(
+      select(d_cc, sample_id, mbage),
+      by = "sample_id") 
+  mice::complete(mice::mice(imp_new))
+})
+
+
+# fit model
+formula <- bf(
+  mbage ~ age * condition + siblings + sex + csection + (1 | skippy_id)
+)
+model_infancy <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week != 52)),
+  file = here::here("data/mbage_infancy_cov")
+)
+
+formula <- bf(
+  mbage ~ age * condition + siblings + sex + csection + bfexcl + (1 | skippy_id)
+)
+model_infancy_bf <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week != 52)),
+  file = here::here("data/mbage_infancy_cov_bf")
+)
+
+formula <- bf(mbage ~ age + condition + siblings + sex + csection)
+model_year1 <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_cov")
+)
+
+post <- posterior_samples(model_year1)
+HDInterval::hdi(post$b_condition1, prob = 0.95)
+mean(post$b_condition1 < 0)
+summary(model_year1)
+
+# sensitivity analyses without covs
+formula <- bf(mbage ~ age + condition)
+model_year1_bez <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_bez")
+)
+
+post <- posterior_samples(model_year1_bez)
+HDInterval::hdi(post$b_condition1, prob = 0.95)
+mean(post$b_condition1 < 0)
+
+
+formula <- bf(mbage ~ age * condition + siblings + sex + csection + (1 | skippy_id))
+model_all <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = implist,
+  file = here::here("data/mbage_all_cov")
+)
+
+summary(model_infancy)
+summary(model_year1)
+summary(model_all)
+
+formula <- bf(mbage ~ age + condition + siblings + sex + csection + bfexcl)
+model_year1_bf <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_cov_bf")
+)
+summary(model_year1_bf)
+post <- posterior_samples(model_year1_bf)
+HDInterval::hdi(post$b_condition1, prob = 0.95)
+mean(post$b_condition1 < 0)
+HDInterval::hdi(post$b_bfexcl, prob = 0.95)
+mean(post$b_bfexcl < 0)
+implist[[1]]$bfexcl
+
+
+formula <- bf(mbage ~ age + condition + siblings + sex + csection + weaning)
+model_year1_weaning <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_cov_w")
+)
+summary(model_year1_weaning)
+
+
+implist_w <- map(implist, function(d) {
+  d %>% mutate(months_bf_weaning = weaning - bfexcl)
+})
+formula <- bf(mbage ~ age + condition + siblings + sex + csection + bfexcl + months_bf_weaning)
+model_year1_weaning <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist_w, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_cov_months_bf_weaning")
+)
+summary(model_year1_weaning)
+
+
+save(model_infancy, model_infancy_bf, model_year1, model_all, model_year1_bf, file = here::here("data/mbage_models_itt.Rds"))
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+fvars <- c("siblings", "condition", "csection", "sex")
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) %>%
+  mutate(skippy_id = as.character(skippy_id))
+# add pp info to existing data 
+if (!"pp" %in% colnames(d_cc)) {
+  d_cc <- left_join(d_cc, pp_indicator, by = "skippy_id")
+}
+d_pp <- filter(d_cc, pp == 1) %>%
+    mutate(
+    across(all_of(fvars), function(x) as.factor(x)),
+    ges_age_s = scale(ges_age)[, 1],
+    birthweight_s = scale(birthweight)[, 1],
+    edlevel_s = scale(edlevel)[, 1]
+    )
+pp_indicator <- mutate(pp_indicator, skippy_id = as.integer(skippy_id))
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+
+
+####################### 2.1 Complete Case Analysis ############################
+
+
+formula <- bf(
+  mbage ~ age * condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + (1 | skippy_id)
+)
+model_infancy_cc_pp <- brm(
+  family = student(),
+  formula = formula,
+  data = filter(d_pp, week != 52),
+  file = here::here("data/mbage_infancy_cc_pp")
+)
+
+formula <- bf(mbage ~ age + condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex)
+model_year1_cc_pp <- brm(
+  family = student(),
+  formula = formula,
+  data = filter(d_pp, week == 52),
+  file = here::here("data/mbage_year1_cc_pp")
+)
+
+summary(model_infancy_cc_pp)
+summary(model_year1_cc_pp)
+
+
+
+
+
+
+
+######################## 2.2 Multiple imputation  #############################
+
+
+
+
+# fit model
+formula <- bf(
+  mbage ~ age * condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + (1 | skippy_id)
+)
+model_infancy <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist_pp, ~filter(.x, week != 52)),
+  file = here::here("data/mbage_infancy_pp_cov")
+)
+
+formula <- bf(
+  mbage ~ age * condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + bfexcl + (1 | skippy_id)
+)
+model_infancy_bf <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist_pp, ~filter(.x, week != 52)),
+  file = here::here("data/mbage_infancy_pp_cov_bf")
+)
+
+formula <- bf(mbage ~ age + condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex)
+model_year1 <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist_pp, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_pp_cov")
+)
+
+summary(model_infancy)
+summary(model_year1)
+
+# same results here. In conclusion, ssc might have effect on microbiota age
+# such that infants microbiota is longer infant like, potentially because of
+# longer breastfeeding but this does not explain effect entirely.
+
+
+formula <- bf(mbage ~ age + condition + siblings + bfexcl + birthweight_s + ges_age_s + edlevel_s + csection + sex)
+model_year1_bf <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist_pp, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_bf_pp_cov")
+)
+summary(model_year1_bf)
+
+save(model_infancy, model_infancy_bf, model_year1, model_year1_bf, file = here::here("data/mbage_models_pp.Rds"))
+
+
+
+# we can argue that there is evidence for an effect of SSC on MBA
+# therefore we will look at the DR analysis as well here:
+
+dr <- foreign::read.spss(
+  here::here("data/kelly_documents/data_itt_pp_dr.sav"),
+  to.data.frame = TRUE
+  ) %>%
+  select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>%
+  mutate(SSC_s = scale(SSC)[, 1])
+head(dr)
+ggplot(dr, aes(SSC)) +
+  geom_histogram()
+
+
+implist <- map(implist, function(imp) {
+  imp_new <- imp %>%
+    dplyr::left_join(
+      select(dr, skippy_id, SSC_s),
+      by = "skippy_id")
+  mice::complete(mice::mice(imp_new))
+})
+
+
+dr$skippy_id <- as.character(dr$skippy_id)
+df <- left_join(d_cc, select(dr, skippy_id, SSC_s, SSC), by = "skippy_id") 
+implist[[1]] %>% colnames()
+
+# fit model
+formula <- bf(
+  mbage ~ SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + age + (1 | skippy_id)
+)
+model_infancy <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week != 52)),
+  file = here::here("data/mbage_infancy_dr_cov")
+)
+
+formula <- bf(mbage ~ age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex)
+model_year1 <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_dr_cov")
+)
+
+
+
+summary(model_infancy)
+summary(model_year1)
+
+
+formula <- bf(mbage ~ age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + bfexcl)
+model_year1_bf <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52)),
+  file = here::here("data/maz_year1_bf_dr_cov")
+)
+summary(model_year1_bf)
+
+
+# only within SSC
+
+formula <- bf(mbage ~  age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex)
+model_year1_within <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52, condition == 1)),
+  file = here::here("data/mbage_year1_dr_within_cov")
+)
+
+
+
+summary(model_year1_within)
+
+
+
+
+formula <- bf(mbage ~  age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex)
+model_year1_out <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52, condition == 0)),
+  file = here::here("data/mbage_year1_dr_out_cov")
+)
+
+summary(model_year1_out)
+
+save(model_year1, model_year1_within, model_year1_bf, file = here::here("data/mbage_models_dr.Rds"))
+
+
+filter(d_cc, is.na(condition))
+ssc_plot1 <- full_join(d_cc, dr, by = "skippy_id") %>% 
+  select(skippy_id, condition, SSC) %>%
+   mutate(
+    condition_label = ifelse(condition == 0, "CAU", ifelse(
+      condition == 1, "SSC", NA)),
+    SSC_h = SSC/60) %>% 
+  distinct(.keep_all = TRUE) %>%
+  filter(skippy_id %in% d_cc$skippy_id) %>%
+  ggplot(aes(condition_label, SSC_h, fill = condition_label)) +
+  geom_boxplot(outlier.alpha = 0) +
+  #geom_beeswarm() +
+  geom_jitter(width = 0.1, size = 2) + 
+  #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+  scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+  theme_bw(base_size = 25) +
+  theme(
+  legend.position = "none",
+  strip.placement = "outside",
+  strip.background = element_blank()) +
+xlab("") + ylab("SSC Total Hours")
+
+
+ssc_plot2 <- full_join(d_cc, dr, by = "skippy_id") %>% 
+  select(skippy_id, condition, SSC, mbage, week) %>%
+   mutate(
+    condition_label = ifelse(condition == 0, "CAU", ifelse(
+      condition == 1, "SSC", NA)),
+    SSC_h = SSC/60) %>% 
+  filter(skippy_id %in% d_cc$skippy_id, week == 52) %>%
+  ggplot(aes(SSC_h, mbage)) +
+  geom_point(size = 4) +
+  geom_smooth(method = "lm", se = TRUE) +
+  geom_vline(xintercept = 1455.561/60, linetype = "dashed") +
+  scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+  #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+  #scale_color_manual(values = c("#fc8d62", "#8da0cb")) +
+  theme_bw(base_size = 25) +
+  theme(
+  legend.position = "none",
+  strip.placement = "outside",
+  strip.background = element_blank()) +
+xlab("SSC Total Hours") + ylab("Microbiota Age")
+ssc_plot1
+ssc_plot2
+
+save(ssc_plot1, ssc_plot2, file = here::here("data/sscplot.Rds"))
+
+
+
+
+
+implist <- map(implist, ~mutate(.x, upper = ifelse(SSC_s <= -0.5, "no", "yes")))
+
+
+formula <- bf(mbage ~ age + upper + siblings + sex + csection)
+model_year1_cat <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist, ~filter(.x, week == 52)),
+  file = here::here("data/mbage_year1_dr_cat_cov")
+)
+
+model_year1_cat
+implist[[1]] %>% colnames()
+test <- implist[[1]]
+test$rs <- resid(lm(mbage ~ age, data = implist[[1]]))
+
+ggplot(test, aes(SSC_s, rs)) +
+  geom_point() +
+  geom_smooth()
+
+
+
+
+
+
+
+
+
+
+################################################################################
+############################## make heatmap from DAA and RF ####################
+################################################################################
+
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+source(here::here("R/ml_helper.R"))
+
+files <- c(
+  here::here("data/maaslin2_tables_itt.Rds"),
+  # here::here("data/ancom_tables_itt.Rds"),
+  here::here("data/tables_linda_itt.Rds")
+)
+for (file in files) load(file)
+
+linda_identified
+maaslin2_identified
+
+
+plot_importance(model, top_n = 20)
+
+rf_feats <- as.character(arrange(extract_importance(model, n = 20), desc(importance))$features)
+plot_importance(model, top_n = 25) + theme_bw(base_size = 25)
+# plot relabundance of the features but only for those who are nonzero in SKIPPY
+rf_feats <- map_chr(rf_feats, function(feat) {
+  out <- NA
+  if(sum(testdata[[feat]]) > 0) {
+    out <- feat
+  }
+  out}) %>%
+  na.omit() %>%
+  as.character()
+rf_feats
+
+
+msl <- make.names(maaslin2_identified$feature) %>% str_replace("genus.Eubacterium._hallii_group", "genus..Eubacterium._hallii_group")
+lnd <- make.names(linda_identified$taxon) %>% str_replace("genus.Eubacterium._hallii_group", "genus..Eubacterium._hallii_group")
+msl
+feats <- union(union(rf_feats, lnd), msl)
+feats
+
+# now extract clr values for the heatmap
+tse <- agglomerateByRank(tse, rank = "genus")
+tse <- transformSamples(tse, method = "relabundance")
+tse <- transformSamples(tse, abund_values = "relabundance", method = "clr", pseudocount = 1)
+hmd <- as.data.frame(assay(tse, "clr")) %>% rownames_to_column("taxon")
+hmd$taxon <- make.names(hmd$taxon)
+hmd <- filter(hmd, taxon %in% feats) %>%
+  mutate(
+    RF = ifelse(taxon %in% rf_feats, 1, 0),
+    Maaslin2 = ifelse(taxon %in% msl, 1, 0),
+    LinDA = ifelse(taxon %in% lnd, 1, 0),
+) 
+side_vars <- select(hmd, taxon, RF, Maaslin2, LinDA) %>% 
+  mutate(score = RF + Maaslin2 + LinDA) %>%
+  arrange(desc(score))
+  # the genus must be non-zero and identifiable in SKIPPY. 
+
+side_vars
+hmd <- select(hmd, -RF, -Maaslin2, -LinDA) %>% 
+  column_to_rownames("taxon") %>%
+  t() %>%
+  as.data.frame() %>%
+  mutate(across(all_of(feats), function(x) scale(x)[, 1])) %>%
+  rownames_to_column("sample_id")
+temp <- colData(tse) %>% 
+  as.data.frame() %>%
+  mutate(Infancy = ifelse(week == 52, "Late Infancy", "Early Infancy")) %>%
+  select(sample_id, condition, Infancy)
+hmd <- left_join(hmd, temp, by = "sample_id")
+
+
+side_vars
+hmd <- arrange(
+  hmd,
+  Infancy,
+  condition,
+  genus.Faecalibacterium,
+  genus.Megasphaera,
+  genus.Bacteroides,
+  genus.Flavonifractor,
+  genus.Rothia,
+  genus..Eubacterium._hallii_group,
+  genus.Bifidobacterium,
+  genus.Parabacteroides,
+  genus.Enterococcus,
+  genus.Erysipelatoclostridium,
+  genus.Blautia,
+  family.Lachnospiraceae,
+  genus.Butyricicoccus,
+  genus.Actinomyces,
+  genus.Staphylococcus,
+  genus.Anaerostipes,
+  genus.Dialister,
+  genus.Lacticaseibacillus,
+  genus.Lachnospira,
+  genus.Lachnospiraceae_UCG.004,
+  genus.Monoglobus
+)
+
+sample_id <- hmd$sample_id
+clu <- hmd$condition
+age <- hmd$Infancy
+
+method_label <- mutate(
+  side_vars, 
+  p1 = ifelse(RF, "R, ", ""), 
+  p2 = ifelse(Maaslin2, "M, ", ""), 
+  p3 = ifelse(LinDA, "L, ", ""),
+  label = glue("{p1}{p2}{p3}"),
+  label = str_replace(label, ",\\s+$", ""),
+  rownames = glue('"{taxon}"^"{label}",'))
+
+n <- length(method_label$label)
+split <- method_label$label[n:1]
+split
+method_label
+df_fin <- hmd[, side_vars$taxon]
+df_fin <- t(df_fin)
+# sort only based on small matrix 
+colnames(df_fin) <- clu
+
+mat <- df_fin
+# Legend color
+nvec <- seq(-1, 1, length.out = 8)
+col_fun <- colorRamp2(nvec, 
+                     c(
+                       "#2166ac", 
+                       "#4393c3", 
+                       "#92c5de", 
+                       "#d1e5f0", 
+                       "#fddbc7", 
+                       "#f4a582", 
+                       "#d6604d", 
+                       "#b2182b"
+                   ))
+
+
+# Annotation
+ha <- HeatmapAnnotation(
+  SSC = clu,
+  col = list(
+    SSC = c("0"="#B0B0B0", "1"="#000000")
+    ),
+  annotation_name_side = "left"
+  )
+
+
+# Heatmap
+png(here::here("fig/rf_feats.png"),
+  width = 50, 
+  height = 40, 
+  units = "cm", 
+  res= 900)
+
+  Heatmap(mat,
+      width = unit(32, "cm"),
+      height = unit(32, "cm"),
+      top_annotation = ha,
+      name = "RA", 
+      col = col_fun,
+      column_title = "",
+      column_title_gp = gpar(fontsize = 18, fontface = "bold"),
+      row_title = "",
+      row_title_side = "right",
+      row_title_gp = gpar(fontsize = 18, fontface = "bold"),
+      row_names_gp = gpar(fontsize = 15, fontface = "italic"),
+      row_names_side = "left",
+      show_column_names = F,
+      border = T,
+      # rect_gp = gpar(col = "white", lwd = 0),
+      cluster_rows = F,
+      cluster_columns = F,
+      show_heatmap_legend = TRUE,
+      heatmap_legend_param = list(ncol =1),
+      # row_split = split,
+      # row_title_rot = 0,
+      # row_order = side_vars$taxon,
+      row_labels = expression(
+        "Faecalibacterium"^"R, M, L",
+        "Megasphaera"^"M, L",
+        "Bacteroides"^"M, L",
+        "Flavonifractor"^"M, L",
+        "Rothia"^"M, L",
+        "Eubacterium hallii group"^"R, M",
+        "Bifidobacterium"^"R",
+        "Parabacteroides"^"L",
+        "Enterococcus"^"L",
+        "Erysipelatoclostridium"^"L",
+        "Blautia"^"R",
+        "family:Lachnospiraceae"^"R",
+        "Butyricicoccus"^"R",
+        "Actinomyces"^"R",
+        "Staphylococcus"^"R",
+        "Anaerostipes"^"R",
+        "Dialister"^"R",
+        "Lacticaseibacillus"^"M",
+        "Lachnospira"^"R",
+        "Lachnospiraceae UCG 004"^"R",
+        "Monoglobus"^"R"
+      ),
+      column_split = age,
+    )
+  dev.off()
+
+pdf(here::here("fig/rf_feats.pdf"), width = 25, height = 20)
+  Heatmap(mat,
+      width = unit(32, "cm"),
+      height = unit(32, "cm"),
+      top_annotation = ha,
+      name = "RA", 
+      col = col_fun,
+      column_title = "",
+      column_title_gp = gpar(fontsize = 18, fontface = "bold"),
+      row_title = "",
+      row_title_side = "right",
+      row_title_gp = gpar(fontsize = 18, fontface = "bold"),
+      row_names_gp = gpar(fontsize = 15, fontface = "italic"),
+      row_names_side = "left",
+      show_column_names = F,
+      border = T,
+      # rect_gp = gpar(col = "white", lwd = 0),
+      cluster_rows = F,
+      cluster_columns = F,
+      show_heatmap_legend = TRUE,
+      heatmap_legend_param = list(ncol =1),
+      # row_split = split,
+      # row_title_rot = 0,
+      # row_order = side_vars$taxon,
+      row_labels = expression(
+        "Faecalibacterium"^"R, M, L",
+        "Megasphaera"^"M, L",
+        "Bacteroides"^"M, L",
+        "Flavonifractor"^"M, L",
+        "Rothia"^"M, L",
+        "Eubacterium hallii group"^"R, M",
+        "Bifidobacterium"^"R",
+        "Parabacteroides"^"L",
+        "Enterococcus"^"L",
+        "Erysipelatoclostridium"^"L",
+        "Blautia"^"R",
+        "family:Lachnospiraceae"^"R",
+        "Butyricicoccus"^"R",
+        "Actinomyces"^"R",
+        "Staphylococcus"^"R",
+        "Anaerostipes"^"R",
+        "Dialister"^"R",
+        "Lacticaseibacillus"^"M",
+        "Lachnospira"^"R",
+        "Lachnospiraceae UCG 004"^"R",
+        "Monoglobus"^"R"
+      ),
+      column_split = age,
+    )
+  dev.off()
+
+side_vars
+
+
+# further data exploration for discussion
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+tse <- agglomerateByRank(tse, rank = "genus")
+tse2 <- tidySummarizedExperiment::filter(tse, week == 2)
+tse5 <- tidySummarizedExperiment::filter(tse, week == 5)
+tse52 <- tidySummarizedExperiment::filter(tse, week == 52)
+prevs <- map(list(tse2, tse5, tse52), function(tse) {
+  prev <- getPrevalence(
+    tse, 
+    detection = 1, 
+    sort = TRUE, 
+    assay.type = "counts",
+    as_relative = FALSE
+    )
+    prev[names(prev) == "genus:Faecalibacterium"]
+})
+prevs
+
+cd <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, sample_id, week, condition, bfexcl, siblings)
+a <- as.data.frame(t(assay(tse, "counts"))) %>% 
+  rownames_to_column("sample_id") %>%
+  select(sample_id, "genus:Faecalibacterium")
+
+df <- full_join(cd, a, by = "sample_id") %>%
+  rename(faecalibacterium = `genus:Faecalibacterium`) %>%
+  mutate(
+    present = ifelse(faecalibacterium > 0, 1, 0),
+    l_faecalibacterium = log((faecalibacterium + 1))
+    )
+group_by(df, week, condition) %>%
+  summarise(m = mean(present))
+
+summary(lm(l_faecalibacterium ~ siblings + bfexcl, data = df))
diff --git a/R/ml_helper.R b/R/ml_helper.R
new file mode 100644
index 0000000..9ab353c
--- /dev/null
+++ b/R/ml_helper.R
@@ -0,0 +1,881 @@
+library(tidyverse)
+library(furrr)
+library(mlr) 
+library(tuneRanger)
+library(ranger)
+library(patchwork)
+
+
+#########################
+###     ML General    ### --------------------------------------
+#########################
+
+# get very rough estimate of a good ntree/computation trade off 
+plot_ntree <- function(x, y, ntree = 1e4) {
+  ntreeplot <- randomForest::randomForest(
+    x = x,
+    y = y,
+    ntree = ntree
+  )
+  plot(ntreeplot)
+}
+
+# returns df of our eval metrics (logloss and F1)
+model_eval <- function(
+  model, 
+  testdata, 
+  features,
+  y,  
+  model_type = "ranger", 
+  classification = TRUE,
+  null_test = FALSE,
+  null_dist = if(null_test) null_dist else NULL
+  ) {
+    
+    if (classification) {
+      
+      # we need the groups as 0/1 values
+      y_true <- as.numeric(testdata[[y]]) -1
+      
+      # obtain predictions (this is little different for each algorithm)
+      # randomForest
+      if (model_type == "randomForest") {
+        y_pred_resp <- predict(model, newdata = testdata, type = "response")
+        y_pred_resp <- as.numeric(y_pred_resp) -1
+        y_pred_prob <- predict(model, newdata = testdata, type = "prob")[, 2]
+      # ranger
+      } else if (model_type == "ranger") {
+        y_pred_resp <- predict(model, data = testdata)$predictions
+        y_pred_resp <- as.numeric(y_pred_resp) -1
+        # to obtain prob you need to fit a prob tree to begin with
+        y_pred_prob <- predict(model, data = testdata)$predictions
+      }
+      # for xgb models we need a xgb.DMatrix
+      } else if (model_type == "XGBoost") {
+        testdata_xgb <- select(testdata, all_of(features)) %>% as.matrix()
+        testdata_xgb <- xgb.DMatrix(data = testdata_xgb, label = y_true)
+        y_pred_prob <- predict(model, testdata_xgb)
+        y_pred_resp <- ifelse(y_pred_prob == 0.5, 
+          rbinom(n = 1, size = 1, p = 0.5), ifelse(y_pred_prob > 0.5,
+            1, 0))
+
+      # logloss 
+      log_l <- MLmetrics::LogLoss(y_pred_prob, y_true)
+      
+      # F1 scores
+      f_one <- MLmetrics::F1_Score(
+        factor(y_true, levels = c("0", "1")), 
+        factor(y_pred_resp, levels = c("0", "1"))
+      )
+      metric <- tibble(logloss = log_l, F1 = f_one)
+      return(metric)
+    } else {
+      if (model_type == "randomForest") {
+        y_pred <- predict(model, newdata = testdata, type = "response")
+      # ranger
+      } else if (model_type == "ranger") {
+        y_pred <- predict(model, data = testdata)$predictions
+      }
+
+      p <- cor.test(testdata[[y]], y_pred)
+      if (null_test) {
+        p_value <- mean(null_dist > p[4]$estimate)
+      }
+      p <- round(p[4]$estimate, 3)
+      rsq <- mean(model$rsq) %>% round(3)
+      
+      metric <- tibble(
+        r = p, 
+        rsq = rsq,
+        p_value = p_value)
+      return(metric)
+    }
+    
+    
+    
+}
+
+
+# returns null distribution or pearson cor for given test data 
+rf_null <- function(
+  y,
+  features,
+  train = train,
+  test = test,
+  n_perm = 500,
+  ntree = 500
+  ) {
+    
+    p_null <- future_map_dbl(c(1:n_perm), function(iter) {
+      # permute outcome   
+      train_perm = train 
+      test_perm <- test
+      train_perm$y_perm <- sample(
+        train[[y]],
+        replace = FALSE,
+        size = dim(train)[1])
+      train_perm <- select(train_perm, -all_of(y))
+      
+      test_perm$y_perm <- sample(
+        test[[y]],
+        replace = FALSE,
+        size = dim(test)[1])
+      test_perm <- select(test_perm, -all_of(y))
+    
+      # fit null model ranger
+      null_model <- ranger::ranger(
+        y = train_perm$y_perm,
+        x = select(train, all_of(features)),
+        num.trees  = ntree,
+        importance = "none",
+        probability = FALSE,
+        mtry = 54 # adapted for this specific manuscript, see mbage script
+      )
+      
+      # obtain pearson 
+      y_pred <- predict(null_model, data = test_perm)$predictions
+      pearson_r <- cor.test(test_perm$y_perm, y_pred)
+      return(pearson_r$estimate[[1]])
+    })
+    return(p_null)  
+}
+
+
+# # returns a list of lists where each list has a fitted model and the
+# # corresponding testdata as items 
+# fit_cv <- function(
+#   data, 
+#   features,
+#   y,
+#   method = "cv",
+#   p = ifelse(method == "resample", 0.8, NULL),
+#   k = 10,
+#   model_type = "ranger",
+#   null_test = FALSE,
+#   n_perm = if (null_test) 500 else NULL,
+#   ...
+#   ) {
+# 
+#     dots <- list(...)
+# 
+#     # cv/resample 
+#     if (method == "cv") {
+#       train_indeces <- caret::createFolds(
+#         data[[y]], 
+#         k = k,
+#         returnTrain = TRUE)
+# 
+#     } else if (method == "resample") {
+#       train_indeces <- caret::createDataPartition(
+#         data[[y]], 
+#         p = p, 
+#         times = k)
+#     }
+# 
+# 
+#     # this will return a list of lists that each contain a fitted model and 
+#     # the corresponding test dataset 
+#     models_and_testdata <- map(train_indeces, function(ind) {
+#       train <- data[ind, ]
+#       test <- data[-ind, ]
+# 
+# 
+# 
+#       # fit randomForest 
+#       if (model_type == "randomForest") {
+#         model <- randomForest::randomForest(
+#           y = train[[y]],
+#           x = select(train, all_of(features)),
+#           ntree = dots$ntree,
+#           importance = "permutation"
+#         )
+#       } else if (model_type == "ranger") {
+#         model <- ranger::ranger(
+#           y = train[[y]],
+#           x = select(train, all_of(features)),
+#           ntree = dots$ntree,
+#           importance = "permutation",
+#           probability = probability
+#         )
+#       } else if (model_type == "XGBoost") {
+#         # prepare xgb data matrix object
+#         labels_train <- train[[y]] %>% as.numeric() -1 # one-hot-coding
+#         labels_test <- test[[y]] %>% as.numeric() -1
+#         train_xgb <- select(train, all_of(features)) %>% as.matrix()
+#         test_xgb <- select(test, all_of(features)) %>% as.matrix()
+#         train_xgb <- xgb.DMatrix(data = train_xgb, label = labels_train)
+#         test_xgb <- xgb.DMatrix(data = test_xgb, label = labels_test)
+# 
+#         # set model parameters (this should be put in ... at some point)
+#         params <- list(
+#           booster = "gbtree",
+#           objective = "binary:logistic",
+#           eta = 0.3,
+#           gamma = 0,
+#           max_depth = 6,
+#           min_child_weight = 1,
+#           subsample = 1,
+#           colsample_bytree = 1
+#         )
+# 
+#         # fit model 
+#         model <- xgb.train(
+#           params = params,
+#           data = train_xgb, 
+#           nrounds = 10,
+#           watchlist = list(val = test_xgb, train = train_xgb),
+#           print_every_n = 10, 
+#           early_stop_round = 10,
+#           maximize = FALSE,
+#           eval_metric = "logloss",
+#           verbose = 0
+#         )
+#       }
+# 
+#       if (null_test) {
+#         null_dist <- rf_null(
+#           y,
+#           features,
+#           train,
+#           test,
+#           ntree = dots$ntree,
+#           n_perm = n_perm
+#         )
+# 
+#         return(list(model, test, null_dist))
+#       } else {
+#         # return fitted model and corresponding test data set
+#         list(model, test)
+#       }
+# 
+#     })
+#     return(models_and_testdata)
+# }
+
+
+
+
+
+
+
+# summarises eval metrics 
+summarize_metrics <- function(
+  models_and_data, 
+  y, 
+  model_type = "ranger", 
+  features = features, 
+  classification = TRUE
+  ) {
+    map_dfr(models_and_data, function(model_and_data) {
+      model <- model_and_data[[1]]
+      testdata <- model_and_data[[2]]
+      model_eval(model, testdata, features = features, y = y, model_type = model_type, classification = classification) 
+    }) %>%
+      gather(metric, value) %>%
+      group_by(metric) %>%
+      summarise(mean = mean(value), sd = sd(value)) %>%
+      mutate_if(is.numeric, round, 2)
+}
+  
+plot_importance <- function(model, top_n = NULL) {
+  var_imp <- importance(model, type = 1)
+  var_imp <- var_imp %>% as.data.frame() 
+  imp_name <- colnames(var_imp)[1]
+  
+  var_imp <- var_imp %>%
+    rownames_to_column("features") %>%
+    select(features, importance = all_of(imp_name)) %>%
+    arrange(importance) %>%
+    mutate(features = factor(features, level = features))
+  if (!is.null(top_n)) {
+    var_imp <- tail(var_imp, top_n)
+  }
+  ggplot(var_imp, aes(features, importance)) +
+    geom_col() +
+    coord_flip() 
+
+}
+
+extract_importance <- function(model, n = 10) {
+  var_imp <- importance(model, type = 1)
+  var_imp <- var_imp %>% as.data.frame() 
+  imp_name <- colnames(var_imp)[1]
+  var_imp <- var_imp %>%
+    rownames_to_column("features") %>%
+    select(features, importance = all_of(imp_name)) %>%
+    arrange(importance) %>%
+    mutate(features = factor(features, level = features)) %>%
+    tail(n)
+  return(var_imp)  
+}
+
+
+
+
+#########################
+###   Random Forests  ### --------------------------------------
+#########################
+
+
+# Feature selection based on RF importance scores.
+# models_and_data is a list of list where each list contains a model object [1]
+# and the corresponding testdata [2] According to workflow in this script
+select_features <- function(
+  models_and_data, 
+  id_name = "id", 
+  n_features = 50) {
+  top_predictors <- map(models_and_data, function(model_and_data) {
+    model <- model_and_data[[1]]
+  
+  
+    top_predictors <- importance(model, type = 1) %>%
+      as.data.frame()
+    colnames(top_predictors) <- "importance"
+    #imp_name <- colnames(top_predictors)[1]
+    top_predictors <- top_predictors %>%
+      rownames_to_column(id_name) %>%
+      arrange(desc(importance)) %>%
+      select(all_of(id_name)) %>%
+      head(n_features)
+    }
+  )
+  
+  # only intersection of all k model is used
+  selected_features <- Reduce(intersect, top_predictors)
+  return(selected_features)
+}
+
+
+
+tune_rf <- function(
+  data, 
+  features, 
+  y, 
+  regression = TRUE,
+  measure = NULL, 
+  iters = 70, 
+  iters.warmup = 30,
+  time.budget = NULL, 
+  num.threads = NULL, 
+  ntree = 1000,
+  parameters = list(
+    replace = FALSE, 
+    respect.unordered.factors = "order"
+  ), 
+  tune.parameters = c(
+    "mtry", 
+    "min.node.size",
+    "sample.fraction"
+  ), 
+  save.file.path = NULL, 
+  build.final.model = FALSE,
+  show.info = getOption("mlrMBO.show.info", TRUE)
+  ) {
+    # names must be compatible with mlr 
+    d <- select(data, all_of(features), all_of(y))
+    colnames(d) <- make.names(colnames(d), unique = T)
+    if (regression) {
+      tune_task <- makeRegrTask(
+        data = d,
+        target = y
+      )
+    } else {
+      tune_task <- makeClassifTask(
+        data = d,
+        target = y
+      )
+    }
+
+    #estimateTimeTuneRanger(tune_task)
+    res <- tuneRanger(
+      tune_task, 
+      measure = measure, 
+      iters = iters, 
+      iters.warmup = iters.warmup,
+      time.budget = time.budget, 
+      num.threads = num.threads, 
+      num.trees = ntree,
+      parameters = parameters, 
+      tune.parameters = tune.parameters,
+      save.file.path = save.file.path, 
+      build.final.model = build.final.model,
+      show.info = show.info
+    )
+    res
+}
+
+
+
+
+rf_cv <- function(
+  data, 
+  features,
+  y,
+  method = "cv",
+  p = ifelse(method == "resample", 0.8, NULL),
+  k = 10,
+  ntree = 500,
+  null_test = FALSE,
+  n_perm = if (null_test) 500 else NULL,
+  regression = TRUE, 
+  probability = ifelse(regression, FALSE, TRUE),
+  ...
+  ) {
+    
+    # additional arguments for ranger 
+    dots <- list(...)
+    ranger_params <- list()
+    if ("mtry" %in% names(dots)) {
+      ranger_params[["mtry"]] <- dots[["mtry"]]
+    }  else {
+      ranger_params[["mtry"]] <- NULL
+    }
+    if ("min.node.size" %in% names(dots)) {
+      ranger_params[["min.node.size"]] <- dots[["min.node.size"]]
+    }  else {
+      ranger_params[["min.node.size"]] <- NULL
+    }
+    if ("replace" %in% names(dots)) {
+      ranger_params[["replace"]] <- dots[["replace"]]
+    }  else {
+      ranger_params[["replace"]] <- FALSE
+    }   
+    if ("sample.fraction" %in% names(dots)) {
+      ranger_params[["sample.fraction"]] <- dots[["sample.fraction"]]
+    }  else {
+      ranger_params[["sample.fraction"]] <- ifelse(ranger_params[["replace"]], 1, 0.632)
+    } 
+    if ("splitrule" %in% names(dots)) {
+      ranger_params[["splitrule"]] <- dots[["splitrule"]]
+    }  else {
+      ranger_params[["splitrule"]] <- NULL
+    } 
+    if ("num.random.splits" %in% names(dots)) {
+      ranger_params[["num.random.splits"]] <- dots[["num.random.splits"]]
+    }  else {
+      ranger_params[["num.random.splits"]] <- 1
+    } 
+    if ("scale.permutation.importance" %in% names(dots)) {
+      ranger_params[["scale.permutation.importance"]] <- dots[["scale.permutation.importance"]]
+    }  else {
+      ranger_params[["scale.permutation.importance"]] <- FALSE
+    }
+    if ("importance" %in% names(dots)) {
+      ranger_params[["importance"]] <- dots[["importance"]]
+    }  else {
+      ranger_params[["importance"]] <- "permutation"
+    }
+    
+    
+
+    # cv/resample 
+    if (method == "cv") {
+      train_indeces <- caret::createFolds(
+        data[[y]], 
+        k = k,
+        returnTrain = TRUE)
+      
+    } else if (method == "resample") {
+      train_indeces <- caret::createDataPartition(
+        data[[y]], 
+        p = p, 
+        times = k)
+    }
+    
+
+    map(train_indeces, function(ind) {
+      train <- data[ind, ]
+      test <- data[-ind, ]
+      model <- ranger::ranger(
+        y = train[[y]],
+        x = select(train, all_of(features)),
+        num.trees  = ntree,
+        importance = ranger_params[["importance"]],
+        probability = probability,
+        mtry = ranger_params[["mtry"]],
+        min.node.size = ranger_params[["min.node.size"]],
+        replace = ranger_params[["replace"]],
+        sample.fraction = ranger_params[["sample.fraction"]],
+        splitrule = ranger_params[["splitrule"]],
+        num.random.splits = ranger_params[["num.random.splits"]],
+        scale.permutation.importance = ranger_params[["scale.permutation.importance"]]        
+      )
+      
+      if (null_test) {
+        null_dist <- rf_null(
+          y,
+          features,
+          train,
+          test,
+          ntree = ntree,
+          n_perm = n_perm
+        )
+        
+        return(list(model, test, null_dist))
+      } else {
+        # return fitted model and corresponding test data set
+        list(model, test)
+      }
+      })
+    }
+
+
+# repeated cv/resampling
+rf_rcv <- function(
+  data, 
+  features,
+  y,
+  method = "cv",
+  p = ifelse(method == "resample", 0.8, NULL),
+  k = 10,
+  ntree = 500,
+  null_test = FALSE,
+  n_perm = if (null_test) 500 else NULL,
+  regression = TRUE,
+  probability = ifelse(regression, FALSE, TRUE),
+  repeated = 10) {
+    
+    all_model_and_data <- map(c(1:repeated), function(rep) {
+      model_and_data <- rf_cv(
+        data, 
+        features,
+        y,
+        method = method,
+        p = p,
+        k = k,
+        ntree = ntree,
+        null_test = null_test,
+        n_perm = n_perm,
+        regression = regression,
+        probability = probability
+      )
+    })
+    flatten(all_model_and_data)
+  }
+
+rf_model_fit <- function(
+  models_and_data, 
+  y, 
+  regression = TRUE, 
+  null_test = FALSE
+  ) {
+    p <- map(models_and_data, function(model_and_data) {
+      
+      model <- model_and_data[[1]]
+      test <- model_and_data[[2]]
+      if (null_test) {
+        null_dist <- model_and_data[[3]]
+      }
+      if (regression) {
+        y_pred <- predict(model, data = test)$predictions
+        p <- cor.test(test[[y]], y_pred)
+        rsq <- model$r.squared %>% round(3)
+        if (null_test) {
+          p_value <- mean(null_dist > p[4]$estimate)
+          list(round(p[4]$estimate, 3), rsq, p_value)
+        } else {
+          list(round(p[4]$estimate, 3), rsq)
+        }
+        
+      } else {
+        y_true <- as.numeric(test[[y]]) -1
+        # only works if you specified ranger as probability tree
+        y_pred_prob <- predict(model, data = test)$predictions
+        log_l <- MLmetrics::LogLoss(y_pred_prob[, 2], y_true)
+        oob <- model$prediction.error
+        metric <- tibble(oob = oob, log_l = log_l) 
+        list(metric)
+      }
+    })
+    p
+}
+
+rf_summary <- function(
+  data, 
+  features,
+  y,
+  p = 0.8, 
+  k = 10,
+  ntree = 500,
+  regression = TRUE,
+  null_test = FALSE,
+  probability = ifelse(regression, FALSE, TRUE)
+  ) {
+    model_and_data <- rf_cv(
+      data,
+      features,
+      y,
+      p = p, 
+      k = k,
+      ntree = ntree,
+      null_test = null_test
+    )
+    metric <- rf_model_fit(
+      model_and_data, 
+      y = y, 
+      regression = regression,
+      null_test = null_test
+    )
+    if (regression) {
+      p <- map_dfr(metric, function(list) {
+        list[[1]]
+       }) %>% gather(sample, value) %>%
+        summarise(mean = mean(value), median = median(value), sd = sd(value))
+        
+      rsq <- map_dfr(metric, function(list) {
+        list[[2]]
+       }) %>% gather(sample, value) %>%
+        summarise(mean = mean(value), median = median(value), sd = sd(value))
+        
+      if (null_test) {
+        df1 <- map_dfr(metric, function(list) {
+          list[[1]]
+          }) %>% gather(sample, r)
+        df2 <- map_dfr(metric, function(list) {
+          list[[3]]
+          }) %>% gather(sample, p_value)
+        p_value <- bind_cols(df1, df2) %>% filter(r == median(r)) %>%
+          .$p_value
+        p_value <- p_value[1]
+      }
+      
+      if (null_test) {
+        list("p" = p, "rsq" = rsq, "p_value" = p_value)
+      } else {
+        list("p" = p, "rsq" = rsq)
+      }
+      
+      
+    } else {
+      map_dfr(metric, ~bind_rows(.x)) %>%
+      gather(statistic, value) %>% 
+      group_by(statistic) %>%
+      summarise(
+        median = median(value), 
+        sd = sd(value), 
+        lower = quantile(value, 0.025), 
+        upper = quantile(value, 0.975)
+      ) %>%
+      mutate_if(is.numeric, round, 2)
+    }
+  }
+
+
+
+
+get_oob <- function(model_and_data, summarise = TRUE) {
+  metric <- map_dfc(model_and_data, function(md) {
+    md[[1]]$prediction.error
+  }) %>% pivot_longer(everything(), names_to = "fold", values_to = "oob")
+    
+  if (summarise) {
+    metric <- metric %>%
+      summarise(
+        median = median(oob),
+        mean = mean(oob),
+        sd = sd(oob),
+        lower = quantile(oob, 0.025),
+        upper = quantile(oob, 0.975)
+        )
+      }
+  metric
+}
+
+# thanks to Artem Sokolov: https://stackoverflow.com/questions/45676745/how-to-calculate-the-auc-value-for-a-ranger-rf-model
+auc_roc <- function(scores, labels){
+  stopifnot( length(scores) == length(labels) )
+  jp <- which( labels > 0 ); np <- length( jp )
+  jn <- which( labels <= 0); nn <- length( jn )
+  s0 <- sum( rank(scores)[jp] )
+  (s0 - np*(np+1) / 2) / (np*nn)
+}
+
+
+get_auc <- function(model_and_data, y, summarise = TRUE) {
+  metric <- map_dfr(model_and_data, function(md) {
+    fit <- md[[1]]
+    test <- md[[2]]
+    y_pred <- predict(fit, data = test)$predictions[, 2]
+    y_true <- as.numeric(test[[y]]) - 1
+    list(auc = auc_roc(y_pred, y_true))
+    }) %>% pivot_longer(everything(), names_to = "fold", values_to = "auc")
+    
+  if (summarise) {
+    metric <- metric %>%
+      summarise(
+        median = median(auc),
+        mean = mean(auc),
+        sd = sd(auc),
+        lower = ifelse(berryFunctions::is.error(quantile(auc, 0.025)), NA, quantile(auc, 0.025)),
+        upper = ifelse(berryFunctions::is.error(quantile(auc, 0.975)), NA, quantile(auc, 0.975))
+      )
+    }
+  metric
+}
+
+get_pearson <- function(model_and_data, y, summarise = TRUE) {
+  metric <- map_dfr(model_and_data, function(md) {
+    fit <- md[[1]]
+    test <- md[[2]]
+    y_pred <- predict(fit, data = test)$predictions
+    y_true <- test[[y]]
+    p <- cor.test(y_true, y_pred)
+    list(pearson = round(p[4]$estimate, 3))
+  }) %>% pivot_longer(everything(), names_to = "fold", values_to = "pearson")
+    
+  if (summarise) {
+    metric <- metric %>%
+      summarise(
+        median = median(pearson),
+        mean = mean(pearson),
+        sd = sd(pearson),
+        lower = ifelse(berryFunctions::is.error(quantile(pearson, 0.025)), NA, quantile(pearson, 0.025)),
+        upper = ifelse(berryFunctions::is.error(quantile(pearson, 0.975)), NA, quantile(pearson, 0.975))
+      )
+    }
+  metric
+}
+
+
+get_rsq <- function(model_and_data, y, summarise = TRUE) {
+  metric <- map_dfr(model_and_data, function(md) {
+    fit <- md[[1]]
+    list(rsq = fit$r.squared)
+  }) %>% pivot_longer(everything(), names_to = "fold", values_to = "rsq")
+    
+  if (summarise) {
+    metric <- metric %>%
+      summarise(
+        median = median(rsq),
+        mean = mean(rsq),
+        sd = sd(rsq),
+        lower = quantile(rsq, 0.025),
+        upper = quantile(rsq, 0.975)
+      )
+    }
+  metric
+}
+
+
+
+
+
+#########################
+###    Regression     ### --------------------------------------
+#########################
+
+# to plot simple regression or counterfactual plots 
+# model is brms model (might work with other lm models too)
+# specify x2 for counterfactual plots
+plot_regression <- function(
+  model, x, y, 
+  points = TRUE, 
+  counterfactual = FALSE, 
+  x2 = NULL) {
+    
+    
+    n <- length(model$data[[x2]])
+    if (counterfactual) {
+      newdata <- tibble(
+        x_rep = seq(
+          from = min(model$data[[x]]), 
+          to = max(model$data[[x]]), 
+          length.out = n),
+        x2_rep = mean(model$data[[x2]])
+      )
+      colnames(newdata) <- c(x, x2)
+    } else {
+      newdata <- tibble(
+        x_rep = seq(
+          from = min(model$data[[x]]), 
+          to = max(model$data[[x]]), 
+          length.out = n)
+        )
+      colnames(newdata) <- c(x)
+    }
+
+    df <- fitted(model, newdata = newdata) %>% 
+      as_tibble() %>%  
+      rename(
+        f_ll = Q2.5,
+        f_ul = Q97.5
+    ) 
+    y_pred <- predict(model, newdata = newdata) %>% 
+             as_tibble() %>%
+             transmute(p_ll = Q2.5, p_ul = Q97.5)
+    df <- bind_cols(newdata, y_pred, df)
+      
+    if(!counterfactual) {
+      p <- ggplot(df, aes_string(x, "Estimate")) +
+          geom_smooth(aes(ymin = f_ll, ymax = f_ul), stat = "identity")
+          
+    } else if(counterfactual) {
+
+        p <- ggplot(df, aes_string(x = x, y = "Estimate")) +
+              geom_ribbon(aes(ymin = p_ll, ymax = p_ul), alpha = 1/5) +
+              geom_smooth(aes(ymin = f_ll, ymax = f_ul), stat = "identity") +
+              coord_cartesian(xlim = range(model$data[[x]]))
+    }
+    
+    # add real data points
+    if(points) {
+      p <- p + geom_point(data = model$data, aes_string(x, y))
+    }
+    
+    return(p)
+}
+
+
+
+# diagnostic plots for frequentist regression (lm or lme4)
+lm_diag <- function(model, data, Y, id = "id", brms = TRUE) {
+  # need some helper function defined elsewhere
+  source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/reporting.R")
+  diag_df <- data %>%
+    mutate( 
+      sresid = if (brms) scale(residuals(model, re_formula = NA)[, 1])[, 1] else resid(model), 
+      fitted = if (brms) fitted(model, re_formula = NA)[, 1] else fitted(model)
+    ) %>% 
+    mutate(sresid = scale(sresid)[, 1])
+  
+
+  # distribution of the scaled residuals
+  p_resid <- ggplot(diag_df, aes(sresid)) +
+      geom_density() +
+      ylab('Density') + xlab('Standardized Redsiduals') +
+      theme_minimal()
+
+  ## qq plot (source code for gg_qq in script)
+  qq <- 
+    gg_qq(diag_df$sresid)+ 
+    theme_minimal() + 
+    xlab('Theoretical') + ylab('Sample')
+
+  # fitted vs sresid 
+  fit_resid <- 
+    ggplot(diag_df, aes(fitted, sresid)) +
+      geom_point(alpha = 0.6) +
+      geom_smooth(method = "loess", se = F, color = "#f94c39") +
+      geom_point(
+        data = filter(diag_df, abs(sresid) > 3.5), 
+        aes(fitted, sresid), color='red'
+      ) +
+      ggrepel::geom_text_repel(
+        data = filter(diag_df, abs(sresid) > 3.5), 
+        aes_string("fitted", y = "sresid", label = id), size = 3
+      ) +
+      ylab('Standardized Residuals') + xlab('Fitted Values') +
+      scale_y_continuous(breaks=c(-4, -3, -2, -1, 0, 1, 2, 3, 4))+
+      theme_minimal()
+
+  # Fitted vs observed
+  fit_obs <- 
+    ggplot(diag_df, aes_string("fitted", glue("{Y}"))) +
+      geom_point(alpha = 0.6) +
+      geom_smooth(method = "loess", se = F, color = '#f94c39') +
+      ylab(glue("Observed {Y}")) + xlab('Fitted Values') +
+      theme_minimal()
+      
+  (p_resid + qq) /
+    (fit_resid + fit_obs)
+}
diff --git a/R/pc_daa_ancombc_bez.R b/R/pc_daa_ancombc_bez.R
new file mode 100644
index 0000000..0a0cc00
--- /dev/null
+++ b/R/pc_daa_ancombc_bez.R
@@ -0,0 +1,1431 @@
+set.seed(1)
+library(mia)
+library(ANCOMBC)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(glue)
+
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+####################### 1.1 Complete Case Analysis ############################
+
+
+# model includes random intercepts and all samples 
+if (!file.exists(here::here("data/daa_cc_mlm_pc_bez.Rds"))) {
+  output <- ancombc2(
+    data = tse_pc,
+    assay_name = "counts",
+    tax_level = NULL,
+    fix_formula = "condition * age_s + siblings",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm",
+    pseudo = 0,
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition",
+    struc_zero = TRUE,
+    neg_lb = TRUE,
+    alpha = 0.4,
+    n_cl = 2,
+    verbose = TRUE,
+    global = FALSE,
+    pairwise = FALSE,
+    dunnet = FALSE,
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+    trend_control = list(
+      contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)),
+      node = list(2),
+      solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here("data/daa_cc_mlm_pc_bez.Rds"))
+ } else {
+   load(here::here("data/daa_cc_mlm_pc_bez.Rds"))
+}
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+  filter(value >5)
+
+res_prim = output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s", "siblings1")
+map(effects, function(effect) {
+  select(res_prim, 
+    taxon, 
+    lfc = glue::glue("lfc_{effect}"),
+    se = glue::glue("se_{effect}"),
+    indicator = glue::glue("diff_{effect}")) %>%
+  filter(indicator) %>%
+  mutate(effect = effect)
+})
+
+
+# model excludes random intercepts and includes all samples
+if (!file.exists(here::here("data/daa_cc_pc_bez.Rds"))) {
+  output <- ancombc2(
+    data = tse_pc, 
+    assay_name = "counts", 
+    tax_level = NULL,
+    fix_formula = "condition * age_s + siblings",
+    #rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.4, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+    trend_control = list(
+      contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+      node = list(2), 
+      solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here("data/daa_cc_pc_bez.Rds"))
+ } else {
+   load(here::here("data/daa_cc_pc_bez.Rds"))
+ }
+   
+ # structural zeros 
+ tab_zero = output$zero_ind
+ sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+ # sensitivity scores 
+ tab_sens = output$pseudo_sens_tab
+ head(tab_sens)
+ pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+   filter(value >5)
+ 
+ res_prim = output$res
+ colnames(res_prim)
+ effects <- c("age_s", "condition1", "condition1:age_s", "siblings1")
+ map(effects, function(effect) {
+   select(res_prim, 
+     taxon, 
+     lfc = glue::glue("lfc_{effect}"),
+     se = glue::glue("se_{effect}"),
+     indicator = glue::glue("diff_{effect}")) %>%
+   filter(indicator) %>%
+   mutate(effect = effect)
+ })
+
+# model includes random intercepts and excludes 1 year samples
+if (!file.exists(here::here("data/daa_cc_mlm_infancy_pc_bez.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse_pc, week != 52), 
+    assay_name = "counts", 
+    tax_level = NULL,
+    fix_formula = "condition * age_s + siblings",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.4, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+    trend_control = list(
+      contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+      node = list(2), 
+      solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here("data/daa_cc_mlm_infancy_pc_bez.Rds"))
+ } else {
+   load(here::here("data/daa_cc_mlm_infancy_pc_bez.Rds"))
+}
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+ filter(value >5)
+
+res_prim = output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s", "siblings1")
+map(effects, function(effect) {
+ select(res_prim, 
+   taxon, 
+   lfc = glue::glue("lfc_{effect}"),
+   se = glue::glue("se_{effect}"),
+   indicator = glue::glue("diff_{effect}")) %>%
+ filter(indicator) %>%
+ mutate(effect = effect)
+})
+
+
+# model includes 1 years samples only 
+if (!file.exists(here::here("data/daa_cc_year1_pc_bez.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse_pc, week == 52), 
+    assay_name = "counts", 
+    tax_level = NULL,
+    fix_formula = "condition * age_s + siblings",
+    # rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.4, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+    trend_control = list(
+      contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+      node = list(2), 
+      solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here("data/daa_cc_year1_pc_bez.Rds"))
+ } else {
+   load(here::here("data/daa_cc_year1_pc_bez.Rds"))
+ }
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+ # sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+   filter(value >5)
+ 
+res_prim = output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s", "siblings1")
+map(effects, function(effect) {
+   select(res_prim, 
+     taxon, 
+     lfc = glue::glue("lfc_{effect}"),
+     se = glue::glue("se_{effect}"),
+     indicator = glue::glue("diff_{effect}")) %>%
+   filter(indicator) %>%
+   mutate(effect = effect)
+})
+
+# for the samples at 1 year we find an effect on two metabolites:
+# 1 MGB016 35.18912 10.45095      TRUE condition1
+# 2 MGB010 35.02913 10.42313      TRUE condition1
+
+
+
+
+######################## 1.2 Multiple imputation  #############################
+
+
+
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+models_imp <- map2(implist, 1:length(implist), function(dimp, imp) {
+  tse_pc_map <- tse_pc
+
+  fvars <- c("constipation", "siblings", "diarrhea", "condition")
+  # add metadata to tse
+  colData(tse_pc_map) <- colData(tse_pc_map) %>%
+    as.data.frame() %>%
+    select(-all_of(fvars), -contains("age")) %>%
+    left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_pc_map)$age <- colData(tse_pc_map)$age + as.numeric(colData(tse_pc_map)$week) * 7
+  colData(tse_pc_map)$age_s <- scale(colData(tse_pc_map)$age)[, 1]
+
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_cc_mlm_all_imp{imp}_pc_bez.Rds")))) {
+    output <- ancombc2(
+      data = tse_pc_map, 
+      assay_name = "counts", 
+      tax_level = NULL,
+      fix_formula = "condition * age_s + siblings",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.4, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_cc_mlm_all_imp{imp}_pc_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_cc_mlm_all_imp{imp}_pc_bez.Rds")))
+  }
+     
+  # structural zeros 
+  tab_zero_all = output$zero_ind
+  # sensitivity scores 
+  tab_sens_all = output$pseudo_sens_tab
+  sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_all = output$res
+  effects_all <- map(effects, function(effect) {
+     select(res_prim_all, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+
+
+
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_pc_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_pc_map, week != 52), 
+      assay_name = "counts", 
+      tax_level = NULL,
+      fix_formula = "condition * age_s + siblings",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.4, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_pc_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_pc_bez.Rds")))
+  }
+     
+  # structural zeros 
+  tab_zero_infancy = output$zero_ind
+  # sensitivity scores 
+  tab_sens_infancy = output$pseudo_sens_tab
+  sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_infancy = output$res
+  effects_infancy <- map(effects, function(effect) {
+     select(res_prim_infancy, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+
+
+
+
+  # model includes 1 years samples only
+  if (!file.exists(here::here(glue("data/daa_cc_year1_imp{imp}_pc_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_pc_map, week == 52), 
+      assay_name = "counts", 
+      tax_level = NULL,
+      fix_formula = "condition * age_s + siblings",
+      # rand_formula = "(1 | skippy_id)",
+      p_adj_method = "holm", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.4, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_cc_year1_imp{imp}_pc_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_cc_year1_imp{imp}_pc_bez.Rds")))
+   }
+     
+  # structural zeros 
+  tab_zero_year1 = output$zero_ind
+  # sensitivity scores 
+  tab_sens_year1 = output$pseudo_sens_tab
+  sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_year1 = output$res
+  effects_year1 <- map(effects, function(effect) {
+     select(res_prim_year1, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+
+  list(
+      all = list(
+        tab_zero_all,
+        sens_scores_all,
+        res_prim_all,
+        effects_all
+    ),
+    infancy = list(
+      tab_zero_infancy,
+      sens_scores_infancy,
+      res_prim_infancy,
+      effects_infancy
+    ),
+    year1 = list(
+      tab_zero_year1,
+      sens_scores_year1,
+      res_prim_year1,
+      effects_year1
+    )
+  )
+  
+})
+
+ancom_tables_itt <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_itt) <- str_remove(colnames(ancom_tables_itt), "_condition1") 
+
+ancom_tables_itt2 <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_itt2) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") 
+
+save(ancom_tables_itt, file = here::here("data/ancom_tables_itt_pc.Rds"))
+
+ancom_tables_itt
+ancom_tables_itt2
+
+
+
+
+
+
+
+
+
+
+t1 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  lfc_age_s,
+  lfc_siblings1,
+
+) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+  
+
+t2 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  p_age_s,
+  p_siblings1,
+
+
+  ) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  q_age_s,
+  q_siblings1,
+
+
+  ) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining <- full_join(t1, t2, by = c("taxon", "variable")) %>%
+  full_join(t3, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+    ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining, file = here::here("data/ancombc_remaining_pc.Rds"))
+
+
+
+
+# same for early infancy samples:
+ancom_tables_itt_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_itt_infancy) <- str_remove(colnames(ancom_tables_itt), "_condition1") 
+
+ancom_tables_itt2_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_itt2_infancy) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") 
+
+save(ancom_tables_itt_infancy, file = here::here("data/ancom_tables_itt_infancy_pc.Rds"))
+
+ancom_tables_itt_infancy
+ancom_tables_itt2_infancy
+
+
+
+
+t1_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  lfc_age_s,
+  lfc_siblings1,
+
+) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  p_age_s,
+  p_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  q_age_s,
+  q_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_infancy <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>%
+  full_join(t3_infancy, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_infancy, file = here::here("data/ancombc_remaining_infancy_pc.Rds"))
+
+
+
+# same for year1 samples:
+ancom_tables_itt_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_itt_year1) <- str_remove(colnames(ancom_tables_itt), "_condition1") 
+
+ancom_tables_itt2_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_itt2_year1) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") 
+
+save(ancom_tables_itt_year1, file = here::here("data/ancom_tables_itt_year1_pc.Rds"))
+
+ancom_tables_itt_year1
+ancom_tables_itt2_year1
+
+
+
+
+
+t1_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  lfc_age_s,
+  lfc_siblings1,
+
+) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  p_age_s,
+  p_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  q_age_s,
+  q_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_year1 <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>%
+  full_join(t3_year1, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_year1, file = here::here("data/ancombc_remaining_year1_pc.Rds"))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+# for analyses we apply prevalence fitlering
+tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+colData(tse_pc) <- colData(tse_pc) %>%
+  as.data.frame() %>%
+  #rownames_to_column("sample_id") %>%
+  left_join(select(d_pp, sample_id, pp), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  DataFrame()
+tse_pc <- filter(tse_pc, pp == 1)
+
+
+####################### 2.1 Complete Case Analysis ############################
+
+
+
+
+
+# model includes random intercepts and all samples 
+if (!file.exists(here::here("data/daa_pp_mlm_pc_bez.Rds"))) {
+  output <- ancombc2(
+    data = tse_pc, 
+    assay_name = "counts", 
+    tax_level = NULL,
+    fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.4, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+    trend_control = list(
+      contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+      node = list(2), 
+      solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here("data/daa_pp_mlm_pc_bez.Rds"))
+ } else {
+   load(here::here("data/daa_pp_mlm_pc_bez.Rds"))
+ }
+
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+  filter(value >5)
+
+res_prim = output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s", "siblings1")
+map(effects, function(effect) {
+  select(res_prim, 
+    taxon, 
+    lfc = glue::glue("lfc_{effect}"),
+    se = glue::glue("se_{effect}"),
+    indicator = glue::glue("diff_{effect}")) %>%
+  filter(indicator) %>%
+  mutate(effect = effect)
+})
+
+
+
+# model includes random intercepts and excludes 1 year samples
+if (!file.exists(here::here("data/daa_pp_mlm_infancy_pc_bez.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse_pc, week != 52), 
+    assay_name = "counts", 
+    tax_level = NULL,
+    fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+    rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.4, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+    trend_control = list(
+      contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+      node = list(2), 
+      solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here("data/daa_pp_mlm_infancy_pc_bez.Rds"))
+ } else {
+   load(here::here("data/daa_pp_mlm_infancy_pc_bez.Rds"))
+}
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+# sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+ filter(value >5)
+
+res_prim = output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s", "siblings1")
+map(effects, function(effect) {
+ select(res_prim, 
+   taxon, 
+   lfc = glue::glue("lfc_{effect}"),
+   se = glue::glue("se_{effect}"),
+   indicator = glue::glue("diff_{effect}")) %>%
+ filter(indicator) %>%
+ mutate(effect = effect)
+})
+
+
+# model includes 1 years samples only 
+if (!file.exists(here::here("data/daa_pp_year1_pc_bez.Rds"))) {
+  output <- ancombc2(
+    data = filter(tse_pc, week == 52), 
+    assay_name = "counts", 
+    tax_level = NULL,
+    fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+    # rand_formula = "(1 | skippy_id)",
+    p_adj_method = "holm", 
+    pseudo = 0, 
+    pseudo_sens = TRUE,
+    prv_cut = 0.10, 
+    lib_cut = 1000, 
+    s0_perc = 0.05,
+    group = "condition", 
+    struc_zero = TRUE, 
+    neg_lb = TRUE,
+    alpha = 0.4, 
+    n_cl = 2, 
+    verbose = TRUE,
+    global = FALSE, 
+    pairwise = FALSE, 
+    dunnet = FALSE, 
+    trend = FALSE,
+    iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+    em_control = list(tol = 1e-5, max_iter = 100),
+    lme_control = lme4::lmerControl(),
+    mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
+    trend_control = list(
+      contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+      node = list(2), 
+      solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here("data/daa_pp_year1_pc_bez.Rds"))
+ } else {
+   load(here::here("data/daa_pp_year1_pc_bez.Rds"))
+ }
+   
+# structural zeros 
+tab_zero = output$zero_ind
+sum(tab_zero[, 2]) + sum(tab_zero[, 3])
+ # sensitivity scores 
+tab_sens = output$pseudo_sens_tab
+head(tab_sens)
+pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>%
+   filter(value >5)
+ 
+res_prim = output$res
+colnames(res_prim)
+effects <- c("age_s", "condition1", "condition1:age_s", "siblings1")
+map(effects, function(effect) {
+   select(res_prim, 
+     taxon, 
+     lfc = glue::glue("lfc_{effect}"),
+     se = glue::glue("se_{effect}"),
+     indicator = glue::glue("diff_{effect}")) %>%
+   filter(indicator) %>%
+   mutate(effect = effect)
+})
+
+
+
+
+
+######################## 2.2 Multiple imputation  #############################
+
+
+
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) {
+  tse_map <- tse
+  fvars <- c("constipation", "siblings", "diarrhea", "condition", "csection", "sex")
+  # add metadata to tse
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(-age_s, -age, -siblings, -ges_age, -birthweight, -edlevel, -csection, -sex, -constipation, -diarrhea) %>%
+    left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+  colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+  
+  tse_map <- filter(tse_map, pp == 1)
+  
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_pp_mlm_imp{imp}_pc_bez.Rds")))) {
+    output <- ancombc2(
+      data = tse_map, 
+      assay_name = "counts", 
+      #tax_level = "genus",
+      fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "fdr", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.4, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+    )
+    save(output, file = here::here(glue("data/daa_pp_mlm_imp{imp}_pc_bez.Rds")))
+  } else {
+    load(here::here(glue("data/daa_pp_mlm_imp{imp}_pc_bez.Rds")))
+  }
+  
+  # structural zeros 
+  tab_zero_all = output$zero_ind
+  # sensitivity scores 
+  tab_sens_all = output$pseudo_sens_tab
+  sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>%
+    filter(value >5)
+  res_prim_all <- output$res
+  effects_all <- map(effects, function(effect) {
+    select(res_prim_all, 
+           taxon, 
+           lfc = glue::glue("lfc_{effect}"),
+           se = glue::glue("se_{effect}"),
+           indicator = glue::glue("diff_{effect}")) %>%
+      filter(indicator) %>%
+      mutate(effect = effect)
+  })
+  
+  
+  
+  # model includes random intercepts and excludes 1 year samples
+  if (!file.exists(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_pc_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_map, week != 52), 
+      assay_name = "counts", 
+      #tax_level = "genus",
+      fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+      rand_formula = "(1 | skippy_id)",
+      p_adj_method = "fdr", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.4, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_pc_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_pc_bez.Rds")))
+  }
+     
+  # structural zeros 
+  tab_zero_infancy = output$zero_ind
+  # sensitivity scores 
+  tab_sens_infancy = output$pseudo_sens_tab
+  sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_infancy <- output$res
+  effects_infancy <- map(effects, function(effect) {
+     select(res_prim_infancy, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+  
+  
+  # model includes 1 years samples only 
+  if (!file.exists(here::here(glue("data/daa_pp_year1_imp{imp}_pc_bez.Rds")))) {
+    output <- ancombc2(
+      data = filter(tse_map, week == 52), 
+      assay_name = "counts", 
+      #tax_level = "genus",
+      fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex",
+      # rand_formula = "(1 | skippy_id)",
+      p_adj_method = "fdr", 
+      pseudo = 0, 
+      pseudo_sens = TRUE,
+      prv_cut = 0.10, 
+      lib_cut = 1000, 
+      s0_perc = 0.05,
+      group = "condition", 
+      struc_zero = TRUE, 
+      neg_lb = TRUE,
+      alpha = 0.4, 
+      n_cl = 2, 
+      verbose = TRUE,
+      global = FALSE, 
+      pairwise = FALSE, 
+      dunnet = FALSE, 
+      trend = FALSE,
+      iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
+      em_control = list(tol = 1e-5, max_iter = 100),
+      lme_control = lme4::lmerControl(),
+      mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100),
+      trend_control = list(
+        contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), 
+        node = list(2), 
+        solver = "ECOS",B = 100)
+      )
+      save(output, file = here::here(glue("data/daa_pp_year1_imp{imp}_pc_bez.Rds")))
+   } else {
+     load(here::here(glue("data/daa_pp_year1_imp{imp}_pc_bez.Rds")))
+   }
+     
+  # structural zeros 
+  tab_zero_year1 = output$zero_ind
+  # sensitivity scores 
+  tab_sens_year1 = output$pseudo_sens_tab
+  sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>%
+     filter(value >5)
+  res_prim_year1 <- output$res
+  effects_year1 <- map(effects, function(effect) {
+     select(res_prim_year1, 
+       taxon, 
+       lfc = glue::glue("lfc_{effect}"),
+       se = glue::glue("se_{effect}"),
+       indicator = glue::glue("diff_{effect}")) %>%
+     filter(indicator) %>%
+     mutate(effect = effect)
+   })
+
+  list(
+    all = list(
+      tab_zero_all,
+      sens_scores_all,
+      res_prim_all,
+      effects_all
+    ),
+    infancy = list(
+      tab_zero_infancy,
+      sens_scores_infancy,
+      res_prim_infancy,
+      effects_infancy
+    ),
+    year1 = list(
+      tab_zero_year1,
+      sens_scores_year1,
+      res_prim_year1,
+      effects_year1
+    )
+  )
+  
+})
+
+
+
+
+
+
+ancom_tables_pp <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_pp) <- str_remove(colnames(ancom_tables_pp), "_condition1") 
+
+ancom_tables_pp2 <- models_imp[[1]]$all[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_pp2) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") 
+
+save(ancom_tables_pp, file = here::here("data/ancom_tables_pp_pc.Rds"))
+
+ancom_tables_pp
+ancom_tables_pp2
+
+
+
+
+
+
+
+
+
+t1 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  lfc_age_s,
+  lfc_siblings1,
+
+) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  p_age_s,
+  p_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3 <- select(
+  models_imp[[1]]$all[[3]],
+  taxon,
+  q_age_s,
+  q_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_pp <- full_join(t1, t2, by = c("taxon", "variable")) %>%
+  full_join(t3, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_pp, file = here::here("data/ancombc_remaining_pp_pc.Rds"))
+
+
+
+
+# same for early infancy samples:
+ancom_tables_pp_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_pp_infancy) <- str_remove(colnames(ancom_tables_pp), "_condition1") 
+
+ancom_tables_pp2_infancy <- models_imp[[1]]$infancy[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_pp2_infancy) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") 
+
+save(ancom_tables_pp_infancy, file = here::here("data/ancom_tables_pp_infancy_pc.Rds"))
+
+ancom_tables_pp_infancy
+
+
+
+
+t1_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  lfc_age_s,
+  lfc_siblings1,
+
+) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  p_age_s,
+  p_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_infancy <- select(
+  models_imp[[1]]$infancy[[3]],
+  taxon,
+  q_age_s,
+  q_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_infancy_pp <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>%
+  full_join(t3_infancy, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_infancy_pp, file = here::here("data/ancombc_remaining_infancy_pp_pc.Rds"))
+
+
+
+# same for year1 samples:
+ancom_tables_pp_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>%
+  arrange(q_condition1, desc(abs(lfc_condition1))) %>%
+  mutate(across(where(is.numeric), round, 3))
+colnames(ancom_tables_pp_year1) <- str_remove(colnames(ancom_tables_pp), "_condition1") 
+
+ancom_tables_pp2_year1 <- models_imp[[1]]$year1[[3]] %>% 
+  select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>%
+  arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>%
+  mutate(across(where(is.numeric), round, 3))
+
+colnames(ancom_tables_pp2_year1) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") 
+
+save(ancom_tables_pp_year1, file = here::here("data/ancom_tables_pp_year1_pc.Rds"))
+
+ancom_tables_pp_year1
+ancom_tables_pp2_year1
+
+
+
+
+
+
+t1_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  lfc_age_s,
+  lfc_siblings1,
+
+) %>%
+  pivot_longer(
+    contains("lfc"), 
+    names_to = "variable", 
+    values_to = "lfc", 
+    names_prefix = "lfc_")
+
+
+t2_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  p_age_s,
+  p_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("p_"), 
+    names_to = "variable", 
+    values_to = "p", 
+    names_prefix = "p_")
+t2
+t3_year1 <- select(
+  models_imp[[1]]$year1[[3]],
+  taxon,
+  q_age_s,
+  q_siblings1,
+
+
+) %>%
+  pivot_longer(
+    contains("q_"), 
+    names_to = "variable", 
+    values_to = "q", 
+    names_prefix = "q_")
+t3
+
+ancombc_remaining_year1_pp <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>%
+  full_join(t3_year, by = c("taxon", "variable")) %>%
+  filter(p <= 0.05) %>%
+  mutate(
+    across(where(is.numeric), round, 3),
+    taxon = str_remove(taxon, "genus:")
+  ) %>%
+  arrange(variable, q) 
+save(ancombc_remaining_year1_pp, file = here::here("data/ancombc_remaining_year1_pp_pc.Rds"))
+
+
+
+
+
+
+
+
diff --git a/R/pc_daa_maaslin2_bez.R b/R/pc_daa_maaslin2_bez.R
new file mode 100644
index 0000000..e183f73
--- /dev/null
+++ b/R/pc_daa_maaslin2_bez.R
@@ -0,0 +1,568 @@
+set.seed(1)
+library(mia)
+library(Maaslin2)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(glue)
+
+
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# for analyses we apply prevalence fitlering
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+fvars <- c("siblings", "condition")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-all_of(fvars), -age, -age_s) %>%
+  left_join(select(d, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+
+####################### 1.1 Complete Case Analysis ############################
+
+
+
+asv_tab <- t(assay(tse))
+meta <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta),]
+
+# you can specifiy different GLMs/normalizations/transforms. We used similar
+# settings as in Nearing et al. (2021) here:
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c("condition", "siblings", "age_s"),
+  random_effects = "skippy_id", 
+  reference = "0",  
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.5, metadata == "condition")
+
+# model includes random intercepts and excludes 1 year samples
+tse_infancy <- filter(tse, week != 52)
+asv_tab <- t(assay(tse))
+meta <- colData(tse_infancy) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta), ]
+
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c("condition", "siblings", "age_s"),
+  random_effects = "skippy_id", 
+  reference = "0",  
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.4, metadata == "condition")
+
+
+
+
+# model includes 1 years samples only 
+tse_year1 <- filter(tse, week == 52)
+asv_tab <- t(assay(tse))
+meta <- colData(tse_year1) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta), ]
+
+
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c("condition", "siblings", "age_s"),
+  #random_effects = "skippy_id", 
+  reference = "0",  
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.4, metadata == "condition")
+
+
+
+
+######################## 1. 2Multiple imputation  #############################
+
+
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+
+
+if (!file.exists(here::here("data/maaslin2_itt_mi_pc_bez.Rds"))) {
+  models_imp <- map2(implist, 1:length(implist), function(dimp, imp) {
+    tse_map <- tse
+    fvars <- c("siblings", "condition")
+    # add metadata to tse
+    colData(tse_map) <- colData(tse_map) %>%
+      as.data.frame() %>%
+      select(-all_of(fvars), -age,-age_s) %>%
+      left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>%
+      column_to_rownames("sample_id") %>%
+      mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+      DataFrame()
+    colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+    colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+
+    # all samples
+    asv_tab <- t(assay(tse_map))
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c("condition", "siblings", "age_s"),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+    
+    res_all <- fit_data$results
+
+
+
+    
+    # model includes random intercepts and excludes 1 year samples
+    tse_infancy <- filter(tse_map, week != 52)
+    asv_tab <- t(assay(tse_infancy))
+    meta <- colData(tse_infancy) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[rownames(meta), ]
+
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c("condition", "siblings", "age_s"),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_inf <- fit_data$results
+
+
+    # model includes 1 years samples only 
+    tse_year1 <- filter(tse_map, week == 52)
+    asv_tab <- t(assay(tse_year1))
+    meta <- colData(tse_year1) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c("condition", "siblings", "age_s"),
+      #random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_year1 <- fit_data$results
+
+    list(res_all, res_inf, res_year1)
+  })
+  save(models_imp, file = here::here("data/maaslin2_itt_mi_pc_bez.Rds"))
+ } else {
+  load(here::here("data/maaslin2_itt_mi_pc_bez.Rds"))
+ }
+
+
+# change [[1]] to 2-5 to inspect the other imputations
+maaslin2_tables_itt <- map(models_imp[[1]], function(x) {
+    x %>% filter(metadata == "condition") %>%
+      select(feature, coef, stderr, pval, qval) %>%
+      arrange(qval, desc(abs(coef))) %>%
+      mutate(
+        across(where(is.numeric), round, 3),
+        feature = str_replace(feature, "\\.\\.", "."),
+        feature = str_replace(feature, "\\.", ":")
+        )
+})
+modules
+colnames(modules) <- c("gbm", "description")
+
+# since MaAsLin does not support interactions I need to evaluate early and late infancy separately
+maaslin2_identified <- map2_dfr(maaslin2_tables_itt, c("all", "early", "late"), function(table, term) {
+  filter(table, qval <= 0.2) %>%
+  mutate(term = term) %>%
+  arrange(coef, qval)
+  }) %>%
+  rename(gbm = feature) %>%
+  left_join(modules, by = "gbm") %>%
+  select(-gbm) %>%
+  select(gbm = description, everything())
+
+maaslin2_identified
+save(maaslin2_tables_itt, maaslin2_identified,  file = here::here("data/maaslin2_tables_itt_pc.Rds"))
+
+map2_dfr(maaslin2_tables_itt, c("all", "early", "late"), function(table, term) {
+  filter(table, qval <= 0.2) %>%
+  mutate(term = term) %>%
+  arrange(coef, qval)
+  }) %>%
+  rename(gbm = feature) %>%
+  left_join(modules, by = "gbm") 
+
+
+
+# plot significant features 
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# for analyses we apply prevalence fitlering
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+fvars <- c("siblings", "condition")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-all_of(fvars), -age, -age_s) %>%
+  left_join(select(d, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+
+tse <- transformSamples(tse, method = "relabundance")
+tse <- transformSamples(tse, abund_values = "relabundance", method = "log10", pseudocount = 1)
+gbms_identified <- as.data.frame(t(assay(tse, "log10"))) %>%
+  rownames_to_column("sample_id") %>%
+  select(
+    sample_id,
+    MGB027,
+    MGB052, 
+    MGB045
+)
+# colnames(gbms_identified) <- c(
+#   "sample_id", 
+#   "Nitric oxide degradation I (NO dioxygenase)",
+#   "Butyrate synthesis I",
+#   "Acetate synthesis III"
+# )
+gbms_meta <- as.data.frame(colData(tse)) %>%
+  rownames_to_column("sample_id") %>%
+  mutate(condition_label = ifelse(condition == 0, "CAU", ifelse(
+    condition == 1, "SSC", NA)),
+    week_label = glue::glue("Week {week}")
+    ) %>%
+  select(sample_id, condition_label, week_label)
+
+gbms_data <- full_join(gbms_identified, gbms_meta, by = "sample_id") 
+plots <- map2(list(
+  "MGB027",
+  "MGB052", 
+  "MGB045"),
+  list(
+  "Nitric oxide degradation I",
+  "Butyrate synthesis I",
+  "Acetate synthesis III"
+  ),
+  function(gbm_name, description) {
+    #pivot_longer(-c(sample_id, condition_label, week_label), names_to = "GBM", values_to = "values") %>%
+    ggplot(gbms_data, aes_string(x = "condition_label", y = gbm_name, fill = "condition_label")) +
+      geom_boxplot(outlier.alpha = 0) +
+      # ggbeeswarm::geom_beeswarm(alpha = 0.4) +
+      geom_jitter(width = 0.1, size = 2) + 
+      #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") +
+      facet_wrap(~week_label) +
+      #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+      scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+      theme_bw(base_size = 25) +
+      theme(
+        legend.position = "none",
+        strip.placement = "outside",
+        strip.background = element_blank()) +
+      xlab("") +
+      ylab(description)
+  }
+)
+plots[[1]]
+
+save(plots, file = here::here("data/gbm_plots.Rds"))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+fvars <- c("siblings", "condition", "csection", "sex")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex, -age) %>%
+  left_join(select(d_pp, age, sample_id, constipation, diarrhea, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+    mutate(
+    across(all_of(fvars), function(x) as.factor(x)),
+    ges_age_s = scale(ges_age)[, 1],
+    birthweight_s = scale(birthweight)[, 1],
+    edlevel_s = scale(edlevel)[, 1]
+    ) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+tse <- filter(tse, pp == 1)
+
+####################### 2.1 Complete Case Analysis ############################
+
+
+# model includes random intercepts and all samples 
+asv_tab <- t(assay(tse))
+meta <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars))
+asv_tab <- asv_tab[rownames(meta), ]
+
+fit_data <- Maaslin2(
+  asv_tab,
+  meta,
+  output = here::here("data/maaslin/1"),
+  transform = "AST",
+  fixed_effects = c(
+    "condition", 
+    "siblings", 
+    "age_s",
+    "csection",
+    "sex",
+    "ges_age_s",
+    "birthweight_s",
+    "edlevel_s"
+    ),
+  random_effects = "skippy_id", 
+  reference = "0",
+  normalization = "TSS",
+  standardize = FALSE,
+  min_prevalence = 0 # prev filterin already done
+)
+filter(fit_data$results, qval <= 0.1, metadata == "condition")
+
+
+
+
+
+
+
+######################## 2.2 Multiple imputation  #############################
+
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+
+if (!file.exists(here::here("data/maaslin2_pp_mi_pc_bez.Rds"))) {
+  models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) {
+    tse_map <- tse 
+    fvars <- c("siblings", "condition", "csection", "sex")
+    # add metadata to tse
+    colData(tse_map) <- colData(tse_map) %>%
+      as.data.frame() %>%
+      select(-ges_age, -birthweight, -edlevel, -all_of(fvars), -age, -age_s) %>%
+      left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>%
+      column_to_rownames("sample_id") %>%
+      mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+      DataFrame()
+    colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+    colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+    tse_map <- filter(tse_map, pp == 1)
+    
+    # all samples
+    asv_tab <- t(assay(tse_map))
+    meta <- colData(tse_map) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c(
+        "condition", 
+        "siblings", 
+        "age_s",
+        "csection",
+        "sex",
+        "ges_age_s",
+        "birthweight_s",
+        "edlevel_s"
+        ),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+      
+    res_all <- fit_data$results
+
+
+    # model includes random intercepts and excludes 1 year samples
+    tse_infancy <- filter(tse_map, week != 52)
+    asv_tab <- t(assay(tse_infancy))
+    meta <- colData(tse_infancy) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+    asv_tab <- asv_tab[rownames(meta), ]
+
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c(
+        "condition", 
+        "siblings", 
+        "age_s",
+        "csection",
+        "sex",
+        "ges_age_s",
+        "birthweight_s",
+        "edlevel_s"
+        ),
+      random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_inf <- fit_data$results
+    
+    
+      # model includes 1 years samples only 
+    tse_year1 <- filter(tse_map, week == 52)
+    asv_tab <- t(assay(tse_year1))
+    meta <- colData(tse_year1) %>% as.data.frame() %>%
+      select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+    asv_tab <- asv_tab[rownames(meta), ]
+
+    fit_data <- Maaslin2(
+      asv_tab,
+      meta,
+      output = here::here("data/maaslin/1"),
+      transform = "AST",
+      fixed_effects = c(
+        "condition", 
+        "siblings", 
+        "age_s",
+        "csection",
+        "sex",
+        "ges_age_s",
+        "birthweight_s",
+        "edlevel_s"
+        ),
+      #random_effects = "skippy_id", 
+      reference = "0",  
+      normalization = "TSS",
+      standardize = FALSE,
+      min_prevalence = 0 # prev filterin already done
+    )
+
+    res_year1 <- fit_data$results
+    
+    list(res_all, res_inf, res_year1)
+  })
+  save(models_imp, file = here::here("data/maaslin2_pp_mi_pc_bez.Rds"))
+ } else {
+  load(here::here("data/maaslin2_pp_mi_pc_bez.Rds"))
+ }
+
+
+
+
+
+# switch the 1 to 2-5 to check other imputations
+maaslin2_tables_pp <- map(models_imp[[1]], function(x) {
+    x %>% filter(metadata == "condition") %>%
+      select(feature, coef, stderr, pval, qval) %>%
+      arrange(qval, desc(abs(coef))) %>%
+      mutate(
+        across(where(is.numeric), round, 3),
+        feature = str_replace(feature, "\\.\\.", "."),
+        feature = str_replace(feature, "\\.", ":")
+        )
+})
+
+save(maaslin2_tables_pp, file = here::here("data/maaslin2_tables_pp_pc.Rds"))
+
+
+
diff --git a/R/pc_linda_bez.R b/R/pc_linda_bez.R
new file mode 100644
index 0000000..bb080ce
--- /dev/null
+++ b/R/pc_linda_bez.R
@@ -0,0 +1,525 @@
+set.seed(1)
+library(mia)
+library(LinDA)
+library(tidyverse)
+library(tidySummarizedExperiment)
+library(glue)
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# for analyses we apply prevalence fitlering
+tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+####################### 1.1 Complete Case Analysis ############################
+
+
+# model includes random intercepts and all samples 
+asv_tab <- as.data.frame(assay(tse_pc, "counts"))
+vars <- c("age_s", "condition", "siblings", "constipation", "diarrhea", "skippy_id")
+meta <- colData(tse_pc) %>% as.data.frame() %>%
+  select(all_of(vars))
+linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.1,
+                   prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+
+
+filter(linda_obj$output$condition1, reject)
+filter(linda_obj$output$siblings1, reject)
+ 
+
+# model includes random intercepts and excludes 1 year samples
+tse_pc_infancy <- filter(tse_pc, week != 52)
+asv_tab <- as.data.frame(assay(tse_pc_infancy, "counts"))
+meta <- colData(tse_pc_infancy) %>% as.data.frame() %>%
+  select(all_of(vars))
+dim(na.omit(meta))
+linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.1,
+                   prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+# linda.plot(linda_obj, c('condition'), 
+#            titles = c('Condition: n v.s. y'), alpha = 0.1, lfc.cut = 1,
+#            legend = TRUE, directory = NULL, width = 11, height = 8)
+
+filter(linda_obj$output$condition1, reject)
+filter(linda_obj$output$siblings1, reject)
+
+
+
+# model includes 1 years samples only 
+tse_pc_year1 <- filter(tse_pc, week == 52)
+asv_tab <- as.data.frame(assay(tse_pc_year1, "counts"))
+meta <- colData(tse_pc_year1) %>% as.data.frame() %>%
+  select(all_of(vars))
+linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings', alpha = 0.1,
+                   prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+res <- filter(linda_obj$output$condition1, reject) %>%
+  rownames_to_column("taxid")
+rd <- rowData(tse_pc_year1) %>% 
+  as.data.frame() %>% 
+  rownames_to_column("taxid") %>%
+  filter(taxid %in% res$taxid)
+res <- left_join(res, rd, by = "taxid")
+filter(linda_obj$output$condition1, reject)
+filter(linda_obj$output$siblings1, reject)
+
+
+
+######################## 1. 2Multiple imputation  #############################
+
+
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+models_imp <- map2(implist, 1:length(implist), function(dimp, imp) {
+  tse_pc_map <- tse_pc 
+  fvars <- c("constipation", "siblings", "diarrhea", "condition")
+  # add metadata to tse_pc
+  colData(tse_pc_map) <- colData(tse_pc_map) %>%
+    as.data.frame() %>%
+    select(-all_of(fvars), -contains("age")) %>%
+    left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_pc_map)$age <- colData(tse_pc_map)$age + as.numeric(colData(tse_pc_map)$week) * 7
+  colData(tse_pc_map)$age_s <- scale(colData(tse_pc_map)$age)[, 1]
+
+  # all samples
+  asv_tab <- as.data.frame(assay(tse_pc_map, "counts"))
+  meta <- colData(tse_pc_map) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars))
+    asv_tab <- asv_tab[, rownames(meta)]
+
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  
+  res_all <- linda_obj$output$condition1
+  res_all2 <- linda_obj$output[["condition1:age_s"]]
+  
+  # model includes random intercepts and excludes 1 year samples
+  tse_pc_infancy <- filter(tse_pc_map, week != 52)
+  asv_tab <- as.data.frame(assay(tse_pc_infancy, "counts"))
+  meta <- colData(tse_pc_infancy) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars))
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.1,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+
+
+  res_infancy <- linda_obj$output$condition1
+  res_infancy2 <- linda_obj$output[["condition1:age_s"]]
+  
+  # model includes 1 years samples only 
+  tse_pc_year1 <- filter(tse_pc_map, week == 52)
+  asv_tab <- as.data.frame(assay(tse_pc_year1, "counts"))
+  meta <- colData(tse_pc_year1) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars))
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings', alpha = 0.1,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+
+  
+  res_year1 <- linda_obj$output$condition1
+  res_year12 <- linda_obj$output[["condition1:age_s"]]
+
+  list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12)
+})
+
+tables_linda_itt <- map(models_imp, function(x) {
+  x[[1]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+tables_linda_itt
+
+tables_linda_itt2 <- map(models_imp, function(x) {
+  x[[4]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+tables_linda_itt2
+
+
+
+
+
+tables_linda_itt_infancy <- map(models_imp, function(x) {
+  x[[2]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+ 
+
+tables_linda_itt_year1 <- map(models_imp, function(x) {
+  x[[3]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3)))
+})
+
+tables_linda_itt
+tables_linda_itt2
+tables_linda_itt_infancy
+tables_linda_itt_year1
+
+linda_identified <- map2_dfr(
+  list(
+    tables_linda_itt,
+    tables_linda_itt2,
+    tables_linda_itt_infancy,
+    tables_linda_itt_year1
+    ),
+  c(
+    "main effect",
+    "interaction term",
+    "early",
+    "late"
+  ),
+  function(table, term) {
+    filter(table[[1]], padj <= 0.2)  %>%
+      mutate(term = term)
+  }) %>%
+  arrange(desc(abs(log2FoldChange)), padj)
+
+# # delete duplicate row
+# linda_identified <- linda_identified[-4, ]
+linda_identified
+
+
+save(
+  tables_linda_itt, 
+  tables_linda_itt2, 
+  linda_identified, 
+  file = here::here("data/tables_linda_itt_pc.Rds")
+)
+
+save(
+  tables_linda_itt_infancy, 
+  file = here::here("data/tables_linda_itt_infancy_pc.Rds")
+)
+
+save(
+  tables_linda_itt_year1, 
+  file = here::here("data/tables_linda_itt_year1_pc.Rds")
+) 
+
+
+
+
+
+
+
+
+models_imp <- map2(implist, 1:length(implist), function(dimp, imp) {
+  tse_pc_map <- tse_pc 
+  fvars <- c("constipation", "siblings", "diarrhea", "condition")
+  # add metadata to tse_pc
+  colData(tse_pc_map) <- colData(tse_pc_map) %>%
+    as.data.frame() %>%
+    select(-all_of(fvars), -contains("age"), -bfexcl) %>%
+    left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, bfexcl, condition), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_pc_map)$age <- colData(tse_pc_map)$age + as.numeric(colData(tse_pc_map)$week) * 7
+  colData(tse_pc_map)$age_s <- scale(colData(tse_pc_map)$age)[, 1]
+  
+  
+  # model includes 1 years samples only 
+  tse_pc_year1 <- filter(tse_pc_map, week == 52)
+  asv_tab <- as.data.frame(assay(tse_pc_year1, "counts"))
+  meta <- colData(tse_pc_year1) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), bfexcl)
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + bfexcl', alpha = 0.1,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  res <- filter(linda_obj$output$condition1, reject) %>%
+    rownames_to_column("taxid")
+  if (dim(res)[1] >= 1) {
+    rd <- rowData(tse_pc_year1) %>% 
+     as.data.frame() %>% 
+     rownames_to_column("taxid") %>%
+     filter(taxid %in% res$taxid)
+    res_year1 <- left_join(res, rd, by = "taxid") %>% 
+     mutate(time = "year1")
+  }
+  
+  out <- list()
+  if(exists("res_year1")) {
+    out[[1]] <- res_year1 
+  } 
+  out
+})
+
+
+models_imp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- left_join(d, pp_indicator, by = "skippy_id")
+}
+# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits...
+d_pp <- filter(d, pp == 1)
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+
+fvars <- c("constipation", "siblings", "diarrhea", "condition", "csection", "sex")
+# add metadata to tse
+colData(tse) <- colData(tse) %>%
+  as.data.frame() %>%
+  select(-ges_age, -birthweight, -edlevel, -csection, -all_of(fvars), -age, -age_s) %>%
+  left_join(select(d_pp, age, sample_id, constipation, condition, diarrhea, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>%
+  column_to_rownames("sample_id") %>%
+  mutate(
+    across(all_of(fvars), function(x) as.factor(x)),
+    ges_age_s = scale(ges_age)[, 1],
+    birthweight_s = scale(birthweight)[, 1],
+    edlevel_s = scale(edlevel)[, 1]
+    ) %>%
+  DataFrame()
+colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7
+colData(tse)$age_s <- scale(colData(tse)$age)[, 1]
+tse <- filter(tse, pp == 1)
+
+####################### 2.1 Complete Case Analysis ############################
+
+
+# model includes random intercepts and all samples 
+asv_tab <- as.data.frame(assay(tse, "counts"))
+meta <- colData(tse) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s)
+asv_tab <- asv_tab[, rownames(meta)]
+linda_obj <- linda(
+  asv_tab, 
+  meta, 
+  formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4,
+  prev.cut = 0.1, 
+  lib.cut = 1000, 
+  winsor.quan = 0.97
+)
+
+
+filter(linda_obj$output$condition1, reject)
+filter(linda_obj$output$siblings1, reject)
+
+
+# model includes random intercepts and excludes 1 year samples
+tse_infancy <- filter(tse, week != 52)
+asv_tab <- as.data.frame(assay(tse_infancy, "counts"))
+meta <- colData(tse_infancy) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars), ges_age_s, edlevel_s, birthweight_s)
+dim(na.omit(meta))
+asv_tab <- asv_tab[, rownames(meta)]
+
+linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + edlevel_s + ges_age_s + birthweight_s + sex + csection + (1|skippy_id)', alpha = 0.4,
+                   prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+# linda.plot(linda_obj, c('condition'), 
+#            titles = c('Condition: n v.s. y'), alpha = 0.4, lfc.cut = 1,
+#            legend = TRUE, directory = NULL, width = 11, height = 8)
+
+filter(linda_obj$output$condition1, reject)
+filter(linda_obj$output$siblings1, reject)
+
+
+
+# model includes 1 years samples only 
+tse_year1 <- filter(tse, week == 52)
+asv_tab <- as.data.frame(assay(tse_year1, "counts"))
+meta <- colData(tse_year1) %>% as.data.frame() %>%
+  select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s)
+asv_tab <- asv_tab[, rownames(meta)]
+linda_obj <- linda(
+  asv_tab, 
+  meta, 
+  formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4,
+  prev.cut = 0.1, 
+  lib.cut = 1000, 
+  winsor.quan = 0.97
+)
+res <- filter(linda_obj$output$condition1, reject) %>%
+  rownames_to_column("taxid")
+rd <- rowData(tse_year1) %>% 
+  as.data.frame() %>% 
+  rownames_to_column("taxid") %>%
+  filter(taxid %in% res$taxid)
+res <- left_join(res, rd, by = "taxid")
+filter(linda_obj$output$siblings1, reject)
+filter(linda_obj$output$condition1, reject)
+
+
+
+
+
+######################## 2.2 Multiple imputation  #############################
+
+load(here::here("data/data_pc.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+# for analyses we apply prevalence fitlering
+tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1)
+
+
+models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) {
+  tse_map <- tse 
+  fvars <- c("constipation", "siblings", "diarrhea", "condition", "csection", "sex")
+  # add metadata to tse
+  colData(tse_map) <- colData(tse_map) %>%
+    as.data.frame() %>%
+    select(-all_of(fvars), -age, -age_s, -ges_age, -birthweight, -edlevel, -csection) %>%
+    left_join(select(dimp, age, sample_id, condition, constipation, diarrhea, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>%
+    column_to_rownames("sample_id") %>%
+    mutate(across(all_of(fvars), function(x) as.factor(x))) %>%
+    DataFrame()
+  colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7
+  colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1]
+  tse_map <- filter(tse_map, pp == 1)
+  
+  # all samples
+  asv_tab <- as.data.frame(assay(tse_map, "counts"))
+  meta <- colData(tse_map) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  
+
+  res_all <- linda_obj$output$condition1
+  res_all2 <- linda_obj$output[["condition1:age_s"]]
+  
+  # model includes random intercepts and excludes 1 year samples
+  tse_infancy <- filter(tse_map, week != 52)
+  asv_tab <- as.data.frame(assay(tse_infancy, "counts"))
+  meta <- colData(tse_infancy) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+  
+  res_infancy <- linda_obj$output$condition1
+  res_infancy2 <- linda_obj$output[["condition1:age_s"]]
+
+  
+  
+  # model includes 1 years samples only 
+  tse_year1 <- filter(tse_map, week == 52)
+  asv_tab <- as.data.frame(assay(tse_year1, "counts"))
+  meta <- colData(tse_year1) %>% as.data.frame() %>%
+    select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s)
+  asv_tab <- asv_tab[, rownames(meta)]
+  linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4,
+                     prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97)
+
+  res_year1 <- linda_obj$output$condition1
+  res_year12 <- linda_obj$output[["condition1:age_s"]]
+
+  list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12)
+})
+
+tables_linda_pp <- map(models_imp, function(x) {
+  x[[1]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3))) 
+})
+tables_linda_pp
+
+save(tables_linda_pp, file = here::here("data/tables_linda_pp_pc.Rds"))
+
+
+
+tables_linda_pp_infancy <- map(models_imp, function(x) {
+  x[[2]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3))) 
+})
+tables_linda_pp_infancy
+
+save(tables_linda_pp_infancy, file = here::here("data/tables_linda_pp_infancy_pc.Rds"))
+
+
+
+tables_linda_pp_year1 <- map(models_imp, function(x) {
+  x[[3]] %>%
+    rownames_to_column("taxon") %>%
+    select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>%
+    arrange(padj, desc(abs(log2FoldChange))) %>%
+    mutate(across(where(is.numeric), function(x) round(x, 3))) 
+})
+tables_linda_pp_year1
+
+save(tables_linda_pp_year1, file = here::here("data/tables_linda_pp_year1_pc.Rds"))
+
diff --git a/R/skippy_import_biom_mba.R b/R/skippy_import_biom_mba.R
new file mode 100644
index 0000000..43d9a12
--- /dev/null
+++ b/R/skippy_import_biom_mba.R
@@ -0,0 +1,135 @@
+# tse_bingo, tse_bibo and tse_skippy must go through same steps in terms
+# of preprocessing
+library(tidyverse)
+library(mia)
+library(glue)
+library(tidySummarizedExperiment)
+library(scater)
+library(readxl)
+
+
+# biom_file <- here::here("data/raw_data/skippy_non_concat_SILVA_138.biom")
+# file.exists(biom_file)
+# # because of some issue in NGT2 we need to replace NaN in the biom file with 0
+# biom_lines <- read_lines(biom_file)
+# sum(str_count(biom_lines, "NaN"))
+# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0")
+# sum(str_count(biom_lines, "NaN"))
+# write_lines(
+#   biom_lines,
+#   here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom")
+# )
+
+# now we should be able to use read_biom 
+biom_file <- here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom")
+
+
+# extract skippy id and week from map files
+raw_path <- here::here("data/map_files")
+map_files <- list.files(raw_path, pattern = ".xlsx$")
+map_files %>% length()
+lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d")
+LibraryNumber_merged <- c(1:6)
+map_file_merged <- map2_dfr(map_files[1:6], LibraryNumber_merged, function(filename, lnm) {
+  # extract library number
+  lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d")
+  # read provided map file
+  xl <- read_excel(glue("{raw_path}/{filename}"))
+  map_file <- xl %>% 
+    mutate(
+      LibraryNumber = lnm,
+      ProjectName = ifelse(is.na(ProjectName), "unspecified", ProjectName)
+    ) %>% 
+    filter(!ProjectName == "Shime-uncooled_ileal_digesta") %>%
+    select(
+      "sample_id" = internal_sample_id,
+      id = Seq_ID,
+      ProjectName   
+    ) %>%
+      filter(!is.na(`sample_id`))
+    
+    return(map_file)
+})
+# make sample_ids unique as they are in the tse object later 
+map_file_merged <- map_file_merged %>% 
+  mutate(
+  namealt = glue("{sample_id}_{1:dim(map_file_merged)[1]}"),
+  sample_id = ifelse(str_detect(sample_id, "MOCK"), namealt, sample_id))
+
+# extract skippy id and week
+# the following provided duplicates and need extra labeling later 
+# when I create the new sample names (see further below)
+duplicates <- c(
+  "245_2_22-1-17", 
+  "240_52_11-2-18",
+  "269_2_18_mei_2017", 
+  "276_5_26052017"
+)
+
+sample_data <- str_match(map_file_merged$id, ".*(\\d\\d\\d)(_|-)(\\d+).*") %>%
+  as_tibble() %>%
+  select(id = V1, skippy_id = V2, week = V4) %>%
+  full_join(map_file_merged, by = "id")  %>%
+  filter(!is.na(sample_id)) %>%
+  mutate(duplicated = ifelse(id %in% duplicates, TRUE, FALSE)) %>%
+  select(wur_id = id, skippy_id, sample_id, week, duplicated)
+filter(sample_data, duplicated)
+
+
+skippy <- loadFromBiom(biom_file)
+colnames(rowData(skippy)) <- c(
+  "kingdom", "phylum", "class", "order", "family", "genus", "species"
+)
+
+# add sample ids as column
+colData(skippy)[["sample_id"]] <- rownames(colData(skippy))
+rownames(colData(skippy))
+colData(skippy)[["sample_id"]]
+
+
+
+colData(skippy) <- colData(skippy) %>% as.data.frame() %>%
+  left_join(sample_data, by = "sample_id") %>%
+  mutate(neg_control = as.factor(ifelse(is.na(week), 1, 0))) %>%
+  column_to_rownames("sample_id") %>%
+  mutate(sample_id = rownames(colData(skippy))) %>%
+  DataFrame()
+
+# we will now store MOCK samples in a separate tse 
+mock_samples <- filter(skippy, str_detect(sample_id, "MOCK"))
+
+# also check they cluster together closely when using ordination
+colData(skippy) %>% dim()
+skippy <- mutate(skippy, 
+  mock = as.factor(ifelse(str_detect(sample_id, "MOCK"), 1, 0))
+)
+colData(skippy)[["library_size"]] <- colSums(assay(skippy))
+# perform NMDS coordination method
+
+# MOCK samples do cluster together but particularly 1 outlier is there
+tse_skippy <- filter(skippy, !str_detect(sample_id, "MOCK"))
+
+
+# library sizes (we need to first get orignal IDs in order to filter out 
+# negative control)
+colData(tse_skippy) %>% as.data.frame() %>% filter(!week %in% c(2, 5, 52))
+
+# the low ls are negative controls as we can see in above table
+colData(tse_skippy) %>% as.data.frame() %>%
+  arrange(library_size)
+# exclude negative controls
+tse_skippy <- filter(tse_skippy, neg_control == 0) %>%
+  mutate(sample_ids_new = glue("{skippy_id}_{ifelse(week == 2, 1, ifelse(week == 5, 2, ifelse(week == 52, 3, 'CHECK')))}{ifelse(duplicated, '_2', '')}"))
+
+
+
+# we retain 331 samples at this step 
+# lets change row and column names to better descriptions 
+colnames(tse_skippy) <- colData(tse_skippy)[, "sample_ids_new"]
+colData(tse_skippy)[, "sample_ids_new"]
+tse_skippy <- filter(skippy, !str_detect(sample_id, "MOCK"))
+rns <- str_c(rowData(tse_skippy)$family, rowData(tse_skippy)$genus, sep = "_")
+rns[duplicated(rns)]
+rownames(tse_skippy) <- rns
+save(tse_skippy, file = here::here("data/tse_skippy.Rds"))
+
diff --git a/R/tables.R b/R/tables.R
new file mode 100644
index 0000000..f12c400
--- /dev/null
+++ b/R/tables.R
@@ -0,0 +1,109 @@
+# this code is from Jennifer but I (Henrik) checked it
+
+library(table1)
+library(dplyr)
+library(tidyverse)
+library(lubridate)
+
+load(here::here("data/data_imp.Rds"))
+load(here::here("data/data.Rds"))
+
+bf <- readxl::read_excel(f, sheet = "Feeding", na = "\\") %>% 
+  select(skippy_id = ID, bfexcl = ExclusiveBF) %>%
+  mutate(skippy_id = as.character(skippy_id))
+
+itt_var <-  read_csv2(here::here("data/skippy_stool_data_cleaned.csv")) %>%
+  mutate_all(function(x) ifelse(x == 99999, NA, ifelse(x == 88888, NA, x))) %>%
+  filter(id %in% sav$id) %>%
+  mutate(across(contains("dat_"), function(x) {
+    x_new <- str_replace(x, "okt", "october")
+    x_new <- str_replace(x_new, "mrt", "march")
+    x_new <- str_replace(x_new, "mei", "may")
+    x_new <- str_replace(x_new, "15-jan-17/22-jan-17", "18/jan/17")
+    x_new <- dmy(x_new)
+    return(x_new)
+  }),
+  datbirth_infant = dmy(datbirth_infant),
+  datbirth_mom = dmy(datbirth_mom),
+  age_week2 = as.numeric(dat_week2 - datbirth_infant),
+  age_week5 = as.numeric(dat_week5 - datbirth_infant),
+  age_week52 = as.numeric(dat_1year - datbirth_infant),
+  age_mom = as.numeric((dat_week2 - datbirth_mom)/365),
+  stdat_week2 = as.numeric(dat_week2 - dmy('01-01-2016')),
+  stdat_week5 = as.numeric(dat_week5 - dmy('01-01-2016')),
+  stdat_week52 = as.numeric(dat_1year - dmy('01-01-2016')),
+  id = as.character(id)) %>%
+  select(
+    skippy_id = id, matches("age_week\\d+"), condition, csection, birthweight, 
+    siblings, sex, antibiotic_week2,
+    antibiotic_week5, antibiotic_1year, apgar_5, ges_age, age_mom, smoking,
+    drinking, weaning, parity, stdat_week2, stdat_week5, stdat_week52) %>%
+  left_join(bf, by = "skippy_id")
+
+# there is a typo in the maternal date of birth of ID 259, so her age will be removed 
+itt_var$age_mom[52] <- NA
+
+itt_var$condition <- factor(itt_var$condition,
+                            levels = c(0, 1),
+                            labels = c("CAU", "SSC"))
+itt_var$sex <- factor(itt_var$sex,
+                      levels = c(0, 1),
+                      labels = c("Male", "Female"))
+itt_var$csection <- factor(itt_var$csection,
+                           levels = c("0", "1"),
+                           labels = c("No", "Yes"))
+
+itt_var$siblings <- as.integer(itt_var$siblings)
+itt_var$parity <- as.integer(itt_var$parity)
+
+
+
+
+itt_var$antibiotic_week2 <- factor(itt_var$antibiotic_week2,
+                                     levels = c("0", "1"),
+                                     labels = c("No", "Yes"))
+itt_var$antibiotic_week5 <- factor(itt_var$antibiotic_week5,
+                                     levels = c("0", "1"),
+                                     labels = c("No", "Yes"))
+itt_var$antibiotic_1year <- factor(itt_var$antibiotic_1year,
+                                     levels = c("0", "1"),
+                                     labels = c("No", "Yes"))
+
+#change labels. Of course there is a better way, but due to time...
+label(itt_var$sex) <- "Sex"
+label(itt_var$ges_age) <- "Gestational age"
+label(itt_var$csection) <- "C-section"
+label(itt_var$siblings)<- "Siblings"
+label(itt_var$parity) <- "Parity"
+label(itt_var$birthweight) <- "Birth weight"
+label(itt_var$condition) <- "Condition"
+label(itt_var$bfexcl) <- "Exclusive breastfeeding duration"
+
+
+
+label(itt_var$age_week2) <- "Age week 2"
+label(itt_var$age_week5) <- "Age week 5"
+label(itt_var$age_week52) <- "Age week 52"
+
+label(itt_var$antibiotic_week2) <- "Antibiotics week 2"
+label(itt_var$antibiotic_week5) <- "Antibiotics week 5"
+label(itt_var$antibiotic_1year) <- "Antibiotics week 52"
+
+units(itt_var$ges_age)       <- "weeks"
+units(itt_var$birthweight) <- "grams"
+units(itt_var$bfexcl) <- "months"
+units(itt_var$age_week2) <- "days"
+units(itt_var$age_week5) <- "days"
+units(itt_var$age_week52) <- "days"
+
+
+# produce table
+tbl1 <- table1(~ sex + csection + ges_age + birthweight + siblings + bfexcl +
+         age_week2 + age_week5 + age_week52 + antibiotic_week2 +
+         antibiotic_week5 + antibiotic_1year | condition, data = itt_var,
+         caption = "Demographics Table",
+         )
+
+save(tbl1, file = here::here("data/tbl1.Rds"))
+
+tbl1
diff --git a/R/volatility.R b/R/volatility.R
new file mode 100644
index 0000000..3526743
--- /dev/null
+++ b/R/volatility.R
@@ -0,0 +1,586 @@
+library(tidyverse)
+library(mia)
+library(glue)
+library(vegan) 
+library(brms)
+library(bayesplot)
+library(posterior)
+library(tidybayes)
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+tse <- transformSamples(tse, method = "clr", name = "clr", pseudocount = 1)
+asv <- t(assay(tse, "clr"))
+ait <- vegdist(asv, method = "euclidean")
+ait <- as.matrix(ait)
+ids <- sort(unique(d$skippy_id))
+vol <- map_dfr(ids, function(id) {
+  
+  # each subject has maximal 3 samples:
+  s1 <- glue("{id}_1")
+  s2 <- glue("{id}_2")
+  s3 <- glue("{id}_3")
+  
+  # we can only obtain vol if there is no missing sample in a pair 
+  vol1 <- ifelse(s1 %in% rownames(ait) & s2 %in% rownames(ait), ait[s1, s2], NA)
+  vol2 <- ifelse(s2 %in% rownames(ait) & s3 %in% rownames(ait), ait[s2, s3], NA)
+  
+  tibble(
+    skippy_id = id,
+    time = c("2-5", "5-52"),
+    vol = c(vol1, vol2)
+  )
+})
+
+
+vol_by_comp <- group_by(vol, time) %>% nest()
+# visualize distributions of volatility per time point pair 
+voldist <- map(vol_by_comp[[2]], function(df) {
+  ggplot(df, aes(vol)) +
+    geom_density()
+})
+voldist[[2]]
+vol1 <- vol_by_comp[[2]][[1]]
+vol2 <- vol_by_comp[[2]][[2]]
+colnames(vol1) <- c("skippy_id", "vol1")
+colnames(vol2) <- c("skippy_id", "vol2")
+
+
+
+
+###############################################################################
+#########################           1. ITT       ##############################
+###############################################################################
+
+
+
+
+####################### 1.1 Complete Case Analysis ############################
+
+d <- dplyr::left_join(d, vol1, by = "skippy_id") %>%
+      dplyr::left_join(vol2, by = "skippy_id") %>%
+      filter(week == 2) %>%
+      mutate(across(contains("vol"), function(x) scale(x)[, 1]))
+fvars <- c("siblings", "condition")
+d <- mutate(
+  d, 
+  across(all_of(fvars), function(x) as.factor(x)),
+  age = week * 7 + age,
+  ges_age_s = scale(ges_age)[, 1],
+  edlevel_s = scale(edlevel)[, 1],
+  birthweight_s = scale(birthweight)[, 1],
+  condition_label = ifelse(condition == 0, "CAU", ifelse(
+    condition == 1, "SSC", NA))
+)
+
+implist_vol <- map(implist, function(dimp) {
+  df <- dplyr::left_join(dimp, vol1, by = "skippy_id") %>%
+        dplyr::left_join(vol2, by = "skippy_id") %>%
+        filter(week == 2) %>%
+        mutate(across(contains("vol"), function(x) scale(x)[, 1]))
+  # the vol columns will be imputed
+  imp <- mice::mice(df, m = 1)
+  complete(imp)
+})
+dim(implist_vol[[1]])
+
+# make a plot
+d %>%
+  pivot_longer(contains("vol"), names_to = "Time", values_to = "Volatility") %>%
+  mutate(Time = ifelse(Time == "vol1", "2-5 weeks", "5-52 weeks")) %>%
+  ggplot(aes(condition_label, Volatility, fill = condition_label)) +
+    geom_boxplot(outlier.alpha = 0) +
+    # ggbeeswarm::geom_beeswarm(alpha = 0.4) +
+    geom_jitter(width = 0.1) +
+    #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") +
+    scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+    #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+    theme_bw(base_size = 25) +
+    theme(
+      legend.position = "none",
+      strip.placement = "outside",
+      strip.background = element_blank()) +
+    xlab("")
+
+
+vol_plot <- d %>%
+  pivot_longer(contains("vol"), names_to = "Time", values_to = "Volatility") %>%
+  mutate(Time = ifelse(Time == "vol1", "2-5 weeks", "5-52 weeks")) %>%
+  ggplot(aes(condition_label, Volatility, fill = condition_label)) +
+    geom_boxplot(outlier.alpha = 0) +
+    # ggbeeswarm::geom_beeswarm(alpha = 0.4) +
+    geom_jitter(width = 0.1, size = 2) + 
+    #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") +
+    facet_wrap(~Time, strip.position = "bottom") +
+    scale_fill_manual(values = c("#ffffff", "#c0c1c2")) +
+    #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) +
+    theme_bw(base_size = 25) +
+    theme(
+      legend.position = "none",
+      strip.placement = "outside",
+      strip.background = element_blank()) +
+    xlab("") 
+vol_plot
+
+save(vol_plot, file = here::here("data/vol_out.Rds"))
+
+
+
+
+
+
+# determine model structure
+
+
+# I consider the following variables for selection 
+coefs <- c(
+  "csection", 
+  "birthweight_s", 
+  "birthweight_s + ges_age_s",
+  "siblings",
+  "sex",
+  "apgar_5_s",
+  "ges_age_s",
+  "edlevel_s",
+  "age_s"
+)
+
+
+# I will use the following algorithm:
+# For each dataset in implist:
+  # Calculate LOO for base model
+  # Then for each var in coefs:
+    # calculate LOO for base model + coef 
+    # If LOO indicates the new model is a better fit
+    # keep that coef in list 
+
+loo_comp <- map_dfr(coefs, function(coef) {
+  map_dfr(1:5, function(i) {
+    # base model 
+    f1 <- bf(Volatility ~ condition + (1|skippy_id))
+    m1 <- brm(
+      data = pivot_longer(implist_vol[[i]], contains("vol"), names_to = "Time", values_to = "Volatility"),
+      formula = f1,
+      file = here::here(glue("data/m1_imp{i}_vol.Rds"))
+    )
+    loo_m1 <- add_criterion(
+      m1,
+      "loo",
+      file = here::here(glue("data/loo_m1_imp{i}_vol")),
+      moment_match = FALSE
+    )
+
+    f2 <- bf(glue("Volatility ~ condition + {coef} + (1|skippy_id)"))
+    m2 <- brm(
+      data = pivot_longer(implist_vol[[i]], contains("vol"), names_to = "Time", values_to = "Volatility"),
+      formula = f2,
+      file = here::here(glue("data/m2_imp{i}_{coef}_vol.Rds"))
+    )
+
+    loo_m2 <- add_criterion(
+      m2,
+      "loo",
+      file = here::here(glue("data/loo_m2_imp{i}_{coef}_vol")),
+      moment_match = FALSE
+    )
+    lcomp <- loo_compare(loo_m2, loo_m1)
+    score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1)
+
+  tibble(
+    coef = coef,
+    imp = i,
+    score = score
+  )
+  }) 
+})
+
+group_by(loo_comp, coef) %>%
+  summarise(ss = sum(score))
+
+
+
+model1 <- brm(
+  family = student(),
+  formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = d,
+  file = here::here("data/vol1_cc.Rds")
+)
+summary(model1)
+hypothesis(model1, "condition1 < 0")
+
+
+
+
+model1_bez <- brm(
+  family = student(),
+  formula = vol1 ~ condition,
+  data = d,
+  file = here::here("data/vol1_cc_bez.Rds")
+)
+summary(model1_bez)
+hypothesis(model1_bez, "condition1 < 0")
+
+
+
+model2 <- brm(
+  family = student(),
+  formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = d,
+  file = here::here("data/vol2_cc.Rds")
+)
+
+summary(model2)
+hypothesis(model2, "condition1 < 0")
+
+
+
+mlm <- brm(
+  family = student(),
+  formula = Volatility ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + (1 | skippy_id),
+  data = pivot_longer(d, contains("vol"), names_to = "Time", values_to = "Volatility"),
+  file = here::here("data/vol1_cc_mlm.Rds")
+)
+summary(mlm)
+hypothesis(mlm, "condition1 < Intercept")
+
+
+
+################### 1.2 Multiple Imputation Analysis ##########################
+
+
+
+model1 <- brm_multiple(
+  family = student(),
+  formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = implist_vol,
+  file = here::here("data/vol1_imp.Rds")
+)
+summary(model1)
+hypothesis(model1, "condition1 < 0")
+
+hypothesis(model1, "ges_age_s < 0")
+post <- posterior_samples(model1)
+mean(post$b_ges_age_s<0)
+
+
+model1_bez <- brm_multiple(
+  family = student(),
+  formula = vol1 ~ condition,
+  data = implist_vol,
+  file = here::here("data/vol1_imp_bez.Rds")
+)
+summary(model1_bez)
+hypothesis(model1_bez, "condition1 < 0")
+
+
+post <- posterior_samples(model1)
+mean(post$b_condition1<0)
+
+model1_bf <- brm_multiple(
+  family = student(),
+  formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl,
+  data = implist_vol,
+  file = here::here("data/vol1_imp_bf.Rds")
+)
+
+summary(model1_bf)
+hypothesis(model1_bf, "condition1 < 0")
+
+model2 <- brm_multiple(
+  family = student(),
+  formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = implist_vol,
+  file = here::here("data/vol2_imp.Rds")
+)
+
+summary(model2)
+hypothesis(model2, "condition1 < 0")
+
+model2_bf <- brm_multiple(
+  family = student(),
+  formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl,
+  data = implist_vol,
+  file = here::here("data/vol2_imp_bf.Rds")
+)
+
+summary(model2_bf)
+hypothesis(model2_bf, "bfexcl < 0")
+post <- posterior_samples(model2_bf)
+mean(post$b_bfexcl<0)
+
+
+mlm <- brm_multiple(
+  family = student(),
+  formula = Volatility ~ condition + ges_age_s + birthweight_s + csection + edlevel_s + (1 | skippy_id),
+  data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")),
+  file = here::here("data/vol1_imp_mlm.Rds")
+)
+
+summary(mlm)
+hypothesis(mlm, "condition1 < 0")
+# file.remove(here::here("data/vol1_imp_mlm.Rds"))
+
+mlm2 <- brm_multiple(
+  family = student(),
+  formula = Volatility ~ condition + (1 | skippy_id),
+  data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")),
+  file = here::here("data/vol1_imp_mlm2.Rds")
+)
+
+summary(mlm2)
+hypothesis(mlm2, "condition1 < 0")
+
+
+
+save(model1, model1_bez, model1_bf, model2, model2_bf, file = here::here("data/volmodels_itt.Rds"))
+
+
+
+
+
+
+###############################################################################
+#########################           2. PP        ##############################
+###############################################################################
+
+# import of biomfile and meta data can be found in the import script
+load(here::here("data/data.Rds"))
+load(file = here::here("data/data_imp.Rds"))
+
+# obtain ids that were selected for PP analyses 
+pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE)
+pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP)
+# add pp info to existing data 
+if (!"pp" %in% colnames(d)) {
+  d <- dplyr::left_join(d, pp_indicator, by = "skippy_id")
+}
+
+
+
+####################### 2.1 Complete Case Analysis ############################
+
+d <- dplyr::left_join(d, vol1, by = "skippy_id") %>%
+      dplyr::left_join(vol2, by = "skippy_id") %>%
+      filter(week == 2) %>%
+      mutate(across(contains("vol"), function(x) scale(x)[, 1]))
+fvars <- c("siblings", "condition", "csection", "sex")
+d <- mutate(
+  d, 
+  across(all_of(fvars), function(x) as.factor(x)),
+  age = week * 7 + age,
+  ges_age_s = scale(ges_age)[, 1],
+  edlevel_s = scale(edlevel)[, 1],
+  birthweight_s = scale(birthweight)[, 1]) %>% 
+  filter(pp == 1)
+
+model1 <- brm(
+  family = student(),
+  formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = d,
+  file = here::here("data/vol1_cc_pp.Rds")
+)
+summary(model1)
+hypothesis(model1, "condition1 < 0")
+
+
+
+model2 <- brm(
+  family = student(),
+  formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = d,
+  file = here::here("data/vol2_cc_pp.Rds")
+)
+
+summary(model2)
+hypothesis(model2, "condition1 < 0")
+
+
+
+mlm <- brm(
+  family = student(),
+  formula = Volatility ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + (1 | skippy_id),
+  data = pivot_longer(d, contains("vol"), names_to = "Time", values_to = "Volatility"),
+  file = here::here("data/vol1_cc_mlm_pp.Rds")
+)
+summary(mlm)
+hypothesis(mlm, "condition1 < Intercept")
+
+
+################### 2.2 Multiple Imputation Analysis ##########################
+
+implist_pp <- map(implist, function(dimp) {
+  dimp_new <- dplyr::left_join(dimp, pp_indicator, by = "skippy_id") %>%
+                filter(pp == 1)
+  dimp_new
+})
+
+implist_vol <- map(implist_pp, function(dimp) {
+  df <- dplyr::left_join(dimp, vol1, by = "skippy_id") %>%
+        dplyr::left_join(vol2, by = "skippy_id") %>%
+        filter(week == 2, pp == 1) %>%
+        mutate(across(contains("vol"), function(x) scale(x)[, 1]))
+  # the vol columns will be imputed
+  imp <- mice::mice(df, m = 1)
+  complete(imp)
+})
+
+model1 <- brm_multiple(
+  family = student(),
+  formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = implist_vol,
+  file = here::here("data/vol1_imp_pp.Rds")
+)
+
+
+dim(implist_vol[[1]])
+summary(model1)
+hypothesis(model1, "condition1 < 0")
+
+
+model1_bf <- brm_multiple(
+  family = student(),
+  formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl,
+  data = implist_vol,
+  file = here::here("data/vol1_imp_bf_pp.Rds")
+)
+
+summary(model1_bf)
+hypothesis(model1_bf, "condition1 < 0")
+
+model2 <- brm_multiple(
+  family = student(),
+  formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = implist_vol,
+  file = here::here("data/vol2_imp_pp.Rds")
+)
+
+summary(model2)
+hypothesis(model2, "condition1 < 0")
+
+model2_bf <- brm_multiple(
+  family = student(),
+  formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl,
+  data = implist_vol,
+  file = here::here("data/vol2_imp_bf_pp.Rds")
+)
+
+summary(model2_bf)
+
+
+mlm <- brm_multiple(
+  family = student(),
+  formula = Volatility ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + (1 | skippy_id),
+  data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")),
+  file = here::here("data/vol1_imp_mlm_pp.Rds")
+)
+
+summary(mlm)
+hypothesis(mlm, "condition1 < 0")
+
+save(model1, model1_bf, model2, model2_bf, file = here::here("data/volmodels_pp.Rds"))
+
+mlm2 <- brm_multiple(
+  family = student(),
+  formula = Volatility ~ condition + (1 | skippy_id),
+  data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")),
+  file = here::here("data/vol1_imp_mlm2_pp.Rds")
+)
+
+summary(mlm2)
+hypothesis(mlm2, "condition1 < 0")
+
+
+
+
+
+
+
+
+# we can argue that there is evidence for an effect of SSC on MBA
+# therefore we will look at the DR analysis as well here:
+
+dr <- foreign::read.spss(
+  here::here("data/kelly_documents/data_itt_pp_dr.sav"),
+  to.data.frame = TRUE
+  ) %>%
+  select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>%
+  mutate(SSC_s = scale(SSC)[, 1])
+
+
+implist_vol <- map(implist, function(dimp) {
+  df <- dplyr::left_join(dimp, vol1, by = "skippy_id") %>%
+        dplyr::left_join(vol2, by = "skippy_id") %>%
+        filter(week == 2) %>%
+        mutate(across(contains("vol"), function(x) scale(x)[, 1])) %>%
+        dplyr::left_join(select(dr, skippy_id, SSC_s), by = "skippy_id")
+  # the vol columns will be imputed
+  imp <- mice::mice(df, m = 1)
+  complete(imp)
+})
+colnames(implist_vol[[1]])
+
+model1 <- brm_multiple(
+  family = student(),
+  formula = vol1 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = implist_vol,
+  file = here::here("data/vol1_imp_dr.Rds")
+)
+
+summary(model1)
+hypothesis(model1, "SSC_s < 0")
+
+
+model2 <- brm_multiple(
+  family = student(),
+  formula = vol2 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection,
+  data = implist_vol,
+  file = here::here("data/vol2_imp_dr.Rds")
+)
+
+
+summary(model2)
+dim(model2$data)
+
+
+# only within SSC
+
+formula <- bf(vol1 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection)
+model_vol1_within <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist_vol, ~filter(.x, condition == 1)),
+  file = here::here("data/vol1_dr_within_cov")
+)
+
+save(model_vol1_within, file = here::here("data/volmodels_dr.Rds"))
+
+summary(model_vol1_within)
+
+formula <- bf(vol1 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection)
+model_vol1_out <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = map(implist_vol, ~filter(.x, condition == 0)),
+  file = here::here("data/vol1_dr_out_cov")
+)
+
+summary(model_vol1_out)
+
+
+implist_vol <- map(implist_vol, ~mutate(.x, upper = ifelse(SSC_s <= -0.5, "no", "yes")))
+
+colnames(implist_vol[[1]])
+formula <- formula <- bf(vol1 ~ upper + ges_age_s + birthweight_s + edlevel_s + csection)
+model_year1_cat <- brm_multiple(
+  family = student(),
+  formula = formula,
+  data = implist_vol,
+  file = here::here("data/vol1_dr_cat_cov")
+)
+
+summary(model_year1_cat)
+hypothesis(model_year1_cat, "upperyes < 0")
+dim(implist_vol[[1]])
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..adef5df
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+# Daily Skin-to-Skin Contact Alters Microbiota Development in Healthy Full-Term Infants
+
+
+
+This repository contains R code for the statistical analysis of a research project that investigated the effect of a skin-to-skin intervention on microbiota development.