diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6fa0854 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +data/* +docs/* +fig/* +picrust2/* +Rmd/* +.DS_Store diff --git a/R/alpha_diversity.R b/R/alpha_diversity.R new file mode 100644 index 0000000..c4eb076 --- /dev/null +++ b/R/alpha_diversity.R @@ -0,0 +1,791 @@ +# for reproducibility +set.seed(1) +# this script contains step 1 of https://osf.io/s45mu (alpha diversity) +library(mia) +library(glue) +library(tidyverse) +library(rstanarm) +library(mice) +library(brms) +library(here) +library(HDInterval) +source(here("R/helper_functions.R")) + + + + +# Data Preparation +################################################################################ +if (!file.exists(here::here("data/data_imp.Rds"))) { + + + # import of biomfile and meta data can be found in the import script + load(here::here("data/data.Rds")) + + mdata$siblings <- ifelse( + mdata$siblings == 0, 0, ifelse( + mdata$siblings >= 1, 1, NA)) + # before estimating AD, I bring the mdata in long format where appropriate + l_mdata <- pivot_longer( + mdata, + matches("age_week\\d+"), + names_to = "week", + names_prefix = "age_week", + values_to = "age") %>% + mutate(sample_id = ifelse(week == "2", glue("{skippy_id}_1"), ifelse(week == "5", glue("{skippy_id}_2"), ifelse(week == "52", glue("{skippy_id}_3"), NA)))) + + + # obtain AD and combine dfs + tse <- estimateDiversity( + tse, + assay_name = "counts", + index = c("shannon", "faith", "inverse_simpson"), + name = c("shannon", "faith", "inverse_simpson") + ) + tse <- estimateRichness( + tse, + assay_name = "counts", + index = "chao1", + name = "chao1" + ) + ad <- colData(tse) %>% + as.data.frame() %>% + select(sample_id, chao1, inverse_simpson, shannon, faith) + + df <- ad %>% + dplyr::full_join(l_mdata, by = "sample_id") %>% + arrange(skippy_id, week) + + + + # variables I choose to retain for imputation based on expected predictive + # value for microbiota data imputation + vars <- c( + "sample_id", + "skippy_id", + "week", + "age", + "chao1", + "inverse_simpson", + "shannon", + "faith", + "condition", + "csection", + "birthweight", + "siblings", + "sex", + "bfexcl", + "bfperc_w1", + "bfperc_w2", + "bfperc_w3", + "bfperc_w4", + "bfperc_w5", + "apgar_5", + "ges_age", + "edlevel", + "parity", + "weaning", + "antibiotic_1year", + "antibiotic_week2", + "antibiotic_week5" + ) + mdata <- dplyr::rename( + mdata, + constipation_week52 = constipation_1year, + diarrhea_week52 = diarrhea_1year, + antibiotic_week52 = antibiotic_1year + ) + + + + # furthermore I include constipation and diarrhea but first i need to put + # them into long format + c_and_d <- select( + mdata, + skippy_id, + matches("constipation_week\\d+$"), + matches("diarrhea_week\\d+$"), + matches("antibiotic_week\\d+$") + ) %>% + pivot_longer( + cols = contains("week"), + names_to = c(".value", "week"), + names_pattern = "(\\w+)_week(\\d+)") + + + + d <- dplyr::left_join( + select(df, all_of(vars)), + c_and_d, + by = c("skippy_id", "week")) %>% + arrange(sample_id) %>% + select( + sample_id, skippy_id, week, age, chao1, inverse_simpson, shannon, faith, condition, + csection, birthweight, siblings, sex, contains("bf"), constipation, antibiotic, + diarrhea, everything(), -contains("antibiotic_w")) + + save(d, file = here::here("data/table1.Rds")) + # now we are ready for imputation, we start with m = 5 to make the script run + # but for the final estimates we will increase number of imputations + # if skippy_id and week are numeric the imputation model function better, other- + # wise the age var imputation is not good. + nvars <- c("skippy_id", "week") + d <- mutate(d, + across(all_of(nvars), function(x) as.numeric(x)), + age = age - week * 7 + ) + imp <- mice(d, m = 50) + + # for the analysis we need to change dtypes for some vars + # variables to standardize + + svars <- c( + "age", + "age_dev", + "chao1", + "inverse_simpson", + "faith", + "shannon", + "birthweight", + "ges_age", + "apgar_5", + "edlevel" + ) + fvars <- c( + "week", + "condition", + "csection", + "siblings", + "sex", + "antibiotic" + ) + implist <- map(1:5, function(x) { + dtemp <- complete(imp, x) + dtemp <- mutate(dtemp, + across(all_of(fvars), function(x) as.factor(x)), + # across(all_of(svars), function(x) scale(x)[, 1]), + age_dev = age, + age = as.numeric(levels(week)) * 7 + age_dev, + age_s = scale(age)[, 1], + age_y = age/365, + age_dev_s = scale(age_dev)[, 1], + chao1_s = scale(chao1)[, 1], + faith_s = scale(faith)[, 1], + inverse_simpson_s = scale(inverse_simpson)[, 1], + shannon_s = scale(shannon)[, 1], + birthweight_s = scale(birthweight)[, 1], + ges_age_s = scale(ges_age)[, 1], + apgar_5_s = scale(apgar_5)[, 1], + edlevel_s = scale(edlevel)[, 1], + skippy_id = as.integer(skippy_id) + ) + return(dtemp) + }) + + save(d, implist, file = here::here("data/data_imp.Rds")) + } else { + load(file = here::here("data/data_imp.Rds")) +} + + + +# Figure out model structure for the ITT analyses +################################################################################ + + +# I decide which time variable we will use based on LOO +f1 <- bf(shannon_s ~ week * condition + (1|skippy_id)) +m1 <- brm_multiple( + data = implist, + formula = f1, + file = here::here("data/m1.Rds") +) +loo_m1 <- add_criterion( + m1, + "loo", + file = here::here("data/loo_m1"), + moment_match = FALSE +) +loo_m1 + +f2 <- bf(shannon_s ~ age_y * condition + (1|skippy_id)) +m2 <- brm_multiple( + data = implist, + formula = f2, + file = here::here("data/m2.Rds") +) + +loo_m2 <- add_criterion( + m2, + "loo", + file = here::here("data/loo_m2"), + moment_match = FALSE +) +f3 <- bf(shannon_s ~ week * condition + age_dev + (1|skippy_id)) +m3 <- brm_multiple( + data = implist, + formula = f3, + file = here::here("data/m3.Rds") +) +loo_m3 <- add_criterion( + m3, + "loo", + file = here::here("data/loo_m3"), + moment_match = FALSE +) + + +loo_comp <- loo_compare(loo_m1, loo_m2, loo_m3) +loo_comp + +# the model with age has best fit. Therefore, we will use these as our main +# models. We will double check if results differ as compared to using week +# But especially for the frequentist models this setup makes it easier as well. + + +# first we must decide which covariates we want to include. For ITT we can +# decide totally based on model fit as long as it does not interfere with +# causal chain of our DAG. E.g. we must exclude BF although it prob help to +# predict AD as SSC --> BF and we want ACE of SSC. However, variables such as +# age, csection, siblings etc. can be included based on whether they improve +# model fit. We can use loo for this to get a quick answer: + + +# I consider the following variables for selection +coefs <- c( + "csection", + "birthweight_s", + "siblings", + "sex", + "apgar_5_s", + "ges_age_s", + "edlevel_s" +) + + + + +# I will use the following algorithm: +# For each dataset in implist: + # Calculate LOO for base model + # Then for each var in coefs: + # calculate LOO for base model + coef + # If LOO indicates the new model is a better fit + # keep that coef in list + +loo_comp <- map_dfr(coefs, function(coef) { + map_dfr(1:5, function(i) { + map_dfr(list(c(2, 5), 52), function(n_week) { + if (52 %in% n_week) { + # base model + f1 <- bf(shannon_s ~ age * condition) + m1 <- brm( + data = filter(implist[[i]], week == 52) %>% + mutate(age = scale(age)[, 1]), + formula = f1, + file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}.Rds")) + ) + loo_m1 <- add_criterion( + m1, + "loo", + file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}")), + moment_match = FALSE + ) + + f2 <- bf(glue("shannon_s ~ age * condition + {coef}")) + m2 <- brm( + data = filter(implist[[i]], week == 52) %>% + mutate(age = scale(age)[, 1]), + formula = f2, + file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}.Rds")) + ) + + loo_m2 <- add_criterion( + m2, + "loo", + file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}")), + moment_match = FALSE + ) + lcomp <- loo_compare(loo_m2, loo_m1) + score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1) + } else { + # base model + f1 <- bf(shannon_s ~ age * condition + (1|skippy_id)) + m1 <- brm( + data = filter(implist[[i]], week != 52) %>% + mutate(age = scale(age)[, 1]), + formula = f1, + file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}.Rds")) + ) + loo_m1 <- add_criterion( + m1, + "loo", + file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}")), + moment_match = FALSE + ) + + f2 <- bf(glue("shannon_s ~ age * condition + {coef} + (1|skippy_id)")) + m2 <- brm( + data = filter(implist[[i]], week != 52) %>% + mutate(age = scale(age)[, 1]), + formula = f2, + file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}.Rds")) + ) + + loo_m2 <- add_criterion( + m2, + "loo", + file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}")), + moment_match = FALSE + ) + lcomp <- loo_compare(loo_m2, loo_m1) + score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1) + } + tibble( + model = ifelse(n_week == 52, "1year", "2 and 5 weeks"), + coef = coef, + imp = i, + score = score + ) + }) + }) +}) + +# siblings was the only covariate that improved out of sample +# predictions. Therefore, we will only keep this variable +group_by(loo_comp, coef, model) %>% + summarise(ss = sum(score)) + +# lastly: should we include random effects for age? +loo_comp <- map_dfr(1:5, function(i) { + # base model + f1 <- bf(shannon_s ~ age * condition + (1|skippy_id)) + m1 <- brm( + data = implist[[i]], + formula = f1, + file = here::here(glue("data/m1_age_imp{i}.Rds")) + ) + loo_m1 <- add_criterion( + m1, + "loo", + file = here::here(glue("data/loo_m1_age_imp{i}")), + moment_match = FALSE + ) + + f2 <- bf(glue("shannon_s ~ age * condition + (1 + age|skippy_id)")) + m2 <- brm( + data = implist[[i]], + formula = f2, + file = here::here(glue("data/m2_age_imp{i}_rftime.Rds")) + ) + + loo_m2 <- add_criterion( + m2, + "loo", + file = here::here(glue("data/loo_m2_age_imp{i}_rftime")), + moment_match = FALSE + ) + lcomp <- loo_compare(loo_m2, loo_m1) + score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1) + tibble( + imp = i, + score = score + ) +}) +# results indicate that varying time effects arent necessary +loo_comp + + + + + +# ITT analyses with MI +############################################################################# + + +# start with posterior predictive checks and model diagnostics before we inter +# pret the results. In the end I will use the final models for interpretation. +# because some loo estimates indicate outliers I will use robust regression. +# Shannon is least prone to non-normality and yet we got some warnings while +# there is no disadvantage of using robust regression: + +# "inverse_simpson_s" cannot be modelled with student() +indeces <- c("shannon_s", "chao1_s", "faith_s") +ad_models <- map(indeces, function(index) { + f <- bf(glue("{index} ~ age_s * condition + siblings + (1|skippy_id)")) + m <- brm_multiple( + family = student(), + data = implist, + formula = f, + file = here::here(glue("data/m_age_{index}_bez.Rds")) + ) +}) +ad_models +save(ad_models, file = here::here("data/ad_models.Rds")) +# now for each model we perform posterior predictive checks and have a look +# at residuals +pp_checks <- map(ad_models, function(m) { + p <- pp_check(m) +}) +pp_checks +# load helper function to diagnose lms +source(here::here("R/ml_helper.R")) +names(ad_models) <- indeces +lm_diags <- map(indeces, function(index) { + lm_diag(ad_models[[index]], ad_models[[index]]$data, index, id = "skippy_id") +}) +lm_diags + +# check also models split by time points (this only makes really a difference for siblings effect) +ad_models2 <- map(c("2and5", "52"), function(weekchr) { + if (weekchr == "2and5") { + ad_models <- map(indeces, function(index) { + f <- bf(glue("{index} ~ age * condition + siblings + (1|skippy_id)")) + m <- brm_multiple( + family = student(), + data = map(implist, ~filter(.x, week != 52)), + formula = f, + file = here::here(glue("data/m_age_{index}_{weekchr}_bez.Rds")) + ) + }) + } else { + ad_models <- map(indeces, function(index) { + f <- bf(glue("{index} ~ condition + age + siblings")) + m <- brm_multiple( + family = student(), + data = map(implist, ~filter(.x, week == 52)), + formula = f, + file = here::here(glue("data/m_{index}_{weekchr}_bez.Rds")) + ) + }) + } + ad_models +}) +ad_models2 +post <- posterior_samples(ad_models2[[1]][[1]]) +mean(post$b_siblings1 < 0) +length(unlist(ad_models2, recursive = FALSE)) +# now for each model we perform posterior predictive checks and have a look +# at residuals +ad_models2 <- unlist(ad_models2, recursive = FALSE) + +length(ad_models2) +pp_checks2 <- map(ad_models2, function(m) { + p <- pp_check(m) +}) +pp_checks2 + + +lm_diags2 <- map(c("2and5", "52"), function(weekchr) { + if (weekchr == "2and5") { + lm_diags <- map(indeces, function(index) { + lm_diag( + ad_models2[1:4][[index]], + filter(implist[[1]], week != 52), + index, id = "skippy_id") + }) + } else { + lm_diags <- map(indeces, function(index) { + lm_diag( + ad_models2[5:8][[index]], + filter(implist[[1]], week== 52), + index, id = "skippy_id") + }) + } + lm_diags +}) +lm_diags2[[1]][[1]] +lm_diags2[[2]][[1]] + + + + +# create a plot for AD +if (!is.factor(d$week)) { + d <-mutate(d, + week = as.factor(week), condition = as.factor(condition), + group = ifelse(week == 2 & condition == 0, "W2 CAU", ifelse( + week == 2 & condition == 1, "W2 SSC", ifelse( + week == 5 & condition == 0, "W5 CAU", ifelse( + week == 5 & condition == 1, "W5 SSC", ifelse( + week == 52 & condition == 0, "W52 CAU", ifelse( + week == 52 & condition == 1, "W52 SSC", NA)))))), + group = as.factor(group), + condition_label = ifelse(condition == 0, "CAU", ifelse( + condition == 1, "SSC", NA)), + week_label = glue::glue("Week {week}") + ) +} + d <-mutate(d, + week = as.factor(week), condition = as.factor(condition), + group = ifelse(week == 2 & condition == 0, "W2 CAU", ifelse( + week == 2 & condition == 1, "W2 SSC", ifelse( + week == 5 & condition == 0, "W5 CAU", ifelse( + week == 5 & condition == 1, "W5 SSC", ifelse( + week == 52 & condition == 0, "W52 CAU", ifelse( + week == 52 & condition == 1, "W52 SSC", NA)))))), + group = as.factor(group), + condition_label = ifelse(condition == 0, "CAU", ifelse( + condition == 1, "SSC", NA)), + week_label = glue::glue("Week {week}") + ) +# alternative 1 +adplots <- map(str_remove(indeces, "_s$"), function(index) { + d %>% + ggplot(aes_string("group", index, fill = "condition")) + + geom_boxplot() + + #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) + + geom_jitter(width = 0.1, size = 2, alpha = 0.7) + + # scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + theme_bw(base_size = 25) + + theme(legend.position = "none") + + xlab("") + ylab(str_to_title(index)) +}) +# alternative 2 +adplots <- map(str_remove(indeces, "_s$"), function(index) { + d %>% + ggplot(aes_string("condition_label", index, fill = "condition_label")) + + geom_boxplot() + + #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) + + geom_jitter(width = 0.1, size = 2, alpha = 0.7) + + # scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + facet_wrap(~week_label, strip.position = "bottom") + + #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + + xlab("") + ylab(str_to_title(index)) +}) + + + +save(adplots, file = here::here("data/adplots.Rds")) + + + + +# ITT analyses complete case +############################################################################# +if (!"shannon_s" %in% colnames(d)) { + d <- mutate(d, + age_s = scale(age)[, 1], + chao1_s = scale(chao1)[, 1], + faith_s = scale(faith)[, 1], + inverse_simpson_s = scale(inverse_simpson)[, 1], + shannon_s = scale(shannon)[, 1], + skippy_id = as.integer(skippy_id), + siblings = as.factor(siblings) + ) +} + + +ad_models_cc <- map(c("2and5", "52"), function(weekchr) { + if (weekchr == "2and5") { + ad_models <- map(indeces, function(index) { + f <- bf(glue("{index} ~ age * condition + siblings + (1|skippy_id)")) + m <- brm( + family = student(), + data = filter(d, week != 52) %>% + mutate(age = scale(age)[, 1]), + formula = f, + file = here::here(glue("data/m_age_{index}_{weekchr}_cc_bez.Rds")) + ) + }) + } else { + ad_models <- map(indeces, function(index) { + f <- bf(glue("{index} ~ condition + age + siblings")) + m <- brm( + family = studetnt(), + data = filter(d, week == 52) %>% + mutate(age = scale(age)[, 1]), + formula = f, + file = here::here(glue("data/m_age_{index}_{weekchr}_cc_bez.Rds")) + ) + }) + } + ad_models +}) +parameters <- c( + "b_age", + "b_condition1", + "b_siblings1", + "b_age:condition1" +) + + + +tbs_cc <- map2_dfr(c("2and5", "52"), ad_models_cc, function(weekchr, models) { + if (weekchr == "2and5") { + tb <- map2_dfr(indeces, models, function(index, model) { + summarise_posterior(model, parameters, 2) %>% + mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0)) + }) + } else { + tb <- map2_dfr(indeces, models, function(index, model) { + summarise_posterior(model, parameters[-length(parameters)], 2) %>% + mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0)) + }) + } +}) +filter(tbs_cc, indicator) + +filter(tbs_cc, indicator) + + + +# results do not differ meaningully between complete case analyses and mi +# for inverse simpson the models are not a good fit in either case. I ran +# wilcoxon tests here: +filter(d, group %in% c("W52 CAU", "W52 SSC")) %>% + mutate(group = fct_drop(group)) %>% + wilcox.test(inverse_simpson ~ group, data = .) + + + + +# PP analyses with MI +############################################################################# + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +count(pp_indicator, PP, SSC) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +count(d, condition, pp) +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + +covariates <- c( + "siblings", + "birthweight_s", + "ges_age_s", + "edlevel_s", + "csection", + "sex" +) + +model_str <- "age_s * condition" +for (coef in covariates) { + model_str <- glue("{model_str} + {coef}") +} + +# "inverse_simpson_s" cannot be modelled with student() +indeces <- c("shannon_s", "chao1_s", "faith_s") +ad_models_pp <- map(indeces, function(index) { + f <- bf(glue("{index} ~ {model_str} + (1|skippy_id)")) + m <- brm_multiple( + family = student(), + data = implist_pp, + formula = f, + file = here::here(glue("data/m_age_{index}_pp_bez.Rds")) + ) +}) +ad_models_pp + +indeces <- c("shannon_s", "inverse_simpson_s", "chao1_s", "faith_s") +ad_models_pp <- map(c("2and5", "52"), function(weekchr) { + if (weekchr == "2and5") { + ad_models <- map(indeces, function(index) { + f <- bf(glue("{index} ~ {model_str} + (1|skippy_id)")) + m <- brm_multiple( + family = student(), + data = map(implist_pp, ~filter(.x, week != 52) %>% + mutate(age = scale(age)[, 1])), + formula = f, + file = here::here(glue("data/m_age_{index}_{weekchr}_sn_pp_bez.Rds")) + ) + }) + } else { + ad_models <- map(indeces, function(index) { + f <- bf(glue("{index} ~ {model_str}")) + if (index %in% c("chao1_s", "inverse_simpon_s")) { + family <- skew_normal() + } else { + family <- student() + } + m <- brm_multiple( + family = family, + data = map(implist_pp, ~filter(.x, week == 52) %>% + mutate(age = scale(age)[, 1])), + formula = f, + file = here::here(glue("data/m_age_{index}_{weekchr}_sn_pp_bez.Rds")) + ) + }) + } + ad_models +}) + + + +pp_checks_pp <- map(ad_models_pp, function(m) { + p <- pp_check(m) +}) +pp_checks_pp + +tbs_pp <- map2_dfr(c("2and5", "52"), ad_models_pp, function(weekchr, models) { + if (weekchr == "2and5") { + tb <- map2_dfr(indeces, models, function(index, model) { + summarise_posterior(model, parameters, 2) %>% + mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0)) + }) + } else { + tb <- map2_dfr(indeces, models, function(index, model) { + summarise_posterior(model, parameters[-length(parameters)], 2) %>% + mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0)) + }) + } +}) +filter(tbs_pp, indicator) + + +# results are similar to the ITT analyses. No effects on alpha diversity + + +# create a supplementary table with all beta coefficients + +ad_stbl <- map2_dfr(c("2and5", "52"), ad_models_pp, function(weekchr, models) { + if (weekchr == "2and5") { + tb <- map2_dfr(indeces, models, function(index, model) { + summarise_posterior(model, contains("b_"), 3) %>% + mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0)) + }) + } else { + tb <- map2_dfr(indeces, models, function(index, model) { + summarise_posterior(model, contains("b_"), 3) %>% + mutate(model = weekchr, index = index, indicator = (lower <=0 & upper <=0) | (lower >=0 & upper >=0)) + }) + } +}) +ad_stbl <- group_by(ad_stbl, model, index) %>% nest() + +ad_stbl_pp <- pmap(list(ad_stbl[[1]], ad_stbl[[2]], ad_stbl[[3]]), function(time, index, tbl){ + time_t <- ifelse(time == "2and5", "week 2 and 5", "1 year") + index_t <- str_to_title(str_remove(index, "_s$")) + caption <- glue("Alpha diversity model using samples obtained at {time_t} and {index_t}") + knitr::kable(tbl, caption = caption) +}) + +ad_stbl_pp + + + + + + + + + + diff --git a/R/beta_diversity_ait.R b/R/beta_diversity_ait.R new file mode 100644 index 0000000..5fc6b74 --- /dev/null +++ b/R/beta_diversity_ait.R @@ -0,0 +1,1223 @@ +# thanks to Gavin for helping out with this part: +# https://stats.stackexchange.com/questions/590510/repeated-measures-permanova-nowhere-to-find + +set.seed(1) +library(mia) +library(tidyverse) +library(tidySummarizedExperiment) +library(vegan) +library(permute) +library(glue) + + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + left_join(select(d, age, sample_id), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + DataFrame() + +colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] + +tse <- agglomerateByRank(tse, rank = "genus") + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + + +####################### 1.1 Complete Case Analysis ############################ + +### First I fit a model to all samples + +# we use Aitchison distance +tse <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr") +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse, "clr")) +asv <- asv[meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + # h does not work if trend is in data (therefore use 999), see Gavins post + permutations = 999 + ) + +permanova + + + +# Perform dbRDA +dbrda <- dbrda(asv ~ age_s + age_s*condition, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "euclidean", + permutations = 999) +permanova2 + + + +### Now split models by infancy and 1 year olds + +## first infancy +tse_inf <- filter(tse, week != 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_inf, "clr")) +asv <- asv[meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "euclidean", + permutations = 999) +permanova2 + + + + +## then 1 year olds +tse_y <- filter(tse, week == 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_y, "clr")) +asv <- asv[meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "euclidean", + permutations = 999) +permanova2 + + + + +######################## 1. 2Multiple imputation ############################# + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + + +permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + + # step 1 + # we use Aitchison distance + tse <- agglomerateByRank(tse, rank = "genus") + tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr") + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select(dimp, condition, siblings, age, sample_id, week, skippy_id), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + + # step 2 + + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ condition + age_s + age_s:condition, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + + ## first infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ condition + age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + + #list(infancy = permanova_inf, year1 = permanova_y, imp = imp) + permanova +}) +permanovas + + + + +######################## 1. 2 WITH BREASTFEEDING ############################# + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +permanovas_bf <- map2_dfr(implist, 1:length(implist), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + + # step 1 + # we use Aitchison distance + tse <- agglomerateByRank(tse, rank = "genus") + tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr") + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select( + dimp, condition, siblings, age, sample_id, + week, skippy_id, bfexcl), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + + # step 2 + + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ bfexcl + condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999, + # by = "margin" + ) + + + ## first infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ bfexcl + condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ bfexcl + condition + age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + + #list(all = permanova_all, infancy = permanova_inf, year1 = permanova_y, imp = imp) + permanova +}) +permanovas_bf + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +count(pp_indicator, PP, SSC) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") %>% + mutate( + birthweight_s = scale(birthweight)[, 1], + ges_age_s = scale(ges_age)[, 1], + edlevel_s = scale(edlevel)[, 1], + ) +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +count(d, condition, pp) +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join(select(d_pp, skippy_id, siblings, week, age, condition, birthweight_s, ges_age_s, edlevel_s, csection, sex, sample_id, pp), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + DataFrame() +colnames(colData(tse)) +colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +tse_pp <- filter(tse, pp == 1) + +colData(tse_pp)$csection +####################### 2.1 Complete Case Analysis ############################ + +### First I fit a model to all samples + +# we use Aitchison distance +tse_pp <- transformSamples(x = tse_pp, method = "clr", pseudocount = 1, name = "clr") +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_pp) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_pp, "clr")) +asv <- asv[meta$sample_id, ] + + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + # does not work if trend is in data (therefore use 999) + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "euclidean", + permutations = 999) +permanova2 + + + +### Now split models by infancy and 1 year olds + +## first infancy +tse_inf <- filter(tse_pp, week != 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_inf, "clr")) +asv <- asv[meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s * condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "euclidean", + permutations = 999) +permanova2 + + + + +## then 1 year olds +tse_y <- filter(tse_pp, week == 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() + +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_y, "clr")) +asv <- asv[meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "euclidean", + permutations = 999) +permanova2 + +# same as above + + +######################## 2.2 Multiple imputation ############################# + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +permanovas_pp <- map2_dfr(implist_pp, 1:length(implist_pp), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + + # step 1 + # we use Aitchison distance + tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr") + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select(dimp, condition, siblings, age, birthweight_s, ges_age_s, edlevel_s, csection, sex, sample_id, week, skippy_id, pp), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + tse_map <- filter(tse_map, pp == 1) + + + # step 2 + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + ## then infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + permanova + #list(infancy = permanova_inf, year1 = permanova_y, imp = imp) +}) + +permanovas_pp + +# same as above + + + + + + +############################################################################## +#################################PLOT & TABLE ################################ +############################################################################## + + +# for the tables I use first imputed dataset of ITT analyses +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") + +tse_plot <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr") +colData(tse_plot) <- colData(tse_plot) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select( + implist[[1]], + condition, siblings, age, + sample_id, week, skippy_id, bfexcl), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + + +## first all samples +# extract relevant meta data and omit na as adonis doesnt accept them. +meta_plot <- colData(tse_plot) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta_plot %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_plot, "clr")) +asv <- asv[meta_plot$sample_id, ] + +# fit and inspect model +permanova_all <- adonis2(asv ~ condition * age_s, + # by = "margin", # each term analyzed individually + data = meta_plot, + method = "euclidean", + permutations = 999 + ) +beta_table <- permanova_all %>% + as.data.frame() %>% + rownames_to_column("parameter") %>% + mutate( + `R2 (%)` = glue("{round(R2 * 100, 2)}"), + P = as.character(`Pr(>F)`), + P = str_remove(P, "^0"), + parameter = str_to_title(parameter), + parameter = ifelse(parameter == "Condition", "SSC", ifelse( + parameter == "Age_s", "Age", ifelse( + parameter == "Condition:age_s", "SSC x Age", parameter + ))) + ) %>% + mutate(across(where(is.numeric), round, 2)) %>% + mutate_all(function(x) ifelse(is.na(x), "", x)) %>% + select(parameter, Df, SumOfSqs, "R2 (%)", F, P) + +colnames(beta_table) <- str_to_title(colnames(beta_table)) + +# fit and inspect model +permanova_all <- adonis2( + asv ~ bfexcl + condition * age_s, + # by = "margin", # each term analyzed individually + data = meta_plot, + method = "euclidean", + permutations = 999) + +beta_table_bf <- permanova_all %>% + as.data.frame() %>% + rownames_to_column("parameter") %>% + mutate( + `R2 (%)` = glue("{round(R2 * 100, 2)}"), + P = as.character(`Pr(>F)`), + P = str_remove(P, "^0"), + parameter = str_to_title(parameter), + parameter = ifelse(parameter == "Condition", "SSC", ifelse( + parameter == "Age_s", "Age", ifelse( + parameter == "Condition:age_s", "SSC x Age", ifelse( + parameter == "Bfexcl", "Breastfeeding", parameter + )))) + ) %>% + mutate(across(where(is.numeric), round, 2)) %>% + mutate_all(function(x) ifelse(is.na(x), "", x)) %>% + select(parameter, Df, SumOfSqs, "R2 (%)", F, P) + +colnames(beta_table_bf) <- str_to_title(colnames(beta_table_bf)) + +beta_table +beta_table_bf + + + + + + + + +# same but split by infants and year1 + +# for the tables I use first imputed dataset of ITT analyses +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +tse_inf <- filter(tse, week != 52) + +tse_plot <- transformSamples(x = tse_inf, method = "clr", pseudocount = 1, name = "clr") +colData(tse_plot) <- colData(tse_plot) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select( + implist[[1]], + condition, siblings, age, + sample_id, week, skippy_id, bfexcl), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + +# step 2 + +# extract relevant meta data and omit na as adonis doesnt accept them. +meta_plot <- colData(tse_plot) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta_plot %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_plot, "clr")) +asv <- asv[meta_plot$sample_id, ] + +# fit and inspect model +permanova_inf <- adonis2(asv ~ condition * age_s, + # by = "margin", # each term analyzed individually + data = meta_plot, + method = "euclidean", + permutations = 999 + ) +beta_table_inf <- permanova_inf %>% + as.data.frame() %>% + rownames_to_column("parameter") %>% + mutate( + `R2 (%)` = glue("{round(R2 * 100, 2)}"), + P = as.character(`Pr(>F)`), + P = str_remove(P, "^0"), + parameter = str_to_title(parameter), + parameter = ifelse(parameter == "Condition", "SSC", ifelse( + parameter == "Age_s", "Age", ifelse( + parameter == "Condition:age_s", "SSC x Age", parameter + ))) + ) %>% + mutate(across(where(is.numeric), round, 2)) %>% + mutate_all(function(x) ifelse(is.na(x), "", x)) %>% + select(parameter, Df, SumOfSqs, "R2 (%)", F, P) + +colnames(beta_table_inf) <- str_to_title(colnames(beta_table_inf)) + +# fit and inspect model +permanova_inf <- adonis2( + asv ~ bfexcl + condition * age_s, + # by = "margin", # each term analyzed individually + data = meta_plot, + method = "euclidean", + permutations = 999) + +beta_table_bf_inf <- permanova_inf %>% + as.data.frame() %>% + rownames_to_column("parameter") %>% + mutate( + `R2 (%)` = glue("{round(R2 * 100, 2)}"), + P = as.character(`Pr(>F)`), + P = str_remove(P, "^0"), + parameter = str_to_title(parameter), + parameter = ifelse(parameter == "Condition", "SSC", ifelse( + parameter == "Age_s", "Age", ifelse( + parameter == "Condition:age_s", "SSC x Age", ifelse( + parameter == "Bfexcl", "Breastfeeding", parameter + )))) + ) %>% + mutate(across(where(is.numeric), round, 2)) %>% + mutate_all(function(x) ifelse(is.na(x), "", x)) %>% + select(parameter, Df, SumOfSqs, "R2 (%)", F, P) + +colnames(beta_table_bf_inf) <- str_to_title(colnames(beta_table_bf_inf)) + +beta_table_inf +beta_table_bf_inf + + + + + +# now lastly year 1 +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +tse_year1 <- filter(tse, week == 52) + +tse_plot <- transformSamples(x = tse_year1, method = "clr", pseudocount = 1, name = "clr") +colData(tse_plot) <- colData(tse_plot) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select( + implist[[1]], + condition, siblings, age, + sample_id, week, skippy_id, bfexcl), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + +# step 2 + +## first all samples +# extract relevant meta data and omit na as adonis doesnt accept them. +meta_plot <- colData(tse_plot) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, bfexcl) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta_plot %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_plot, "clr")) +asv <- asv[meta_plot$sample_id, ] + +# fit and inspect model +permanova_year1 <- adonis2(asv ~ condition + age_s, + # by = "margin", # each term analyzed individually + data = meta_plot, + method = "euclidean", + permutations = 999 + ) +beta_table_year1 <- permanova_year1 %>% + as.data.frame() %>% + rownames_to_column("parameter") %>% + mutate( + `R2 (%)` = glue("{round(R2 * 100, 2)}"), + P = as.character(`Pr(>F)`), + P = str_remove(P, "^0"), + parameter = str_to_title(parameter), + parameter = ifelse(parameter == "Condition", "SSC", ifelse( + parameter == "Age_s", "Age", ifelse( + parameter == "Condition:age_s", "SSC x Age", parameter + ))) + ) %>% + mutate(across(where(is.numeric), round, 2)) %>% + mutate_all(function(x) ifelse(is.na(x), "", x)) %>% + select(parameter, Df, SumOfSqs, "R2 (%)", F, P) + +colnames(beta_table_year1) <- str_to_title(colnames(beta_table_year1)) + +# fit and inspect model +permanova_year1 <- adonis2( + asv ~ bfexcl + condition + age_s, + # by = "margin", # each term analyzed individually + data = meta_plot, + method = "euclidean", + permutations = 999) + +beta_table_bf_year1 <- permanova_year1 %>% + as.data.frame() %>% + rownames_to_column("parameter") %>% + mutate( + `R2 (%)` = glue("{round(R2 * 100, 2)}"), + P = as.character(`Pr(>F)`), + P = str_remove(P, "^0"), + parameter = str_to_title(parameter), + parameter = ifelse(parameter == "Condition", "SSC", ifelse( + parameter == "Age_s", "Age", ifelse( + parameter == "Condition:age_s", "SSC x Age", ifelse( + parameter == "Bfexcl", "Breastfeeding", parameter + )))) + ) %>% + mutate(across(where(is.numeric), round, 2)) %>% + mutate_all(function(x) ifelse(is.na(x), "", x)) %>% + select(parameter, Df, SumOfSqs, "R2 (%)", F, P) + +colnames(beta_table_bf_year1) <- str_to_title(colnames(beta_table_bf_year1)) + +beta_table_year1 +beta_table_bf_year1 + + + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/mb_helper.R") + + +pseq <- makePhyloseqFromTreeSE(tse) +pseq_clr <- microbiome::transform(pseq, transform = "clr") +sample_data(pseq_clr) <- sd_to_df(pseq_clr) %>% + mutate( + Infancy = ifelse(week == 52, "Late", "Early"), + Condition = ifelse(condition == 1, "SSC", ifelse(condition == 0, "CAU", NA)) + ) %>% + df_to_sd() + + +# all samples +bp <- biplot( + pseq_clr, + color = "Condition", + point_size = 5, + otu_alpha = 0, + colors = c("#909090", "#000000"), + shape = "Infancy" +) + +save(bp, beta_table, beta_table_bf, beta_table_inf, beta_table_bf_inf, beta_table_year1, beta_table_bf_year1, file = here::here("data/beta_plot_table.Rds")) + + +sids <- colData(tidySummarizedExperiment::filter(tse, week != "52")) %>% rownames() +bp <- biplot( + pseq_clr, + color = "condition", + point_size = 5, + otu_alpha = 0, + colors = c("#909090", "#000000"), + filter_samples = sids +) + +bp[[1]] + +sids <- colData(tidySummarizedExperiment::filter(tse, week == "52")) %>% rownames() +bp <- biplot( + pseq_clr, + color = "condition", + point_size = 5, + otu_alpha = 0, + colors = c("#909090", "#000000"), + filter_samples = sids + ) + +bp[[2]] + + +sids <- colData(tidySummarizedExperiment::filter(tse, week != "52")) %>% rownames() +sids +filter <- dplyr::filter +bp_series <- biplot( + pseq_clr, + color = "condition", + point_size = 5, + otu_alpha = 0, + connect_series = "week", + subject_id = "skippy_id", + colors = c("#909090", "#000000"), + filter_samples = sids + ) + + +bp_series[[1]] + + + +# make also a plot with Bray Curtis + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +library(scater) +# Bray-Curtis is usually applied to relative abundances +tse_ra <- transformCounts(tse, method = "relabundance") +tse_ra <- mutate( + tse_ra, + Infancy = ifelse(week == 52, "Late", "Early"), + Condition = factor(ifelse(condition == 1, "SSC", ifelse(condition == 0, "CAU", NA)), levels = c("SSC", "CAU")) +) +# Perform PCoA +tse_ra <- runMDS(tse_ra, FUN = vegan::vegdist, method = "bray", name = "PCoA_BC", exprs_values = "relabundance") +# Create ggplot object +p <- plotReducedDim( + tse_ra, + "PCoA_BC", + colour_by = "Condition", + shape_by = "Infancy", + point_size = 5, + point_alpha = 1, + theme_size = 25) +p +?"scater-plot-args" +plotReducedDim(tse_ra, "PCoA_BC", colour_by = "Condition", other_fields = list(point_size = 3)) + +# Add explained variance for each axis +e <- attr(reducedDim(tse, "PCoA_BC"), "eig"); +rel_eig <- e/sum(e[e>0]) +p <- p + labs(x = paste("PCoA 1 (", round(100 * rel_eig[[1]],1), "%", ")", sep = ""), + y = paste("PCoA 2 (", round(100 * rel_eig[[2]],1), "%", ")", sep = "")) + +print(p) + + + + + +############################################################################### +################################ DR analyses ################################## +############################################################################### + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +dr <- foreign::read.spss( + here::here("data/kelly_documents/data_itt_pp_dr.sav"), + to.data.frame = TRUE + ) %>% + select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>% + mutate(SSC_s = scale(SSC)[, 1]) + +implist <- map(implist, function(imp) { + imp_new <- imp %>% + left_join( + select(dr, skippy_id, SSC_s), + by = "skippy_id") + #mice::complete(mice::mice(imp_new)) +}) + +permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + + # step 1 + # we use Aitchison distance + tse <- agglomerateByRank(tse, rank = "genus") + tse_map <- transformSamples(x = tse, method = "clr", pseudocount = 1, name = "clr") + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select(dimp, condition, siblings, age, sample_id, week, skippy_id, SSC_s), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + + # step 2 + + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, SSC_s) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ SSC_s + age_s + age_s:SSC_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + ## first infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, SSC_s) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ SSC_s + age_s + SSC_s:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, SSC_s) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "clr")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ SSC_s + age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + + #list(infancy = permanova_inf, year1 = permanova_y, imp = imp) + permanova +}) +permanovas diff --git a/R/beta_diversity_bray.R b/R/beta_diversity_bray.R new file mode 100644 index 0000000..20552f8 --- /dev/null +++ b/R/beta_diversity_bray.R @@ -0,0 +1,794 @@ +# thanks to Gavin for helping out with this part: +# https://stats.stackexchange.com/questions/590510/repeated-measures-permanova-nowhere-to-find + +set.seed(1) +library(mia) +library(tidyverse) +library(tidySummarizedExperiment) +library(vegan) +library(permute) + + + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +tse <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + left_join(select(d, age, sample_id), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + DataFrame() + +colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + + +####################### 1.1 Complete Case Analysis ############################ + +### First I fit a model to all samples + + +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse, "counts")) +asv <- asv[rownames(asv) %in% meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + # does not work if trend is in data (therefore use 999) + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "bray", + permutations = 999) +permanova2 + + + +### Now split models by infancy and 1 year olds + +## first infancy +tse_inf <- filter(tse, week != 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_inf, "counts")) +asv <- asv[rownames(asv) %in% meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "bray", + permutations = 999) +permanova2 + + + + +## then 1 year olds +tse_y <- filter(tse, week == 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_y, "counts")) +asv <- asv[rownames(asv) %in% meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "bray", + permutations = 999) +permanova2 + + + + +######################## 1.2 Multiple imputation ############################# + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + tse_map <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance") + # step 1 + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select(dimp, condition, siblings, age, sample_id, week, skippy_id), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + + # step 2 + ## first infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "counts")) + asv <- asv[rownames(asv) %in% meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "counts")) + asv <- asv[rownames(asv) %in% meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + permanova + #list(infancy = permanova_inf, year1 = permanova_y, imp = imp) +}) + +permanovas + + + + +######################## 1.2 WITH BREASTFEEDING ############################# + + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +permanovas_bf <- map2_dfr(implist, 1:length(implist), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + + # step 1 + # we use Aitchison distance + tse_map <- agglomerateByRank(tse, rank = "genus") + tse_map <- transformSamples(x = tse_map, method = "relabundance", pseudocount = 1, name = "relabundance") + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select( + dimp, condition, siblings, age, + sample_id, + week, skippy_id, bfexcl), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + + # step 2 + + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, + bfexcl) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ bfexcl + siblings + condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999, + by = "margin" + ) + + + ## first infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ condition + age_s + condition:age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ condition + age_s, + # by = "margin", # each term analyzed individually + data = meta, + method = "euclidean", + permutations = 999 + ) + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + permanova + # #list(infancy = permanova_inf, year1 = permanova_y, imp = imp) +}) +permanovas_bf + + + + + + + + + + + + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +tse <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance") + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +count(pp_indicator, PP, SSC) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +count(d, condition, pp) +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + left_join(select(d_pp, age, sample_id, pp), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + DataFrame() + +colData(tse)$age <- colData(tse)$age + as.numeric(as.character(colData(tse)$week)) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +tse_pp <- filter(tse, pp == 1) + +####################### 2.1 Complete Case Analysis ############################ + +### First I fit a model to all samples + + + +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_pp) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_pp, "counts")) +asv <- asv[rownames(asv) %in% meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + # does not work if trend is in data (therefore use 999) + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "bray", + permutations = 999) +permanova2 + + + +### Now split models by infancy and 1 year olds + +## first infancy +tse_inf <- filter(tse_pp, week != 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() +# we need to account for non-independence of data in the infancy model +ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id +h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_inf, "counts")) +asv <- asv[rownames(asv) %in% meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "bray", + permutations = 999) +permanova2 + + + + +## then 1 year olds +tse_y <- filter(tse_pp, week == 52) +# extract relevant meta data and omit na as adonis doesnt accept them. +meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings) %>% + na.omit() + +# according to omitted NAs I need to select stool samples +asv <- t(assay(tse_y, "counts")) +asv <- asv[rownames(asv) %in% meta$sample_id, ] + +# fit and inspect model +permanova <- adonis2(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + +permanova + +# Perform dbRDA +dbrda <- dbrda(asv ~ condition + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, data = meta) +# Perform permutational analysis +permanova2 <- anova.cca(dbrda, + by = "margin", # each term analyzed individually + method = "bray", + permutations = 999) +permanova2 + +# no effects in PP either! + + +######################## 2.2 Multiple imputation ############################# + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +permanovas_pp <- map2_dfr(implist_pp, 1:length(implist_pp), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + + # step 1 + # we use Aitchison distance + tse_map <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance") + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select(dimp, condition, siblings, age, birthweight_s, ges_age_s, edlevel_s, csection, sex, sample_id, week, skippy_id, pp), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + tse_map <- filter(tse_map, pp == 1) + + + # step 2 + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, + birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ siblings + condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + ## then infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, + birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ siblings + condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, + birthweight_s, ges_age_s, + edlevel_s, csection, sex) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ siblings + condition + age_s + condition:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + permanova + #list(infancy = permanova_inf, year1 = permanova_y, imp = imp) +}) + +permanovas_pp + + + + + +############################################################################### +################################ DR analyses ################################## +############################################################################### + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +dr <- foreign::read.spss( + here::here("data/kelly_documents/data_itt_pp_dr.sav"), + to.data.frame = TRUE + ) %>% + select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>% + mutate(SSC_s = scale(SSC)[, 1]) + +implist <- map(implist, function(imp) { + imp_new <- imp %>% + left_join( + select(dr, skippy_id, SSC_s), + by = "skippy_id") + #mice::complete(mice::mice(imp_new)) +}) + +permanovas <- map2_dfr(implist, 1:length(implist), function(dimp, imp) { + # Steps are repeated as from the beginning in the script above + tse_map <- transformSamples(x = tse, method = "relabundance", pseudocount = 1, name = "relabundance") + # step 1 + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(sample_id) %>% + left_join( + select(dimp, condition, siblings, age, sample_id, week, skippy_id, SSC_s), + by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + age = age + as.numeric(as.character(week)), + age_s = scale(age)[, 1] + ) %>% + DataFrame() + + ## first all samples + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_map) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, + SSC_s) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_map, "counts")) + asv <- asv[meta$sample_id, ] + + # fit and inspect model + permanova_all <- adonis2(asv ~ SSC_s + age_s + SSC_s:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + + # step 2 + ## first infancy + tse_inf <- filter(tse_map, week != 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_inf) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, + SSC_s) %>% + na.omit() + # we need to account for non-independence of data in the infancy model + ids <- meta %>% mutate(skippy_id = as.factor(skippy_id)) %>% + .$skippy_id + h <- how(plots = Plots(strata = ids, type = "none"), + nperm = 999) + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_inf, "counts")) + asv <- asv[rownames(asv) %in% meta$sample_id, ] + + # fit and inspect model + permanova_inf <- adonis2(asv ~ SSC_s + age_s + SSC_s:age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + + ## then 1 year olds + tse_y <- filter(tse_map, week == 52) + # extract relevant meta data and omit na as adonis doesnt accept them. + meta <- colData(tse_y) %>% as.data.frame() %>% + rownames_to_column("sample_id") %>% + select(sample_id, skippy_id, condition, age_s, siblings, + SSC_s) %>% + na.omit() + + # according to omitted NAs I need to select stool samples + asv <- t(assay(tse_y, "counts")) + asv <- asv[rownames(asv) %in% meta$sample_id, ] + + # fit and inspect model + permanova_y <- adonis2(asv ~ SSC_s + age_s + birthweight_s + ges_age_s + edlevel_s + csection + sex + siblings, + # by = "margin", # each term analyzed individually + data = meta, + method = "bray", + permutations = 999 + ) + + permanova_all$time <- "all" + permanova_inf$time <- "infancy" + permanova_y$time <- "year1" + permanova <- bind_rows( + as.data.frame(permanova_all) %>% rownames_to_column("parameter"), + as.data.frame(permanova_inf) %>% rownames_to_column("parameter"), + as.data.frame(permanova_y) %>% rownames_to_column("parameter") + ) + permanova$imp <- imp + permanova + #list(infancy = permanova_inf, year1 = permanova_y, imp = imp) +}) + +permanovas diff --git a/R/bibo_import_biom.R b/R/bibo_import_biom.R new file mode 100644 index 0000000..ca2b57b --- /dev/null +++ b/R/bibo_import_biom.R @@ -0,0 +1,41 @@ +# options(repr.plot.width=15, repr.plot.height=15) + +library(tidyverse) +library(mia) +library(glue) +library(tidySummarizedExperiment) +library(miaViz) +library(scater) +library(readxl) + +# list.files("data/raw_data") +# biom_file <- here::here("data/raw_data/bibo_SILVA_138.biom") +# file.exists(biom_file) +# # because of some issue in NGT2 we need to replace NaN in the biom file with 0 +# biom_lines <- read_lines(biom_file) +# sum(str_count(biom_lines, "NaN")) +# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0") +# sum(str_count(biom_lines, "NaN")) +# write_lines( +# biom_lines, +# here::here("data/raw_data/bibo_SILVA_138_rpl.biom") +# ) + +# now we should be able to use read_biom +biom_file <- here::here("data/raw_data/bibo_SILVA_138_rpl.biom") +bibo <- loadFromBiom(biom_file) +colnames(rowData(bibo)) <- c( + "kingdom", "phylum", "class", "order", "family", "genus", "species" +) +colData(bibo)[["sample_id"]] <- rownames(colData(bibo)) +tse_bibo <- filter(bibo, !str_detect(sample_id, "MOCK")) + + +rns <- str_c(rowData(tse_bibo)$family, rowData(tse_bibo)$genus, sep = "_") +rns[duplicated(rns)] +rownames(tse_bibo) <- rns +rownames(tse_bibo) + + +save(tse_bibo, file = here::here("data/tse_bibo.Rds")) + diff --git a/R/bingo_import_biom.R b/R/bingo_import_biom.R new file mode 100644 index 0000000..05e5f6d --- /dev/null +++ b/R/bingo_import_biom.R @@ -0,0 +1,98 @@ +# options(repr.plot.width=15, repr.plot.height=15) + +library(tidyverse) +library(mia) +library(glue) +library(tidySummarizedExperiment) +library(miaViz) +library(scater) +library(readxl) + + + +### BINGO 1 + +# biom_file <- here::here("data/raw_data/bingo_infancy_SILVA_138.biom") +# file.exists(biom_file) +# # because of some issue in NGT2 we need to replace NaN in the biom file with 0 +# biom_lines <- read_lines(biom_file) +# sum(str_count(biom_lines, "NaN")) +# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0") +# sum(str_count(biom_lines, "NaN")) +# write_lines( +# biom_lines, +# here::here("data/raw_data/bingo_infancy_SILVA_138_rpl.biom") +# ) + +# now we should be able to use read_biom +biom_file <- here::here("data/raw_data/bingo_infancy_SILVA_138_rpl.biom") + + +xl <- read_excel("bingo_infancy_mapping.xlsx", sheet = "baby metadata good samples") +colnames(xl) +sample_data <- select( + xl, + sample_id = `#SampleID`, + id = ID_participants, + age = Actual_Age_Collection_days) +bingo1 <- loadFromBiom(biom_file) + +# use practical names for the ranks instead +colnames(rowData(bingo1)) <- c( + "kingdom", "phylum", "class", "order", "family", "genus", "species" +) + +# add sample ids as column +colData(bingo1)[["sample_id"]] <- rownames(colData(bingo1)) +colData(bingo1) <- colData(bingo1) %>% as.data.frame() %>% + left_join(sample_data, by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(sample_id = rownames(colData(bingo1))) %>% + DataFrame() + + +tse_bingo1 <- filter(bingo1, !str_detect(sample_id, "MOCK")) +rns <- str_c(rowData(tse_bingo1)$family, rowData(tse_bingo1)$genus, sep = "_") +rns[duplicated(rns)] +rownames(tse_bingo1) <- rns +save(tse_bingo1, file = here::here("data/tse_bingo1.Rds")) + + + +### BINGO 2 + + +# biom_file <- here::here("data/raw_data/bingo_year13_SILVA_138.biom") +# file.exists(biom_file) +# # because of some issue in NGT2 we need to replace NaN in the biom file with 0 +# biom_lines <- read_lines(biom_file) +# sum(str_count(biom_lines, "NaN")) +# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0") +# sum(str_count(biom_lines, "NaN")) +# write_lines( +# biom_lines, +# here::here("data/raw_data/bingo_year13_SILVA_138_rpl.biom") +# ) + +# now we should be able to use read_biom +biom_file <- here::here("data/raw_data/bingo_year13_SILVA_138_rpl.biom") +bingo2 <- loadFromBiom(biom_file) +colnames(rowData(bingo2)) <- c( + "kingdom", "phylum", "class", "order", "family", "genus", "species" +) + + +# add sample ids as column +colData(bingo2)[["sample_id"]] <- rownames(colData(bingo2)) +colData(bingo2) <- colData(bingo2) %>% as.data.frame() %>% + left_join(sample_data, by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(sample_id = rownames(colData(bingo2))) %>% + DataFrame() + + +tse_bingo2 <- filter(bingo2, !str_detect(sample_id, "MOCK")) +rns <- str_c(rowData(tse_bingo2)$family, rowData(tse_bingo2)$genus, sep = "_") +rns[duplicated(rns)] +rownames(tse_bingo2) <- rns +save(tse_bingo2, file = here::here("data/tse_bingo2.Rds")) \ No newline at end of file diff --git a/R/daa_ancombc_bez.R b/R/daa_ancombc_bez.R new file mode 100644 index 0000000..2effe1d --- /dev/null +++ b/R/daa_ancombc_bez.R @@ -0,0 +1,1461 @@ +set.seed(1) +library(mia) +library(ANCOMBC) +library(tidyverse) +library(tidySummarizedExperiment) +library(glue) + + + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# for analyses we apply prevalence fitlering and analyze at genus level +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) +fvars <- c("siblings", "condition") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(d, age, sample_id, siblings), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] + +####################### 1.1 Complete Case Analysis ############################ + + + +# model includes random intercepts and all samples +if (!file.exists(here::here("data/daa_cc_mlm.Rds"))) { + output <- ancombc2( + data = tse, + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl() + ) + save(output, file = here::here("data/daa_cc_mlm.Rds")) + } else { + load(here::here("data/daa_cc_mlm.Rds")) +} + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim <- output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + +res_prim %>% select(taxon, contains("condition1")) %>% + arrange(q_condition1, p_condition1) %>% + head(15) + + +# model includes random intercepts and excludes 1 year samples +if (!file.exists(here::here("data/daa_cc_mlm_infancy.Rds"))) { + output <- ancombc2( + data = filter(tse, week != 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl() + ) + save(output, file = here::here("data/daa_cc_mlm_infancy.Rds")) + } else { + load(here::here("data/daa_cc_mlm_infancy.Rds")) +} + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim <- output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + +res_prim %>% select(taxon, contains("condition1")) %>% + arrange(p_condition1) %>% + head(15) + +# model includes 1 years samples only +if (!file.exists(here::here("data/daa_cc_year1.Rds"))) { + output <- ancombc2( + data = filter(tse, week == 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl() + ) + save(output, file = here::here("data/daa_cc_year1.Rds")) + } else { + load(here::here("data/daa_cc_year1.Rds")) + } + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) + # sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim <- output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + + + + +######################## 1.2 Multiple imputation ############################# + + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + +models_imp <- map2(implist, length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(dimp, age, sample_id, siblings), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_cc_mlm_all_imp{imp}2_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_map), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_cc_mlm_all_imp{imp}2_bez.Rds"))) + } else { + load(here::here(glue("data/daa_cc_mlm_all_imp{imp}2_bez.Rds"))) + } + + # structural zeros + tab_zero_all = output$zero_ind + # sensitivity scores + tab_sens_all = output$pseudo_sens_tab + sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_all <- output$res + + effects_all <- map(effects, function(effect) { + select(res_prim_all, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_map, week != 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_bez.Rds"))) + } else { + load(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_bez.Rds"))) + } + + # structural zeros + tab_zero_infancy = output$zero_ind + # sensitivity scores + tab_sens_infancy = output$pseudo_sens_tab + sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_infancy <- output$res + + effects_infancy <- map(effects, function(effect) { + select(res_prim_infancy, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + # model includes 1 years samples only + if (!file.exists(here::here(glue("data/daa_cc_year1_imp{imp}_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_map, week == 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_cc_year1_imp{imp}_bez.Rds"))) + } else { + load(here::here(glue("data/daa_cc_year1_imp{imp}_bez.Rds"))) + } + + # structural zeros + tab_zero_year1 = output$zero_ind + # sensitivity scores + tab_sens_year1 = output$pseudo_sens_tab + sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_year1 <- output$res + effects <- c("age_s", "condition1") + effects_year1 <- map(effects, function(effect) { + select(res_prim_year1, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + list( + all = list( + tab_zero_all, + sens_scores_all, + res_prim_all, + effects_all + ), + infancy = list( + tab_zero_infancy, + sens_scores_infancy, + res_prim_infancy, + effects_infancy + ), + year1 = list( + tab_zero_year1, + sens_scores_year1, + res_prim_year1, + effects_year1 + ) + ) +}) + +# you must have effects defined before running the map command +models_imp[[1]]$all$effects_all +models_imp[[1]]$infancy$effects_infancy + +ancom_tables_itt <- models_imp[[1]]$all[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_itt) <- str_remove(colnames(ancom_tables_itt), "_condition1") + +ancom_tables_itt2 <- models_imp[[1]]$all[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_itt2) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") + +save(ancom_tables_itt, file = here::here("data/ancom_tables_itt.Rds")) + +ancom_tables_itt +ancom_tables_itt2 +# ancom did not identify any as significant but effect sizes are in line + + + + + + + +t1 <- select( + models_imp[[1]]$all[[3]], + taxon, + lfc_age_s) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2 <- select( + models_imp[[1]]$all[[3]], + taxon, + p_age_s + ) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3 <- select( + models_imp[[1]]$all[[3]], + taxon, + q_age_s) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining <- full_join(t1, t2, by = c("taxon", "variable")) %>% + full_join(t3, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining, file = here::here("data/ancombc_remaining.Rds")) + + + + +# same for early infancy samples: +ancom_tables_itt_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_itt_infancy) <- str_remove(colnames(ancom_tables_itt), "_condition1") + +ancom_tables_itt2_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_itt2_infancy) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") + +save(ancom_tables_itt_infancy, file = here::here("data/ancom_tables_itt_infancy.Rds")) + +ancom_tables_itt_infancy + + + + +t1_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + lfc_age_s) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + p_age_s +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + q_age_s) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_infancy <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>% + full_join(t3_infancy, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_infancy, file = here::here("data/ancombc_remaining_infancy.Rds")) + + + +# same for year1 samples: +ancom_tables_itt_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_itt_year1) <- str_remove(colnames(ancom_tables_itt), "_condition1") + +ancom_tables_itt2_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_itt2_year1) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") + +save(ancom_tables_itt_year1, file = here::here("data/ancom_tables_itt_year1.Rds")) + +ancom_tables_itt_year1 + + + + +t1_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + lfc_age_s) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + p_age_s +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + q_age_s) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_year1 <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>% + full_join(t3_year1, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_year1, file = here::here("data/ancombc_remaining_year1.Rds")) + + + + + + + + + + + + + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) +fvars <- c("siblings", "condition") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>% + left_join(select(d_pp, age, sample_id, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + across(all_of(fvars), function(x) as.factor(x)), + ges_age_s = scale(ges_age)[, 1], + birthweight_s = scale(birthweight)[, 1], + edlevel_s = scale(edlevel)[, 1] + ) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +tse <- filter(tse, pp == 1) + + +####################### 2.1 Complete Case Analysis ############################ + + + +# model includes random intercepts and all samples +if (!file.exists(here::here("data/daa_pp_mlm_bez.Rds"))) { + output <- ancombc2( + data = tse, + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + # mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + # trend_control = list( + # contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + # node = list(2), + # solver = "ECOS",B = 100) + # + ) + save(output, file = here::here("data/daa_pp_mlm_bez.Rds")) + } else { + load(here::here("data/daa_pp_mlm_bez.Rds")) + } + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim <- output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + +# model excludes random intercepts and includes all samples +if (!file.exists(here::here("data/daa_pp_bez.Rds"))) { + output <- ancombc2( + data = tse, + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + #rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl() + ) + save(output, file = here::here("data/daa_pp_bez.Rds")) + } else { + load(here::here("data/daa_pp_bez.Rds")) + } + + # structural zeros + tab_zero = output$zero_ind + sum(tab_zero[, 2]) + sum(tab_zero[, 3]) + # sensitivity scores + tab_sens = output$pseudo_sens_tab + head(tab_sens) + pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + + res_prim <- output$res + colnames(res_prim) + effects <- c("age_s", "condition1", "condition1:age_s") + map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + +# model includes random intercepts and excludes 1 year samples +if (!file.exists(here::here("data/daa_pp_mlm_infancy_bez.Rds"))) { + output <- ancombc2( + data = filter(tse, week != 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl() + ) + save(output, file = here::here("data/daa_pp_mlm_infancy_bez.Rds")) + } else { + load(here::here("data/daa_pp_mlm_infancy_bez.Rds")) +} + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim <- output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + +# model includes 1 years samples only +if (!file.exists(here::here("data/daa_pp_year1_bez.Rds"))) { + output <- ancombc2( + data = filter(tse, week == 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl() + ) + save(output, file = here::here("data/daa_pp_year1_bez.Rds")) + } else { + load(here::here("data/daa_pp_year1_bez.Rds")) + } + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) + # sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim <- output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + + + + +######################## 2.2 Multiple imputation ############################# + + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + +models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition", "csection", "sex") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>% + left_join(select(dimp, age, sample_id, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + + tse_map <- filter(tse_map, pp == 1) + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_pp_mlm_imp{imp}_bez.Rds")))) { + output <- ancombc2( + data = tse_map, + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_pp_mlm_imp{imp}_bez.Rds"))) + } else { + load(here::here(glue("data/daa_pp_mlm_imp{imp}_bez.Rds"))) + } + + # structural zeros + tab_zero_all = output$zero_ind + # sensitivity scores + tab_sens_all = output$pseudo_sens_tab + sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_all <- output$res + effects_all <- map(effects, function(effect) { + select(res_prim_all, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_map, week != 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_bez.Rds"))) + } else { + load(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_bez.Rds"))) + } + + # structural zeros + tab_zero_infancy = output$zero_ind + # sensitivity scores + tab_sens_infancy = output$pseudo_sens_tab + sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_infancy <- output$res + effects_infancy <- map(effects, function(effect) { + select(res_prim_infancy, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + # model includes 1 years samples only + if (!file.exists(here::here(glue("data/daa_pp_year1_imp{imp}_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_map, week == 52), + assay_name = "counts", + tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.2, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_pp_year1_imp{imp}_bez.Rds"))) + } else { + load(here::here(glue("data/daa_pp_year1_imp{imp}_bez.Rds"))) + } + + # structural zeros + tab_zero_year1 = output$zero_ind + # sensitivity scores + tab_sens_year1 = output$pseudo_sens_tab + sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_year1 <- output$res + effects_year1 <- map(effects, function(effect) { + select(res_prim_year1, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + list( + all = list( + tab_zero_all, + sens_scores_all, + res_prim_all, + effects_all + ), + infancy = list( + tab_zero_infancy, + sens_scores_infancy, + res_prim_infancy, + effects_infancy + ), + year1 = list( + tab_zero_year1, + sens_scores_year1, + res_prim_year1, + effects_year1 + ) + ) + +}) + + + + + +ancom_tables_pp <- models_imp[[1]]$all[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_pp) <- str_remove(colnames(ancom_tables_pp), "_condition1") + +ancom_tables_pp2 <- models_imp[[1]]$all[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_pp2) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") + +save(ancom_tables_pp, file = here::here("data/ancom_tables_pp.Rds")) + +ancom_tables_pp +ancom_tables_pp2 + + + + + + + + +t1 <- select( + models_imp[[1]]$all[[3]], + taxon, + lfc_age_s) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2 <- select( + models_imp[[1]]$all[[3]], + taxon, + p_age_s +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3 <- select( + models_imp[[1]]$all[[3]], + taxon, + q_age_s) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_pp <- full_join(t1, t2, by = c("taxon", "variable")) %>% + full_join(t3, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_pp, file = here::here("data/ancombc_remaining_pp.Rds")) + + + + +# same for early infancy samples: +ancom_tables_pp_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_pp_infancy) <- str_remove(colnames(ancom_tables_pp), "_condition1") + +ancom_tables_pp2_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_pp2_infancy) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") + +save(ancom_tables_pp_infancy, file = here::here("data/ancom_tables_pp_infancy.Rds")) + +ancom_tables_pp_infancy + + + + +t1_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + lfc_age_s) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + p_age_s +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + q_age_s) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_infancy_pp <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>% + full_join(t3_infancy, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_infancy_pp, file = here::here("data/ancombc_remaining_infancy_pp.Rds")) + + + +# same for year1 samples: +ancom_tables_pp_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_pp_year1) <- str_remove(colnames(ancom_tables_pp), "_condition1") + +ancom_tables_pp2_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_pp2_year1) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") + +save(ancom_tables_pp_year1, file = here::here("data/ancom_tables_pp_year1.Rds")) + +ancom_tables_pp_year1 + + + + +t1_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + lfc_age_s) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + p_age_s +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + q_age_s) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_year1_pp <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>% + full_join(t3_year1, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_year1_pp, file = here::here("data/ancombc_remaining_year1_pp.Rds")) + + + + + + + + + + + + + +############################################################################### +################################# PLOTS ####################################### +############################################################################### + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +tse <- agglomerateByRank(tse, rank = "genus") +tse <- transformCounts(tse, method = "relabundance") + +asv_tab <- t(assay(tse, "relabundance")) %>% as.data.frame() %>% + select(all_of(c("genus:Parabacteroides", "genus:Flavonifractor", "genus:Faecalibacterium"))) %>% + rownames_to_column("sample_id") +colnames(asv_tab) <- str_remove(colnames(asv_tab), "genus:") +meta <- colData(tse) %>% as.data.frame() %>% + mutate( + condition_label = ifelse(condition == 1, "SSC", ifelse(condition == 0, "CAU", NA)), + week_label = glue::glue("Week {week}") + ) %>% + select(sample_id, condition_label, week_label) + +ds <- meta %>% full_join(asv_tab, by = "sample_id") %>% + pivot_longer( + all_of(c("Parabacteroides", "Flavonifractor", "Faecalibacterium")), + names_to = "Genus", values_to = "Abundance" + ) %>% + group_by(Genus) %>% + nest() + + +daa_plots <- map2(ds[[1]], ds[[2]], function(genus, d) { + p <- d %>% + ggplot(aes_string("condition_label", "Abundance", fill = "condition_label")) + + geom_boxplot(outlier.alpha = 0) + + #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) + + geom_jitter(width = 0.1, size = 2.5, alpha = 0.5) + + scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none") + + xlab("") + ylab(str_to_title(genus)) + if (genus == "Parabacteroides") { + p <- p + facet_wrap(~week_label, strip.position = "bottom") + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + } + p +}) +daa_plots +daa_plots <- map2(ds[[1]], ds[[2]], function(genus, d) { + if (genus == "Parabacteroides") { + df <- d + } else { + df <- filter(d, week_label == "Week 52") + } + p <- df %>% + ggplot(aes_string("condition_label", "Abundance", fill = "condition_label")) + + geom_boxplot(outlier.alpha = 0) + + #ggbeeswarm::geom_beeswarm(size = 3, cex = 1) + + geom_jitter(width = 0.1, size = 2.5, alpha = 0.5) + + scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none") + + xlab("") + ylab(str_to_title(genus)) + if (genus == "Parabacteroides") { + p <- p + facet_wrap(~week_label, strip.position = "bottom") + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + } + p +}) +daa_plots[[1]] + +library(patchwork) +daa_plot <- daa_plots[[1]] / + (daa_plots[[2]] + daa_plots[[3]]) + +daa_plot +save(daa_plot, file = here::here("data/daa_plots.Rds")) + + + diff --git a/R/daa_linda_bez.R b/R/daa_linda_bez.R new file mode 100644 index 0000000..c929bdd --- /dev/null +++ b/R/daa_linda_bez.R @@ -0,0 +1,520 @@ +set.seed(1) +library(mia) +library(LinDA) +library(tidyverse) +library(tidySummarizedExperiment) +library(glue) + + + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + + +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) +fvars <- c("siblings", "condition") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(d, age, sample_id, siblings), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] + +####################### 1.1 Complete Case Analysis ############################ + + +# model includes random intercepts and all samples +asv_tab <- as.data.frame(assay(tse, "counts")) +meta <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +head(meta) +asv_tab <- asv_tab[, rownames(meta)] +head(asv_tab[, 1:5]) +linda_obj <- linda( + asv_tab, + meta, + formula = '~condition * age_s + (1|skippy_id)', + alpha = 0.4, + prev.cut = 0.1, + lib.cut = 1000, + winsor.quan = 0.97 +) + +filter(linda_obj$output$condition1, reject) + + + + + +# model includes random intercepts and excludes 1 year samples +tse_infancy <- filter(tse, week != 52) +asv_tab <- as.data.frame(assay(tse_infancy, "counts")) +meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[, rownames(meta)] +dim(na.omit(meta)) +linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) +# linda.plot(linda_obj, c('condition'), +# titles = c('Condition: n v.s. y'), alpha = 0.4, lfc.cut = 1, +# legend = TRUE, directory = NULL, width = 11, height = 8) + +filter(linda_obj$output$condition1, reject) + + + + +# model includes 1 years samples only +tse_year1 <- filter(tse, week == 52) +asv_tab <- as.data.frame(assay(tse_year1, "counts")) +meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[, rownames(meta)] +linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) +res <- filter(linda_obj$output$condition1, reject) %>% + rownames_to_column("taxid") +rd <- rowData(tse_year1) %>% + as.data.frame() %>% + rownames_to_column("taxid") %>% + filter(taxid %in% res$taxid) +res <- left_join(res, rd, by = "taxid") +filter(linda_obj$output$condition1, reject) + + + + + +######################## 1. 2Multiple imputation ############################# + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + +models_imp <- map2(implist, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(dimp, age, sample_id, siblings), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + + # all samples + asv_tab <- as.data.frame(assay(tse_map, "counts")) + meta <- colData(tse_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[, rownames(meta)] + + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + + res_all <- linda_obj$output$condition1 + res_all2 <- linda_obj$output[["condition1:age_s"]] + + # model includes random intercepts and excludes 1 year samples + tse_infancy <- filter(tse_map, week != 52) + asv_tab <- as.data.frame(assay(tse_infancy, "counts")) + meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + res_infancy <- linda_obj$output$condition1 + res_infancy2 <- linda_obj$output[["condition1:age_s"]] + + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- as.data.frame(assay(tse_year1, "counts")) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + res_year1 <- linda_obj$output$condition1 + res_year12 <- linda_obj$output[["condition1:age_s"]] + + list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12) +}) + + +tables_linda_itt <- map(models_imp, function(x) { + x[[1]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_itt + +tables_linda_itt2 <- map(models_imp, function(x) { + x[[4]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_itt2 + + + + + +tables_linda_itt_infancy <- map(models_imp, function(x) { + x[[2]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) + + +tables_linda_itt_year1 <- map(models_imp, function(x) { + x[[3]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) + +tables_linda_itt +tables_linda_itt2 +tables_linda_itt_infancy +tables_linda_itt_year1 + +linda_identified <- map2_dfr( + list( + tables_linda_itt, + tables_linda_itt2, + tables_linda_itt_infancy, + tables_linda_itt_year1 + ), + c( + "main effect", + "interaction term", + "early", + "late" + ), + function(table, term) { + filter(table[[1]], padj <= 0.2) %>% + mutate(term = term) + }) %>% + arrange(desc(abs(log2FoldChange)), padj) + +# delete duplicate row +linda_identified <- linda_identified[-4, ] +linda_identified + + +save( + tables_linda_itt, + tables_linda_itt2, + linda_identified, + file = here::here("data/tables_linda_itt.Rds") +) + +save( + tables_linda_itt_infancy, + file = here::here("data/tables_linda_itt_infancy.Rds") +) + +save( + tables_linda_itt_year1, + file = here::here("data/tables_linda_itt_year1.Rds") +) + + + + + + + + + + + +models_imp <- map2(implist, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-siblings, -bfexcl) %>% + left_join(select(dimp, age, sample_id, siblings, bfexcl), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- as.data.frame(assay(tse_year1, "counts")) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), bfexcl) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + bfexcl', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + res <- filter(linda_obj$output$condition1, reject) %>% + rownames_to_column("taxid") + if (dim(res)[1] >= 1) { + rd <- rowData(tse_year1) %>% + as.data.frame() %>% + rownames_to_column("taxid") %>% + filter(taxid %in% res$taxid) + res_year1 <- left_join(res, rd, by = "taxid") %>% + mutate(time = "year1") + } + + out <- list() + if(exists("res_year1")) { + out[[1]] <- res_year1 + } + out +}) + + +models_imp + + + + + + + + + + + + + + + + + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + +fvars <- c("siblings", "condition", "csection", "sex") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>% + left_join(select(d_pp, age, sample_id, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + across(all_of(fvars), function(x) as.factor(x)), + ges_age_s = scale(ges_age)[, 1], + birthweight_s = scale(birthweight)[, 1], + edlevel_s = scale(edlevel)[, 1] + ) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +tse <- filter(tse, pp == 1) + +####################### 2.1 Complete Case Analysis ############################ + + +# model includes random intercepts and all samples +asv_tab <- as.data.frame(assay(tse, "counts")) +meta <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s) +asv_tab <- asv_tab[, rownames(meta)] +linda_obj <- linda( + asv_tab, + meta, + formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, + lib.cut = 1000, + winsor.quan = 0.97 +) + + +filter(linda_obj$output$condition1, reject) + + +fvars <- c("siblings", "condition", "csection", "sex") +# model includes random intercepts and excludes 1 year samples +tse_infancy <- filter(tse, week != 52) +asv_tab <- as.data.frame(assay(tse_infancy, "counts")) +meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), ges_age_s, edlevel_s, birthweight_s) +dim(na.omit(meta)) +asv_tab <- asv_tab[, rownames(meta)] +linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + edlevel_s + ges_age_s + birthweight_s + sex + csection + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) +# linda.plot(linda_obj, c('condition'), +# titles = c('Condition: n v.s. y'), alpha = 0.4, lfc.cut = 1, +# legend = TRUE, directory = NULL, width = 11, height = 8) + +filter(linda_obj$output$condition1, reject) + + + + +# model includes 1 years samples only +tse_year1 <- filter(tse, week == 52) +asv_tab <- as.data.frame(assay(tse_year1, "counts")) +meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s) +asv_tab <- asv_tab[, rownames(meta)] +linda_obj <- linda( + asv_tab, + meta, + formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4, + prev.cut = 0.1, + lib.cut = 1000, + winsor.quan = 0.97 +) +res <- filter(linda_obj$output$condition1, reject) %>% + rownames_to_column("taxid") +rd <- rowData(tse_year1) %>% + as.data.frame() %>% + rownames_to_column("taxid") %>% + filter(taxid %in% res$taxid) +res <- left_join(res, rd, by = "taxid") + +filter(linda_obj$output$condition1, reject) + + + + + +######################## 2.2 Multiple imputation ############################# + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + +models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition", "csection", "sex") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>% + left_join(select(dimp, age, sample_id, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + tse_map <- filter(tse_map, pp == 1) + + # all samples + asv_tab <- as.data.frame(assay(tse_map, "counts")) + meta <- colData(tse_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + + res_all <- linda_obj$output$condition1 + res_all2 <- linda_obj$output[["condition1:age_s"]] + + # model includes random intercepts and excludes 1 year samples + tse_infancy <- filter(tse_map, week != 52) + asv_tab <- as.data.frame(assay(tse_infancy, "counts")) + meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + res_infancy <- linda_obj$output$condition1 + res_infancy2 <- linda_obj$output[["condition1:age_s"]] + + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- as.data.frame(assay(tse_year1, "counts")) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + res_year1 <- linda_obj$output$condition1 + res_year12 <- linda_obj$output[["condition1:age_s"]] + + list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12) +}) + +models_imp[[1]][[1]] +tables_linda_pp <- map(models_imp, function(x) { + x[[3]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_pp + +save(tables_linda_pp, file = here::here("data/tables_linda_pp.Rds")) + + + diff --git a/R/daa_maaslin2_bez.R b/R/daa_maaslin2_bez.R new file mode 100644 index 0000000..ad4d519 --- /dev/null +++ b/R/daa_maaslin2_bez.R @@ -0,0 +1,495 @@ +set.seed(1) +library(mia) +library(Maaslin2) +library(tidyverse) +library(tidySummarizedExperiment) +library(glue) + + + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + + +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) +fvars <- c("siblings", "condition") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(d, age, sample_id, siblings), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] + +####################### 1.1 Complete Case Analysis ############################ + + + +asv_tab <- t(assay(tse)) +meta <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta),] + +# you can specifiy different GLMs/normalizations/transforms. We used similar +# settings as in Nearing et al. (2021) here: +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.5, metadata == "condition") + +# model includes random intercepts and excludes 1 year samples +tse_infancy <- filter(tse, week != 52) +asv_tab <- t(assay(tse)) +meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta), ] + +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.4, metadata == "condition") + + + + +# model includes 1 years samples only +tse_year1 <- filter(tse, week == 52) +asv_tab <- t(assay(tse)) +meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta), ] + + +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "age_s"), + #random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.4, metadata == "condition") + + + + +######################## 1. 2Multiple imputation ############################# + + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + + +if (!file.exists(here::here("data/maaslin2_itt_mi_bez.Rds"))) { + models_imp <- map2(implist, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(dimp, age, sample_id, siblings), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + + # all samples + asv_tab <- t(assay(tse_map)) + meta <- colData(tse_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_all <- fit_data$results + + + + + # model includes random intercepts and excludes 1 year samples + tse_infancy <- filter(tse_map, week != 52) + asv_tab <- t(assay(tse_infancy)) + meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[rownames(meta), ] + + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_inf <- fit_data$results + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- t(assay(tse_year1)) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "age_s"), + #random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_year1 <- fit_data$results + + list(res_all, res_inf, res_year1) + }) + save(models_imp, file = here::here("data/maaslin2_itt_mi_bez.Rds")) + } else { + load(here::here("data/maaslin2_itt_mi_bez.Rds")) + } + + +# change [[1]] to 2-5 to inspect the other imputations +maaslin2_tables_itt <- map(models_imp[[1]], function(x) { + x %>% filter(metadata == "condition") %>% + select(feature, coef, stderr, pval, qval) %>% + arrange(qval, desc(abs(coef))) %>% + mutate( + across(where(is.numeric), round, 3), + feature = str_replace(feature, "\\.\\.", "."), + feature = str_replace(feature, "\\.", ":") + ) +}) +maaslin2_tables_itt + +# since MaAsLin does not support interactions I need to evaluate early and late infancy separately +maaslin2_identified <- map2_dfr(maaslin2_tables_itt, c("all", "early", "late"), function(table, term) { + filter(table, qval <= 0.2) %>% + mutate(term = term) %>% + arrange(coef, qval) +}) + +maaslin2_identified +save(maaslin2_tables_itt, maaslin2_identified, file = here::here("data/maaslin2_tables_itt.Rds")) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + +fvars <- c("siblings", "condition", "csection", "sex") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>% + left_join(select(d_pp, age, sample_id, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + across(all_of(fvars), function(x) as.factor(x)), + ges_age_s = scale(ges_age)[, 1], + birthweight_s = scale(birthweight)[, 1], + edlevel_s = scale(edlevel)[, 1] + ) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +tse <- filter(tse, pp == 1) + +####################### 2.1 Complete Case Analysis ############################ + + +# model includes random intercepts and all samples +asv_tab <- t(assay(tse)) +meta <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta), ] + +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.1, metadata == "condition") + + + + + + + +######################## 2.2 Multiple imputation ############################# + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- agglomerateByRank(tse, rank = "genus") +tse <- subsetByPrevalentTaxa(tse, detection = 0.001, prevalence = 0.1) + +if (!file.exists(here::here("data/maaslin2_pp_mi_bez.Rds"))) { + models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition", "csection", "sex") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex) %>% + left_join(select(dimp, age, sample_id, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + tse_map <- filter(tse_map, pp == 1) + + # all samples + asv_tab <- t(assay(tse_map)) + meta <- colData(tse_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_all <- fit_data$results + + + # model includes random intercepts and excludes 1 year samples + tse_infancy <- filter(tse_map, week != 52) + asv_tab <- t(assay(tse_infancy)) + meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[rownames(meta), ] + + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_inf <- fit_data$results + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- t(assay(tse_year1)) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + #random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_year1 <- fit_data$results + + list(res_all, res_inf, res_year1) + }) + save(models_imp, file = here::here("data/maaslin2_pp_mi_bez.Rds")) + } else { + load(here::here("data/maaslin2_pp_mi_bez.Rds")) + } + + + + + +# switch the 1 to 2-5 to check other imputations +maaslin2_tables_itt_pp <- map(models_imp[[1]], function(x) { + x %>% filter(metadata == "condition") %>% + select(feature, coef, stderr, pval, qval) %>% + arrange(qval, desc(abs(coef))) %>% + mutate( + across(where(is.numeric), round, 3), + feature = str_replace(feature, "\\.\\.", "."), + feature = str_replace(feature, "\\.", ":") + ) +}) +maaslin2_tables_itt_pp +save(maaslin2_tables_itt_pp, file = here::here("data/maaslin2_tables_itt_pp.Rds")) \ No newline at end of file diff --git a/R/gbm.R b/R/gbm.R new file mode 100644 index 0000000..8e8329d --- /dev/null +++ b/R/gbm.R @@ -0,0 +1,86 @@ +library(tidyverse) +library(omixerRpm) + +# load the KEGG orthologue table +ko <- read.delim( + "picrust2/data/KO_metagenome_out/pred_metagenome_unstrat.tsv", + header = TRUE + ) + +# pick most recent database +listDB() +db <- loadDB(name = listDB()[1]) +head(ko) +# calculate GBM abundance and store in df +# threads = 16, java.mem = 16 +gbm <- rpm(x = ko, module.db = db, ) +gbm <- asDataFrame(gbm, "abundance") +head(gbm) + + +# swap ids back to the sample ids I use in the other datasets +mapfiles <- list.files(here::here("data/map_files"), full.names = TRUE) +id_swap <- map_dfr(mapfiles, ~readxl::read_excel(.x)) %>% + select(sample_id = Seq_ID, internal_sample_id) %>% + mutate( + sample_id = str_replace(sample_id, "SKIPPY_", ""), + sample_id = str_replace(sample_id, "_2", "_1"), + sample_id = str_replace(sample_id, "_52", "_3"), + sample_id = str_replace(sample_id, "_5", "_2"), + sample_id = str_replace(sample_id, "245_1_15-1-17", "245_1"), + sample_id = str_replace(sample_id, "245_1_22-1-17", "245_1_2"), + sample_id = str_replace(sample_id, "269_1_15_mei_2017", "269_1"), + sample_id = str_replace(sample_id, "269_1_18_mei_2017", "269_1_2") + ) + + +modules <- select(gbm, Module, Description) +gbm <- select(gbm, -contains("MOCK"), -Description) +colnames(gbm) <- str_remove(colnames(gbm), "X") +gbm <- column_to_rownames(gbm, "Module") %>% + t() %>% + as.data.frame() %>% + mutate_all(function(x) as.integer(x)) %>% + rownames_to_column("internal_sample_id") %>% + left_join(id_swap, by = "internal_sample_id") %>% + select(-internal_sample_id) +head(gbm) +dim(gbm) + + +# store the gbm in a tse to then perform DAA using similar script as before +load(here::here("data/data_imp.Rds")) +load(here::here("data/data.Rds")) + +fvars <- c("siblings", "condition") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(d, age, sample_id, siblings), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +gbm <- gbm %>% + filter(sample_id %in% colnames(tse)) %>% + column_to_rownames("sample_id") %>% + t() + +colnames(gbm)[!colnames(gbm) %in% colnames(tse)] +colnames(tse)[!colnames(tse) %in% colnames(gbm)] +md <- colData(tse) %>% + as.data.frame() %>% + rownames_to_column("sample_id") %>% + arrange(factor(sample_id, levels = colnames(gbm))) %>% + mutate(sid = sample_id) %>% + column_to_rownames("sid") %>% + DataFrame() + +tse_pc <- TreeSummarizedExperiment(assays = gbm, colData = md) +assayNames(tse_pc) <- "counts" +save(tse_pc, modules, file = here::here("data/data_pc.Rds")) + + +gbm %>% dim() diff --git a/R/h_skippy_import_biom.R b/R/h_skippy_import_biom.R new file mode 100644 index 0000000..9c62ad7 --- /dev/null +++ b/R/h_skippy_import_biom.R @@ -0,0 +1,386 @@ +library(tidyverse) +library(mia) +library(glue) +library(tidySummarizedExperiment) +library(miaViz) +library(scater) +library(readxl) +library(lubridate) + +# extract skippy id and week from map files +raw_path <- here::here("data/map_files") +map_files <- list.files(raw_path, pattern = "2021_\\d{4}.*.xlsx$") +map_files %>% length() +lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d") +library_number_merged <- c(1:6) +map_file_merged <- map2_dfr( + map_files, +library_number_merged, +function(filename, lnm) { + # extract library number + lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d") + # read provided map file + xl <- read_excel(glue("{raw_path}/{filename}")) + map_file <- xl %>% + mutate( + LibraryNumber = lnm, + ProjectName = ifelse(is.na(ProjectName), "unspecified", ProjectName) + ) %>% + filter(!ProjectName == "Shime-uncooled_ileal_digesta") %>% + select( + "sample_id" = internal_sample_id, + id = Seq_ID, + ProjectName + ) %>% + filter(!is.na(`sample_id`)) + return(map_file) +}) +# make sample_ids unique as they are in the tse object later +map_file_merged <- map_file_merged %>% + mutate( + namealt = glue("{sample_id}_{1:dim(map_file_merged)[1]}"), + sample_id = ifelse(str_detect(sample_id, "MOCK"), namealt, sample_id)) + + +# the following provided duplicates and need extra labeling later +# when I create the new sample names (see further below) +duplicates <- c( + "245_2_22-1-17", + "240_52_11-2-18", + "269_2_18_mei_2017", + "276_5_26052017" +) + +# create a dataframe that has id from wur, our id, an indicator whether we have +# 2 samples from an id +sample_data <- str_match(map_file_merged$id, ".*(\\d\\d\\d)(_|-)(\\d+).*") %>% + as_tibble() %>% + select(id = V1, skippy_id = V2, week = V4) %>% + full_join(map_file_merged, by = "id") %>% + filter(!is.na(sample_id)) %>% + mutate(duplicated = ifelse(id %in% duplicates, TRUE, FALSE)) %>% + select(wur_id = id, skippy_id, sample_id, week, duplicated) +filter(sample_data, duplicated) + +# import the biome file to combine with above metadata + +# read_biom does not work with the output of NGT2 in its current version +# we must replace certain NaNs with 0 (not relevant for our analysis) +biom_file <- here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom") +if (!file.exists(biom_file)) { + biom_file_old <- here::here("data/raw_data/skippy_non_concat_SILVA_138.biom") + biom_lines <- read_lines(biom_file_old) + biom_lines <- str_replace_all(biom_lines, "NaN", "0.0") + write_lines( + biom_lines, + here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom") + ) +} + +skippy <- loadFromBiom(biom_file) +# add phylogenetic tree made with NGT2 (clustal omega) +tree_file <- here::here( + "data/raw_data/skippy_non_concat_tree_files/all_otus.tree" +) +tree <- ape::read.tree(tree_file) +# tree can only be added to tse but not se object +skippy <- as(skippy, "TreeSummarizedExperiment") +rowTree(skippy) <- tree +# use practical names for the ranks instead +colnames(rowData(skippy)) <- c( + "kingdom", "phylum", "class", "order", "family", "genus", "species" +) + +# add sample ids as column +colData(skippy)[["sample_id"]] <- rownames(colData(skippy)) +# now we can add our sample data we created above to the tse +colData(skippy) <- colData(skippy) %>% + as.data.frame() %>% + left_join(sample_data, by = "sample_id") %>% + mutate(neg_control = as.factor(ifelse(is.na(week), 1, 0))) %>% + column_to_rownames("sample_id") %>% + mutate(sample_id = rownames(colData(skippy))) %>% + DataFrame() + +# we will now store MOCK samples in a separate tse +mock_samples <- filter(skippy, str_detect(sample_id, "MOCK")) +mock_samples <- transformSamples(mock_samples, method = "clr", pseudocount = 1) +# check correlations between mock samples +assay(mock_samples, "clr") %>% cor() + + +# also check they cluster together closely when using ordination +skippy <- mutate(skippy, + mock = as.factor(ifelse(str_detect(sample_id, "MOCK"), 1, 0)) +) +colData(skippy)[["library_size"]] <- colSums(assay(skippy)) +# perform NMDS coordination method +skippy <- runNMDS( + skippy, + FUN = vegan::vegdist, + name = "NMDS" +) +# plot results of a 2-component NMDS on tse, +plotReducedDim(skippy, "NMDS", colour_by = "mock") +plotReducedDim(skippy, "NMDS", colour_by = "library_size") +# MOCK samples do cluster together but particularly 1 outlier is there +# now drop the MOCK samples +tse <- filter(skippy, !str_detect(sample_id, "MOCK")) +# fix names of taxa +rowData(tse) <- rowData(tse) %>% + as_tibble() %>% + mutate(across(everything(), function(x) str_remove(x, ".*[kpcofgs]__"))) %>% + DataFrame() + +# which are the negative control samples? +colData(tse) %>% as.data.frame() %>% filter(!week %in% c(2, 5, 52)) +# the low ls are negative controls as we can see in above table +colData(tse) %>% +as.data.frame() %>% + arrange(library_size) +# exclude negative controls and create unique sample ids indicative of week +tse <- filter(tse, neg_control == 0) %>% + mutate( + sample_ids_new = glue( + "{skippy_id}_{ifelse(week == 2, 1, + ifelse(week == 5, 2, ifelse(week == 52, 3, 'CHECK')))}{ + ifelse(duplicated, '_2', '')}")) + +# we retain 331 samples at this step +# lets change row and column names to better descriptions +colnames(tse) <- colData(tse)[, "sample_ids_new"] +save(tse, file = here::here("data/tse.Rds")) + + + +# store a file that lists all samples we have +colData(tse) %>% + as.data.frame() %>% + select(sample_ids_new, skippy_id, week, wur_id) %>% + arrange(sample_ids_new) %>% + write_excel_csv(here::here("data/processed_samples.csv")) + +# test if we need to rarefy or not: do the methods in ad_tjazi and mia +# produce different results? +tse <- estimateDiversity( + tse, + assay_name = "counts", + index = c("shannon", "faith", "inverse_simpson"), + name = c("shannon", "faith", "inverse_simpson") +) + +ad_tjazi <- Tjazi::get_asymptotic_alpha(assay(tse)) %>% + rownames_to_column("sample_id") + +colData(tse) %>% + as.data.frame() %>% + select( + library_size, + sample_id = sample_ids_new, + shannon, + inverse_simpson, + faith) %>% + full_join(ad_tjazi, by = "sample_id") %>% + select_if(is_numeric) %>% + cor() + + # colData(tse) %>% as.data.frame() %>% + # select(library_size, sample_id = sample_ids_new, shannon, inverse_simpson, faith) %>% + # full_join(ad_tjazi, by = "sample_id") %>% + # select_if(is_numeric) %>% + # select(-library_size, -Chao1) %>% + # pivot_longer(everything(), names_to = "index", values_to = "diversity") %>% + # ggplot(aes(index, diversity)) + + # geom_boxplot() + + # geom_jitter() + +# no correlation with library size + perfect correlation of shannon --> +# we can pick any method + + +# here I check if if end up with all the samples as per documented by Kelly: +samples <- tibble(sample_id = colnames(tse)) %>% + mutate( + week = str_match(sample_id, "_(\\d+)$")[, 2], + id = str_match(sample_id, "^(\\d\\d\\d)")[, 2] + ) %>% + arrange(id, week) +doc <- read_excel(here::here("data/StoolSamples_20162017_SKIPPY.xlsx")) +doc <- select(doc, + id = ID, + w2 = "Stoolsample week 2", + w5 = "Stoolsample week 5", + w52 = "Stoolsample week 52") %>% + pivot_longer(matches("w\\d"), names_to = "week", values_to = "planned") %>% + mutate( + week = str_extract(week, "\\d+"), + week = ifelse( + week == 52, 3, ifelse( + week == 5, 2, ifelse( + week == 2, 1, NA))) + ) +samples$id <- as.double(samples$id) +samples$week <- as.double(samples$week) +full_join(samples, doc, by = c("id", "week")) %>% + filter(is.na(sample_id), planned == 1) %>% + arrange(id, week) +missing_ids <- full_join(samples, doc, by = c("id", "week")) %>% + filter(is.na(sample_id), planned == 1) %>% + arrange(id, week) %>% + .$id +missing_ids %in% samples$id +filter(samples, id %in% missing_ids) %>% + arrange(id, week) + + +# there is a discrepancy in the samples that should be there and that +# are there. I will search through all map files once more but this time +# without filtering to make sure that I didnt filter them out before + +raw_path <- here::here("data/map_files") +map_files <- list.files(raw_path, pattern = "2021_\\d{4}.*.xlsx$") +map_files %>% length() +lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d") +library_number_merged <- c(1:6) +map_file_merged <- map2_dfr( + map_files, + library_number_merged, + function(filename, lnm) { + # extract library number + lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d") + # read provided map file + xl <- read_excel(glue("{raw_path}/{filename}")) + map_file <- xl %>% + mutate( + LibraryNumber = lnm, + ProjectName = ifelse(is.na(ProjectName), "unspecified", ProjectName) + ) %>% + select( + "sample_id" = internal_sample_id, + id = Seq_ID, + ProjectName + ) %>% + filter(!is.na(`sample_id`)) + + return(map_file)}) %>% + mutate(id2 = str_match(id, ".*(\\d\\d\\d).*")[, 2]) +filter(map_file_merged, id2 %in% missing_ids) + +# according to documentation all samples except 226 week 52 are correctly +# missing as they have never been sent to WUR + +n_total <- length(colnames(tse)) +duplicate_ids <- colnames(tse)[map_int(colnames(tse), ~str_length(.x)) > 5] +n_dupl <- length(colnames(tse)[map_int(colnames(tse), ~str_length(.x)) > 5]) +n_total - n_dupl +# how many unique samples per time point? +colData(tse) %>% + as.data.frame() %>% + filter(!sample_ids_new %in% duplicate_ids) %>% + count(week) + + +# after Kelly's latest prereg email I check now which of the ITT individuals +# are in the data + +library(foreign) +sav <- read.spss(here::here("data/kelly_documents/data_itt_pp_dr.sav"), to.data.frame = TRUE) %>% select(id = ID, itt = ITT) %>% + filter(itt != "Excl") %>% + mutate(id = as.character(id)) + +df <- colData(tse) %>% + as.data.frame() %>% + select(wur_id, id = skippy_id, week, duplicated) %>% + full_join(sav, by = "id") +ids <- filter(df, is.na(wur_id)) %>% + .$id +cat(glue("{ids},")) +filter(df, id %in% ids) + +df %>% filter(id %in% sav$id, !duplicated) %>% count(week) + + +# after all has been checked, I will now double check that only the correct +# ids remain in the tse + that I add any necessary variables for the project +load(file = here::here("data/tse.Rds")) + +colData(tse) %>% + as.data.frame() %>% + filter(skippy_id %in% sav$id, !duplicated) %>% + count(week) +tse <- filter(tse, skippy_id %in% sav$id, !duplicated) + +# apparently, there were several date files and this file has more complete data +# than the mdata below for the birthdate variable: +xl <- read_excel(here::here("data/Birthdates_SKIPPY.xlsx")) %>% + mutate(birthdate = dmy(GeboortedatumBaby)) %>% + select(id = ID, birthdate) %>% + distinct() +# fix wrong year entry for id 272 (typo) +xl[xl$id == 272, "birthdate"] <- ymd("2017-04-24") + + +# import metadata +mdata <- read_csv2(here::here("data/skippy_stool_data_cleaned.csv")) %>% + mutate_all(function(x) ifelse(x == 99999, NA, ifelse(x == 88888, NA, x))) %>% + filter(id %in% sav$id) %>% + left_join(xl, by = "id") %>% + # dates must be fixed in order to calculate age + mutate(across(contains("dat_"), function(x) { + x_new <- str_replace(x, "okt", "october") + x_new <- str_replace(x_new, "mrt", "march") + x_new <- str_replace(x_new, "mei", "may") + x_new <- str_replace(x_new, "15-jan-17/22-jan-17", "18/jan/17") + x_new <- dmy(x_new) + return(x_new) + })) + +# again a typo for dat_week2. Easier to fix manually in base R +# dat_week2 = ifelse(id == 300, dmy("25-06-2017"), dmy(dat_week2)), +mdata[mdata$id == 300, "dat_week2"] <- ymd("2017-06-25") + +mdata <- mutate(mdata, + datbirth_infant = dmy(datbirth_infant), + # now that dates are fine we can calculate age + checkold_week2 = as.numeric(dat_week2 - datbirth_infant), + checkold_week5 = as.numeric(dat_week5 - datbirth_infant), + checkold_week52 = as.numeric(dat_1year - datbirth_infant), + age_week2 = as.numeric(dat_week2 - birthdate), + age_week5 = as.numeric(dat_week5 - birthdate), + age_week52 = as.numeric(dat_1year - birthdate), + id = as.character(id)) %>% + select( + skippy_id = id, condition, csection, birthweight, + siblings, sex, bfexcl, contains("bf"), everything()) + +filter( + mdata, + age_week2 != checkold_week2 | + age_week5 != checkold_week5 | + age_week52 != checkold_week52) %>% + select(skippy_id, birthdate, datbirth_infant) + +# now we add all the meta variables to the tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + rownames_to_column("sid") %>% + left_join(mdata, by = "skippy_id") %>% + select(-wur_id, -duplicated, -neg_control, -mock, -sample_id) %>% + select(everything(), sample_id = sample_ids_new) %>% + column_to_rownames("sid") %>% + DataFrame() + + +# this resulting file is our starting point for the analyses +# i will also save it with the mdata and sav as we need that for multiple +# imputation +save(tse, sav, mdata, file = here::here("data/data.Rds")) +select( + mdata, + birthdate, + datbirth_infant, + skippy_id, + age_week2, + age_week5, + age_week52 +) \ No newline at end of file diff --git a/R/helper_functions.R b/R/helper_functions.R new file mode 100644 index 0000000..9108a63 --- /dev/null +++ b/R/helper_functions.R @@ -0,0 +1,42 @@ +library(HDInterval) +library(tidyverse) + + +# calculate contrasts for brms models +calc_contrast <- function(tb, str_vs_str = c("con0", "con1"), ctname = str_vs_str[2]) { + group1 <- select(tb, contains(str_vs_str[1])) %>% + pivot_longer( + everything(), + names_to = "subgroups", + values_to = "value" + ) + group2 <- select(tb, contains(str_vs_str[2])) %>% + pivot_longer( + everything(), + names_to = "subgroups", + values_to = "value" + ) + contrast <- group2$value - group1$value + tb <- tibble( + name = ctname, + beta = median(contrast), + lower = hdi(contrast)[1], + upper = hdi(contrast)[2]) %>% + mutate(across(where(is.numeric), round, 2)) + return(tb) +} + + +summarise_posterior <- function(model, parameters, n_digits = 3) { + as_draws_df(model) %>% + select(all_of(parameters)) %>% + pivot_longer(everything(), names_to = "parameter", values_to = "value") %>% + group_by(parameter) %>% + summarise( + m = median(value), + sd = sd(value), + lower = hdi(value)[1], + upper = hdi(value)[2] + ) %>% + mutate(across(where(is.numeric), round, n_digits)) +} \ No newline at end of file diff --git a/R/mb_age.R b/R/mb_age.R new file mode 100644 index 0000000..f63234f --- /dev/null +++ b/R/mb_age.R @@ -0,0 +1,1293 @@ +library(glue) +library(mia) +library(readxl) +library(lubridate) +library(here) +library(ranger) +library(tidyverse) +library(brms) +library(ggbeeswarm) +library(ComplexHeatmap) +library(circlize) + +full_join <- dplyr::full_join + +# double check by using two approaches. +# APPROACH 1: combine to one TSE, then extract data: +for (file in list.files(here("data/"), pattern = "tse_\\w+.Rds")) { + load(here(glue("data/{file}"))) +} + +lnsk <- length(rownames(tse_skippy)) +lnbib <- length(rownames(tse_bibo)) +lnbin1 <- length(rownames(tse_bingo1)) +lnbin2 <- length(rownames(tse_bingo2)) +rnskippy <- glue("x{1:lnsk}") +rnbibo <- glue("x{(lnsk + 1): (lnsk +lnbib)}") +rnbingo1 <- glue("x{(lnsk + lnbib + 1): (lnsk +lnbib + lnbin1)}") +rnbingo2 <- glue("x{(lnsk + lnbib + lnbin1 + 1): (lnsk +lnbib + lnbin1 + + lnbin2)}") + +tses <- map2( + list(rnskippy, rnbibo, rnbingo1, rnbingo2), + list(tse_skippy, tse_bibo, tse_bingo1, tse_bingo2), + function(rn, tse) { + rownames(tse) <- rn + tse + } +) + +tses[[1]] %>% colnames() +skbib <- mergeSEs(tses[[1]], tses[[2]], missing_values = 0) +bin <- mergeSEs(tses[[3]], tses[[4]], missing_values = 0) +assay(bin) +tse <- mergeSEs(skbib, bin, missing_values = 0) +rowData(tse) %>% dim() +atest <- assay(tse) + +plot(density(colSums(atest))) +# now we need to replace "g__", "f__" etc. by NA in order to aggregate correctly +rowData(tse) <- rowData(tse) %>% + as_tibble() %>% + mutate(across(everything(), function(x) str_remove(x, ".*[kpcofgs]__"))) %>% + DataFrame() + +test <- agglomerateByRank(tse, rank = "Genus") +plot(density(colSums(assay(tse)))) +plot(density(assay(test)[1, ])) + +# APPROACH 2: extract data from each tse and then combine +tses <- map( + list(tse_skippy, tse_bibo, tse_bingo1, tse_bingo2), + function(tse) { + new_tse <- tse + rowData(new_tse) <- rowData(new_tse) %>% + as_tibble() %>% + mutate( + across(everything(), + function(x) str_remove(x, ".*[kpcofgs]__"))) %>% + DataFrame() + new_tse <- agglomerateByRank(new_tse, rank = "genus") + new_tse <- transformSamples(new_tse, method = "relabundance") + as.data.frame(assay(new_tse, "relabundance")) %>% rownames_to_column("taxon") + } +) + +# we can see that there are different numbers of genera in each df +map(tses, ~dim(.x)) +df <- full_join(tses[[1]], tses[[2]], by = "taxon") %>% + full_join(tses[[3]], by = "taxon") %>% + full_join(tses[[4]], by = "taxon") %>% + mutate_all(function(x) ifelse(is.na(x), 0, x)) + + +# Plots are identical (checked them with the counts but now we need relab) +# the next step is to get the age at sample collection for each study + +# SKIPPY +load(file = here::here("data/data_imp.Rds")) +skage <- select(d, skippy_id, sample_id, week, age) %>% + mutate(age = age + as.numeric(as.character(week)) * 7) %>% + select(sample_id, age) +mapfiles <- list.files(here::here("data/map_files"), full.names = TRUE) +id_swap <- map_dfr(mapfiles, ~read_excel(.x)) %>% + select(sample_id = Seq_ID, internal_sample_id) %>% + mutate( + sample_id = str_replace(sample_id, "SKIPPY_", ""), + sample_id = str_replace(sample_id, "_2", "_1"), + sample_id = str_replace(sample_id, "_52", "_3"), + sample_id = str_replace(sample_id, "_5", "_2"), + sample_id = str_replace(sample_id, "245_1_15-1-17", "245_5"), + sample_id = str_replace(sample_id, "245_1_22-1-17", "245_5_2"), + sample_id = str_replace(sample_id, "269_1_15_mei_2017", "269_5"), + sample_id = str_replace(sample_id, "269_1_18_mei_2017", "269_5_2") + ) + +id_swap %>% filter(str_detect(sample_id, "269_")) +skage <- full_join(skage, id_swap, by = "sample_id") %>% + select(sample_id = internal_sample_id, age) %>% + na.omit() +mean(skage$sample_id %in% id_swap$sample_id) + + +# BINGO +load(here::here("data/tse_bingo1.Rds")) +bin_age0 <- colData(tse_bingo1) %>% + as.data.frame() %>% + mutate( + # id = str_extract(id, "\\d\\d\\d"), + time = ifelse( + str_detect(sample_id, "k1"), "week2", ifelse( + str_detect(sample_id, "k2"), "week6", ifelse( + str_detect(sample_id, "k3"), "week12", NA + ) + ) + ) + ) %>% + select(sample_id, age) + + +bin_age1 <- read_excel(here("data/BINGO1y_ActualAge.xlsx")) %>% + select(id = FamilyID, age = "1y-actual age_corrected in days") %>% + filter(!is.na(age)) + +# we need to add the correct sample ids to the file +map_files <- c( + "2019_11_06_Sequencing sample collection_2019_0084.xlsx", + "2019_11_06_Sequencing sample collection_2019_0085.xlsx", + "2020_01_14_Sequencing sample collection_2020_0005.xlsx" +) +bingo_map <- map_dfr(map_files, function(mapfile) { + xl <- read_excel(glue(here("data/bingo_mapfiles/{mapfile}"))) %>% + select(internal_sample_id, ProjectName, Seq_ID) %>% + filter(ProjectName == "BINGO") +}) + +bingo_map_1 <- filter(bingo_map, str_detect(Seq_ID, "1_")) %>% + mutate(id = str_pad(str_remove(Seq_ID, "1_"), 3, side = "left", "0")) %>% + select(sample_id = internal_sample_id, id) +bin_age1 <- full_join(bin_age1, bingo_map_1, by = "id") %>% + select(-id) + +bin_age2 <- read_csv(here("data/bingo_age_1_3_years.csv")) %>% + select( + ID, + birthdate = "Geboortedatum baby", + coldate1 = "Poop collection tube arrives", + coldate2 = "Poop collection date" + ) %>% + filter(!is.na(ID), ID != "Test") %>% + mutate( + coldate2 = str_remove(coldate2, "^\\w+,\\s"), + coldate1 = mdy(coldate1), + coldate2 = mdy(coldate2), + birthdate = dmy(birthdate), + age = ifelse( + is.na(coldate2), + coldate1 - birthdate, + coldate2 - birthdate) + ) %>% + select(id = ID, age) + +bingo_map_2 <- filter(bingo_map, str_detect(Seq_ID, "3_")) %>% + mutate(id = str_pad(str_remove(Seq_ID, "3_"), 3, side = "left", "0")) %>% + select(sample_id = internal_sample_id, id) +bin_age2 <- full_join(bin_age2, bingo_map_2, by = "id") %>% + select(-id) + +bin_age <- bind_rows(bin_age0, bin_age1, bin_age2) %>% + na.omit() +bin_age + + + +# BIBO +# first add proper sample ids +raw_path <- here::here("data/bibo_mapfiles/") +map_files <- list.files(raw_path, pattern = ".xlsx$") +lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d") +library_number_merged <- seq_along(map_files) +map_file_merged <- map2_dfr( + map_files, + library_number_merged, + function(filename, lnm) { + # extract library number + lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d") + + # read provided map file + xl <- read_excel(glue("{raw_path}/{filename}")) + + # create new map file for NGtax2.0 + map_file <- xl %>% + filter( + !grepl("L\\d\\d_NC_\\d", `Seq_ID`), + Seq_ID != "empty", + ProjectName == "BIBO" + ) %>% + select(sample_id = internal_sample_id, Seq_ID) + + return(map_file) +}) + +#---code book: a, 1m; b, 3m; c, 4m; d, 6y; e, 10y. +bibo_map <- map_file_merged %>% + mutate( + id = str_extract(Seq_ID, "\\d+"), + time = str_extract(Seq_ID, "\\w{1}"), + time = ifelse(time == "a", 28, + ifelse(time == "b", 75, ifelse(time == "c", 105, NA))), + id = as.numeric(id), + time = as.numeric(time) + ) %>% + filter(time %in% c(28, 75, 105)) +bibo_age_path <- "data/bibo_age_per_sample_infancy.xlsx" +bibo_age <- read_excel(bibo_age_path) %>% + select( + id = ID, + day28 = "28 days", + day75 = "CC -2 days", + day105 = "CC+28 days" + ) %>% + mutate(id = as.integer(id)) %>% + filter(!is.na(id)) %>% + pivot_longer(contains("day"), names_to = "time", values_to = "age") %>% + mutate( + id = as.numeric(id), + time = as.numeric(str_extract(time, "\\d+")) + ) + +bibo_age <- dplyr::full_join(bibo_age, bibo_map, by = c("id", "time")) %>% + select(sample_id, age) %>% + filter(!is.na(sample_id)) + + +bibo_age + + +# now we got all ages. We need to create a df that has sample ids as rownames +# and genera as colnames + age. + +df <- full_join(tses[[1]], tses[[2]], by = "taxon") %>% + full_join(tses[[3]], by = "taxon") %>% + full_join(tses[[4]], by = "taxon") %>% + mutate_all(function(x) ifelse(is.na(x), 0, x)) %>% + column_to_rownames("taxon") +colnames(df) +# first we do prevalence filtering. +prev <- 0.1 +# for that level we will loose % of all genera --> +mean(rowSums(df[, -1]) <= 0.1) +df <- df[c(rowSums(df[, -1]) > 0.1), ] +df <- as.data.frame(t(df)) %>% + rownames_to_column("sample_id") + + + + + + + +sample_ages <- bind_rows(bin_age, bibo_age, skage) %>% na.omit() +filter(sample_ages) %>% + ggplot(aes(age)) + + geom_histogram(bins = 25) + + +d <- full_join(df, sample_ages, by = "sample_id") %>% + na.omit() + + +# now we can train the model +colnames(d) <- make.names(colnames(d)) +# in the paper they use ntree = 10k and mtry = p/3 +p <- ((ncol(d) - 2) / 3) %>% round() +# i exclude the skippy samples for training +traindata <- filter(d, !sample_id %in% skage$sample_id) +# also leave out samples that are far above the age we work with here. +traindata <- filter(traindata, age <= 1000) +testdata <- filter(d, sample_id %in% skage$sample_id) +model <- ranger( + formula = age ~ ., + data = select(traindata, -sample_id), + num.trees = 1e4, + mtry = p, + importance = "permutation" +) +model + +# calculate correlation of predictions and actual values +pred <- predict(model, data = select(testdata, -sample_id)) +testdata$pred <- pred$predictions +r <- cor.test(testdata$pred, testdata$age)$estimate +r +source(here::here("R/ml_helper.R")) + +if (!file.exists(here::here("data/rvalues.Rds"))) { + rvalues <- rf_null( + y = "age", + features = select(traindata, -sample_id, -age) %>% colnames(), + train = traindata, + test = testdata, + n_perm = 500, + ntree = 500 + ) + save(rvalues, file = here::here("data/rvalues.Rds")) + } else { + load(here::here("data/rvalues.Rds")) +} + +# pvalue +mean(rvalues > r) + + +out <- select(testdata, internal_sample_id = sample_id, pred) %>% + full_join(id_swap, by = "internal_sample_id") %>% + filter(!is.na(pred)) %>% + select(sample_id, pred) +out + + + +# now we have the data to calculate the microbiota age and MAZ + +# calculate microbiota for age z score +# first we need the median microbiota age of healthy children that are in the +# same age (same month). I.E. month 1 (2 weeks), month 2 (5 weeks) and month 12 +# (52 weeks). + +traindata$mbage <- predict(model, data = select(traindata, -sample_id))$predictions +median_mbage <- select(traindata, sample_id, age, mbage) %>% + mutate(month = as.integer(age / (365 / 12))) %>% + group_by(month) %>% + summarise(md = median(mbage), sd = sd(mbage), n = n()) +median_mbage +age_range <- ggplot(traindata, aes(age)) + + geom_histogram(bins = 50, color = "white", fill = "#606060") + + theme_bw(base_size = 25) + + scale_x_continuous(breaks = seq(0, 600, 100)) + + scale_y_continuous(breaks = seq(0, 110, 10)) + + xlab("Age") + ylab("n") +age_range + + +# the formula is: microbiota age - median microbiota age of healthy children +# of same echronological age / sd of the healthy childrens mage +maz <- select(testdata, internal_sample_id = sample_id, age, mbage = pred) %>% + left_join(id_swap, by = "internal_sample_id") %>% + select(-internal_sample_id) %>% + mutate( + month = as.integer(age / (365 / 12)) + ) + + +maz$maz <- NA + +for (i in seq_along(maz$mbage)) { + maz$maz[i] <- as.numeric((maz$mbage[i] - + median_mbage[median_mbage$month == maz$month[i], "md"]) / + median_mbage[median_mbage$month == maz$month[i], "sd"]) +} + +# for analyses it seems better to use microbiota age rather than MAZ because to calculate +# the sd, more coverage in certain months would have been needed. However, including age in +# the regression model is comparable in meaning and works fine. + + + + + + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + + + +####################### 1.1 Complete Case Analysis ############################ + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# for analyses we apply prevalence fitlering and analyze at genus level +fvars <- c("siblings", "condition") +# add metadata to tse +d <- colData(tse) %>% + as.data.frame() %>% + select(-siblings) %>% + left_join(select(d, age, sample_id, siblings), by = "sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) +d$age <- d$age + as.numeric(d$week) * 7 +d$age <- scale(d$age)[, 1] + +if(!file.exists(here::here("data/mbagedataa.Rds"))) { + d_cc <- full_join(select(d, -age), maz, by = "sample_id") %>% + mutate( + condition_label = ifelse(condition == 0, "CAU", ifelse( + condition == 1, "SSC", NA)), + week_label = glue::glue("Week {week}") + ) %>% + group_by(week) %>% + mutate( + maz_s = scale(maz)[, 1], + maz_c = maz - median(maz, na.rm = TRUE) + ) %>% + ungroup() + save(d_cc, file = here::here("data/mbagedataa.Rds")) + } else { + load(here::here("data/mbagedataa.Rds")) +} + +# create plot +mba_plot <- ggplot(d_cc, aes(condition_label, mbage, fill = condition_label)) + + geom_boxplot(outlier.alpha = 0) + + #geom_beeswarm(alpha = 0.4) + + geom_jitter(width = 0.1, size = 2) + + #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") + + facet_wrap(~week_label, strip.position = "bottom") + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank(), + strip.text.x = element_text(size = 20) + ) + + xlab("") + ylab("Microbiota Age") +mba_plot +save(mba_plot, age_range, file = here::here("data/mba_out.Rds")) + +# how is bfexcl distruted +d_cc$bfexcl +dim(d_cc) +select(d_cc, skippy_id, bfexcl) %>% arrange(skippy_id) %>% + distinct() %>% + ggplot(aes(bfexcl)) + + geom_histogram() + + + +# check optimal model structure and then fit the models +coefs <- c( + "csection", + "birthweight_s", + "siblings", + "sex", + "apgar_5_s", + "ges_age_s", + "edlevel_s" +) + + +loo_comp <- map_dfr(coefs, function(coef) { + map_dfr(1:5, function(i) { + map_dfr(list(c(2, 5), 52), function(n_week) { + if (52 %in% n_week) { + # base model + f1 <- bf(mbage ~ age * condition) + m1 <- brm( + data = filter(implist[[i]], week == 52), + formula = f1, + file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}_mbage.Rds")) + ) + loo_m1 <- add_criterion( + m1, + "loo", + file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}_mbage")), + moment_match = FALSE + ) + + f2 <- bf(glue("mbage ~ age * condition + {coef}")) + m2 <- brm( + data = filter(implist[[i]], week == 52), + formula = f2, + file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}_mbage.Rds")) + ) + + loo_m2 <- add_criterion( + m2, + "loo", + file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}_mbage")), + moment_match = FALSE + ) + lcomp <- loo_compare(loo_m2, loo_m1) + score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1) + } else { + # base model + f1 <- bf(mbage ~ age * condition + (1|skippy_id)) + m1 <- brm( + data = filter(implist[[i]], week != 52), + formula = f1, + file = here::here(glue("data/m1_age_imp{i}_{n_week[1]}_mbage.Rds")) + ) + loo_m1 <- add_criterion( + m1, + "loo", + file = here::here(glue("data/loo_m1_age_imp{i}_{n_week[1]}_mbage")), + moment_match = FALSE + ) + + f2 <- bf(glue("mbage ~ age * condition + {coef} + (1|skippy_id)")) + m2 <- brm( + data = filter(implist[[i]], week != 52), + formula = f2, + file = here::here(glue("data/m2_age_imp{i}_{coef}_{n_week[1]}_mbage.Rds")) + ) + + loo_m2 <- add_criterion( + m2, + "loo", + file = here::here(glue("data/loo_m2_age_imp{i}_{coef}_{n_week[1]}_mbage")), + moment_match = FALSE + ) + lcomp <- loo_compare(loo_m2, loo_m1) + score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1) + } + tibble( + model = ifelse(n_week == 52, "1year", "2 and 5 weeks"), + coef = coef, + imp = i, + score = score + ) + }) + }) +}) + +group_by(loo_comp, coef, model) %>% + summarise(ss = sum(score)) + + + + + + + + + + +formula <- bf( + mbage ~ condition * age + siblings + sex + csection + (1 | skippy_id) +) +model_infancy_cc <- brm( + family = student(), + formula = formula, + data = filter(d_cc, week != 52), + file = here::here("data/mbage_infancy_cc_cov") +) + +formula <- bf(mbage ~ age + condition + siblings + sex + csection) +model_year1_cc <- brm( + family = student(), + formula = formula, + data = filter(d_cc, week == 52), + file = here::here("data/mbage_year1_cc_cov") +) + +formula <- bf(mbage ~ condition * age + siblings + sex + csection) +model_all_cc <- brm( + family = student(), + formula = formula, + data = d_cc, + file = here::here("data/mbage_all_cc_cov") +) + +summary(model_infancy_cc) +summary(model_year1_cc) +summary(model_all_cc) + +# sensitivity analyses (no covs) +formula <- bf(mbage ~ age + condition) +model_year1_cc_bez <- brm( + family = student(), + formula = formula, + data = filter(d_cc, week == 52), + file = here::here("data/mbage_year1_cc_bezcov") +) +summary(model_year1_cc_bez) + + + + + + +######################## 1.2 Multiple imputation ############################# + + +implist <- map(implist, function(imp) { + imp_new <- imp %>% + left_join( + select(d_cc, sample_id, mbage), + by = "sample_id") + mice::complete(mice::mice(imp_new)) +}) + + +# fit model +formula <- bf( + mbage ~ age * condition + siblings + sex + csection + (1 | skippy_id) +) +model_infancy <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week != 52)), + file = here::here("data/mbage_infancy_cov") +) + +formula <- bf( + mbage ~ age * condition + siblings + sex + csection + bfexcl + (1 | skippy_id) +) +model_infancy_bf <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week != 52)), + file = here::here("data/mbage_infancy_cov_bf") +) + +formula <- bf(mbage ~ age + condition + siblings + sex + csection) +model_year1 <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_cov") +) + +post <- posterior_samples(model_year1) +HDInterval::hdi(post$b_condition1, prob = 0.95) +mean(post$b_condition1 < 0) +summary(model_year1) + +# sensitivity analyses without covs +formula <- bf(mbage ~ age + condition) +model_year1_bez <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_bez") +) + +post <- posterior_samples(model_year1_bez) +HDInterval::hdi(post$b_condition1, prob = 0.95) +mean(post$b_condition1 < 0) + + +formula <- bf(mbage ~ age * condition + siblings + sex + csection + (1 | skippy_id)) +model_all <- brm_multiple( + family = student(), + formula = formula, + data = implist, + file = here::here("data/mbage_all_cov") +) + +summary(model_infancy) +summary(model_year1) +summary(model_all) + +formula <- bf(mbage ~ age + condition + siblings + sex + csection + bfexcl) +model_year1_bf <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_cov_bf") +) +summary(model_year1_bf) +post <- posterior_samples(model_year1_bf) +HDInterval::hdi(post$b_condition1, prob = 0.95) +mean(post$b_condition1 < 0) +HDInterval::hdi(post$b_bfexcl, prob = 0.95) +mean(post$b_bfexcl < 0) +implist[[1]]$bfexcl + + +formula <- bf(mbage ~ age + condition + siblings + sex + csection + weaning) +model_year1_weaning <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_cov_w") +) +summary(model_year1_weaning) + + +implist_w <- map(implist, function(d) { + d %>% mutate(months_bf_weaning = weaning - bfexcl) +}) +formula <- bf(mbage ~ age + condition + siblings + sex + csection + bfexcl + months_bf_weaning) +model_year1_weaning <- brm_multiple( + family = student(), + formula = formula, + data = map(implist_w, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_cov_months_bf_weaning") +) +summary(model_year1_weaning) + + +save(model_infancy, model_infancy_bf, model_year1, model_all, model_year1_bf, file = here::here("data/mbage_models_itt.Rds")) + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +fvars <- c("siblings", "condition", "csection", "sex") +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) %>% + mutate(skippy_id = as.character(skippy_id)) +# add pp info to existing data +if (!"pp" %in% colnames(d_cc)) { + d_cc <- left_join(d_cc, pp_indicator, by = "skippy_id") +} +d_pp <- filter(d_cc, pp == 1) %>% + mutate( + across(all_of(fvars), function(x) as.factor(x)), + ges_age_s = scale(ges_age)[, 1], + birthweight_s = scale(birthweight)[, 1], + edlevel_s = scale(edlevel)[, 1] + ) +pp_indicator <- mutate(pp_indicator, skippy_id = as.integer(skippy_id)) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + + + +####################### 2.1 Complete Case Analysis ############################ + + +formula <- bf( + mbage ~ age * condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + (1 | skippy_id) +) +model_infancy_cc_pp <- brm( + family = student(), + formula = formula, + data = filter(d_pp, week != 52), + file = here::here("data/mbage_infancy_cc_pp") +) + +formula <- bf(mbage ~ age + condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex) +model_year1_cc_pp <- brm( + family = student(), + formula = formula, + data = filter(d_pp, week == 52), + file = here::here("data/mbage_year1_cc_pp") +) + +summary(model_infancy_cc_pp) +summary(model_year1_cc_pp) + + + + + + + +######################## 2.2 Multiple imputation ############################# + + + + +# fit model +formula <- bf( + mbage ~ age * condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + (1 | skippy_id) +) +model_infancy <- brm_multiple( + family = student(), + formula = formula, + data = map(implist_pp, ~filter(.x, week != 52)), + file = here::here("data/mbage_infancy_pp_cov") +) + +formula <- bf( + mbage ~ age * condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + bfexcl + (1 | skippy_id) +) +model_infancy_bf <- brm_multiple( + family = student(), + formula = formula, + data = map(implist_pp, ~filter(.x, week != 52)), + file = here::here("data/mbage_infancy_pp_cov_bf") +) + +formula <- bf(mbage ~ age + condition + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex) +model_year1 <- brm_multiple( + family = student(), + formula = formula, + data = map(implist_pp, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_pp_cov") +) + +summary(model_infancy) +summary(model_year1) + +# same results here. In conclusion, ssc might have effect on microbiota age +# such that infants microbiota is longer infant like, potentially because of +# longer breastfeeding but this does not explain effect entirely. + + +formula <- bf(mbage ~ age + condition + siblings + bfexcl + birthweight_s + ges_age_s + edlevel_s + csection + sex) +model_year1_bf <- brm_multiple( + family = student(), + formula = formula, + data = map(implist_pp, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_bf_pp_cov") +) +summary(model_year1_bf) + +save(model_infancy, model_infancy_bf, model_year1, model_year1_bf, file = here::here("data/mbage_models_pp.Rds")) + + + +# we can argue that there is evidence for an effect of SSC on MBA +# therefore we will look at the DR analysis as well here: + +dr <- foreign::read.spss( + here::here("data/kelly_documents/data_itt_pp_dr.sav"), + to.data.frame = TRUE + ) %>% + select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>% + mutate(SSC_s = scale(SSC)[, 1]) +head(dr) +ggplot(dr, aes(SSC)) + + geom_histogram() + + +implist <- map(implist, function(imp) { + imp_new <- imp %>% + dplyr::left_join( + select(dr, skippy_id, SSC_s), + by = "skippy_id") + mice::complete(mice::mice(imp_new)) +}) + + +dr$skippy_id <- as.character(dr$skippy_id) +df <- left_join(d_cc, select(dr, skippy_id, SSC_s, SSC), by = "skippy_id") +implist[[1]] %>% colnames() + +# fit model +formula <- bf( + mbage ~ SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + age + (1 | skippy_id) +) +model_infancy <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week != 52)), + file = here::here("data/mbage_infancy_dr_cov") +) + +formula <- bf(mbage ~ age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex) +model_year1 <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_dr_cov") +) + + + +summary(model_infancy) +summary(model_year1) + + +formula <- bf(mbage ~ age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex + bfexcl) +model_year1_bf <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52)), + file = here::here("data/maz_year1_bf_dr_cov") +) +summary(model_year1_bf) + + +# only within SSC + +formula <- bf(mbage ~ age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex) +model_year1_within <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52, condition == 1)), + file = here::here("data/mbage_year1_dr_within_cov") +) + + + +summary(model_year1_within) + + + + +formula <- bf(mbage ~ age + SSC_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex) +model_year1_out <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52, condition == 0)), + file = here::here("data/mbage_year1_dr_out_cov") +) + +summary(model_year1_out) + +save(model_year1, model_year1_within, model_year1_bf, file = here::here("data/mbage_models_dr.Rds")) + + +filter(d_cc, is.na(condition)) +ssc_plot1 <- full_join(d_cc, dr, by = "skippy_id") %>% + select(skippy_id, condition, SSC) %>% + mutate( + condition_label = ifelse(condition == 0, "CAU", ifelse( + condition == 1, "SSC", NA)), + SSC_h = SSC/60) %>% + distinct(.keep_all = TRUE) %>% + filter(skippy_id %in% d_cc$skippy_id) %>% + ggplot(aes(condition_label, SSC_h, fill = condition_label)) + + geom_boxplot(outlier.alpha = 0) + + #geom_beeswarm() + + geom_jitter(width = 0.1, size = 2) + + #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + +xlab("") + ylab("SSC Total Hours") + + +ssc_plot2 <- full_join(d_cc, dr, by = "skippy_id") %>% + select(skippy_id, condition, SSC, mbage, week) %>% + mutate( + condition_label = ifelse(condition == 0, "CAU", ifelse( + condition == 1, "SSC", NA)), + SSC_h = SSC/60) %>% + filter(skippy_id %in% d_cc$skippy_id, week == 52) %>% + ggplot(aes(SSC_h, mbage)) + + geom_point(size = 4) + + geom_smooth(method = "lm", se = TRUE) + + geom_vline(xintercept = 1455.561/60, linetype = "dashed") + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + #scale_color_manual(values = c("#fc8d62", "#8da0cb")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + +xlab("SSC Total Hours") + ylab("Microbiota Age") +ssc_plot1 +ssc_plot2 + +save(ssc_plot1, ssc_plot2, file = here::here("data/sscplot.Rds")) + + + + + +implist <- map(implist, ~mutate(.x, upper = ifelse(SSC_s <= -0.5, "no", "yes"))) + + +formula <- bf(mbage ~ age + upper + siblings + sex + csection) +model_year1_cat <- brm_multiple( + family = student(), + formula = formula, + data = map(implist, ~filter(.x, week == 52)), + file = here::here("data/mbage_year1_dr_cat_cov") +) + +model_year1_cat +implist[[1]] %>% colnames() +test <- implist[[1]] +test$rs <- resid(lm(mbage ~ age, data = implist[[1]])) + +ggplot(test, aes(SSC_s, rs)) + + geom_point() + + geom_smooth() + + + + + + + + + + +################################################################################ +############################## make heatmap from DAA and RF #################### +################################################################################ + +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) +source(here::here("R/ml_helper.R")) + +files <- c( + here::here("data/maaslin2_tables_itt.Rds"), + # here::here("data/ancom_tables_itt.Rds"), + here::here("data/tables_linda_itt.Rds") +) +for (file in files) load(file) + +linda_identified +maaslin2_identified + + +plot_importance(model, top_n = 20) + +rf_feats <- as.character(arrange(extract_importance(model, n = 20), desc(importance))$features) +plot_importance(model, top_n = 25) + theme_bw(base_size = 25) +# plot relabundance of the features but only for those who are nonzero in SKIPPY +rf_feats <- map_chr(rf_feats, function(feat) { + out <- NA + if(sum(testdata[[feat]]) > 0) { + out <- feat + } + out}) %>% + na.omit() %>% + as.character() +rf_feats + + +msl <- make.names(maaslin2_identified$feature) %>% str_replace("genus.Eubacterium._hallii_group", "genus..Eubacterium._hallii_group") +lnd <- make.names(linda_identified$taxon) %>% str_replace("genus.Eubacterium._hallii_group", "genus..Eubacterium._hallii_group") +msl +feats <- union(union(rf_feats, lnd), msl) +feats + +# now extract clr values for the heatmap +tse <- agglomerateByRank(tse, rank = "genus") +tse <- transformSamples(tse, method = "relabundance") +tse <- transformSamples(tse, abund_values = "relabundance", method = "clr", pseudocount = 1) +hmd <- as.data.frame(assay(tse, "clr")) %>% rownames_to_column("taxon") +hmd$taxon <- make.names(hmd$taxon) +hmd <- filter(hmd, taxon %in% feats) %>% + mutate( + RF = ifelse(taxon %in% rf_feats, 1, 0), + Maaslin2 = ifelse(taxon %in% msl, 1, 0), + LinDA = ifelse(taxon %in% lnd, 1, 0), +) +side_vars <- select(hmd, taxon, RF, Maaslin2, LinDA) %>% + mutate(score = RF + Maaslin2 + LinDA) %>% + arrange(desc(score)) + # the genus must be non-zero and identifiable in SKIPPY. + +side_vars +hmd <- select(hmd, -RF, -Maaslin2, -LinDA) %>% + column_to_rownames("taxon") %>% + t() %>% + as.data.frame() %>% + mutate(across(all_of(feats), function(x) scale(x)[, 1])) %>% + rownames_to_column("sample_id") +temp <- colData(tse) %>% + as.data.frame() %>% + mutate(Infancy = ifelse(week == 52, "Late Infancy", "Early Infancy")) %>% + select(sample_id, condition, Infancy) +hmd <- left_join(hmd, temp, by = "sample_id") + + +side_vars +hmd <- arrange( + hmd, + Infancy, + condition, + genus.Faecalibacterium, + genus.Megasphaera, + genus.Bacteroides, + genus.Flavonifractor, + genus.Rothia, + genus..Eubacterium._hallii_group, + genus.Bifidobacterium, + genus.Parabacteroides, + genus.Enterococcus, + genus.Erysipelatoclostridium, + genus.Blautia, + family.Lachnospiraceae, + genus.Butyricicoccus, + genus.Actinomyces, + genus.Staphylococcus, + genus.Anaerostipes, + genus.Dialister, + genus.Lacticaseibacillus, + genus.Lachnospira, + genus.Lachnospiraceae_UCG.004, + genus.Monoglobus +) + +sample_id <- hmd$sample_id +clu <- hmd$condition +age <- hmd$Infancy + +method_label <- mutate( + side_vars, + p1 = ifelse(RF, "R, ", ""), + p2 = ifelse(Maaslin2, "M, ", ""), + p3 = ifelse(LinDA, "L, ", ""), + label = glue("{p1}{p2}{p3}"), + label = str_replace(label, ",\\s+$", ""), + rownames = glue('"{taxon}"^"{label}",')) + +n <- length(method_label$label) +split <- method_label$label[n:1] +split +method_label +df_fin <- hmd[, side_vars$taxon] +df_fin <- t(df_fin) +# sort only based on small matrix +colnames(df_fin) <- clu + +mat <- df_fin +# Legend color +nvec <- seq(-1, 1, length.out = 8) +col_fun <- colorRamp2(nvec, + c( + "#2166ac", + "#4393c3", + "#92c5de", + "#d1e5f0", + "#fddbc7", + "#f4a582", + "#d6604d", + "#b2182b" + )) + + +# Annotation +ha <- HeatmapAnnotation( + SSC = clu, + col = list( + SSC = c("0"="#B0B0B0", "1"="#000000") + ), + annotation_name_side = "left" + ) + + +# Heatmap +png(here::here("fig/rf_feats.png"), + width = 50, + height = 40, + units = "cm", + res= 900) + + Heatmap(mat, + width = unit(32, "cm"), + height = unit(32, "cm"), + top_annotation = ha, + name = "RA", + col = col_fun, + column_title = "", + column_title_gp = gpar(fontsize = 18, fontface = "bold"), + row_title = "", + row_title_side = "right", + row_title_gp = gpar(fontsize = 18, fontface = "bold"), + row_names_gp = gpar(fontsize = 15, fontface = "italic"), + row_names_side = "left", + show_column_names = F, + border = T, + # rect_gp = gpar(col = "white", lwd = 0), + cluster_rows = F, + cluster_columns = F, + show_heatmap_legend = TRUE, + heatmap_legend_param = list(ncol =1), + # row_split = split, + # row_title_rot = 0, + # row_order = side_vars$taxon, + row_labels = expression( + "Faecalibacterium"^"R, M, L", + "Megasphaera"^"M, L", + "Bacteroides"^"M, L", + "Flavonifractor"^"M, L", + "Rothia"^"M, L", + "Eubacterium hallii group"^"R, M", + "Bifidobacterium"^"R", + "Parabacteroides"^"L", + "Enterococcus"^"L", + "Erysipelatoclostridium"^"L", + "Blautia"^"R", + "family:Lachnospiraceae"^"R", + "Butyricicoccus"^"R", + "Actinomyces"^"R", + "Staphylococcus"^"R", + "Anaerostipes"^"R", + "Dialister"^"R", + "Lacticaseibacillus"^"M", + "Lachnospira"^"R", + "Lachnospiraceae UCG 004"^"R", + "Monoglobus"^"R" + ), + column_split = age, + ) + dev.off() + +pdf(here::here("fig/rf_feats.pdf"), width = 25, height = 20) + Heatmap(mat, + width = unit(32, "cm"), + height = unit(32, "cm"), + top_annotation = ha, + name = "RA", + col = col_fun, + column_title = "", + column_title_gp = gpar(fontsize = 18, fontface = "bold"), + row_title = "", + row_title_side = "right", + row_title_gp = gpar(fontsize = 18, fontface = "bold"), + row_names_gp = gpar(fontsize = 15, fontface = "italic"), + row_names_side = "left", + show_column_names = F, + border = T, + # rect_gp = gpar(col = "white", lwd = 0), + cluster_rows = F, + cluster_columns = F, + show_heatmap_legend = TRUE, + heatmap_legend_param = list(ncol =1), + # row_split = split, + # row_title_rot = 0, + # row_order = side_vars$taxon, + row_labels = expression( + "Faecalibacterium"^"R, M, L", + "Megasphaera"^"M, L", + "Bacteroides"^"M, L", + "Flavonifractor"^"M, L", + "Rothia"^"M, L", + "Eubacterium hallii group"^"R, M", + "Bifidobacterium"^"R", + "Parabacteroides"^"L", + "Enterococcus"^"L", + "Erysipelatoclostridium"^"L", + "Blautia"^"R", + "family:Lachnospiraceae"^"R", + "Butyricicoccus"^"R", + "Actinomyces"^"R", + "Staphylococcus"^"R", + "Anaerostipes"^"R", + "Dialister"^"R", + "Lacticaseibacillus"^"M", + "Lachnospira"^"R", + "Lachnospiraceae UCG 004"^"R", + "Monoglobus"^"R" + ), + column_split = age, + ) + dev.off() + +side_vars + + +# further data exploration for discussion +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +tse <- agglomerateByRank(tse, rank = "genus") +tse2 <- tidySummarizedExperiment::filter(tse, week == 2) +tse5 <- tidySummarizedExperiment::filter(tse, week == 5) +tse52 <- tidySummarizedExperiment::filter(tse, week == 52) +prevs <- map(list(tse2, tse5, tse52), function(tse) { + prev <- getPrevalence( + tse, + detection = 1, + sort = TRUE, + assay.type = "counts", + as_relative = FALSE + ) + prev[names(prev) == "genus:Faecalibacterium"] +}) +prevs + +cd <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, sample_id, week, condition, bfexcl, siblings) +a <- as.data.frame(t(assay(tse, "counts"))) %>% + rownames_to_column("sample_id") %>% + select(sample_id, "genus:Faecalibacterium") + +df <- full_join(cd, a, by = "sample_id") %>% + rename(faecalibacterium = `genus:Faecalibacterium`) %>% + mutate( + present = ifelse(faecalibacterium > 0, 1, 0), + l_faecalibacterium = log((faecalibacterium + 1)) + ) +group_by(df, week, condition) %>% + summarise(m = mean(present)) + +summary(lm(l_faecalibacterium ~ siblings + bfexcl, data = df)) diff --git a/R/ml_helper.R b/R/ml_helper.R new file mode 100644 index 0000000..9ab353c --- /dev/null +++ b/R/ml_helper.R @@ -0,0 +1,881 @@ +library(tidyverse) +library(furrr) +library(mlr) +library(tuneRanger) +library(ranger) +library(patchwork) + + +######################### +### ML General ### -------------------------------------- +######################### + +# get very rough estimate of a good ntree/computation trade off +plot_ntree <- function(x, y, ntree = 1e4) { + ntreeplot <- randomForest::randomForest( + x = x, + y = y, + ntree = ntree + ) + plot(ntreeplot) +} + +# returns df of our eval metrics (logloss and F1) +model_eval <- function( + model, + testdata, + features, + y, + model_type = "ranger", + classification = TRUE, + null_test = FALSE, + null_dist = if(null_test) null_dist else NULL + ) { + + if (classification) { + + # we need the groups as 0/1 values + y_true <- as.numeric(testdata[[y]]) -1 + + # obtain predictions (this is little different for each algorithm) + # randomForest + if (model_type == "randomForest") { + y_pred_resp <- predict(model, newdata = testdata, type = "response") + y_pred_resp <- as.numeric(y_pred_resp) -1 + y_pred_prob <- predict(model, newdata = testdata, type = "prob")[, 2] + # ranger + } else if (model_type == "ranger") { + y_pred_resp <- predict(model, data = testdata)$predictions + y_pred_resp <- as.numeric(y_pred_resp) -1 + # to obtain prob you need to fit a prob tree to begin with + y_pred_prob <- predict(model, data = testdata)$predictions + } + # for xgb models we need a xgb.DMatrix + } else if (model_type == "XGBoost") { + testdata_xgb <- select(testdata, all_of(features)) %>% as.matrix() + testdata_xgb <- xgb.DMatrix(data = testdata_xgb, label = y_true) + y_pred_prob <- predict(model, testdata_xgb) + y_pred_resp <- ifelse(y_pred_prob == 0.5, + rbinom(n = 1, size = 1, p = 0.5), ifelse(y_pred_prob > 0.5, + 1, 0)) + + # logloss + log_l <- MLmetrics::LogLoss(y_pred_prob, y_true) + + # F1 scores + f_one <- MLmetrics::F1_Score( + factor(y_true, levels = c("0", "1")), + factor(y_pred_resp, levels = c("0", "1")) + ) + metric <- tibble(logloss = log_l, F1 = f_one) + return(metric) + } else { + if (model_type == "randomForest") { + y_pred <- predict(model, newdata = testdata, type = "response") + # ranger + } else if (model_type == "ranger") { + y_pred <- predict(model, data = testdata)$predictions + } + + p <- cor.test(testdata[[y]], y_pred) + if (null_test) { + p_value <- mean(null_dist > p[4]$estimate) + } + p <- round(p[4]$estimate, 3) + rsq <- mean(model$rsq) %>% round(3) + + metric <- tibble( + r = p, + rsq = rsq, + p_value = p_value) + return(metric) + } + + + +} + + +# returns null distribution or pearson cor for given test data +rf_null <- function( + y, + features, + train = train, + test = test, + n_perm = 500, + ntree = 500 + ) { + + p_null <- future_map_dbl(c(1:n_perm), function(iter) { + # permute outcome + train_perm = train + test_perm <- test + train_perm$y_perm <- sample( + train[[y]], + replace = FALSE, + size = dim(train)[1]) + train_perm <- select(train_perm, -all_of(y)) + + test_perm$y_perm <- sample( + test[[y]], + replace = FALSE, + size = dim(test)[1]) + test_perm <- select(test_perm, -all_of(y)) + + # fit null model ranger + null_model <- ranger::ranger( + y = train_perm$y_perm, + x = select(train, all_of(features)), + num.trees = ntree, + importance = "none", + probability = FALSE, + mtry = 54 # adapted for this specific manuscript, see mbage script + ) + + # obtain pearson + y_pred <- predict(null_model, data = test_perm)$predictions + pearson_r <- cor.test(test_perm$y_perm, y_pred) + return(pearson_r$estimate[[1]]) + }) + return(p_null) +} + + +# # returns a list of lists where each list has a fitted model and the +# # corresponding testdata as items +# fit_cv <- function( +# data, +# features, +# y, +# method = "cv", +# p = ifelse(method == "resample", 0.8, NULL), +# k = 10, +# model_type = "ranger", +# null_test = FALSE, +# n_perm = if (null_test) 500 else NULL, +# ... +# ) { +# +# dots <- list(...) +# +# # cv/resample +# if (method == "cv") { +# train_indeces <- caret::createFolds( +# data[[y]], +# k = k, +# returnTrain = TRUE) +# +# } else if (method == "resample") { +# train_indeces <- caret::createDataPartition( +# data[[y]], +# p = p, +# times = k) +# } +# +# +# # this will return a list of lists that each contain a fitted model and +# # the corresponding test dataset +# models_and_testdata <- map(train_indeces, function(ind) { +# train <- data[ind, ] +# test <- data[-ind, ] +# +# +# +# # fit randomForest +# if (model_type == "randomForest") { +# model <- randomForest::randomForest( +# y = train[[y]], +# x = select(train, all_of(features)), +# ntree = dots$ntree, +# importance = "permutation" +# ) +# } else if (model_type == "ranger") { +# model <- ranger::ranger( +# y = train[[y]], +# x = select(train, all_of(features)), +# ntree = dots$ntree, +# importance = "permutation", +# probability = probability +# ) +# } else if (model_type == "XGBoost") { +# # prepare xgb data matrix object +# labels_train <- train[[y]] %>% as.numeric() -1 # one-hot-coding +# labels_test <- test[[y]] %>% as.numeric() -1 +# train_xgb <- select(train, all_of(features)) %>% as.matrix() +# test_xgb <- select(test, all_of(features)) %>% as.matrix() +# train_xgb <- xgb.DMatrix(data = train_xgb, label = labels_train) +# test_xgb <- xgb.DMatrix(data = test_xgb, label = labels_test) +# +# # set model parameters (this should be put in ... at some point) +# params <- list( +# booster = "gbtree", +# objective = "binary:logistic", +# eta = 0.3, +# gamma = 0, +# max_depth = 6, +# min_child_weight = 1, +# subsample = 1, +# colsample_bytree = 1 +# ) +# +# # fit model +# model <- xgb.train( +# params = params, +# data = train_xgb, +# nrounds = 10, +# watchlist = list(val = test_xgb, train = train_xgb), +# print_every_n = 10, +# early_stop_round = 10, +# maximize = FALSE, +# eval_metric = "logloss", +# verbose = 0 +# ) +# } +# +# if (null_test) { +# null_dist <- rf_null( +# y, +# features, +# train, +# test, +# ntree = dots$ntree, +# n_perm = n_perm +# ) +# +# return(list(model, test, null_dist)) +# } else { +# # return fitted model and corresponding test data set +# list(model, test) +# } +# +# }) +# return(models_and_testdata) +# } + + + + + + + +# summarises eval metrics +summarize_metrics <- function( + models_and_data, + y, + model_type = "ranger", + features = features, + classification = TRUE + ) { + map_dfr(models_and_data, function(model_and_data) { + model <- model_and_data[[1]] + testdata <- model_and_data[[2]] + model_eval(model, testdata, features = features, y = y, model_type = model_type, classification = classification) + }) %>% + gather(metric, value) %>% + group_by(metric) %>% + summarise(mean = mean(value), sd = sd(value)) %>% + mutate_if(is.numeric, round, 2) +} + +plot_importance <- function(model, top_n = NULL) { + var_imp <- importance(model, type = 1) + var_imp <- var_imp %>% as.data.frame() + imp_name <- colnames(var_imp)[1] + + var_imp <- var_imp %>% + rownames_to_column("features") %>% + select(features, importance = all_of(imp_name)) %>% + arrange(importance) %>% + mutate(features = factor(features, level = features)) + if (!is.null(top_n)) { + var_imp <- tail(var_imp, top_n) + } + ggplot(var_imp, aes(features, importance)) + + geom_col() + + coord_flip() + +} + +extract_importance <- function(model, n = 10) { + var_imp <- importance(model, type = 1) + var_imp <- var_imp %>% as.data.frame() + imp_name <- colnames(var_imp)[1] + var_imp <- var_imp %>% + rownames_to_column("features") %>% + select(features, importance = all_of(imp_name)) %>% + arrange(importance) %>% + mutate(features = factor(features, level = features)) %>% + tail(n) + return(var_imp) +} + + + + +######################### +### Random Forests ### -------------------------------------- +######################### + + +# Feature selection based on RF importance scores. +# models_and_data is a list of list where each list contains a model object [1] +# and the corresponding testdata [2] According to workflow in this script +select_features <- function( + models_and_data, + id_name = "id", + n_features = 50) { + top_predictors <- map(models_and_data, function(model_and_data) { + model <- model_and_data[[1]] + + + top_predictors <- importance(model, type = 1) %>% + as.data.frame() + colnames(top_predictors) <- "importance" + #imp_name <- colnames(top_predictors)[1] + top_predictors <- top_predictors %>% + rownames_to_column(id_name) %>% + arrange(desc(importance)) %>% + select(all_of(id_name)) %>% + head(n_features) + } + ) + + # only intersection of all k model is used + selected_features <- Reduce(intersect, top_predictors) + return(selected_features) +} + + + +tune_rf <- function( + data, + features, + y, + regression = TRUE, + measure = NULL, + iters = 70, + iters.warmup = 30, + time.budget = NULL, + num.threads = NULL, + ntree = 1000, + parameters = list( + replace = FALSE, + respect.unordered.factors = "order" + ), + tune.parameters = c( + "mtry", + "min.node.size", + "sample.fraction" + ), + save.file.path = NULL, + build.final.model = FALSE, + show.info = getOption("mlrMBO.show.info", TRUE) + ) { + # names must be compatible with mlr + d <- select(data, all_of(features), all_of(y)) + colnames(d) <- make.names(colnames(d), unique = T) + if (regression) { + tune_task <- makeRegrTask( + data = d, + target = y + ) + } else { + tune_task <- makeClassifTask( + data = d, + target = y + ) + } + + #estimateTimeTuneRanger(tune_task) + res <- tuneRanger( + tune_task, + measure = measure, + iters = iters, + iters.warmup = iters.warmup, + time.budget = time.budget, + num.threads = num.threads, + num.trees = ntree, + parameters = parameters, + tune.parameters = tune.parameters, + save.file.path = save.file.path, + build.final.model = build.final.model, + show.info = show.info + ) + res +} + + + + +rf_cv <- function( + data, + features, + y, + method = "cv", + p = ifelse(method == "resample", 0.8, NULL), + k = 10, + ntree = 500, + null_test = FALSE, + n_perm = if (null_test) 500 else NULL, + regression = TRUE, + probability = ifelse(regression, FALSE, TRUE), + ... + ) { + + # additional arguments for ranger + dots <- list(...) + ranger_params <- list() + if ("mtry" %in% names(dots)) { + ranger_params[["mtry"]] <- dots[["mtry"]] + } else { + ranger_params[["mtry"]] <- NULL + } + if ("min.node.size" %in% names(dots)) { + ranger_params[["min.node.size"]] <- dots[["min.node.size"]] + } else { + ranger_params[["min.node.size"]] <- NULL + } + if ("replace" %in% names(dots)) { + ranger_params[["replace"]] <- dots[["replace"]] + } else { + ranger_params[["replace"]] <- FALSE + } + if ("sample.fraction" %in% names(dots)) { + ranger_params[["sample.fraction"]] <- dots[["sample.fraction"]] + } else { + ranger_params[["sample.fraction"]] <- ifelse(ranger_params[["replace"]], 1, 0.632) + } + if ("splitrule" %in% names(dots)) { + ranger_params[["splitrule"]] <- dots[["splitrule"]] + } else { + ranger_params[["splitrule"]] <- NULL + } + if ("num.random.splits" %in% names(dots)) { + ranger_params[["num.random.splits"]] <- dots[["num.random.splits"]] + } else { + ranger_params[["num.random.splits"]] <- 1 + } + if ("scale.permutation.importance" %in% names(dots)) { + ranger_params[["scale.permutation.importance"]] <- dots[["scale.permutation.importance"]] + } else { + ranger_params[["scale.permutation.importance"]] <- FALSE + } + if ("importance" %in% names(dots)) { + ranger_params[["importance"]] <- dots[["importance"]] + } else { + ranger_params[["importance"]] <- "permutation" + } + + + + # cv/resample + if (method == "cv") { + train_indeces <- caret::createFolds( + data[[y]], + k = k, + returnTrain = TRUE) + + } else if (method == "resample") { + train_indeces <- caret::createDataPartition( + data[[y]], + p = p, + times = k) + } + + + map(train_indeces, function(ind) { + train <- data[ind, ] + test <- data[-ind, ] + model <- ranger::ranger( + y = train[[y]], + x = select(train, all_of(features)), + num.trees = ntree, + importance = ranger_params[["importance"]], + probability = probability, + mtry = ranger_params[["mtry"]], + min.node.size = ranger_params[["min.node.size"]], + replace = ranger_params[["replace"]], + sample.fraction = ranger_params[["sample.fraction"]], + splitrule = ranger_params[["splitrule"]], + num.random.splits = ranger_params[["num.random.splits"]], + scale.permutation.importance = ranger_params[["scale.permutation.importance"]] + ) + + if (null_test) { + null_dist <- rf_null( + y, + features, + train, + test, + ntree = ntree, + n_perm = n_perm + ) + + return(list(model, test, null_dist)) + } else { + # return fitted model and corresponding test data set + list(model, test) + } + }) + } + + +# repeated cv/resampling +rf_rcv <- function( + data, + features, + y, + method = "cv", + p = ifelse(method == "resample", 0.8, NULL), + k = 10, + ntree = 500, + null_test = FALSE, + n_perm = if (null_test) 500 else NULL, + regression = TRUE, + probability = ifelse(regression, FALSE, TRUE), + repeated = 10) { + + all_model_and_data <- map(c(1:repeated), function(rep) { + model_and_data <- rf_cv( + data, + features, + y, + method = method, + p = p, + k = k, + ntree = ntree, + null_test = null_test, + n_perm = n_perm, + regression = regression, + probability = probability + ) + }) + flatten(all_model_and_data) + } + +rf_model_fit <- function( + models_and_data, + y, + regression = TRUE, + null_test = FALSE + ) { + p <- map(models_and_data, function(model_and_data) { + + model <- model_and_data[[1]] + test <- model_and_data[[2]] + if (null_test) { + null_dist <- model_and_data[[3]] + } + if (regression) { + y_pred <- predict(model, data = test)$predictions + p <- cor.test(test[[y]], y_pred) + rsq <- model$r.squared %>% round(3) + if (null_test) { + p_value <- mean(null_dist > p[4]$estimate) + list(round(p[4]$estimate, 3), rsq, p_value) + } else { + list(round(p[4]$estimate, 3), rsq) + } + + } else { + y_true <- as.numeric(test[[y]]) -1 + # only works if you specified ranger as probability tree + y_pred_prob <- predict(model, data = test)$predictions + log_l <- MLmetrics::LogLoss(y_pred_prob[, 2], y_true) + oob <- model$prediction.error + metric <- tibble(oob = oob, log_l = log_l) + list(metric) + } + }) + p +} + +rf_summary <- function( + data, + features, + y, + p = 0.8, + k = 10, + ntree = 500, + regression = TRUE, + null_test = FALSE, + probability = ifelse(regression, FALSE, TRUE) + ) { + model_and_data <- rf_cv( + data, + features, + y, + p = p, + k = k, + ntree = ntree, + null_test = null_test + ) + metric <- rf_model_fit( + model_and_data, + y = y, + regression = regression, + null_test = null_test + ) + if (regression) { + p <- map_dfr(metric, function(list) { + list[[1]] + }) %>% gather(sample, value) %>% + summarise(mean = mean(value), median = median(value), sd = sd(value)) + + rsq <- map_dfr(metric, function(list) { + list[[2]] + }) %>% gather(sample, value) %>% + summarise(mean = mean(value), median = median(value), sd = sd(value)) + + if (null_test) { + df1 <- map_dfr(metric, function(list) { + list[[1]] + }) %>% gather(sample, r) + df2 <- map_dfr(metric, function(list) { + list[[3]] + }) %>% gather(sample, p_value) + p_value <- bind_cols(df1, df2) %>% filter(r == median(r)) %>% + .$p_value + p_value <- p_value[1] + } + + if (null_test) { + list("p" = p, "rsq" = rsq, "p_value" = p_value) + } else { + list("p" = p, "rsq" = rsq) + } + + + } else { + map_dfr(metric, ~bind_rows(.x)) %>% + gather(statistic, value) %>% + group_by(statistic) %>% + summarise( + median = median(value), + sd = sd(value), + lower = quantile(value, 0.025), + upper = quantile(value, 0.975) + ) %>% + mutate_if(is.numeric, round, 2) + } + } + + + + +get_oob <- function(model_and_data, summarise = TRUE) { + metric <- map_dfc(model_and_data, function(md) { + md[[1]]$prediction.error + }) %>% pivot_longer(everything(), names_to = "fold", values_to = "oob") + + if (summarise) { + metric <- metric %>% + summarise( + median = median(oob), + mean = mean(oob), + sd = sd(oob), + lower = quantile(oob, 0.025), + upper = quantile(oob, 0.975) + ) + } + metric +} + +# thanks to Artem Sokolov: https://stackoverflow.com/questions/45676745/how-to-calculate-the-auc-value-for-a-ranger-rf-model +auc_roc <- function(scores, labels){ + stopifnot( length(scores) == length(labels) ) + jp <- which( labels > 0 ); np <- length( jp ) + jn <- which( labels <= 0); nn <- length( jn ) + s0 <- sum( rank(scores)[jp] ) + (s0 - np*(np+1) / 2) / (np*nn) +} + + +get_auc <- function(model_and_data, y, summarise = TRUE) { + metric <- map_dfr(model_and_data, function(md) { + fit <- md[[1]] + test <- md[[2]] + y_pred <- predict(fit, data = test)$predictions[, 2] + y_true <- as.numeric(test[[y]]) - 1 + list(auc = auc_roc(y_pred, y_true)) + }) %>% pivot_longer(everything(), names_to = "fold", values_to = "auc") + + if (summarise) { + metric <- metric %>% + summarise( + median = median(auc), + mean = mean(auc), + sd = sd(auc), + lower = ifelse(berryFunctions::is.error(quantile(auc, 0.025)), NA, quantile(auc, 0.025)), + upper = ifelse(berryFunctions::is.error(quantile(auc, 0.975)), NA, quantile(auc, 0.975)) + ) + } + metric +} + +get_pearson <- function(model_and_data, y, summarise = TRUE) { + metric <- map_dfr(model_and_data, function(md) { + fit <- md[[1]] + test <- md[[2]] + y_pred <- predict(fit, data = test)$predictions + y_true <- test[[y]] + p <- cor.test(y_true, y_pred) + list(pearson = round(p[4]$estimate, 3)) + }) %>% pivot_longer(everything(), names_to = "fold", values_to = "pearson") + + if (summarise) { + metric <- metric %>% + summarise( + median = median(pearson), + mean = mean(pearson), + sd = sd(pearson), + lower = ifelse(berryFunctions::is.error(quantile(pearson, 0.025)), NA, quantile(pearson, 0.025)), + upper = ifelse(berryFunctions::is.error(quantile(pearson, 0.975)), NA, quantile(pearson, 0.975)) + ) + } + metric +} + + +get_rsq <- function(model_and_data, y, summarise = TRUE) { + metric <- map_dfr(model_and_data, function(md) { + fit <- md[[1]] + list(rsq = fit$r.squared) + }) %>% pivot_longer(everything(), names_to = "fold", values_to = "rsq") + + if (summarise) { + metric <- metric %>% + summarise( + median = median(rsq), + mean = mean(rsq), + sd = sd(rsq), + lower = quantile(rsq, 0.025), + upper = quantile(rsq, 0.975) + ) + } + metric +} + + + + + +######################### +### Regression ### -------------------------------------- +######################### + +# to plot simple regression or counterfactual plots +# model is brms model (might work with other lm models too) +# specify x2 for counterfactual plots +plot_regression <- function( + model, x, y, + points = TRUE, + counterfactual = FALSE, + x2 = NULL) { + + + n <- length(model$data[[x2]]) + if (counterfactual) { + newdata <- tibble( + x_rep = seq( + from = min(model$data[[x]]), + to = max(model$data[[x]]), + length.out = n), + x2_rep = mean(model$data[[x2]]) + ) + colnames(newdata) <- c(x, x2) + } else { + newdata <- tibble( + x_rep = seq( + from = min(model$data[[x]]), + to = max(model$data[[x]]), + length.out = n) + ) + colnames(newdata) <- c(x) + } + + df <- fitted(model, newdata = newdata) %>% + as_tibble() %>% + rename( + f_ll = Q2.5, + f_ul = Q97.5 + ) + y_pred <- predict(model, newdata = newdata) %>% + as_tibble() %>% + transmute(p_ll = Q2.5, p_ul = Q97.5) + df <- bind_cols(newdata, y_pred, df) + + if(!counterfactual) { + p <- ggplot(df, aes_string(x, "Estimate")) + + geom_smooth(aes(ymin = f_ll, ymax = f_ul), stat = "identity") + + } else if(counterfactual) { + + p <- ggplot(df, aes_string(x = x, y = "Estimate")) + + geom_ribbon(aes(ymin = p_ll, ymax = p_ul), alpha = 1/5) + + geom_smooth(aes(ymin = f_ll, ymax = f_ul), stat = "identity") + + coord_cartesian(xlim = range(model$data[[x]])) + } + + # add real data points + if(points) { + p <- p + geom_point(data = model$data, aes_string(x, y)) + } + + return(p) +} + + + +# diagnostic plots for frequentist regression (lm or lme4) +lm_diag <- function(model, data, Y, id = "id", brms = TRUE) { + # need some helper function defined elsewhere + source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/reporting.R") + diag_df <- data %>% + mutate( + sresid = if (brms) scale(residuals(model, re_formula = NA)[, 1])[, 1] else resid(model), + fitted = if (brms) fitted(model, re_formula = NA)[, 1] else fitted(model) + ) %>% + mutate(sresid = scale(sresid)[, 1]) + + + # distribution of the scaled residuals + p_resid <- ggplot(diag_df, aes(sresid)) + + geom_density() + + ylab('Density') + xlab('Standardized Redsiduals') + + theme_minimal() + + ## qq plot (source code for gg_qq in script) + qq <- + gg_qq(diag_df$sresid)+ + theme_minimal() + + xlab('Theoretical') + ylab('Sample') + + # fitted vs sresid + fit_resid <- + ggplot(diag_df, aes(fitted, sresid)) + + geom_point(alpha = 0.6) + + geom_smooth(method = "loess", se = F, color = "#f94c39") + + geom_point( + data = filter(diag_df, abs(sresid) > 3.5), + aes(fitted, sresid), color='red' + ) + + ggrepel::geom_text_repel( + data = filter(diag_df, abs(sresid) > 3.5), + aes_string("fitted", y = "sresid", label = id), size = 3 + ) + + ylab('Standardized Residuals') + xlab('Fitted Values') + + scale_y_continuous(breaks=c(-4, -3, -2, -1, 0, 1, 2, 3, 4))+ + theme_minimal() + + # Fitted vs observed + fit_obs <- + ggplot(diag_df, aes_string("fitted", glue("{Y}"))) + + geom_point(alpha = 0.6) + + geom_smooth(method = "loess", se = F, color = '#f94c39') + + ylab(glue("Observed {Y}")) + xlab('Fitted Values') + + theme_minimal() + + (p_resid + qq) / + (fit_resid + fit_obs) +} diff --git a/R/pc_daa_ancombc_bez.R b/R/pc_daa_ancombc_bez.R new file mode 100644 index 0000000..0a0cc00 --- /dev/null +++ b/R/pc_daa_ancombc_bez.R @@ -0,0 +1,1431 @@ +set.seed(1) +library(mia) +library(ANCOMBC) +library(tidyverse) +library(tidySummarizedExperiment) +library(glue) + + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) + +####################### 1.1 Complete Case Analysis ############################ + + +# model includes random intercepts and all samples +if (!file.exists(here::here("data/daa_cc_mlm_pc_bez.Rds"))) { + output <- ancombc2( + data = tse_pc, + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here("data/daa_cc_mlm_pc_bez.Rds")) + } else { + load(here::here("data/daa_cc_mlm_pc_bez.Rds")) +} + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim = output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s", "siblings1") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + +# model excludes random intercepts and includes all samples +if (!file.exists(here::here("data/daa_cc_pc_bez.Rds"))) { + output <- ancombc2( + data = tse_pc, + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings", + #rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here("data/daa_cc_pc_bez.Rds")) + } else { + load(here::here("data/daa_cc_pc_bez.Rds")) + } + + # structural zeros + tab_zero = output$zero_ind + sum(tab_zero[, 2]) + sum(tab_zero[, 3]) + # sensitivity scores + tab_sens = output$pseudo_sens_tab + head(tab_sens) + pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + + res_prim = output$res + colnames(res_prim) + effects <- c("age_s", "condition1", "condition1:age_s", "siblings1") + map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + +# model includes random intercepts and excludes 1 year samples +if (!file.exists(here::here("data/daa_cc_mlm_infancy_pc_bez.Rds"))) { + output <- ancombc2( + data = filter(tse_pc, week != 52), + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here("data/daa_cc_mlm_infancy_pc_bez.Rds")) + } else { + load(here::here("data/daa_cc_mlm_infancy_pc_bez.Rds")) +} + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim = output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s", "siblings1") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + +# model includes 1 years samples only +if (!file.exists(here::here("data/daa_cc_year1_pc_bez.Rds"))) { + output <- ancombc2( + data = filter(tse_pc, week == 52), + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here("data/daa_cc_year1_pc_bez.Rds")) + } else { + load(here::here("data/daa_cc_year1_pc_bez.Rds")) + } + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) + # sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim = output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s", "siblings1") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + +# for the samples at 1 year we find an effect on two metabolites: +# 1 MGB016 35.18912 10.45095 TRUE condition1 +# 2 MGB010 35.02913 10.42313 TRUE condition1 + + + + +######################## 1.2 Multiple imputation ############################# + + + +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + +models_imp <- map2(implist, 1:length(implist), function(dimp, imp) { + tse_pc_map <- tse_pc + + fvars <- c("constipation", "siblings", "diarrhea", "condition") + # add metadata to tse + colData(tse_pc_map) <- colData(tse_pc_map) %>% + as.data.frame() %>% + select(-all_of(fvars), -contains("age")) %>% + left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_pc_map)$age <- colData(tse_pc_map)$age + as.numeric(colData(tse_pc_map)$week) * 7 + colData(tse_pc_map)$age_s <- scale(colData(tse_pc_map)$age)[, 1] + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_cc_mlm_all_imp{imp}_pc_bez.Rds")))) { + output <- ancombc2( + data = tse_pc_map, + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_cc_mlm_all_imp{imp}_pc_bez.Rds"))) + } else { + load(here::here(glue("data/daa_cc_mlm_all_imp{imp}_pc_bez.Rds"))) + } + + # structural zeros + tab_zero_all = output$zero_ind + # sensitivity scores + tab_sens_all = output$pseudo_sens_tab + sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_all = output$res + effects_all <- map(effects, function(effect) { + select(res_prim_all, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_pc_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_pc_map, week != 52), + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_pc_bez.Rds"))) + } else { + load(here::here(glue("data/daa_cc_mlm_infancy_imp{imp}_pc_bez.Rds"))) + } + + # structural zeros + tab_zero_infancy = output$zero_ind + # sensitivity scores + tab_sens_infancy = output$pseudo_sens_tab + sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_infancy = output$res + effects_infancy <- map(effects, function(effect) { + select(res_prim_infancy, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + + + # model includes 1 years samples only + if (!file.exists(here::here(glue("data/daa_cc_year1_imp{imp}_pc_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_pc_map, week == 52), + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_cc_year1_imp{imp}_pc_bez.Rds"))) + } else { + load(here::here(glue("data/daa_cc_year1_imp{imp}_pc_bez.Rds"))) + } + + # structural zeros + tab_zero_year1 = output$zero_ind + # sensitivity scores + tab_sens_year1 = output$pseudo_sens_tab + sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_year1 = output$res + effects_year1 <- map(effects, function(effect) { + select(res_prim_year1, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + list( + all = list( + tab_zero_all, + sens_scores_all, + res_prim_all, + effects_all + ), + infancy = list( + tab_zero_infancy, + sens_scores_infancy, + res_prim_infancy, + effects_infancy + ), + year1 = list( + tab_zero_year1, + sens_scores_year1, + res_prim_year1, + effects_year1 + ) + ) + +}) + +ancom_tables_itt <- models_imp[[1]]$all[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_itt) <- str_remove(colnames(ancom_tables_itt), "_condition1") + +ancom_tables_itt2 <- models_imp[[1]]$all[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_itt2) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") + +save(ancom_tables_itt, file = here::here("data/ancom_tables_itt_pc.Rds")) + +ancom_tables_itt +ancom_tables_itt2 + + + + + + + + + + +t1 <- select( + models_imp[[1]]$all[[3]], + taxon, + lfc_age_s, + lfc_siblings1, + +) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2 <- select( + models_imp[[1]]$all[[3]], + taxon, + p_age_s, + p_siblings1, + + + ) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3 <- select( + models_imp[[1]]$all[[3]], + taxon, + q_age_s, + q_siblings1, + + + ) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining <- full_join(t1, t2, by = c("taxon", "variable")) %>% + full_join(t3, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining, file = here::here("data/ancombc_remaining_pc.Rds")) + + + + +# same for early infancy samples: +ancom_tables_itt_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_itt_infancy) <- str_remove(colnames(ancom_tables_itt), "_condition1") + +ancom_tables_itt2_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_itt2_infancy) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") + +save(ancom_tables_itt_infancy, file = here::here("data/ancom_tables_itt_infancy_pc.Rds")) + +ancom_tables_itt_infancy +ancom_tables_itt2_infancy + + + + +t1_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + lfc_age_s, + lfc_siblings1, + +) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + p_age_s, + p_siblings1, + + +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + q_age_s, + q_siblings1, + + +) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_infancy <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>% + full_join(t3_infancy, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_infancy, file = here::here("data/ancombc_remaining_infancy_pc.Rds")) + + + +# same for year1 samples: +ancom_tables_itt_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_itt_year1) <- str_remove(colnames(ancom_tables_itt), "_condition1") + +ancom_tables_itt2_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_itt2_year1) <- str_remove(colnames(ancom_tables_itt2), "_condition1:age_s") + +save(ancom_tables_itt_year1, file = here::here("data/ancom_tables_itt_year1_pc.Rds")) + +ancom_tables_itt_year1 +ancom_tables_itt2_year1 + + + + + +t1_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + lfc_age_s, + lfc_siblings1, + +) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + p_age_s, + p_siblings1, + + +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + q_age_s, + q_siblings1, + + +) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_year1 <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>% + full_join(t3_year1, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_year1, file = here::here("data/ancombc_remaining_year1_pc.Rds")) + + + + + + + + + + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + +# for analyses we apply prevalence fitlering +tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) +colData(tse_pc) <- colData(tse_pc) %>% + as.data.frame() %>% + #rownames_to_column("sample_id") %>% + left_join(select(d_pp, sample_id, pp), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + DataFrame() +tse_pc <- filter(tse_pc, pp == 1) + + +####################### 2.1 Complete Case Analysis ############################ + + + + + +# model includes random intercepts and all samples +if (!file.exists(here::here("data/daa_pp_mlm_pc_bez.Rds"))) { + output <- ancombc2( + data = tse_pc, + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here("data/daa_pp_mlm_pc_bez.Rds")) + } else { + load(here::here("data/daa_pp_mlm_pc_bez.Rds")) + } + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim = output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s", "siblings1") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + + +# model includes random intercepts and excludes 1 year samples +if (!file.exists(here::here("data/daa_pp_mlm_infancy_pc_bez.Rds"))) { + output <- ancombc2( + data = filter(tse_pc, week != 52), + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here("data/daa_pp_mlm_infancy_pc_bez.Rds")) + } else { + load(here::here("data/daa_pp_mlm_infancy_pc_bez.Rds")) +} + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) +# sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim = output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s", "siblings1") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + +# model includes 1 years samples only +if (!file.exists(here::here("data/daa_pp_year1_pc_bez.Rds"))) { + output <- ancombc2( + data = filter(tse_pc, week == 52), + assay_name = "counts", + tax_level = NULL, + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "holm", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "holm", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here("data/daa_pp_year1_pc_bez.Rds")) + } else { + load(here::here("data/daa_pp_year1_pc_bez.Rds")) + } + +# structural zeros +tab_zero = output$zero_ind +sum(tab_zero[, 2]) + sum(tab_zero[, 3]) + # sensitivity scores +tab_sens = output$pseudo_sens_tab +head(tab_sens) +pivot_longer(tab_sens, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + +res_prim = output$res +colnames(res_prim) +effects <- c("age_s", "condition1", "condition1:age_s", "siblings1") +map(effects, function(effect) { + select(res_prim, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) +}) + + + + + +######################## 2.2 Multiple imputation ############################# + + + +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + +models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("constipation", "siblings", "diarrhea", "condition", "csection", "sex") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-age_s, -age, -siblings, -ges_age, -birthweight, -edlevel, -csection, -sex, -constipation, -diarrhea) %>% + left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + + tse_map <- filter(tse_map, pp == 1) + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_pp_mlm_imp{imp}_pc_bez.Rds")))) { + output <- ancombc2( + data = tse_map, + assay_name = "counts", + #tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "fdr", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_pp_mlm_imp{imp}_pc_bez.Rds"))) + } else { + load(here::here(glue("data/daa_pp_mlm_imp{imp}_pc_bez.Rds"))) + } + + # structural zeros + tab_zero_all = output$zero_ind + # sensitivity scores + tab_sens_all = output$pseudo_sens_tab + sens_scores_all <- pivot_longer(tab_sens_all, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_all <- output$res + effects_all <- map(effects, function(effect) { + select(res_prim_all, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + + # model includes random intercepts and excludes 1 year samples + if (!file.exists(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_pc_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_map, week != 52), + assay_name = "counts", + #tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + rand_formula = "(1 | skippy_id)", + p_adj_method = "fdr", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_pc_bez.Rds"))) + } else { + load(here::here(glue("data/daa_pp_mlm_infancy_imp{imp}_pc_bez.Rds"))) + } + + # structural zeros + tab_zero_infancy = output$zero_ind + # sensitivity scores + tab_sens_infancy = output$pseudo_sens_tab + sens_scores_infancy <- pivot_longer(tab_sens_infancy, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_infancy <- output$res + effects_infancy <- map(effects, function(effect) { + select(res_prim_infancy, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + + # model includes 1 years samples only + if (!file.exists(here::here(glue("data/daa_pp_year1_imp{imp}_pc_bez.Rds")))) { + output <- ancombc2( + data = filter(tse_map, week == 52), + assay_name = "counts", + #tax_level = "genus", + fix_formula = "condition * age_s + siblings + birthweight_s + ges_age_s + edlevel_s + csection + sex", + # rand_formula = "(1 | skippy_id)", + p_adj_method = "fdr", + pseudo = 0, + pseudo_sens = TRUE, + prv_cut = 0.10, + lib_cut = 1000, + s0_perc = 0.05, + group = "condition", + struc_zero = TRUE, + neg_lb = TRUE, + alpha = 0.4, + n_cl = 2, + verbose = TRUE, + global = FALSE, + pairwise = FALSE, + dunnet = FALSE, + trend = FALSE, + iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE), + em_control = list(tol = 1e-5, max_iter = 100), + lme_control = lme4::lmerControl(), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + trend_control = list( + contrast = list(matrix(c(1, 0, -1, 1), nrow = 2, byrow = TRUE)), + node = list(2), + solver = "ECOS",B = 100) + ) + save(output, file = here::here(glue("data/daa_pp_year1_imp{imp}_pc_bez.Rds"))) + } else { + load(here::here(glue("data/daa_pp_year1_imp{imp}_pc_bez.Rds"))) + } + + # structural zeros + tab_zero_year1 = output$zero_ind + # sensitivity scores + tab_sens_year1 = output$pseudo_sens_tab + sens_scores_year1 <- pivot_longer(tab_sens_year1, -taxon, names_to = "contrast", values_to = "value") %>% + filter(value >5) + res_prim_year1 <- output$res + effects_year1 <- map(effects, function(effect) { + select(res_prim_year1, + taxon, + lfc = glue::glue("lfc_{effect}"), + se = glue::glue("se_{effect}"), + indicator = glue::glue("diff_{effect}")) %>% + filter(indicator) %>% + mutate(effect = effect) + }) + + list( + all = list( + tab_zero_all, + sens_scores_all, + res_prim_all, + effects_all + ), + infancy = list( + tab_zero_infancy, + sens_scores_infancy, + res_prim_infancy, + effects_infancy + ), + year1 = list( + tab_zero_year1, + sens_scores_year1, + res_prim_year1, + effects_year1 + ) + ) + +}) + + + + + + +ancom_tables_pp <- models_imp[[1]]$all[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_pp) <- str_remove(colnames(ancom_tables_pp), "_condition1") + +ancom_tables_pp2 <- models_imp[[1]]$all[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_pp2) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") + +save(ancom_tables_pp, file = here::here("data/ancom_tables_pp_pc.Rds")) + +ancom_tables_pp +ancom_tables_pp2 + + + + + + + + + +t1 <- select( + models_imp[[1]]$all[[3]], + taxon, + lfc_age_s, + lfc_siblings1, + +) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2 <- select( + models_imp[[1]]$all[[3]], + taxon, + p_age_s, + p_siblings1, + + +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3 <- select( + models_imp[[1]]$all[[3]], + taxon, + q_age_s, + q_siblings1, + + +) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_pp <- full_join(t1, t2, by = c("taxon", "variable")) %>% + full_join(t3, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_pp, file = here::here("data/ancombc_remaining_pp_pc.Rds")) + + + + +# same for early infancy samples: +ancom_tables_pp_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_pp_infancy) <- str_remove(colnames(ancom_tables_pp), "_condition1") + +ancom_tables_pp2_infancy <- models_imp[[1]]$infancy[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_pp2_infancy) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") + +save(ancom_tables_pp_infancy, file = here::here("data/ancom_tables_pp_infancy_pc.Rds")) + +ancom_tables_pp_infancy + + + + +t1_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + lfc_age_s, + lfc_siblings1, + +) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + p_age_s, + p_siblings1, + + +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_infancy <- select( + models_imp[[1]]$infancy[[3]], + taxon, + q_age_s, + q_siblings1, + + +) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_infancy_pp <- full_join(t1_infancy, t2_infancy, by = c("taxon", "variable")) %>% + full_join(t3_infancy, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_infancy_pp, file = here::here("data/ancombc_remaining_infancy_pp_pc.Rds")) + + + +# same for year1 samples: +ancom_tables_pp_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, lfc_condition1, se_condition1, p_condition1, q_condition1) %>% + arrange(q_condition1, desc(abs(lfc_condition1))) %>% + mutate(across(where(is.numeric), round, 3)) +colnames(ancom_tables_pp_year1) <- str_remove(colnames(ancom_tables_pp), "_condition1") + +ancom_tables_pp2_year1 <- models_imp[[1]]$year1[[3]] %>% + select(taxon, "lfc_condition1:age_s", "se_condition1:age_s", "p_condition1:age_s", "q_condition1:age_s") %>% + arrange(`q_condition1:age_s`, desc(abs(`lfc_condition1:age_s`))) %>% + mutate(across(where(is.numeric), round, 3)) + +colnames(ancom_tables_pp2_year1) <- str_remove(colnames(ancom_tables_pp2), "_condition1:age_s") + +save(ancom_tables_pp_year1, file = here::here("data/ancom_tables_pp_year1_pc.Rds")) + +ancom_tables_pp_year1 +ancom_tables_pp2_year1 + + + + + + +t1_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + lfc_age_s, + lfc_siblings1, + +) %>% + pivot_longer( + contains("lfc"), + names_to = "variable", + values_to = "lfc", + names_prefix = "lfc_") + + +t2_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + p_age_s, + p_siblings1, + + +) %>% + pivot_longer( + contains("p_"), + names_to = "variable", + values_to = "p", + names_prefix = "p_") +t2 +t3_year1 <- select( + models_imp[[1]]$year1[[3]], + taxon, + q_age_s, + q_siblings1, + + +) %>% + pivot_longer( + contains("q_"), + names_to = "variable", + values_to = "q", + names_prefix = "q_") +t3 + +ancombc_remaining_year1_pp <- full_join(t1_year1, t2_year1, by = c("taxon", "variable")) %>% + full_join(t3_year, by = c("taxon", "variable")) %>% + filter(p <= 0.05) %>% + mutate( + across(where(is.numeric), round, 3), + taxon = str_remove(taxon, "genus:") + ) %>% + arrange(variable, q) +save(ancombc_remaining_year1_pp, file = here::here("data/ancombc_remaining_year1_pp_pc.Rds")) + + + + + + + + diff --git a/R/pc_daa_maaslin2_bez.R b/R/pc_daa_maaslin2_bez.R new file mode 100644 index 0000000..e183f73 --- /dev/null +++ b/R/pc_daa_maaslin2_bez.R @@ -0,0 +1,568 @@ +set.seed(1) +library(mia) +library(Maaslin2) +library(tidyverse) +library(tidySummarizedExperiment) +library(glue) + + + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# for analyses we apply prevalence fitlering +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) +fvars <- c("siblings", "condition") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-all_of(fvars), -age, -age_s) %>% + left_join(select(d, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] + +####################### 1.1 Complete Case Analysis ############################ + + + +asv_tab <- t(assay(tse)) +meta <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta),] + +# you can specifiy different GLMs/normalizations/transforms. We used similar +# settings as in Nearing et al. (2021) here: +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "siblings", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.5, metadata == "condition") + +# model includes random intercepts and excludes 1 year samples +tse_infancy <- filter(tse, week != 52) +asv_tab <- t(assay(tse)) +meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta), ] + +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "siblings", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.4, metadata == "condition") + + + + +# model includes 1 years samples only +tse_year1 <- filter(tse, week == 52) +asv_tab <- t(assay(tse)) +meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta), ] + + +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "siblings", "age_s"), + #random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.4, metadata == "condition") + + + + +######################## 1. 2Multiple imputation ############################# + + +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + + + +if (!file.exists(here::here("data/maaslin2_itt_mi_pc_bez.Rds"))) { + models_imp <- map2(implist, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-all_of(fvars), -age,-age_s) %>% + left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + + # all samples + asv_tab <- t(assay(tse_map)) + meta <- colData(tse_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "siblings", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_all <- fit_data$results + + + + + # model includes random intercepts and excludes 1 year samples + tse_infancy <- filter(tse_map, week != 52) + asv_tab <- t(assay(tse_infancy)) + meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[rownames(meta), ] + + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "siblings", "age_s"), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_inf <- fit_data$results + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- t(assay(tse_year1)) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c("condition", "siblings", "age_s"), + #random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_year1 <- fit_data$results + + list(res_all, res_inf, res_year1) + }) + save(models_imp, file = here::here("data/maaslin2_itt_mi_pc_bez.Rds")) + } else { + load(here::here("data/maaslin2_itt_mi_pc_bez.Rds")) + } + + +# change [[1]] to 2-5 to inspect the other imputations +maaslin2_tables_itt <- map(models_imp[[1]], function(x) { + x %>% filter(metadata == "condition") %>% + select(feature, coef, stderr, pval, qval) %>% + arrange(qval, desc(abs(coef))) %>% + mutate( + across(where(is.numeric), round, 3), + feature = str_replace(feature, "\\.\\.", "."), + feature = str_replace(feature, "\\.", ":") + ) +}) +modules +colnames(modules) <- c("gbm", "description") + +# since MaAsLin does not support interactions I need to evaluate early and late infancy separately +maaslin2_identified <- map2_dfr(maaslin2_tables_itt, c("all", "early", "late"), function(table, term) { + filter(table, qval <= 0.2) %>% + mutate(term = term) %>% + arrange(coef, qval) + }) %>% + rename(gbm = feature) %>% + left_join(modules, by = "gbm") %>% + select(-gbm) %>% + select(gbm = description, everything()) + +maaslin2_identified +save(maaslin2_tables_itt, maaslin2_identified, file = here::here("data/maaslin2_tables_itt_pc.Rds")) + +map2_dfr(maaslin2_tables_itt, c("all", "early", "late"), function(table, term) { + filter(table, qval <= 0.2) %>% + mutate(term = term) %>% + arrange(coef, qval) + }) %>% + rename(gbm = feature) %>% + left_join(modules, by = "gbm") + + + +# plot significant features +# import of biomfile and meta data can be found in the import script +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# for analyses we apply prevalence fitlering +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) +fvars <- c("siblings", "condition") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-all_of(fvars), -age, -age_s) %>% + left_join(select(d, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] + +tse <- transformSamples(tse, method = "relabundance") +tse <- transformSamples(tse, abund_values = "relabundance", method = "log10", pseudocount = 1) +gbms_identified <- as.data.frame(t(assay(tse, "log10"))) %>% + rownames_to_column("sample_id") %>% + select( + sample_id, + MGB027, + MGB052, + MGB045 +) +# colnames(gbms_identified) <- c( +# "sample_id", +# "Nitric oxide degradation I (NO dioxygenase)", +# "Butyrate synthesis I", +# "Acetate synthesis III" +# ) +gbms_meta <- as.data.frame(colData(tse)) %>% + rownames_to_column("sample_id") %>% + mutate(condition_label = ifelse(condition == 0, "CAU", ifelse( + condition == 1, "SSC", NA)), + week_label = glue::glue("Week {week}") + ) %>% + select(sample_id, condition_label, week_label) + +gbms_data <- full_join(gbms_identified, gbms_meta, by = "sample_id") +plots <- map2(list( + "MGB027", + "MGB052", + "MGB045"), + list( + "Nitric oxide degradation I", + "Butyrate synthesis I", + "Acetate synthesis III" + ), + function(gbm_name, description) { + #pivot_longer(-c(sample_id, condition_label, week_label), names_to = "GBM", values_to = "values") %>% + ggplot(gbms_data, aes_string(x = "condition_label", y = gbm_name, fill = "condition_label")) + + geom_boxplot(outlier.alpha = 0) + + # ggbeeswarm::geom_beeswarm(alpha = 0.4) + + geom_jitter(width = 0.1, size = 2) + + #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") + + facet_wrap(~week_label) + + #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + + xlab("") + + ylab(description) + } +) +plots[[1]] + +save(plots, file = here::here("data/gbm_plots.Rds")) + + + + + + + + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) + +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + +fvars <- c("siblings", "condition", "csection", "sex") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-siblings, -ges_age, -birthweight, -edlevel, -csection, -sex, -age) %>% + left_join(select(d_pp, age, sample_id, constipation, diarrhea, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + across(all_of(fvars), function(x) as.factor(x)), + ges_age_s = scale(ges_age)[, 1], + birthweight_s = scale(birthweight)[, 1], + edlevel_s = scale(edlevel)[, 1] + ) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +tse <- filter(tse, pp == 1) + +####################### 2.1 Complete Case Analysis ############################ + + +# model includes random intercepts and all samples +asv_tab <- t(assay(tse)) +meta <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) +asv_tab <- asv_tab[rownames(meta), ] + +fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done +) +filter(fit_data$results, qval <= 0.1, metadata == "condition") + + + + + + + +######################## 2.2 Multiple imputation ############################# + +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + + +if (!file.exists(here::here("data/maaslin2_pp_mi_pc_bez.Rds"))) { + models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("siblings", "condition", "csection", "sex") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-ges_age, -birthweight, -edlevel, -all_of(fvars), -age, -age_s) %>% + left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + tse_map <- filter(tse_map, pp == 1) + + # all samples + asv_tab <- t(assay(tse_map)) + meta <- colData(tse_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_all <- fit_data$results + + + # model includes random intercepts and excludes 1 year samples + tse_infancy <- filter(tse_map, week != 52) + asv_tab <- t(assay(tse_infancy)) + meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[rownames(meta), ] + + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_inf <- fit_data$results + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- t(assay(tse_year1)) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[rownames(meta), ] + + fit_data <- Maaslin2( + asv_tab, + meta, + output = here::here("data/maaslin/1"), + transform = "AST", + fixed_effects = c( + "condition", + "siblings", + "age_s", + "csection", + "sex", + "ges_age_s", + "birthweight_s", + "edlevel_s" + ), + #random_effects = "skippy_id", + reference = "0", + normalization = "TSS", + standardize = FALSE, + min_prevalence = 0 # prev filterin already done + ) + + res_year1 <- fit_data$results + + list(res_all, res_inf, res_year1) + }) + save(models_imp, file = here::here("data/maaslin2_pp_mi_pc_bez.Rds")) + } else { + load(here::here("data/maaslin2_pp_mi_pc_bez.Rds")) + } + + + + + +# switch the 1 to 2-5 to check other imputations +maaslin2_tables_pp <- map(models_imp[[1]], function(x) { + x %>% filter(metadata == "condition") %>% + select(feature, coef, stderr, pval, qval) %>% + arrange(qval, desc(abs(coef))) %>% + mutate( + across(where(is.numeric), round, 3), + feature = str_replace(feature, "\\.\\.", "."), + feature = str_replace(feature, "\\.", ":") + ) +}) + +save(maaslin2_tables_pp, file = here::here("data/maaslin2_tables_pp_pc.Rds")) + + + diff --git a/R/pc_linda_bez.R b/R/pc_linda_bez.R new file mode 100644 index 0000000..bb080ce --- /dev/null +++ b/R/pc_linda_bez.R @@ -0,0 +1,525 @@ +set.seed(1) +library(mia) +library(LinDA) +library(tidyverse) +library(tidySummarizedExperiment) +library(glue) + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# for analyses we apply prevalence fitlering +tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + +####################### 1.1 Complete Case Analysis ############################ + + +# model includes random intercepts and all samples +asv_tab <- as.data.frame(assay(tse_pc, "counts")) +vars <- c("age_s", "condition", "siblings", "constipation", "diarrhea", "skippy_id") +meta <- colData(tse_pc) %>% as.data.frame() %>% + select(all_of(vars)) +linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.1, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + +filter(linda_obj$output$condition1, reject) +filter(linda_obj$output$siblings1, reject) + + +# model includes random intercepts and excludes 1 year samples +tse_pc_infancy <- filter(tse_pc, week != 52) +asv_tab <- as.data.frame(assay(tse_pc_infancy, "counts")) +meta <- colData(tse_pc_infancy) %>% as.data.frame() %>% + select(all_of(vars)) +dim(na.omit(meta)) +linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.1, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) +# linda.plot(linda_obj, c('condition'), +# titles = c('Condition: n v.s. y'), alpha = 0.1, lfc.cut = 1, +# legend = TRUE, directory = NULL, width = 11, height = 8) + +filter(linda_obj$output$condition1, reject) +filter(linda_obj$output$siblings1, reject) + + + +# model includes 1 years samples only +tse_pc_year1 <- filter(tse_pc, week == 52) +asv_tab <- as.data.frame(assay(tse_pc_year1, "counts")) +meta <- colData(tse_pc_year1) %>% as.data.frame() %>% + select(all_of(vars)) +linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings', alpha = 0.1, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) +res <- filter(linda_obj$output$condition1, reject) %>% + rownames_to_column("taxid") +rd <- rowData(tse_pc_year1) %>% + as.data.frame() %>% + rownames_to_column("taxid") %>% + filter(taxid %in% res$taxid) +res <- left_join(res, rd, by = "taxid") +filter(linda_obj$output$condition1, reject) +filter(linda_obj$output$siblings1, reject) + + + +######################## 1. 2Multiple imputation ############################# + + +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse_pc <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + +models_imp <- map2(implist, 1:length(implist), function(dimp, imp) { + tse_pc_map <- tse_pc + fvars <- c("constipation", "siblings", "diarrhea", "condition") + # add metadata to tse_pc + colData(tse_pc_map) <- colData(tse_pc_map) %>% + as.data.frame() %>% + select(-all_of(fvars), -contains("age")) %>% + left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, condition), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_pc_map)$age <- colData(tse_pc_map)$age + as.numeric(colData(tse_pc_map)$week) * 7 + colData(tse_pc_map)$age_s <- scale(colData(tse_pc_map)$age)[, 1] + + # all samples + asv_tab <- as.data.frame(assay(tse_pc_map, "counts")) + meta <- colData(tse_pc_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + asv_tab <- asv_tab[, rownames(meta)] + + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + res_all <- linda_obj$output$condition1 + res_all2 <- linda_obj$output[["condition1:age_s"]] + + # model includes random intercepts and excludes 1 year samples + tse_pc_infancy <- filter(tse_pc_map, week != 52) + asv_tab <- as.data.frame(assay(tse_pc_infancy, "counts")) + meta <- colData(tse_pc_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + (1|skippy_id)', alpha = 0.1, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + + res_infancy <- linda_obj$output$condition1 + res_infancy2 <- linda_obj$output[["condition1:age_s"]] + + # model includes 1 years samples only + tse_pc_year1 <- filter(tse_pc_map, week == 52) + asv_tab <- as.data.frame(assay(tse_pc_year1, "counts")) + meta <- colData(tse_pc_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars)) + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings', alpha = 0.1, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + + res_year1 <- linda_obj$output$condition1 + res_year12 <- linda_obj$output[["condition1:age_s"]] + + list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12) +}) + +tables_linda_itt <- map(models_imp, function(x) { + x[[1]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_itt + +tables_linda_itt2 <- map(models_imp, function(x) { + x[[4]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_itt2 + + + + + +tables_linda_itt_infancy <- map(models_imp, function(x) { + x[[2]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) + + +tables_linda_itt_year1 <- map(models_imp, function(x) { + x[[3]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) + +tables_linda_itt +tables_linda_itt2 +tables_linda_itt_infancy +tables_linda_itt_year1 + +linda_identified <- map2_dfr( + list( + tables_linda_itt, + tables_linda_itt2, + tables_linda_itt_infancy, + tables_linda_itt_year1 + ), + c( + "main effect", + "interaction term", + "early", + "late" + ), + function(table, term) { + filter(table[[1]], padj <= 0.2) %>% + mutate(term = term) + }) %>% + arrange(desc(abs(log2FoldChange)), padj) + +# # delete duplicate row +# linda_identified <- linda_identified[-4, ] +linda_identified + + +save( + tables_linda_itt, + tables_linda_itt2, + linda_identified, + file = here::here("data/tables_linda_itt_pc.Rds") +) + +save( + tables_linda_itt_infancy, + file = here::here("data/tables_linda_itt_infancy_pc.Rds") +) + +save( + tables_linda_itt_year1, + file = here::here("data/tables_linda_itt_year1_pc.Rds") +) + + + + + + + + +models_imp <- map2(implist, 1:length(implist), function(dimp, imp) { + tse_pc_map <- tse_pc + fvars <- c("constipation", "siblings", "diarrhea", "condition") + # add metadata to tse_pc + colData(tse_pc_map) <- colData(tse_pc_map) %>% + as.data.frame() %>% + select(-all_of(fvars), -contains("age"), -bfexcl) %>% + left_join(select(dimp, age, sample_id, constipation, diarrhea, siblings, bfexcl, condition), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_pc_map)$age <- colData(tse_pc_map)$age + as.numeric(colData(tse_pc_map)$week) * 7 + colData(tse_pc_map)$age_s <- scale(colData(tse_pc_map)$age)[, 1] + + + # model includes 1 years samples only + tse_pc_year1 <- filter(tse_pc_map, week == 52) + asv_tab <- as.data.frame(assay(tse_pc_year1, "counts")) + meta <- colData(tse_pc_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), bfexcl) + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + bfexcl', alpha = 0.1, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + res <- filter(linda_obj$output$condition1, reject) %>% + rownames_to_column("taxid") + if (dim(res)[1] >= 1) { + rd <- rowData(tse_pc_year1) %>% + as.data.frame() %>% + rownames_to_column("taxid") %>% + filter(taxid %in% res$taxid) + res_year1 <- left_join(res, rd, by = "taxid") %>% + mutate(time = "year1") + } + + out <- list() + if(exists("res_year1")) { + out[[1]] <- res_year1 + } + out +}) + + +models_imp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) + +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- left_join(d, pp_indicator, by = "skippy_id") +} +# 60 that are in PP and condition 0; 18 that are condition 1 and pp. Fits... +d_pp <- filter(d, pp == 1) +implist_pp <- map(implist, function(dimp) { + dimp_new <- left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + + +fvars <- c("constipation", "siblings", "diarrhea", "condition", "csection", "sex") +# add metadata to tse +colData(tse) <- colData(tse) %>% + as.data.frame() %>% + select(-ges_age, -birthweight, -edlevel, -csection, -all_of(fvars), -age, -age_s) %>% + left_join(select(d_pp, age, sample_id, constipation, condition, diarrhea, siblings, pp, csection, sex, ges_age, edlevel, birthweight), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate( + across(all_of(fvars), function(x) as.factor(x)), + ges_age_s = scale(ges_age)[, 1], + birthweight_s = scale(birthweight)[, 1], + edlevel_s = scale(edlevel)[, 1] + ) %>% + DataFrame() +colData(tse)$age <- colData(tse)$age + as.numeric(colData(tse)$week) * 7 +colData(tse)$age_s <- scale(colData(tse)$age)[, 1] +tse <- filter(tse, pp == 1) + +####################### 2.1 Complete Case Analysis ############################ + + +# model includes random intercepts and all samples +asv_tab <- as.data.frame(assay(tse, "counts")) +meta <- colData(tse) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s) +asv_tab <- asv_tab[, rownames(meta)] +linda_obj <- linda( + asv_tab, + meta, + formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, + lib.cut = 1000, + winsor.quan = 0.97 +) + + +filter(linda_obj$output$condition1, reject) +filter(linda_obj$output$siblings1, reject) + + +# model includes random intercepts and excludes 1 year samples +tse_infancy <- filter(tse, week != 52) +asv_tab <- as.data.frame(assay(tse_infancy, "counts")) +meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), ges_age_s, edlevel_s, birthweight_s) +dim(na.omit(meta)) +asv_tab <- asv_tab[, rownames(meta)] + +linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + edlevel_s + ges_age_s + birthweight_s + sex + csection + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) +# linda.plot(linda_obj, c('condition'), +# titles = c('Condition: n v.s. y'), alpha = 0.4, lfc.cut = 1, +# legend = TRUE, directory = NULL, width = 11, height = 8) + +filter(linda_obj$output$condition1, reject) +filter(linda_obj$output$siblings1, reject) + + + +# model includes 1 years samples only +tse_year1 <- filter(tse, week == 52) +asv_tab <- as.data.frame(assay(tse_year1, "counts")) +meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), ges_age_s, birthweight_s, edlevel_s) +asv_tab <- asv_tab[, rownames(meta)] +linda_obj <- linda( + asv_tab, + meta, + formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4, + prev.cut = 0.1, + lib.cut = 1000, + winsor.quan = 0.97 +) +res <- filter(linda_obj$output$condition1, reject) %>% + rownames_to_column("taxid") +rd <- rowData(tse_year1) %>% + as.data.frame() %>% + rownames_to_column("taxid") %>% + filter(taxid %in% res$taxid) +res <- left_join(res, rd, by = "taxid") +filter(linda_obj$output$siblings1, reject) +filter(linda_obj$output$condition1, reject) + + + + + +######################## 2.2 Multiple imputation ############################# + +load(here::here("data/data_pc.Rds")) +load(file = here::here("data/data_imp.Rds")) +# for analyses we apply prevalence fitlering +tse <- subsetByPrevalentTaxa(tse_pc, detection = 0, prevalence = 0.1) + + +models_imp <- map2(implist_pp, 1:length(implist), function(dimp, imp) { + tse_map <- tse + fvars <- c("constipation", "siblings", "diarrhea", "condition", "csection", "sex") + # add metadata to tse + colData(tse_map) <- colData(tse_map) %>% + as.data.frame() %>% + select(-all_of(fvars), -age, -age_s, -ges_age, -birthweight, -edlevel, -csection) %>% + left_join(select(dimp, age, sample_id, condition, constipation, diarrhea, siblings, pp, csection, sex, ges_age_s, edlevel_s, birthweight_s), by = "sample_id") %>% + column_to_rownames("sample_id") %>% + mutate(across(all_of(fvars), function(x) as.factor(x))) %>% + DataFrame() + colData(tse_map)$age <- colData(tse_map)$age + as.numeric(colData(tse_map)$week) * 7 + colData(tse_map)$age_s <- scale(colData(tse_map)$age)[, 1] + tse_map <- filter(tse_map, pp == 1) + + # all samples + asv_tab <- as.data.frame(assay(tse_map, "counts")) + meta <- colData(tse_map) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + + res_all <- linda_obj$output$condition1 + res_all2 <- linda_obj$output[["condition1:age_s"]] + + # model includes random intercepts and excludes 1 year samples + tse_infancy <- filter(tse_map, week != 52) + asv_tab <- as.data.frame(assay(tse_infancy, "counts")) + meta <- colData(tse_infancy) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s + (1|skippy_id)', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + res_infancy <- linda_obj$output$condition1 + res_infancy2 <- linda_obj$output[["condition1:age_s"]] + + + + # model includes 1 years samples only + tse_year1 <- filter(tse_map, week == 52) + asv_tab <- as.data.frame(assay(tse_year1, "counts")) + meta <- colData(tse_year1) %>% as.data.frame() %>% + select(skippy_id, age_s, all_of(fvars), edlevel_s, birthweight_s, ges_age_s) + asv_tab <- asv_tab[, rownames(meta)] + linda_obj <- linda(asv_tab, meta, formula = '~condition * age_s + siblings + csection + sex + ges_age_s + birthweight_s + edlevel_s', alpha = 0.4, + prev.cut = 0.1, lib.cut = 1000, winsor.quan = 0.97) + + res_year1 <- linda_obj$output$condition1 + res_year12 <- linda_obj$output[["condition1:age_s"]] + + list(res_all, res_infancy, res_year1, res_all2, res_infancy2, res_year12) +}) + +tables_linda_pp <- map(models_imp, function(x) { + x[[1]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_pp + +save(tables_linda_pp, file = here::here("data/tables_linda_pp_pc.Rds")) + + + +tables_linda_pp_infancy <- map(models_imp, function(x) { + x[[2]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_pp_infancy + +save(tables_linda_pp_infancy, file = here::here("data/tables_linda_pp_infancy_pc.Rds")) + + + +tables_linda_pp_year1 <- map(models_imp, function(x) { + x[[3]] %>% + rownames_to_column("taxon") %>% + select(taxon, log2FoldChange, lfcSE, pvalue, padj) %>% + arrange(padj, desc(abs(log2FoldChange))) %>% + mutate(across(where(is.numeric), function(x) round(x, 3))) +}) +tables_linda_pp_year1 + +save(tables_linda_pp_year1, file = here::here("data/tables_linda_pp_year1_pc.Rds")) + diff --git a/R/skippy_import_biom_mba.R b/R/skippy_import_biom_mba.R new file mode 100644 index 0000000..43d9a12 --- /dev/null +++ b/R/skippy_import_biom_mba.R @@ -0,0 +1,135 @@ +# tse_bingo, tse_bibo and tse_skippy must go through same steps in terms +# of preprocessing +library(tidyverse) +library(mia) +library(glue) +library(tidySummarizedExperiment) +library(scater) +library(readxl) + + +# biom_file <- here::here("data/raw_data/skippy_non_concat_SILVA_138.biom") +# file.exists(biom_file) +# # because of some issue in NGT2 we need to replace NaN in the biom file with 0 +# biom_lines <- read_lines(biom_file) +# sum(str_count(biom_lines, "NaN")) +# biom_lines <- str_replace_all(biom_lines, "NaN", "0.0") +# sum(str_count(biom_lines, "NaN")) +# write_lines( +# biom_lines, +# here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom") +# ) + +# now we should be able to use read_biom +biom_file <- here::here("data/raw_data/skippy_non_concat_SILVA_138_rpl.biom") + + +# extract skippy id and week from map files +raw_path <- here::here("data/map_files") +map_files <- list.files(raw_path, pattern = ".xlsx$") +map_files %>% length() +lib_nums <- str_extract(map_files, "\\d\\d\\d\\d_00\\d\\d") +LibraryNumber_merged <- c(1:6) +map_file_merged <- map2_dfr(map_files[1:6], LibraryNumber_merged, function(filename, lnm) { + # extract library number + lib_num <- str_extract(filename, "\\d\\d\\d\\d_00\\d\\d") + # read provided map file + xl <- read_excel(glue("{raw_path}/{filename}")) + map_file <- xl %>% + mutate( + LibraryNumber = lnm, + ProjectName = ifelse(is.na(ProjectName), "unspecified", ProjectName) + ) %>% + filter(!ProjectName == "Shime-uncooled_ileal_digesta") %>% + select( + "sample_id" = internal_sample_id, + id = Seq_ID, + ProjectName + ) %>% + filter(!is.na(`sample_id`)) + + return(map_file) +}) +# make sample_ids unique as they are in the tse object later +map_file_merged <- map_file_merged %>% + mutate( + namealt = glue("{sample_id}_{1:dim(map_file_merged)[1]}"), + sample_id = ifelse(str_detect(sample_id, "MOCK"), namealt, sample_id)) + +# extract skippy id and week +# the following provided duplicates and need extra labeling later +# when I create the new sample names (see further below) +duplicates <- c( + "245_2_22-1-17", + "240_52_11-2-18", + "269_2_18_mei_2017", + "276_5_26052017" +) + +sample_data <- str_match(map_file_merged$id, ".*(\\d\\d\\d)(_|-)(\\d+).*") %>% + as_tibble() %>% + select(id = V1, skippy_id = V2, week = V4) %>% + full_join(map_file_merged, by = "id") %>% + filter(!is.na(sample_id)) %>% + mutate(duplicated = ifelse(id %in% duplicates, TRUE, FALSE)) %>% + select(wur_id = id, skippy_id, sample_id, week, duplicated) +filter(sample_data, duplicated) + + +skippy <- loadFromBiom(biom_file) +colnames(rowData(skippy)) <- c( + "kingdom", "phylum", "class", "order", "family", "genus", "species" +) + +# add sample ids as column +colData(skippy)[["sample_id"]] <- rownames(colData(skippy)) +rownames(colData(skippy)) +colData(skippy)[["sample_id"]] + + + +colData(skippy) <- colData(skippy) %>% as.data.frame() %>% + left_join(sample_data, by = "sample_id") %>% + mutate(neg_control = as.factor(ifelse(is.na(week), 1, 0))) %>% + column_to_rownames("sample_id") %>% + mutate(sample_id = rownames(colData(skippy))) %>% + DataFrame() + +# we will now store MOCK samples in a separate tse +mock_samples <- filter(skippy, str_detect(sample_id, "MOCK")) + +# also check they cluster together closely when using ordination +colData(skippy) %>% dim() +skippy <- mutate(skippy, + mock = as.factor(ifelse(str_detect(sample_id, "MOCK"), 1, 0)) +) +colData(skippy)[["library_size"]] <- colSums(assay(skippy)) +# perform NMDS coordination method + +# MOCK samples do cluster together but particularly 1 outlier is there +tse_skippy <- filter(skippy, !str_detect(sample_id, "MOCK")) + + +# library sizes (we need to first get orignal IDs in order to filter out +# negative control) +colData(tse_skippy) %>% as.data.frame() %>% filter(!week %in% c(2, 5, 52)) + +# the low ls are negative controls as we can see in above table +colData(tse_skippy) %>% as.data.frame() %>% + arrange(library_size) +# exclude negative controls +tse_skippy <- filter(tse_skippy, neg_control == 0) %>% + mutate(sample_ids_new = glue("{skippy_id}_{ifelse(week == 2, 1, ifelse(week == 5, 2, ifelse(week == 52, 3, 'CHECK')))}{ifelse(duplicated, '_2', '')}")) + + + +# we retain 331 samples at this step +# lets change row and column names to better descriptions +colnames(tse_skippy) <- colData(tse_skippy)[, "sample_ids_new"] +colData(tse_skippy)[, "sample_ids_new"] +tse_skippy <- filter(skippy, !str_detect(sample_id, "MOCK")) +rns <- str_c(rowData(tse_skippy)$family, rowData(tse_skippy)$genus, sep = "_") +rns[duplicated(rns)] +rownames(tse_skippy) <- rns +save(tse_skippy, file = here::here("data/tse_skippy.Rds")) + diff --git a/R/tables.R b/R/tables.R new file mode 100644 index 0000000..f12c400 --- /dev/null +++ b/R/tables.R @@ -0,0 +1,109 @@ +# this code is from Jennifer but I (Henrik) checked it + +library(table1) +library(dplyr) +library(tidyverse) +library(lubridate) + +load(here::here("data/data_imp.Rds")) +load(here::here("data/data.Rds")) + +bf <- readxl::read_excel(f, sheet = "Feeding", na = "\\") %>% + select(skippy_id = ID, bfexcl = ExclusiveBF) %>% + mutate(skippy_id = as.character(skippy_id)) + +itt_var <- read_csv2(here::here("data/skippy_stool_data_cleaned.csv")) %>% + mutate_all(function(x) ifelse(x == 99999, NA, ifelse(x == 88888, NA, x))) %>% + filter(id %in% sav$id) %>% + mutate(across(contains("dat_"), function(x) { + x_new <- str_replace(x, "okt", "october") + x_new <- str_replace(x_new, "mrt", "march") + x_new <- str_replace(x_new, "mei", "may") + x_new <- str_replace(x_new, "15-jan-17/22-jan-17", "18/jan/17") + x_new <- dmy(x_new) + return(x_new) + }), + datbirth_infant = dmy(datbirth_infant), + datbirth_mom = dmy(datbirth_mom), + age_week2 = as.numeric(dat_week2 - datbirth_infant), + age_week5 = as.numeric(dat_week5 - datbirth_infant), + age_week52 = as.numeric(dat_1year - datbirth_infant), + age_mom = as.numeric((dat_week2 - datbirth_mom)/365), + stdat_week2 = as.numeric(dat_week2 - dmy('01-01-2016')), + stdat_week5 = as.numeric(dat_week5 - dmy('01-01-2016')), + stdat_week52 = as.numeric(dat_1year - dmy('01-01-2016')), + id = as.character(id)) %>% + select( + skippy_id = id, matches("age_week\\d+"), condition, csection, birthweight, + siblings, sex, antibiotic_week2, + antibiotic_week5, antibiotic_1year, apgar_5, ges_age, age_mom, smoking, + drinking, weaning, parity, stdat_week2, stdat_week5, stdat_week52) %>% + left_join(bf, by = "skippy_id") + +# there is a typo in the maternal date of birth of ID 259, so her age will be removed +itt_var$age_mom[52] <- NA + +itt_var$condition <- factor(itt_var$condition, + levels = c(0, 1), + labels = c("CAU", "SSC")) +itt_var$sex <- factor(itt_var$sex, + levels = c(0, 1), + labels = c("Male", "Female")) +itt_var$csection <- factor(itt_var$csection, + levels = c("0", "1"), + labels = c("No", "Yes")) + +itt_var$siblings <- as.integer(itt_var$siblings) +itt_var$parity <- as.integer(itt_var$parity) + + + + +itt_var$antibiotic_week2 <- factor(itt_var$antibiotic_week2, + levels = c("0", "1"), + labels = c("No", "Yes")) +itt_var$antibiotic_week5 <- factor(itt_var$antibiotic_week5, + levels = c("0", "1"), + labels = c("No", "Yes")) +itt_var$antibiotic_1year <- factor(itt_var$antibiotic_1year, + levels = c("0", "1"), + labels = c("No", "Yes")) + +#change labels. Of course there is a better way, but due to time... +label(itt_var$sex) <- "Sex" +label(itt_var$ges_age) <- "Gestational age" +label(itt_var$csection) <- "C-section" +label(itt_var$siblings)<- "Siblings" +label(itt_var$parity) <- "Parity" +label(itt_var$birthweight) <- "Birth weight" +label(itt_var$condition) <- "Condition" +label(itt_var$bfexcl) <- "Exclusive breastfeeding duration" + + + +label(itt_var$age_week2) <- "Age week 2" +label(itt_var$age_week5) <- "Age week 5" +label(itt_var$age_week52) <- "Age week 52" + +label(itt_var$antibiotic_week2) <- "Antibiotics week 2" +label(itt_var$antibiotic_week5) <- "Antibiotics week 5" +label(itt_var$antibiotic_1year) <- "Antibiotics week 52" + +units(itt_var$ges_age) <- "weeks" +units(itt_var$birthweight) <- "grams" +units(itt_var$bfexcl) <- "months" +units(itt_var$age_week2) <- "days" +units(itt_var$age_week5) <- "days" +units(itt_var$age_week52) <- "days" + + +# produce table +tbl1 <- table1(~ sex + csection + ges_age + birthweight + siblings + bfexcl + + age_week2 + age_week5 + age_week52 + antibiotic_week2 + + antibiotic_week5 + antibiotic_1year | condition, data = itt_var, + caption = "Demographics Table", + ) + +save(tbl1, file = here::here("data/tbl1.Rds")) + +tbl1 diff --git a/R/volatility.R b/R/volatility.R new file mode 100644 index 0000000..3526743 --- /dev/null +++ b/R/volatility.R @@ -0,0 +1,586 @@ +library(tidyverse) +library(mia) +library(glue) +library(vegan) +library(brms) +library(bayesplot) +library(posterior) +library(tidybayes) + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +tse <- transformSamples(tse, method = "clr", name = "clr", pseudocount = 1) +asv <- t(assay(tse, "clr")) +ait <- vegdist(asv, method = "euclidean") +ait <- as.matrix(ait) +ids <- sort(unique(d$skippy_id)) +vol <- map_dfr(ids, function(id) { + + # each subject has maximal 3 samples: + s1 <- glue("{id}_1") + s2 <- glue("{id}_2") + s3 <- glue("{id}_3") + + # we can only obtain vol if there is no missing sample in a pair + vol1 <- ifelse(s1 %in% rownames(ait) & s2 %in% rownames(ait), ait[s1, s2], NA) + vol2 <- ifelse(s2 %in% rownames(ait) & s3 %in% rownames(ait), ait[s2, s3], NA) + + tibble( + skippy_id = id, + time = c("2-5", "5-52"), + vol = c(vol1, vol2) + ) +}) + + +vol_by_comp <- group_by(vol, time) %>% nest() +# visualize distributions of volatility per time point pair +voldist <- map(vol_by_comp[[2]], function(df) { + ggplot(df, aes(vol)) + + geom_density() +}) +voldist[[2]] +vol1 <- vol_by_comp[[2]][[1]] +vol2 <- vol_by_comp[[2]][[2]] +colnames(vol1) <- c("skippy_id", "vol1") +colnames(vol2) <- c("skippy_id", "vol2") + + + + +############################################################################### +######################### 1. ITT ############################## +############################################################################### + + + + +####################### 1.1 Complete Case Analysis ############################ + +d <- dplyr::left_join(d, vol1, by = "skippy_id") %>% + dplyr::left_join(vol2, by = "skippy_id") %>% + filter(week == 2) %>% + mutate(across(contains("vol"), function(x) scale(x)[, 1])) +fvars <- c("siblings", "condition") +d <- mutate( + d, + across(all_of(fvars), function(x) as.factor(x)), + age = week * 7 + age, + ges_age_s = scale(ges_age)[, 1], + edlevel_s = scale(edlevel)[, 1], + birthweight_s = scale(birthweight)[, 1], + condition_label = ifelse(condition == 0, "CAU", ifelse( + condition == 1, "SSC", NA)) +) + +implist_vol <- map(implist, function(dimp) { + df <- dplyr::left_join(dimp, vol1, by = "skippy_id") %>% + dplyr::left_join(vol2, by = "skippy_id") %>% + filter(week == 2) %>% + mutate(across(contains("vol"), function(x) scale(x)[, 1])) + # the vol columns will be imputed + imp <- mice::mice(df, m = 1) + complete(imp) +}) +dim(implist_vol[[1]]) + +# make a plot +d %>% + pivot_longer(contains("vol"), names_to = "Time", values_to = "Volatility") %>% + mutate(Time = ifelse(Time == "vol1", "2-5 weeks", "5-52 weeks")) %>% + ggplot(aes(condition_label, Volatility, fill = condition_label)) + + geom_boxplot(outlier.alpha = 0) + + # ggbeeswarm::geom_beeswarm(alpha = 0.4) + + geom_jitter(width = 0.1) + + #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + + xlab("") + + +vol_plot <- d %>% + pivot_longer(contains("vol"), names_to = "Time", values_to = "Volatility") %>% + mutate(Time = ifelse(Time == "vol1", "2-5 weeks", "5-52 weeks")) %>% + ggplot(aes(condition_label, Volatility, fill = condition_label)) + + geom_boxplot(outlier.alpha = 0) + + # ggbeeswarm::geom_beeswarm(alpha = 0.4) + + geom_jitter(width = 0.1, size = 2) + + #stat_summary(fun.y=mean, geom="point", shape=20, size=14, color="red", fill="red") + + facet_wrap(~Time, strip.position = "bottom") + + scale_fill_manual(values = c("#ffffff", "#c0c1c2")) + + #scale_fill_manual(values = c("#fc8d62", "#8da0cb")) + + theme_bw(base_size = 25) + + theme( + legend.position = "none", + strip.placement = "outside", + strip.background = element_blank()) + + xlab("") +vol_plot + +save(vol_plot, file = here::here("data/vol_out.Rds")) + + + + + + +# determine model structure + + +# I consider the following variables for selection +coefs <- c( + "csection", + "birthweight_s", + "birthweight_s + ges_age_s", + "siblings", + "sex", + "apgar_5_s", + "ges_age_s", + "edlevel_s", + "age_s" +) + + +# I will use the following algorithm: +# For each dataset in implist: + # Calculate LOO for base model + # Then for each var in coefs: + # calculate LOO for base model + coef + # If LOO indicates the new model is a better fit + # keep that coef in list + +loo_comp <- map_dfr(coefs, function(coef) { + map_dfr(1:5, function(i) { + # base model + f1 <- bf(Volatility ~ condition + (1|skippy_id)) + m1 <- brm( + data = pivot_longer(implist_vol[[i]], contains("vol"), names_to = "Time", values_to = "Volatility"), + formula = f1, + file = here::here(glue("data/m1_imp{i}_vol.Rds")) + ) + loo_m1 <- add_criterion( + m1, + "loo", + file = here::here(glue("data/loo_m1_imp{i}_vol")), + moment_match = FALSE + ) + + f2 <- bf(glue("Volatility ~ condition + {coef} + (1|skippy_id)")) + m2 <- brm( + data = pivot_longer(implist_vol[[i]], contains("vol"), names_to = "Time", values_to = "Volatility"), + formula = f2, + file = here::here(glue("data/m2_imp{i}_{coef}_vol.Rds")) + ) + + loo_m2 <- add_criterion( + m2, + "loo", + file = here::here(glue("data/loo_m2_imp{i}_{coef}_vol")), + moment_match = FALSE + ) + lcomp <- loo_compare(loo_m2, loo_m1) + score <- ifelse(rownames(lcomp)[1] == "loo_m1", 0, 1) + + tibble( + coef = coef, + imp = i, + score = score + ) + }) +}) + +group_by(loo_comp, coef) %>% + summarise(ss = sum(score)) + + + +model1 <- brm( + family = student(), + formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = d, + file = here::here("data/vol1_cc.Rds") +) +summary(model1) +hypothesis(model1, "condition1 < 0") + + + + +model1_bez <- brm( + family = student(), + formula = vol1 ~ condition, + data = d, + file = here::here("data/vol1_cc_bez.Rds") +) +summary(model1_bez) +hypothesis(model1_bez, "condition1 < 0") + + + +model2 <- brm( + family = student(), + formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = d, + file = here::here("data/vol2_cc.Rds") +) + +summary(model2) +hypothesis(model2, "condition1 < 0") + + + +mlm <- brm( + family = student(), + formula = Volatility ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + (1 | skippy_id), + data = pivot_longer(d, contains("vol"), names_to = "Time", values_to = "Volatility"), + file = here::here("data/vol1_cc_mlm.Rds") +) +summary(mlm) +hypothesis(mlm, "condition1 < Intercept") + + + +################### 1.2 Multiple Imputation Analysis ########################## + + + +model1 <- brm_multiple( + family = student(), + formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = implist_vol, + file = here::here("data/vol1_imp.Rds") +) +summary(model1) +hypothesis(model1, "condition1 < 0") + +hypothesis(model1, "ges_age_s < 0") +post <- posterior_samples(model1) +mean(post$b_ges_age_s<0) + + +model1_bez <- brm_multiple( + family = student(), + formula = vol1 ~ condition, + data = implist_vol, + file = here::here("data/vol1_imp_bez.Rds") +) +summary(model1_bez) +hypothesis(model1_bez, "condition1 < 0") + + +post <- posterior_samples(model1) +mean(post$b_condition1<0) + +model1_bf <- brm_multiple( + family = student(), + formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl, + data = implist_vol, + file = here::here("data/vol1_imp_bf.Rds") +) + +summary(model1_bf) +hypothesis(model1_bf, "condition1 < 0") + +model2 <- brm_multiple( + family = student(), + formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = implist_vol, + file = here::here("data/vol2_imp.Rds") +) + +summary(model2) +hypothesis(model2, "condition1 < 0") + +model2_bf <- brm_multiple( + family = student(), + formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl, + data = implist_vol, + file = here::here("data/vol2_imp_bf.Rds") +) + +summary(model2_bf) +hypothesis(model2_bf, "bfexcl < 0") +post <- posterior_samples(model2_bf) +mean(post$b_bfexcl<0) + + +mlm <- brm_multiple( + family = student(), + formula = Volatility ~ condition + ges_age_s + birthweight_s + csection + edlevel_s + (1 | skippy_id), + data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")), + file = here::here("data/vol1_imp_mlm.Rds") +) + +summary(mlm) +hypothesis(mlm, "condition1 < 0") +# file.remove(here::here("data/vol1_imp_mlm.Rds")) + +mlm2 <- brm_multiple( + family = student(), + formula = Volatility ~ condition + (1 | skippy_id), + data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")), + file = here::here("data/vol1_imp_mlm2.Rds") +) + +summary(mlm2) +hypothesis(mlm2, "condition1 < 0") + + + +save(model1, model1_bez, model1_bf, model2, model2_bf, file = here::here("data/volmodels_itt.Rds")) + + + + + + +############################################################################### +######################### 2. PP ############################## +############################################################################### + +# import of biomfile and meta data can be found in the import script +load(here::here("data/data.Rds")) +load(file = here::here("data/data_imp.Rds")) + +# obtain ids that were selected for PP analyses +pp_indicator <- foreign::read.spss(here::here("data/raw_data/kelly141022/Data_ITT_PP_ExploratoryDRselections.sav"), to.data.frame = TRUE) +pp_indicator <- select(pp_indicator, skippy_id = ID, pp = PP) +# add pp info to existing data +if (!"pp" %in% colnames(d)) { + d <- dplyr::left_join(d, pp_indicator, by = "skippy_id") +} + + + +####################### 2.1 Complete Case Analysis ############################ + +d <- dplyr::left_join(d, vol1, by = "skippy_id") %>% + dplyr::left_join(vol2, by = "skippy_id") %>% + filter(week == 2) %>% + mutate(across(contains("vol"), function(x) scale(x)[, 1])) +fvars <- c("siblings", "condition", "csection", "sex") +d <- mutate( + d, + across(all_of(fvars), function(x) as.factor(x)), + age = week * 7 + age, + ges_age_s = scale(ges_age)[, 1], + edlevel_s = scale(edlevel)[, 1], + birthweight_s = scale(birthweight)[, 1]) %>% + filter(pp == 1) + +model1 <- brm( + family = student(), + formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = d, + file = here::here("data/vol1_cc_pp.Rds") +) +summary(model1) +hypothesis(model1, "condition1 < 0") + + + +model2 <- brm( + family = student(), + formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = d, + file = here::here("data/vol2_cc_pp.Rds") +) + +summary(model2) +hypothesis(model2, "condition1 < 0") + + + +mlm <- brm( + family = student(), + formula = Volatility ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + (1 | skippy_id), + data = pivot_longer(d, contains("vol"), names_to = "Time", values_to = "Volatility"), + file = here::here("data/vol1_cc_mlm_pp.Rds") +) +summary(mlm) +hypothesis(mlm, "condition1 < Intercept") + + +################### 2.2 Multiple Imputation Analysis ########################## + +implist_pp <- map(implist, function(dimp) { + dimp_new <- dplyr::left_join(dimp, pp_indicator, by = "skippy_id") %>% + filter(pp == 1) + dimp_new +}) + +implist_vol <- map(implist_pp, function(dimp) { + df <- dplyr::left_join(dimp, vol1, by = "skippy_id") %>% + dplyr::left_join(vol2, by = "skippy_id") %>% + filter(week == 2, pp == 1) %>% + mutate(across(contains("vol"), function(x) scale(x)[, 1])) + # the vol columns will be imputed + imp <- mice::mice(df, m = 1) + complete(imp) +}) + +model1 <- brm_multiple( + family = student(), + formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = implist_vol, + file = here::here("data/vol1_imp_pp.Rds") +) + + +dim(implist_vol[[1]]) +summary(model1) +hypothesis(model1, "condition1 < 0") + + +model1_bf <- brm_multiple( + family = student(), + formula = vol1 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl, + data = implist_vol, + file = here::here("data/vol1_imp_bf_pp.Rds") +) + +summary(model1_bf) +hypothesis(model1_bf, "condition1 < 0") + +model2 <- brm_multiple( + family = student(), + formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection, + data = implist_vol, + file = here::here("data/vol2_imp_pp.Rds") +) + +summary(model2) +hypothesis(model2, "condition1 < 0") + +model2_bf <- brm_multiple( + family = student(), + formula = vol2 ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + bfexcl, + data = implist_vol, + file = here::here("data/vol2_imp_bf_pp.Rds") +) + +summary(model2_bf) + + +mlm <- brm_multiple( + family = student(), + formula = Volatility ~ condition + ges_age_s + birthweight_s + edlevel_s + csection + (1 | skippy_id), + data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")), + file = here::here("data/vol1_imp_mlm_pp.Rds") +) + +summary(mlm) +hypothesis(mlm, "condition1 < 0") + +save(model1, model1_bf, model2, model2_bf, file = here::here("data/volmodels_pp.Rds")) + +mlm2 <- brm_multiple( + family = student(), + formula = Volatility ~ condition + (1 | skippy_id), + data = map(implist_vol, ~pivot_longer(.x, contains("vol"), names_to = "Time", values_to = "Volatility")), + file = here::here("data/vol1_imp_mlm2_pp.Rds") +) + +summary(mlm2) +hypothesis(mlm2, "condition1 < 0") + + + + + + + + +# we can argue that there is evidence for an effect of SSC on MBA +# therefore we will look at the DR analysis as well here: + +dr <- foreign::read.spss( + here::here("data/kelly_documents/data_itt_pp_dr.sav"), + to.data.frame = TRUE + ) %>% + select(skippy_id = ID, ITT, SSC = TotalSSCwk1wk5) %>% + mutate(SSC_s = scale(SSC)[, 1]) + + +implist_vol <- map(implist, function(dimp) { + df <- dplyr::left_join(dimp, vol1, by = "skippy_id") %>% + dplyr::left_join(vol2, by = "skippy_id") %>% + filter(week == 2) %>% + mutate(across(contains("vol"), function(x) scale(x)[, 1])) %>% + dplyr::left_join(select(dr, skippy_id, SSC_s), by = "skippy_id") + # the vol columns will be imputed + imp <- mice::mice(df, m = 1) + complete(imp) +}) +colnames(implist_vol[[1]]) + +model1 <- brm_multiple( + family = student(), + formula = vol1 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection, + data = implist_vol, + file = here::here("data/vol1_imp_dr.Rds") +) + +summary(model1) +hypothesis(model1, "SSC_s < 0") + + +model2 <- brm_multiple( + family = student(), + formula = vol2 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection, + data = implist_vol, + file = here::here("data/vol2_imp_dr.Rds") +) + + +summary(model2) +dim(model2$data) + + +# only within SSC + +formula <- bf(vol1 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection) +model_vol1_within <- brm_multiple( + family = student(), + formula = formula, + data = map(implist_vol, ~filter(.x, condition == 1)), + file = here::here("data/vol1_dr_within_cov") +) + +save(model_vol1_within, file = here::here("data/volmodels_dr.Rds")) + +summary(model_vol1_within) + +formula <- bf(vol1 ~ SSC_s + ges_age_s + birthweight_s + edlevel_s + csection) +model_vol1_out <- brm_multiple( + family = student(), + formula = formula, + data = map(implist_vol, ~filter(.x, condition == 0)), + file = here::here("data/vol1_dr_out_cov") +) + +summary(model_vol1_out) + + +implist_vol <- map(implist_vol, ~mutate(.x, upper = ifelse(SSC_s <= -0.5, "no", "yes"))) + +colnames(implist_vol[[1]]) +formula <- formula <- bf(vol1 ~ upper + ges_age_s + birthweight_s + edlevel_s + csection) +model_year1_cat <- brm_multiple( + family = student(), + formula = formula, + data = implist_vol, + file = here::here("data/vol1_dr_cat_cov") +) + +summary(model_year1_cat) +hypothesis(model_year1_cat, "upperyes < 0") +dim(implist_vol[[1]]) + diff --git a/README.md b/README.md new file mode 100644 index 0000000..adef5df --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Daily Skin-to-Skin Contact Alters Microbiota Development in Healthy Full-Term Infants + + + +This repository contains R code for the statistical analysis of a research project that investigated the effect of a skin-to-skin intervention on microbiota development.