From 0ef1c3c7db6fdfb4e85b3bf802e1a5b6549579f7 Mon Sep 17 00:00:00 2001
From: Will Nickols <willnickols@college.harvard.edu>
Date: Sat, 27 Jul 2024 17:15:35 -0400
Subject: [PATCH] Added strata option

---
 DESCRIPTION                     |   2 +-
 R/fit.R                         | 195 +++++++++++++++++++++++++-------
 R/maaslin3.R                    |  36 ++++--
 R/viz.R                         |   6 +-
 man/maaslin3.Rd                 |   4 +-
 man/maaslin_check_arguments.Rd  |   3 +-
 man/maaslin_compute_formula.Rd  |   4 +-
 man/maaslin_log_arguments.Rd    |   3 +-
 man/maaslin_parse_param_list.Rd |   3 +-
 man/maaslin_read_data.Rd        |   3 +-
 vignettes/maaslin3_manual.Rmd   |   7 +-
 11 files changed, 204 insertions(+), 62 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 49dd1e3..9f9ce74 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -12,7 +12,7 @@ Description: MaAsLin 3 refines and extends generalized multivariate linear model
 License: MIT + file LICENSE
 Imports: dplyr, pbapply, lmerTest, parallel, lme4, optparse, logging,
         data.table, multcomp, ggplot2, RColorBrewer, patchwork, scales,
-        rlang, hash, matrixStats, tibble, ggnewscale, kableExtra
+        rlang, hash, matrixStats, tibble, ggnewscale, kableExtra, survival
 Suggests: knitr, testthat (>= 2.1.0), rmarkdown, markdown
 VignetteBuilder: knitr
 Collate: fit.R utility_scripts.R viz.R maaslin3.R
diff --git a/R/fit.R b/R/fit.R
index cbf937c..b2e5a60 100644
--- a/R/fit.R
+++ b/R/fit.R
@@ -57,6 +57,7 @@ get_fixed_effects <- function(formula, random_effects_formula, dat_sub, groups,
   } else { # Random effects
     patterns <- paste0("(", unlist(lme4::findbars(formula(gsub("^expr ", "", safe_deparse(formula))))), ")")
     
+    # Remove all the random effects from the formula
     for (pattern in patterns) {
       fixed_effects_only <- gsub(pattern, "", 
                        paste0(trimws(safe_deparse(formula(gsub("^expr ", "", safe_deparse(formula))))), collapse = " "), 
@@ -193,6 +194,24 @@ fit.model <- function(
   formula <- extract_special_predictor_out[[1]]
   ordereds <- extract_special_predictor_out[[2]]
   
+  # Extract strata
+  extract_special_predictor_out <- extract_special_predictor(formula, 'strata')
+  formula <- extract_special_predictor_out[[1]]
+  strata <- extract_special_predictor_out[[2]]
+  
+  if (length(strata) > 1) {
+    stop("Only one strata allowed. Please change the formula.")
+  }
+  
+  if (length(strata) > 0 & !is.null(random_effects_formula)) {
+    stop("Strata and random effects cannot be combined. Please only use random effects if you have multiple grouping categories.")
+  }
+  
+  if (length(strata) > 0 & model == 'LM') {
+    formula <- formula(paste0(safe_deparse(formula), ' + (1 | ', strata, ')'))
+    random_effects_formula <- formula
+  }
+  
   #############################################################
   # Determine the function and summary for the model selected #
   #############################################################
@@ -278,43 +297,132 @@ fit.model <- function(
   ##################
 
   if (model == "logistic") {
-    if (is.null(random_effects_formula)) { # Fixed effects only
-      if (augment) {
-        model_function <- function(formula, mm, weight_scheme, na.action) {
-          assign("weight_sch_current", weight_scheme, envir = environment(formula))
+    if (is.null(random_effects_formula)) {
+      if (length(strata) > 0) {
+        if (augment) {
+          model_function <-
+            function(formula, mm, weight_scheme, na.action) {
+              formula <- formula(paste0(safe_deparse(formula), ' + strata(', strata, ')'))
+              
+              assign("weight_sch_current", weight_scheme, envir = environment(formula))
+              
+              clogit_out <- tryCatch({
+                fit1 <- survival::clogit(
+                  formula(formula), 
+                  data = mm, 
+                  method = "breslow", 
+                  control = coxph.control(iter.max = 1000),
+                  na.action = na.action,
+                  weights = weight_sch_current,
+                  robust = F) # Robust SE seem to be worse with weighting...
+              }, warning = function(w) {
+                'warning'
+              }, error = function(e) {
+                'error'
+              })
+              
+              if (is.character(clogit_out)) {
+                fit1 <- survival::clogit(
+                  formula(formula), 
+                  data = mm, 
+                  method = "breslow", 
+                  control = coxph.control(iter.max = 1000),
+                  na.action = na.action,
+                  weights = weight_sch_current,
+                  robust = F) # Robust SE seem to be worse with weighting...
+                return(fit1)
+              } else  {
+                return(clogit_out)
+              }
+            }
+        } else {
+          model_function <-
+            function(formula, data, na.action) {
+              clogit_out <- tryCatch({
+                fit1 <- survival::clogit(
+                  formula(formula), 
+                  data = data, 
+                  method = "breslow", 
+                  control = coxph.control(iter.max = 1000),
+                  na.action = na.action,
+                  robust = F) # Robust SE seem to be worse with weighting...)
+              }, warning = function(w) {
+                'warning'
+              }, error = function(e) {
+                'error'
+              })
+              
+              if (is.character(clogit_out)) {
+                fit1 <- survival::clogit(
+                  formula(formula), 
+                  data = data, 
+                  method = "breslow", 
+                  control = coxph.control(iter.max = 1000),
+                  na.action = na.action,
+                  robust = F) # Robust SE seem to be worse with weighting...)
+                return(fit1)
+              } else  {
+                return(clogit_out)
+              }
+            }
+        }
+        summary_function <- function(fit, names_to_include) {
+          lm_summary <- coef(summary(fit))
+          
+          store_names <- gsub('`', '', rownames(lm_summary))
+          if (!all(names_to_include %in% store_names)) { # If deficient rank, make sure all rownames are included
+            rows_to_add = names_to_include[!(names_to_include %in% store_names)]
+            lm_summary <- rbind(lm_summary, matrix(rep(NaN, ncol(lm_summary) * length(rows_to_add)), nrow=length(rows_to_add)))
+            rownames(lm_summary) <- c(store_names, rows_to_add)
+          }
           
-          glm_out <- glm(
-            formula = formula(formula),
-            family = 'binomial',
-            data = mm,
-            weights = weight_sch_current,
-            na.action = na.action,
-          )
+          if ('robust se' %in% colnames(lm_summary)) {
+            para <- as.data.frame(lm_summary)[,-c(2,4,5)] # Don't actually use robust SE
+          } else {
+            para <- as.data.frame(lm_summary)[,-c(2,4)]
+          }
           
-          return(glm_out)
+          para$name <- rownames(lm_summary)
+          return(para)
         }
-      } else {
-        model_function <-
-          function(formula, data, na.action) {
-            return(glm(
-              formula(formula),
-              data = data,
+      } else { # Fixed effects only
+        if (augment) {
+          model_function <- function(formula, mm, weight_scheme, na.action) {
+            assign("weight_sch_current", weight_scheme, envir = environment(formula))
+            
+            glm_out <- glm(
+              formula = formula(formula),
               family = 'binomial',
+              data = mm,
+              weights = weight_sch_current,
               na.action = na.action,
-            ))
+            )
+            
+            return(glm_out)
           }
-      }
-      summary_function <- function(fit, names_to_include) {
-        lm_summary <- summary(fit)$coefficients
-        store_names <- gsub('`', '', rownames(lm_summary))
-        if (!all(names_to_include %in% store_names)) { # If deficient rank, make sure all rownames are included
-          rows_to_add = names_to_include[!(names_to_include %in% store_names)]
-          lm_summary <- rbind(lm_summary, matrix(rep(NaN, 4 * length(rows_to_add)), nrow=length(rows_to_add)))
-          rownames(lm_summary) <- c(store_names, rows_to_add)
+        } else {
+          model_function <-
+            function(formula, data, na.action) {
+              return(glm(
+                formula(formula),
+                data = data,
+                family = 'binomial',
+                na.action = na.action,
+              ))
+            }
+        }
+        summary_function <- function(fit, names_to_include) {
+          lm_summary <- summary(fit)$coefficients
+          store_names <- gsub('`', '', rownames(lm_summary))
+          if (!all(names_to_include %in% store_names)) { # If deficient rank, make sure all rownames are included
+            rows_to_add = names_to_include[!(names_to_include %in% store_names)]
+            lm_summary <- rbind(lm_summary, matrix(rep(NaN, 4 * length(rows_to_add)), nrow=length(rows_to_add)))
+            rownames(lm_summary) <- c(store_names, rows_to_add)
+          }
+          para <- as.data.frame(lm_summary)[-1, -3]
+          para$name <- rownames(lm_summary)[-1]
+          return(para)
         }
-        para <- as.data.frame(lm_summary)[-1, -3]
-        para$name <- rownames(lm_summary)[-1]
-        return(para)
       }
     } else { # Random effects
       ranef_function <- lme4::ranef
@@ -322,7 +430,7 @@ fit.model <- function(
         model_function <-
           function(formula, mm, weight_scheme, na.action) {
             assign("weight_sch_current", weight_scheme, envir = environment(formula))
-
+            
             index <- 1
             
             while (index < length(optimizers)) {
@@ -335,7 +443,7 @@ fit.model <- function(
                     na.action = na.action,
                     weights = weight_sch_current,
                     control = lme4::glmerControl(optimizer = optimizers[index],
-                                           optCtrl = optCtrlList[[index]]))
+                                                 optCtrl = optCtrlList[[index]]))
                 }, warning=function(w) {
                   if (w$message == "non-integer #successes in a binomial glm!") {
                     # Still worked
@@ -358,14 +466,14 @@ fit.model <- function(
             
             if (is.character(glm_out)) {
               withCallingHandlers({ # Catch non-integer # successes first
-                  fit1 <- lme4::glmer(
-                    formula(formula), 
-                    data = mm, 
-                    family = 'binomial',
-                    na.action = na.action,
-                    weights = weight_sch_current,
-                    control = lme4::glmerControl(optimizer = optimizers[index],
-                                           optCtrl = optCtrlList[[index]]))
+                fit1 <- lme4::glmer(
+                  formula(formula), 
+                  data = mm, 
+                  family = 'binomial',
+                  na.action = na.action,
+                  weights = weight_sch_current,
+                  control = lme4::glmerControl(optimizer = optimizers[index],
+                                               optCtrl = optCtrlList[[index]]))
               }, warning=function(w) {
                 if (w$message == "non-integer #successes in a binomial glm!") {
                   # Still worked
@@ -390,7 +498,7 @@ fit.model <- function(
                   family = 'binomial',
                   na.action = na.action,
                   control = lme4::glmerControl(optimizer = optimizers[index],
-                                         optCtrl = optCtrlList[[index]]))
+                                               optCtrl = optCtrlList[[index]]))
               }, warning = function(w) {
                 'warning'
               }, error = function(e) {
@@ -412,7 +520,7 @@ fit.model <- function(
                 family = 'binomial',
                 na.action = na.action,
                 control = lme4::glmerControl(optimizer = optimizers[index],
-                                       optCtrl = optCtrlList[[index]])))
+                                             optCtrl = optCtrlList[[index]])))
             } else  {
               return(glm_out)
             }
@@ -457,7 +565,8 @@ fit.model <- function(
       'lmerTest',
       'parallel',
       'lme4',
-      'multcomp'
+      'multcomp',
+      'survival'
     )) {
       suppressPackageStartupMessages(require(lib, character.only = TRUE))
     }
diff --git a/R/maaslin3.R b/R/maaslin3.R
index 4009a8d..4ddf2ff 100755
--- a/R/maaslin3.R
+++ b/R/maaslin3.R
@@ -82,6 +82,7 @@ args$formula <- NULL
 args$random_effects <- NULL
 args$group_effects <- NULL
 args$ordered_effects <- NULL
+args$strata_effects <- NULL
 args$fixed_effects <- NULL
 args$feature_specific_covariate <- NULL
 args$feature_specific_covariate_name <- NULL
@@ -188,6 +189,18 @@ options <-
                  "[ Default: none ]"
     )
   )
+options <-
+  optparse::add_option(
+    options,
+    c("--strata_effects"),
+    type = "character",
+    dest = "strata_effects",
+    default = args$ordered_effects,
+    help = paste("The strata effects for the model.",
+                 "Only one is allowed.",
+                 "[ Default: none ]"
+    )
+  )
 options <-
   optparse::add_option(
     options,
@@ -510,6 +523,7 @@ maaslin_parse_param_list <- function(param_list) {
                         fixed_effects = args$fixed_effects,
                         group_effects = args$group_effects,
                         ordered_effects = args$ordered_effects,
+                        strata_effects = args$strata_effects,
                         feature_specific_covariate = args$feature_specific_covariate,
                         feature_specific_covariate_name = args$feature_specific_covariate_name,
                         feature_specific_covariate_record = args$feature_specific_covariate_record,
@@ -684,6 +698,7 @@ maaslin_log_arguments <- function(param_list) {
   logging::logdebug("Fixed effects: %s", param_list[["fixed_effects"]])
   logging::logdebug("Group effects: %s", param_list[["group_effects"]])
   logging::logdebug("Ordered effects: %s", param_list[["ordered_effects"]])
+  logging::logdebug("Strata effects: %s", param_list[["strata_effects"]])
   logging::logdebug("Formula: %s", param_list[["formula"]])
   logging::logdebug("Correction method: %s", param_list[["correction"]])
   logging::logdebug("Standardize: %s", param_list[["standardize"]])
@@ -1079,6 +1094,7 @@ maaslin_compute_formula <- function(params_and_data) {
   fixed_effects <- param_list[["fixed_effects"]]
   group_effects <- param_list[["group_effects"]]
   ordered_effects <- param_list[["ordered_effects"]]
+  strata_effects <- param_list[["strata_effects"]]
   random_effects <- param_list[["random_effects"]]
   feature_specific_covariate_name <- param_list[['feature_specific_covariate_name']]
   
@@ -1086,10 +1102,11 @@ maaslin_compute_formula <- function(params_and_data) {
     if (!is.null(param_list[["fixed_effects"]]) | 
         !is.null(param_list[["random_effects"]]) |
         !is.null(param_list[["group_effects"]]) | 
-        !is.null(param_list[["ordered_effects"]])) {
+        !is.null(param_list[["ordered_effects"]]) |
+        !is.null(param_list[["strata_effects"]])) {
       logging::logwarn(
-        paste("fixed_effects, random_effects, group_effects, or ordered_effects provided in addition to formula,", 
-              "using just the fixed_effects, random_effects, group_effects, and ordered_effects"))
+        paste("fixed_effects, random_effects, group_effects, ordered_effects, or strata_effects provided in addition to formula,", 
+              "using just the fixed_effects, random_effects, group_effects, ordered_effects, and strata_effects"))
     } else {
       logging::logwarn(
         paste("maaslin_compute_formula called even though a formula is provided",
@@ -1202,7 +1219,7 @@ maaslin_compute_formula <- function(params_and_data) {
   }
   
   # reduce metadata to only include fixed/group/random effects in formula
-  effects_names <- unique(c(fixed_effects, random_effects, group_effects, ordered_effects))
+  effects_names <- unique(c(fixed_effects, random_effects, group_effects, ordered_effects, strata_effects))
   metadata <- metadata[, effects_names, drop = FALSE]
   
   # create the fixed effects formula text
@@ -1213,6 +1230,9 @@ maaslin_compute_formula <- function(params_and_data) {
   if (length(ordered_effects) > 0) {
     formula_effects <- union(formula_effects, paste0("ordered(", ordered_effects, ")"))
   }
+  if (length(strata_effects) > 0) {
+    formula_effects <- union(formula_effects, paste0("strata(", strata_effects, ")"))
+  }
   if (!is.null(feature_specific_covariate_name)) {
     formula_effects <- union(formula_effects, feature_specific_covariate_name)
   }
@@ -1274,9 +1294,10 @@ maaslin_check_formula <- function(params_and_data) {
   if (!is.null(param_list[["fixed_effects"]]) | 
       !is.null(param_list[["random_effects"]]) | 
       !is.null(param_list[["group_effects"]]) | 
-      !is.null(param_list[["ordered_effects"]])) {
+      !is.null(param_list[["ordered_effects"]]) | 
+      !is.null(param_list[["strata_effects"]])) {
     logging::logwarn(
-      paste("fixed_effects, random_effects, group_effects, or ordered_effects provided in addition to formula,", 
+      paste("fixed_effects, random_effects, group_effects, ordered_effects, or strata_effects provided in addition to formula,", 
             "using only formula"))
   }
   
@@ -1319,7 +1340,7 @@ maaslin_check_formula <- function(params_and_data) {
   
   term_labels <- attr(terms(formula), "term.labels")
   
-  if (sum(!grepl("\\|", term_labels)) == 0 & is.null(feature_specific_covariate_name)) {
+  if (sum(!grepl("strata\\(|\\|", term_labels)) == 0 & is.null(feature_specific_covariate_name)) {
     logging::logerror("No fixed, group, or ordered effects included in formula.")
     stop()
   }
@@ -2056,6 +2077,7 @@ if (identical(environment(), globalenv()) &&
       fixed_effects = current_args$fixed_effects,
       group_effects = current_args$group_effects,
       ordered_effects = current_args$ordered_effects,
+      strata_effects = current_args$strata_effects,
       feature_specific_covariate = current_args$feature_specific_covariate,
       feature_specific_covariate_name = current_args$feature_specific_covariate_name,
       feature_specific_covariate_record = current_args$feature_specific_covariate_record,
diff --git a/R/viz.R b/R/viz.R
index d4e32d9..caa348f 100644
--- a/R/viz.R
+++ b/R/viz.R
@@ -85,6 +85,7 @@ maaslin3_summary_plot <-
     merged_results <- merged_results[is.na(merged_results$error) & 
                                        !is.na(merged_results$qval_individual) &
                                        !is.na(merged_results$coef),]
+    
     if (nrow(merged_results) == 0) {
       logging::loginfo(
         paste("No associtions were without errors. No summary plot generated."))
@@ -273,6 +274,9 @@ maaslin3_summary_plot <-
 
     # Bin coefficients into categories
     coefficient_thresh <- round(max(abs(quantile(merged_results_sig$coef, c(0.1, 0.9)))) / 10, 1) * 5
+    if (coefficient_thresh == 0) {
+      coefficient_thresh <- 0.5
+    }
     coef_breaks <- c(-coefficient_thresh, -coefficient_thresh / 2, 0, coefficient_thresh / 2, coefficient_thresh, Inf)
     threshold_set <- c(paste0("(-Inf,", -1 * coefficient_thresh, "]"),
                        paste0("(", -1 * coefficient_thresh, ",", -1/2 * coefficient_thresh,"]"),
@@ -560,7 +564,7 @@ maaslin3_association_plots <-
               ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0, 0.2)))
             
             temp_plot <- temp_plot + 
-              nature_theme(.data$metadata, 
+              nature_theme(as.character(joined_features_metadata_abun$metadata), 
                            paste0(feature_name, '\n(Normalization: ', normalization, ', Transformation: ', transformation, ')')) +
               ggplot2::theme(
                 panel.grid.major = ggplot2::element_blank(),
diff --git a/man/maaslin3.Rd b/man/maaslin3.Rd
index 55beace..be5024c 100644
--- a/man/maaslin3.Rd
+++ b/man/maaslin3.Rd
@@ -25,12 +25,14 @@ maaslin3(param_list)
     }
     A model should also be specified using one of the options below. If no formula is specified, at least one of  \code{fixed_effects}, \code{group_effects}, or \code{ordered_effects} should be specified.
     \itemize{
-      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group and ordered variables can be specified as: \code{group(grouping_variable)} and \code{ordered(ordered_variable)}. The other variable options below will not be considered if a formula is set.
+      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group, ordered, and strata variables can be specified as: \code{group(grouping_variable)}, \code{ordered(ordered_variable)} and \code{strata(strata_variable)}. The other variable options below will not be considered if a formula is set.
       \item \code{fixed_effects}: A vector of variable names to be included as fixed effects.
       \item \code{reference}: For a variable with more than two levels supplied with \code{fixed_effects}, the factor to use as a reference provided as a string of 'variable,reference' semi-colon delimited for multiple variables.
       \item \code{random_effects}: A vector of variable names to be included as random intercepts.
       \item \code{group_effects}: A factored categorical variable to be included for group testing. An ANOVA-style test will be performed to assess whether any of the variable's levels are significant, and no coefficients or individual p-values will be returned.
       \item \code{ordered_effects}: A factored categorical variable to be included. Consecutive levels will be tested for significance against each other, and the resulting associations will correspond to effect sizes, standard errors, and significances of each level versus the previous.
+      \item \code{strata_effects}: A vector with one variable name to be included as the strata variable in case-control studies. Strata cannot be combined with random effects.
+
     }
     Particularly for use in metatranscriptomics workflows, a table of feature-specific covariates can be included. A feature's covariates will be included like metadata when fitting the model for that feature.
     \itemize{
diff --git a/man/maaslin_check_arguments.Rd b/man/maaslin_check_arguments.Rd
index 3529bbf..5c42d3b 100644
--- a/man/maaslin_check_arguments.Rd
+++ b/man/maaslin_check_arguments.Rd
@@ -19,12 +19,13 @@ maaslin_check_arguments(param_list)
     }
     A model should also be specified using one of the options below. If no formula is specified, at least one of  \code{fixed_effects}, \code{group_effects}, or \code{ordered_effects} should be specified.
     \itemize{
-      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group and ordered variables can be specified as: \code{group(grouping_variable)} and \code{ordered(ordered_variable)}. The other variable options below will not be considered if a formula is set.
+      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group, ordered, and strata variables can be specified as: \code{group(grouping_variable)}, \code{ordered(ordered_variable)} and \code{strata(strata_variable)}. The other variable options below will not be considered if a formula is set.
       \item \code{fixed_effects}: A vector of variable names to be included as fixed effects.
       \item \code{reference}: For a variable with more than two levels supplied with \code{fixed_effects}, the factor to use as a reference provided as a string of 'variable,reference' semi-colon delimited for multiple variables.
       \item \code{random_effects}: A vector of variable names to be included as random intercepts.
       \item \code{group_effects}: A factored categorical variable to be included for group testing. An ANOVA-style test will be performed to assess whether any of the variable's levels are significant, and no coefficients or individual p-values will be returned.
       \item \code{ordered_effects}: A factored categorical variable to be included. Consecutive levels will be tested for significance against each other, and the resulting associations will correspond to effect sizes, standard errors, and significances of each level versus the previous.
+      \item \code{strata_effects}: A vector with one variable name to be included as the strata variable in case-control studies. Strata cannot be combined with random effects.
     }
     Particularly for use in metatranscriptomics workflows, a table of feature-specific covariates can be included. A feature's covariates will be included like metadata when fitting the model for that feature.
     \itemize{
diff --git a/man/maaslin_compute_formula.Rd b/man/maaslin_compute_formula.Rd
index 9ec31bc..1afcc1e 100644
--- a/man/maaslin_compute_formula.Rd
+++ b/man/maaslin_compute_formula.Rd
@@ -4,7 +4,7 @@
 Compute a formula for a MaAsLin 3 run based on the specified effects.
 }
 \description{
-Compute a formula using variables provided through \code{fixed_effects}, \code{random_effects}, \code{group_effects}, and \code{ordered_effects}. Only one of \code{maaslin_compute_formula} or \code{maaslin_check_formula} should be used.
+Compute a formula using variables provided through \code{fixed_effects}, \code{random_effects}, \code{group_effects}, \code{ordered_effects}, and \code{strata_effects}. Only one of \code{maaslin_compute_formula} or \code{maaslin_check_formula} should be used.
 }
 \usage{
 maaslin_compute_formula(params_and_data)
@@ -13,7 +13,7 @@ maaslin_compute_formula(params_and_data)
     \item{params_and_data}{
     The results of \code{maaslin_reorder_data} or a list containing the following named items:
     \describe{
-      \item{(1)}{\code{param_list}: The parameter list with \code{unscaled_abundance} trimmed of samples with missing abundances or metadata and \code{fixed_effects}, \code{random_effects}, \code{group_effects}, or \code{ordered_effects} specified.}
+      \item{(1)}{\code{param_list}: The parameter list with \code{unscaled_abundance} trimmed of samples with missing abundances or metadata and \code{fixed_effects}, \code{random_effects}, \code{group_effects}, \code{ordered_effects}, or \code{strata_effects} specified.}
       \item{(2)}{\code{data}: A data frame of feature abundances trimmed of samples with missing abundances or metadata.}
       \item{(3)}{\code{metadata}: A data frame of metadata trimmed of samples with missing abundances or metadata.}
     }
diff --git a/man/maaslin_log_arguments.Rd b/man/maaslin_log_arguments.Rd
index 62a3190..d3b397c 100644
--- a/man/maaslin_log_arguments.Rd
+++ b/man/maaslin_log_arguments.Rd
@@ -19,12 +19,13 @@ maaslin_log_arguments(param_list)
     }
     A model should also be specified using one of the options below. If no formula is specified, at least one of  \code{fixed_effects}, \code{group_effects}, or \code{ordered_effects} should be specified.
     \itemize{
-      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group and ordered variables can be specified as: \code{group(grouping_variable)} and \code{ordered(ordered_variable)}. The other variable options below will not be considered if a formula is set.
+      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group, ordered, and strata variables can be specified as: \code{group(grouping_variable)}, \code{ordered(ordered_variable)} and \code{strata(strata_variable)}. The other variable options below will not be considered if a formula is set.
       \item \code{fixed_effects}: A vector of variable names to be included as fixed effects.
       \item \code{reference}: For a variable with more than two levels supplied with \code{fixed_effects}, the factor to use as a reference provided as a string of 'variable,reference' semi-colon delimited for multiple variables.
       \item \code{random_effects}: A vector of variable names to be included as random intercepts.
       \item \code{group_effects}: A factored categorical variable to be included for group testing. An ANOVA-style test will be performed to assess whether any of the variable's levels are significant, and no coefficients or individual p-values will be returned.
       \item \code{ordered_effects}: A factored categorical variable to be included. Consecutive levels will be tested for significance against each other, and the resulting associations will correspond to effect sizes, standard errors, and significances of each level versus the previous.
+      \item \code{strata_effects}: A vector with one variable name to be included as the strata variable in case-control studies. Strata cannot be combined with random effects.
     }
     Particularly for use in metatranscriptomics workflows, a table of feature-specific covariates can be included. A feature's covariates will be included like metadata when fitting the model for that feature.
     \itemize{
diff --git a/man/maaslin_parse_param_list.Rd b/man/maaslin_parse_param_list.Rd
index 6528112..42e5052 100644
--- a/man/maaslin_parse_param_list.Rd
+++ b/man/maaslin_parse_param_list.Rd
@@ -19,12 +19,13 @@ maaslin_parse_param_list(param_list)
     }
     A model should also be specified using one of the options below. If no formula is specified, at least one of  \code{fixed_effects}, \code{group_effects}, or \code{ordered_effects} should be specified.
     \itemize{
-      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group and ordered variables can be specified as: \code{group(grouping_variable)} and \code{ordered(ordered_variable)}. The other variable options below will not be considered if a formula is set.
+      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group, ordered, and strata variables can be specified as: \code{group(grouping_variable)}, \code{ordered(ordered_variable)} and \code{strata(strata_variable)}. The other variable options below will not be considered if a formula is set.
       \item \code{fixed_effects}: A vector of variable names to be included as fixed effects.
       \item \code{reference}: For a variable with more than two levels supplied with \code{fixed_effects}, the factor to use as a reference provided as a string of 'variable,reference' semi-colon delimited for multiple variables.
       \item \code{random_effects}: A vector of variable names to be included as random intercepts.
       \item \code{group_effects}: A factored categorical variable to be included for group testing. An ANOVA-style test will be performed to assess whether any of the variable's levels are significant, and no coefficients or individual p-values will be returned.
       \item \code{ordered_effects}: A factored categorical variable to be included. Consecutive levels will be tested for significance against each other, and the resulting associations will correspond to effect sizes, standard errors, and significances of each level versus the previous.
+      \item \code{strata_effects}: A vector with one variable name to be included as the strata variable in case-control studies. Strata cannot be combined with random effects.
     }
     Particularly for use in metatranscriptomics workflows, a table of feature-specific covariates can be included. A feature's covariates will be included like metadata when fitting the model for that feature.
     \itemize{
diff --git a/man/maaslin_read_data.Rd b/man/maaslin_read_data.Rd
index 6148a68..d2a9b86 100644
--- a/man/maaslin_read_data.Rd
+++ b/man/maaslin_read_data.Rd
@@ -19,12 +19,13 @@ maaslin_read_data(param_list)
     }
     A model should also be specified using one of the options below. If no formula is specified, at least one of  \code{fixed_effects}, \code{group_effects}, or \code{ordered_effects} should be specified.
     \itemize{
-      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group and ordered variables can be specified as: \code{group(grouping_variable)} and \code{ordered(ordered_variable)}. The other variable options below will not be considered if a formula is set.
+      \item \code{formula}: A formula in \code{lme4} format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if \code{standardize=TRUE}). Group, ordered, and strata variables can be specified as: \code{group(grouping_variable)}, \code{ordered(ordered_variable)} and \code{strata(strata_variable)}. The other variable options below will not be considered if a formula is set.
       \item \code{fixed_effects}: A vector of variable names to be included as fixed effects.
       \item \code{reference}: For a variable with more than two levels supplied with \code{fixed_effects}, the factor to use as a reference provided as a string of 'variable,reference' semi-colon delimited for multiple variables.
       \item \code{random_effects}: A vector of variable names to be included as random intercepts.
       \item \code{group_effects}: A factored categorical variable to be included for group testing. An ANOVA-style test will be performed to assess whether any of the variable's levels are significant, and no coefficients or individual p-values will be returned.
       \item \code{ordered_effects}: A factored categorical variable to be included. Consecutive levels will be tested for significance against each other, and the resulting associations will correspond to effect sizes, standard errors, and significances of each level versus the previous.
+      \item \code{strata_effects}: A vector with one variable name to be included as the strata variable in case-control studies. Strata cannot be combined with random effects.
     }
     Particularly for use in metatranscriptomics workflows, a table of feature-specific covariates can be included. A feature's covariates will be included like metadata when fitting the model for that feature.
     \itemize{
diff --git a/vignettes/maaslin3_manual.Rmd b/vignettes/maaslin3_manual.Rmd
index 7f5febe..1f9896a 100644
--- a/vignettes/maaslin3_manual.Rmd
+++ b/vignettes/maaslin3_manual.Rmd
@@ -103,7 +103,7 @@ The data file can contain samples not included in the metadata file (along with
 To run MaAsLin 3, it is also necessary to specify a model. The model can come from a formula or vectors of terms. In either case, variable names should not have spaces or unusual characters.
 
 * *Formula*: The `formula` parameter should be set to any formula that satisfies the `lme4` specifications: fixed effects, random effects, interaction terms, polynomial terms, and more can all be included. If categorical variables are included as fixed effects, each level will be tested against the first factor level. In addition, ordered predictors and group predictors can be included by including `group(variable_name)` and `ordered(variable_name)` in the formula. Ordered and group predictors should stand alone in the formula (i.e., no group predictors in random effects).
-* *Vectors*: Alternatively, a vector of variable names can be supplied to the parameters `fixed_effects`, `random_effects`, `group_effects`, and `ordered_effects`. Categorical variables should either be ordered as factors beforehand, or `reference` should be provided as a string of 'variable,reference' semi-colon delimited for multiple variables (e.g., `variable_1,reference_1;variable_2,reference_2`). NOTE: adding a space between the variable and level might result in the wrong reference level being used.
+* *Vectors*: Alternatively, a vector of variable names can be supplied to the parameters `fixed_effects`, `random_effects`, `group_effects`, `ordered_effects`, and `strata_effects`. Categorical variables should either be ordered as factors beforehand, or `reference` should be provided as a string of 'variable,reference' semi-colon delimited for multiple variables (e.g., `variable_1,reference_1;variable_2,reference_2`). NOTE: adding a space between the variable and level might result in the wrong reference level being used.
 
 **Because MaAsLin 3 identifies prevalence (presence/absence) associations, sample read depth (number of reads) should be included as a covariate if available. Deeper sequencing will likely increase feature detection in a way that could spuriously correlate with metadata of interest when read depth is not included in the model.**
 
@@ -337,16 +337,17 @@ When running MaAsLin 3 in R, the manual page for each function (e.g., `?maaslin3
 
 #### Model formula ####
 
-* `formula`: A formula in `lme4` format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if `standardize = TRUE`). Group and ordered variables can be specified as: `group(grouping_variable)` and `ordered(ordered_variable)`. The other variable options below will not be considered if a formula is set.
+* `formula`: A formula in `lme4` format. Random effects, interactions, and functions of the metadata can be included (note that these functions will be applied after standardization if `standardize = TRUE`). Group, ordered, and strata variables can be specified as: `group(grouping_variable)`, `ordered(ordered_variable)`, and `strata(strata_variable)`. The other variable options below will not be considered if a formula is set.
 * `fixed_effects`: A vector of variable names to be included as fixed effects.
   * Fixed effects models are fit with `lm` (linear) or `glm` (logistic).
 * `reference`: For a variable with more than two levels supplied with `fixed_effects`, the factor to use as a reference provided as a string of 'variable,reference' semi-colon delimited for multiple variables.
-* `random_effects`: A vector of variable names to be included as random intercepts. **Random intercept models may produce poor model fits when there are fewer than 5 observations per group.** In these scenarios, per-group fixed effects should be used and subsequently filtered out.
+* `random_effects`: A vector of variable names to be included as random intercepts. **Random intercept models may produce poor model fits when there are fewer than 5 observations per group.** In these scenarios, per-group fixed effects should be used and subsequently filtered out. See `strata_effects` as well.
   * Random effects models are fit with `lmer` (linear) and `glmer` (logistic), and the significance tests come from `lmerTest` and `glmer` respectively.
 * `group_effects`: A factored categorical variable to be included for group testing. An ANOVA-style test will be performed to assess whether any of the variable's levels are significant, and no coefficients or individual p-values will be returned.
   * Tests are performed with the `anova` function's `LRT` option (logistic fixed and mixed effects), the `anova` function's F test (linear fixed effects), or `lmerTest::contest` (linear mixed effects).
 * `ordered_effects`: A factored categorical variable to be included. Consecutive levels will be tested for significance against each other with contrast tests, and the resulting associations will correspond to effect sizes, standard errors, and significances of each level versus the previous.
   * Contrast tests are performed with `multcomp::glht` (fixed effects and logistic mixed effects) and `lmerTest::contest` (linear mixed effects).
+* `strata_effects`: A single grouping variable to be included in matched case-control studies. If a strata variable is included, no random effects can be included. When a strata variable is included, a conditional logistic regression will be run to account for the strata. The abundance model will be run with a random intercept in place of the strata. Strata can include more than two observations per group. Only variables that differ within the groups can be tested.
 
 #### Feature specific covariates ####
 Particularly for use in metatranscriptomics workflows, a table of feature-specific covariates can be included. A feature's covariates will be included like a fixed effect metadatum when fitting the model for that feature. The covariate's name does not need to be included in the formula.