From 4c15ad248dd65ceb88239d76f6849ce6ceee119c Mon Sep 17 00:00:00 2001 From: ljmciver Date: Thu, 29 Jul 2021 17:18:36 -0600 Subject: [PATCH 01/54] Update actions main to include dependency to help with failing builds --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c73dff8..c481a74 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,7 +26,7 @@ jobs: run: R -q -e "BiocManager::install('BiocCheck')" - name: Install build and test dependencies from CRAN - run: R -q -e "install.packages(c('knitr','glmmTMB'), repos='http://cran.r-project.org')" + run: R -q -e "install.packages(c('knitr','glmmTMB','rmarkdown'), repos='http://cran.r-project.org')" - name: Build run: cd $GITHUB_WORKSPACE && R CMD build . From 069fcd628610618b7395f124f20ee86f37a192fd Mon Sep 17 00:00:00 2001 From: ljmciver Date: Thu, 29 Jul 2021 17:29:42 -0600 Subject: [PATCH 02/54] Add another dependency to main actions for build and test --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c481a74..e9edfb0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,7 +26,7 @@ jobs: run: R -q -e "BiocManager::install('BiocCheck')" - name: Install build and test dependencies from CRAN - run: R -q -e "install.packages(c('knitr','glmmTMB','rmarkdown'), repos='http://cran.r-project.org')" + run: R -q -e "install.packages(c('knitr','glmmTMB','rmarkdown','markdown'), repos='http://cran.r-project.org')" - name: Build run: cd $GITHUB_WORKSPACE && R CMD build . From 01bfc2c0c2aa0d00485e2028b9cc35542ecd4076 Mon Sep 17 00:00:00 2001 From: ljmciver Date: Thu, 29 Jul 2021 18:38:25 -0600 Subject: [PATCH 03/54] Add markdown to suggests for build --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3726d7b..37c40c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,8 @@ Imports: robustbase, biglm, pcaPP, edgeR, metagenomeSeq, lpsymphony, pbapply, ca Suggests: knitr, testthat (>= 2.1.0), - rmarkdown + rmarkdown, + markdown VignetteBuilder: knitr Collate: fit.R utility_scripts.R viz.R Maaslin2.R URL: http://huttenhower.sph.harvard.edu/maaslin2 From c084db459071e8af3b37a29d01909371b6e0202b Mon Sep 17 00:00:00 2001 From: ljmciver Date: Fri, 30 Jul 2021 08:15:24 -0600 Subject: [PATCH 04/54] Add import to NAMESPACE for bioc checks --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 6947bac..92b2f55 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,3 +11,4 @@ importFrom("stats", "as.formula", "coef", "glm", "lm", "na.exclude", importFrom("utils", "capture.output", "read.table", "write.table") importFrom("dplyr", "%>%") importFrom("stats", "sd", "na.omit") +importFrom("utils","type.convert") From af223b49900c5e6a70331ba14e2bd8399705e40b Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 15 Sep 2021 11:57:11 -0400 Subject: [PATCH 05/54] feat: add optparse argument for max number of pngs Previously max number of pngs could only be set by calling plotting function as a standalone. --- R/Maaslin2.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index f81553a..7903297 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -273,6 +273,17 @@ options <- " associations [ Default: %default ]" ) ) +options <- + optparse::add_option( + options, + c("-g", "--max_pngs"), + type = "double", + dest = "max_pngs", + default = args$max_pngs, + help = paste("The maximum number of scatterplots for signficant", + " associations to save as png files [ Default: %default ]" + ) + ) options <- optparse::add_option( options, From 687219e6340e021e6f147d312f33772db4a46641 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 15 Sep 2021 13:15:52 -0400 Subject: [PATCH 06/54] feat: add argument for max pngs to save to maaslin2 call Maximum number of png files to save is now passed into maaslin2 main function from R or command line. - Default is 10 as before - Value is passed to plotting function which already had this parameter --- R/Maaslin2.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 7903297..77ff5c0 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -93,6 +93,7 @@ args$standardize <- TRUE args$plot_heatmap <- TRUE args$heatmap_first_n <- 50 args$plot_scatter <- TRUE +args$max_pngs <- 10 args$cores <- 1 args$reference <- NULL @@ -338,6 +339,7 @@ Maaslin2 <- cores = 1, plot_heatmap = TRUE, plot_scatter = TRUE, + max_pngs = 10, heatmap_first_n = 50, reference = NULL) { @@ -1025,7 +1027,8 @@ Maaslin2 <- filtered_data, significant_results_file, output, - figures_folder) + figures_folder, + max_pngs) } return(fit_data) @@ -1073,6 +1076,7 @@ if (identical(environment(), globalenv()) && current_args$cores, current_args$plot_heatmap, current_args$plot_scatter, + current_args$max_pngs, current_args$heatmap_first_n, current_args$reference ) From bee13f4ce3e1592d964a8622c6f522d54821fd74 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 15 Sep 2021 13:36:36 -0400 Subject: [PATCH 07/54] docs: update README to include max_pngs option --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index b64f982..1bef594 100644 --- a/README.md +++ b/README.md @@ -253,6 +253,10 @@ Options: -o PLOT_SCATTER, --plot_scatter=PLOT_SCATTER Generate scatter plots for the significant associations [ Default: TRUE ] + + -g MAX_PNGS, --max_pngs=MAX_PNGS + The maximum number of scatterplots for signficant associations + to save as png files [ Default: 10 ] -e CORES, --cores=CORES The number of R processes to run in parallel From a1569c4aa1c0fced4d78199f5dbc89dd50f78a69 Mon Sep 17 00:00:00 2001 From: ljmciver Date: Wed, 15 Sep 2021 16:26:09 -0600 Subject: [PATCH 08/54] Update action workflow to include pandoc to try to resolve error with version change from texlive --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e9edfb0..a6a21fd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,7 @@ jobs: - uses: actions/checkout@v2 - name: Install build and test dependencies from dist - run: apt-get update -y && apt-get install -y libssl-dev libxml2-dev libcurl4-openssl-dev texlive + run: apt-get update -y && apt-get install -y libssl-dev libxml2-dev libcurl4-openssl-dev texlive pandoc - name: install build and test dependencies from bioconductor run: R -q -e "BiocManager::install('BiocCheck')" From b81c13a8ad5f42a1d09f97c4a6fa0a7d8ce62102 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Fri, 17 Sep 2021 09:29:53 -0400 Subject: [PATCH 09/54] docs: add max_pngs info to vignetteand man page Add new feature to set max_pngs output to documentation. Fix extra space in README text. --- README.md | 2 +- man/Maaslin2.Rd | 4 ++++ vignettes/maaslin2.Rmd | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1bef594..1b0611b 100644 --- a/README.md +++ b/README.md @@ -255,7 +255,7 @@ Options: associations [ Default: TRUE ] -g MAX_PNGS, --max_pngs=MAX_PNGS - The maximum number of scatterplots for signficant associations + The maximum number of scatterplots for signficant associations to save as png files [ Default: 10 ] -e CORES, --cores=CORES diff --git a/man/Maaslin2.Rd b/man/Maaslin2.Rd index ee50bd9..b6f7e8a 100644 --- a/man/Maaslin2.Rd +++ b/man/Maaslin2.Rd @@ -87,6 +87,10 @@ Maaslin2( } \item{plot_scatter}{ Generate scatter plots for the significant associations. +} + \item{max_pngs}{ + Set the maximum number of scatterplots for signficant associations + to save as png files. } \item{cores}{ The number of R processes to run in parallel. diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index 80f0b7c..d05b1fa 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -258,6 +258,10 @@ Options: -o PLOT_SCATTER, --plot_scatter=PLOT_SCATTER Generate scatter plots for the significant associations [ Default: TRUE ] + + -g MAX_PNGS, --max_pngs=MAX_PNGS + The maximum number of scatterplots for signficant associations + to save as png files [ Default: 10 ] -e CORES, --cores=CORES The number of R processes to run in parallel From f505cfb87d472e2ab3608655ce7ec989b27723f0 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Fri, 17 Sep 2021 11:03:00 -0400 Subject: [PATCH 10/54] Add argument for maximum number of pngs to save (#6) * feat: add optparse argument for max number of pngs Previously max number of pngs could only be set by calling plotting function as a standalone. * feat: add argument for max pngs to save to maaslin2 call Maximum number of png files to save is now passed into maaslin2 main function from R or command line. - Default is 10 as before - Value is passed to plotting function which already had this parameter * docs: update README to include max_pngs option * docs: add max_pngs info to vignetteand man page Add new feature to set max_pngs output to documentation. Fix extra space in README text. --- R/Maaslin2.R | 17 ++++++++++++++++- README.md | 4 ++++ man/Maaslin2.Rd | 4 ++++ vignettes/maaslin2.Rmd | 4 ++++ 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index f81553a..77ff5c0 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -93,6 +93,7 @@ args$standardize <- TRUE args$plot_heatmap <- TRUE args$heatmap_first_n <- 50 args$plot_scatter <- TRUE +args$max_pngs <- 10 args$cores <- 1 args$reference <- NULL @@ -273,6 +274,17 @@ options <- " associations [ Default: %default ]" ) ) +options <- + optparse::add_option( + options, + c("-g", "--max_pngs"), + type = "double", + dest = "max_pngs", + default = args$max_pngs, + help = paste("The maximum number of scatterplots for signficant", + " associations to save as png files [ Default: %default ]" + ) + ) options <- optparse::add_option( options, @@ -327,6 +339,7 @@ Maaslin2 <- cores = 1, plot_heatmap = TRUE, plot_scatter = TRUE, + max_pngs = 10, heatmap_first_n = 50, reference = NULL) { @@ -1014,7 +1027,8 @@ Maaslin2 <- filtered_data, significant_results_file, output, - figures_folder) + figures_folder, + max_pngs) } return(fit_data) @@ -1062,6 +1076,7 @@ if (identical(environment(), globalenv()) && current_args$cores, current_args$plot_heatmap, current_args$plot_scatter, + current_args$max_pngs, current_args$heatmap_first_n, current_args$reference ) diff --git a/README.md b/README.md index b64f982..1b0611b 100644 --- a/README.md +++ b/README.md @@ -253,6 +253,10 @@ Options: -o PLOT_SCATTER, --plot_scatter=PLOT_SCATTER Generate scatter plots for the significant associations [ Default: TRUE ] + + -g MAX_PNGS, --max_pngs=MAX_PNGS + The maximum number of scatterplots for signficant associations + to save as png files [ Default: 10 ] -e CORES, --cores=CORES The number of R processes to run in parallel diff --git a/man/Maaslin2.Rd b/man/Maaslin2.Rd index ee50bd9..b6f7e8a 100644 --- a/man/Maaslin2.Rd +++ b/man/Maaslin2.Rd @@ -87,6 +87,10 @@ Maaslin2( } \item{plot_scatter}{ Generate scatter plots for the significant associations. +} + \item{max_pngs}{ + Set the maximum number of scatterplots for signficant associations + to save as png files. } \item{cores}{ The number of R processes to run in parallel. diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index 80f0b7c..d05b1fa 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -258,6 +258,10 @@ Options: -o PLOT_SCATTER, --plot_scatter=PLOT_SCATTER Generate scatter plots for the significant associations [ Default: TRUE ] + + -g MAX_PNGS, --max_pngs=MAX_PNGS + The maximum number of scatterplots for signficant associations + to save as png files [ Default: 10 ] -e CORES, --cores=CORES The number of R processes to run in parallel From 8105443c39f9b4d6bec13113e470b3b899d70087 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 4 Oct 2021 14:15:53 -0400 Subject: [PATCH 11/54] fix: fix how reference level setting is handled Reference levels now: - Will respect factor ordering if passed in as a factor without reference level set (for use in R) - Can be set for factors with 2 levels to change ordering (doesn't change stats, but does change plots) - Will skip numeric data by checking type instead of grep for numbers (created conflicting behavior with strings that happened to have numbers in them) - Won't use undocumented "UNK" strings - Will check if metadata only has one non-NA value and give good error message --- R/Maaslin2.R | 58 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 77ff5c0..9b8bc70 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -736,32 +736,48 @@ Maaslin2 <- # Filter data based on min abundance and min prevalence # ######################################################### - # use ordered factor for variables with more than two levels - # find variables with more than two levels if (is.null(reference)) { reference <- "," } - - for ( i in colnames(metadata) ) { - mlevels <- unique(na.omit(metadata[,i])) - numeric_levels <- grep('^-?[0-9.]+[eE+-]?', mlevels, value = T) - if ( ( length(mlevels[! (mlevels %in% c("UNK"))]) > 2 ) && - ( i %in% fixed_effects ) && - ( length(numeric_levels) == 0)) { - split_reference <- unlist(strsplit(reference, "[,;]")) - if (! i %in% split_reference ) { - stop( - paste("Please provide the reference for the variable '", - i, "' which includes more than 2 levels: ", - paste(as.character(mlevels), collapse=", "), ".", sep="") - ) - } else { - ref <- split_reference[match(i,split_reference)+1] - other_levels <- as.character(mlevels)[! as.character(mlevels) == ref] - metadata[,i] = factor(metadata[,i], levels=c(ref, other_levels)) + split_reference <- unlist(strsplit(reference, "[,;]")) + + # for each fixed effect, check that a reference level has been set if necessary: number of levels > 2 and metadata isn't already an ordered factor + for (i in fixed_effects) { + # don't check for or require reference levels for numeric metadata + if (is.numeric(metadata[,i])) { + next + } + # respect ordering if a factor is explicitly passed in with no reference set + if (is.factor(metadata[,i]) && !(i %in% split_reference)) { + logging::loginfo(paste("Factor detected for categorial metadata '", + i, "'. Provide a reference argument or manually set factor ordering to change reference level.", sep="")) + next + } + + # set metadata as a factor (ordered alphabetically) + metadata[,i] <- as.factor(metadata[,i]) + mlevels <- levels(metadata[,i]) + + # get reference level for variable being considered, returns NA if not found + ref <- split_reference[match(i, split_reference)+1] + + # if metadata has 2 levels, allow but don't require setting reference level, otherwise require it + if ((length(mlevels) == 2)) { + if(!is.na(ref)) { + metadata[,i] = relevel(metadata[,i], ref = ref) } + } else if (length(mlevels) > 2) { + if (!is.na(ref)) { + metadata[,i] = relevel(metadata[,i], ref = ref) + } else { + stop(paste("Please provide the reference for the variable '", + i, "' which includes more than 2 levels: ", + paste(as.character(mlevels), collapse=", "), ".", sep="")) + } + } else { + stop("Error determining reference level for metadata. Please check that provided category has 2 or more non-NA values.") } - } + } unfiltered_data <- data unfiltered_metadata <- metadata From a68062481fff7b39e0ef35af5d10badaca0a894f Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 4 Oct 2021 14:26:48 -0400 Subject: [PATCH 12/54] docs: update for reference level argument changes --- README.md | 11 +++++------ vignettes/maaslin2.Rmd | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1b0611b..f1d3ff8 100644 --- a/README.md +++ b/README.md @@ -263,12 +263,11 @@ Options: [ Default: 1 ] -d REFERENCE, --reference=REFERENCE - The factor to use as a reference for a variable - with more than two levels provided as a string - of 'variable,reference' semi-colon delimited - for multiple variables [ Default: NA ] - NOTE: A space between the variable and reference - will not error but will cause an inaccurate result. + The factor to use as a reference level for a categorical variable + provided as a string of 'variable,reference', semi-colon delimited for + multiple variables. Not required if metadata is passed as a factor or + for variables with less than two levels but can be set regardless. + [ Default: NA ] ## Troubleshooting ## diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index d05b1fa..689ac41 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -268,10 +268,11 @@ Options: [ Default: 1 ] -d REFERENCE, --reference=REFERENCE - The factor to use as a reference for a variable - with more than two levels provided as a string - of 'variable,reference' semi-colon delimited - for multiple variables [ Default: NA ] + The factor to use as a reference level for a categorical variable + provided as a string of 'variable,reference', semi-colon delimited for + multiple variables. Not required if metadata is passed as a factor or + for variables with less than two levels but can be set regardless. + [ Default: NA ] ## Troubleshooting ## From eca2f2c43454cd4eee1f5e4e9a3d5ed3c8ad1280 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Tue, 12 Oct 2021 13:16:40 -0400 Subject: [PATCH 13/54] fix: import relevel from stats --- NAMESPACE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 92b2f55..5ac36da 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,7 +7,7 @@ import("lpsymphony") import("optparse") importFrom("grDevices", "colorRampPalette", "dev.off", "pdf", "jpeg","png") importFrom("stats", "as.formula", "coef", "glm", "lm", "na.exclude", - "p.adjust", "update") + "p.adjust", "update", "relevel") importFrom("utils", "capture.output", "read.table", "write.table") importFrom("dplyr", "%>%") importFrom("stats", "sd", "na.omit") From 3083f1ff8aa76989b2a1203d17d8b8fb1351199d Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 18 Oct 2021 11:59:05 -0400 Subject: [PATCH 14/54] fix: better state error message in ref level Starting an error with "Error" gives awkward "Error: Error..." text. --- R/Maaslin2.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 9b8bc70..618429f 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -775,7 +775,7 @@ Maaslin2 <- paste(as.character(mlevels), collapse=", "), ".", sep="")) } } else { - stop("Error determining reference level for metadata. Please check that provided category has 2 or more non-NA values.") + stop("Provided categorical metadata has fewer than 2 unique, non-NA values.") } } From f3044c5dd2be7abdeb11ebb809ed4d65114d2195 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 18 Oct 2021 13:36:54 -0400 Subject: [PATCH 15/54] Update reference level handling (#10) * feat: add optparse argument for max number of pngs Previously max number of pngs could only be set by calling plotting function as a standalone. * feat: add argument for max pngs to save to maaslin2 call Maximum number of png files to save is now passed into maaslin2 main function from R or command line. - Default is 10 as before - Value is passed to plotting function which already had this parameter * docs: update README to include max_pngs option * docs: add max_pngs info to vignetteand man page Add new feature to set max_pngs output to documentation. Fix extra space in README text. * fix: fix how reference level setting is handled Reference levels now: - Will respect factor ordering if passed in as a factor without reference level set (for use in R) - Can be set for factors with 2 levels to change ordering (doesn't change stats, but does change plots) - Will skip numeric data by checking type instead of grep for numbers (created conflicting behavior with strings that happened to have numbers in them) - Won't use undocumented "UNK" strings - Will check if metadata only has one non-NA value and give good error message * docs: update for reference level argument changes * fix: import relevel from stats * fix: better state error message in ref level Starting an error with "Error" gives awkward "Error: Error..." text. --- NAMESPACE | 2 +- R/Maaslin2.R | 58 +++++++++++++++++++++++++++--------------- README.md | 11 ++++---- vignettes/maaslin2.Rmd | 9 ++++--- 4 files changed, 48 insertions(+), 32 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 92b2f55..5ac36da 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,7 +7,7 @@ import("lpsymphony") import("optparse") importFrom("grDevices", "colorRampPalette", "dev.off", "pdf", "jpeg","png") importFrom("stats", "as.formula", "coef", "glm", "lm", "na.exclude", - "p.adjust", "update") + "p.adjust", "update", "relevel") importFrom("utils", "capture.output", "read.table", "write.table") importFrom("dplyr", "%>%") importFrom("stats", "sd", "na.omit") diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 77ff5c0..618429f 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -736,32 +736,48 @@ Maaslin2 <- # Filter data based on min abundance and min prevalence # ######################################################### - # use ordered factor for variables with more than two levels - # find variables with more than two levels if (is.null(reference)) { reference <- "," } - - for ( i in colnames(metadata) ) { - mlevels <- unique(na.omit(metadata[,i])) - numeric_levels <- grep('^-?[0-9.]+[eE+-]?', mlevels, value = T) - if ( ( length(mlevels[! (mlevels %in% c("UNK"))]) > 2 ) && - ( i %in% fixed_effects ) && - ( length(numeric_levels) == 0)) { - split_reference <- unlist(strsplit(reference, "[,;]")) - if (! i %in% split_reference ) { - stop( - paste("Please provide the reference for the variable '", - i, "' which includes more than 2 levels: ", - paste(as.character(mlevels), collapse=", "), ".", sep="") - ) - } else { - ref <- split_reference[match(i,split_reference)+1] - other_levels <- as.character(mlevels)[! as.character(mlevels) == ref] - metadata[,i] = factor(metadata[,i], levels=c(ref, other_levels)) + split_reference <- unlist(strsplit(reference, "[,;]")) + + # for each fixed effect, check that a reference level has been set if necessary: number of levels > 2 and metadata isn't already an ordered factor + for (i in fixed_effects) { + # don't check for or require reference levels for numeric metadata + if (is.numeric(metadata[,i])) { + next + } + # respect ordering if a factor is explicitly passed in with no reference set + if (is.factor(metadata[,i]) && !(i %in% split_reference)) { + logging::loginfo(paste("Factor detected for categorial metadata '", + i, "'. Provide a reference argument or manually set factor ordering to change reference level.", sep="")) + next + } + + # set metadata as a factor (ordered alphabetically) + metadata[,i] <- as.factor(metadata[,i]) + mlevels <- levels(metadata[,i]) + + # get reference level for variable being considered, returns NA if not found + ref <- split_reference[match(i, split_reference)+1] + + # if metadata has 2 levels, allow but don't require setting reference level, otherwise require it + if ((length(mlevels) == 2)) { + if(!is.na(ref)) { + metadata[,i] = relevel(metadata[,i], ref = ref) } + } else if (length(mlevels) > 2) { + if (!is.na(ref)) { + metadata[,i] = relevel(metadata[,i], ref = ref) + } else { + stop(paste("Please provide the reference for the variable '", + i, "' which includes more than 2 levels: ", + paste(as.character(mlevels), collapse=", "), ".", sep="")) + } + } else { + stop("Provided categorical metadata has fewer than 2 unique, non-NA values.") } - } + } unfiltered_data <- data unfiltered_metadata <- metadata diff --git a/README.md b/README.md index 1b0611b..f1d3ff8 100644 --- a/README.md +++ b/README.md @@ -263,12 +263,11 @@ Options: [ Default: 1 ] -d REFERENCE, --reference=REFERENCE - The factor to use as a reference for a variable - with more than two levels provided as a string - of 'variable,reference' semi-colon delimited - for multiple variables [ Default: NA ] - NOTE: A space between the variable and reference - will not error but will cause an inaccurate result. + The factor to use as a reference level for a categorical variable + provided as a string of 'variable,reference', semi-colon delimited for + multiple variables. Not required if metadata is passed as a factor or + for variables with less than two levels but can be set regardless. + [ Default: NA ] ## Troubleshooting ## diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index d05b1fa..689ac41 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -268,10 +268,11 @@ Options: [ Default: 1 ] -d REFERENCE, --reference=REFERENCE - The factor to use as a reference for a variable - with more than two levels provided as a string - of 'variable,reference' semi-colon delimited - for multiple variables [ Default: NA ] + The factor to use as a reference level for a categorical variable + provided as a string of 'variable,reference', semi-colon delimited for + multiple variables. Not required if metadata is passed as a factor or + for variables with less than two levels but can be set regardless. + [ Default: NA ] ## Troubleshooting ## From ef1d3fafe571e6980f655558530c70dbfe0dacd6 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 27 Oct 2021 15:25:08 -0400 Subject: [PATCH 16/54] feat: output filtered feature table Output the filtered input data table so users can make their own feature plots without having to redo these steps. Do not apply transformations since those are not plotted. --- R/Maaslin2.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 618429f..22d5420 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -891,6 +891,20 @@ Maaslin2 <- # Write out the results # ######################### + ############################### + # write filtered data to file # + ############################### + + filtered_file = file.path(output, "filtered_data.tsv") + logging::loginfo("Writing filtered data to file %s", filtered_file) + write.table( + data.frame("feature" = rownames(filtered_data), filtered_data), + file = filtered_file, + sep = "\t", + quote = FALSE, + row.names = FALSE + ) + ########################### # write residuals to file # ########################### From d8c11675fba8aac6c2b1daa0e85570642396429d Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 27 Oct 2021 17:07:33 -0400 Subject: [PATCH 17/54] feat: return and store list of model fits The results of fitting models are currently only parsed and summarized as the final output. This saves the full fits as an RDS file for easier access. --- R/Maaslin2.R | 11 +++++++++++ R/fit.R | 14 +++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 22d5420..2ca2ae7 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -891,6 +891,17 @@ Maaslin2 <- # Write out the results # ######################### + fits_file = file.path(output, "fits.rds") + # remove residuals file if already exists (since residuals append) + if (file.exists(fits_file)) { + logging::logwarn( + "Deleting existing residuals file: %s", fits_file) + unlink(fits_file) + } + logging::loginfo("Writing residuals to file %s", fits_file) + saveRDS(fit_data$fits, file = fits_file) + + ############################### # write filtered data to file # ############################### diff --git a/R/fit.R b/R/fit.R index c1081ae..77caac8 100644 --- a/R/fit.R +++ b/R/fit.R @@ -253,8 +253,9 @@ fit.data <- d<-as.vector(unlist(l)) names(d)<-unlist(lapply(l, row.names)) output$ranef<-d - } } + output$fit <- fit + } else { logging::logwarn(paste( @@ -268,6 +269,7 @@ fit.data <- output$residuals <- NA output$fitted <- NA if (!(is.null(random_effects_formula))) output$ranef <- NA + output$fit <- NA } colnames(output$para) <- c('coef', 'stderr' , 'pval', 'name') output$para$feature <- colnames(features)[x] @@ -295,6 +297,12 @@ fit.data <- })) row.names(fitted) <- colnames(features) + fits <- + lapply(outputs, function(x) { + return(x$fit) + }) + names(fits) <- colnames(features) + if (!(is.null(random_effects_formula))) { ranef <- do.call(rbind, lapply(outputs, function(x) { @@ -349,9 +357,9 @@ fit.data <- rownames(paras)<-NULL if (!(is.null(random_effects_formula))) { - return(list("results" = paras, "residuals" = residuals, "fitted" = fitted, "ranef" = ranef)) + return(list("results" = paras, "residuals" = residuals, "fitted" = fitted, "fits" = fits, "ranef" = ranef)) } else { - return(list("results" = paras, "residuals" = residuals, "fitted" = fitted)) + return(list("results" = paras, "residuals" = residuals, "fitted" = fitted, "fits" = fits)) } } From cd0d6551e71536b9f337173d47bcc60de19bb3e8 Mon Sep 17 00:00:00 2001 From: YancongZhang Date: Mon, 1 Nov 2021 16:42:27 -0400 Subject: [PATCH 18/54] Update sourcing R scripts --- NEWS | 1 + R/Maaslin2.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index be1f167..49a3b95 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,7 @@ Changes in version 1.7.1 (07-27-2021): * Update tutorial data files * Update knitr dependency for bioconductor tests +* Update sourcing R scripts Changes in version 1.5.1 (12-02-2020): * Update log from base 10 to base 2. diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 618429f..7d0563e 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -45,7 +45,7 @@ if (identical(environment(), globalenv()) && script_dir <- dirname(script_path) script_name <- basename(script_path) - for (R_file in dir(script_dir, pattern = "*.R")) + for (R_file in dir(script_dir, pattern = "*.R$")) { if (!(R_file == script_name)) source(file.path(script_dir, R_file)) From 0f55d18b9e93355347849c82b77ceffc999d0ab8 Mon Sep 17 00:00:00 2001 From: ljmciver Date: Tue, 2 Nov 2021 08:20:50 -0600 Subject: [PATCH 19/54] Add new version --- DESCRIPTION | 2 +- NEWS | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 37c40c2..0021157 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: Maaslin2 Title: "Multivariable Association Discovery in Population-scale Meta-omics Studies" Year: 2021 -Version: 1.7.2 +Version: 1.7.3 Authors@R: c( person("Himel", "Mallick", email = "himel.stat.iitk@gmail.com", role = "aut"), person("Ali", "Rahnavard", email = "gholamali.rahnavard@gmail.com", role = "aut"), diff --git a/NEWS b/NEWS index 49a3b95..6e76f2e 100644 --- a/NEWS +++ b/NEWS @@ -1,8 +1,14 @@ +Changes in version 1.7.3 (11-02-2021): +* Update sourcing R scripts +* Update reference level handling +* Update example files +* Fix saved plots to allow for runs with less then N plots per metadata +* Convert strings to numeric when needed for metadata +* Fix actions to build with latest version of dependencies Changes in version 1.7.1 (07-27-2021): * Update tutorial data files * Update knitr dependency for bioconductor tests -* Update sourcing R scripts Changes in version 1.5.1 (12-02-2020): * Update log from base 10 to base 2. From f4875772a90f7d7fdf508f09347da653f11402e4 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 22 Nov 2021 09:49:05 -0500 Subject: [PATCH 20/54] feat: write additional output tables Write the filtered, filtered/normalized, and filtered/normalized/transformed tables to tsv files --- R/Maaslin2.R | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 2ca2ae7..0457311 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -902,9 +902,9 @@ Maaslin2 <- saveRDS(fit_data$fits, file = fits_file) - ############################### - # write filtered data to file # - ############################### + ########################################## + # write processed feature tables to file # + ########################################## filtered_file = file.path(output, "filtered_data.tsv") logging::loginfo("Writing filtered data to file %s", filtered_file) @@ -916,6 +916,26 @@ Maaslin2 <- row.names = FALSE ) + filtered_data_norm_file = file.path(output, "filtered_data_norm.tsv") + logging::loginfo("Writing filtered, normalized data to file %s", filtered_data_norm_file) + write.table( + data.frame("feature" = rownames(filtered_data_norm), filtered_data_norm), + file = filtered_data_norm_file, + sep = "\t", + quote = FALSE, + row.names = FALSE + ) + + filtered_data_norm_transformed_file = file.path(output, "filtered_data_norm_transformed.tsv") + logging::loginfo("Writing filtered, normalized, transformed data to file %s", filtered_data_norm_transformed_file) + write.table( + data.frame("feature" = rownames(filtered_data_norm_transformed), filtered_data_norm_transformed), + file = filtered_data_norm_transformed_file, + sep = "\t", + quote = FALSE, + row.names = FALSE + ) + ########################### # write residuals to file # ########################### From 12cc1cffca7852faf12453f04b9523f1a9c3c755 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 22 Nov 2021 13:41:26 -0500 Subject: [PATCH 21/54] feat: save filtered tables to dedicated folder --- R/Maaslin2.R | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 0457311..1c5563a 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -385,14 +385,21 @@ Maaslin2 <- print("Creating output folder") dir.create(output) } - - if (plot_heatmap || plot_scatter){ - figures_folder <- file.path(output,"figures") - if (!file.exists(figures_folder)) { - print("Creating output figures folder") - dir.create(figures_folder) + + features_folder <- file.path(output, "features") + if (!file.exists(features_folder)) { + print("Creating output feature tables folder") + dir.create(features_folder) } + + if (plot_heatmap || plot_scatter) { + figures_folder <- file.path(output, "figures") + if (!file.exists(figures_folder)) { + print("Creating output figures folder") + dir.create(figures_folder) + } } + # create log file (write info to stdout and debug level to log file) # set level to finest so all log levels are reviewed @@ -906,7 +913,7 @@ Maaslin2 <- # write processed feature tables to file # ########################################## - filtered_file = file.path(output, "filtered_data.tsv") + filtered_file = file.path(features_folder, "filtered_data.tsv") logging::loginfo("Writing filtered data to file %s", filtered_file) write.table( data.frame("feature" = rownames(filtered_data), filtered_data), @@ -916,7 +923,7 @@ Maaslin2 <- row.names = FALSE ) - filtered_data_norm_file = file.path(output, "filtered_data_norm.tsv") + filtered_data_norm_file = file.path(features_folder, "filtered_data_norm.tsv") logging::loginfo("Writing filtered, normalized data to file %s", filtered_data_norm_file) write.table( data.frame("feature" = rownames(filtered_data_norm), filtered_data_norm), @@ -926,7 +933,7 @@ Maaslin2 <- row.names = FALSE ) - filtered_data_norm_transformed_file = file.path(output, "filtered_data_norm_transformed.tsv") + filtered_data_norm_transformed_file = file.path(features_folder, "filtered_data_norm_transformed.tsv") logging::loginfo("Writing filtered, normalized, transformed data to file %s", filtered_data_norm_transformed_file) write.table( data.frame("feature" = rownames(filtered_data_norm_transformed), filtered_data_norm_transformed), From 46200b685fb559fae610316d3f7c83a3589c160e Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 22 Nov 2021 14:01:23 -0500 Subject: [PATCH 22/54] feat: write rds files to dedicated folder Previously these were dumped to the main output folder which created some clutter and confusion. --- R/Maaslin2.R | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 1c5563a..a96ac22 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -391,6 +391,12 @@ Maaslin2 <- print("Creating output feature tables folder") dir.create(features_folder) } + + fits_folder <- file.path(output, "fits") + if (!file.exists(fits_folder)) { + print("Creating output fits folder") + dir.create(fits_folder) + } if (plot_heatmap || plot_scatter) { figures_folder <- file.path(output, "figures") @@ -894,11 +900,11 @@ Maaslin2 <- length(which(filtered_data_norm[, x[1]] > 0)) ) - ######################### - # Write out the results # - ######################### + ################################ + # Write out the raw model fits # + ################################ - fits_file = file.path(output, "fits.rds") + fits_file = file.path(fits_folder, "fits.rds") # remove residuals file if already exists (since residuals append) if (file.exists(fits_file)) { logging::logwarn( @@ -910,7 +916,7 @@ Maaslin2 <- ########################################## - # write processed feature tables to file # + # Write processed feature tables to file # ########################################## filtered_file = file.path(features_folder, "filtered_data.tsv") @@ -944,10 +950,10 @@ Maaslin2 <- ) ########################### - # write residuals to file # + # Write residuals to file # ########################### - residuals_file = file.path(output, "residuals.rds") + residuals_file = file.path(fits_folder, "residuals.rds") # remove residuals file if already exists (since residuals append) if (file.exists(residuals_file)) { logging::logwarn( @@ -958,10 +964,10 @@ Maaslin2 <- saveRDS(fit_data$residuals, file = residuals_file) ############################### - # write fitted values to file # + # Write fitted values to file # ############################### - fitted_file = file.path(output, "fitted.rds") + fitted_file = file.path(fits_folder, "fitted.rds") # remove fitted file if already exists (since fitted append) if (file.exists(fitted_file)) { logging::logwarn( @@ -971,12 +977,12 @@ Maaslin2 <- logging::loginfo("Writing fitted values to file %s", fitted_file) saveRDS(fit_data$fitted, file = fitted_file) - ######################################################## - # write extracted random effects to file (if specified) # - ######################################################## + ######################################################### + # Write extracted random effects to file (if specified) # + ######################################################### if (!is.null(random_effects)) { - ranef_file = file.path(output, "ranef.rds") + ranef_file = file.path(fits_folder, "ranef.rds") # remove ranef file if already exists (since ranef append) if (file.exists(ranef_file)) { logging::logwarn( @@ -988,7 +994,7 @@ Maaslin2 <- } ############################# - # write all results to file # + # Write all results to file # ############################# results_file <- file.path(output, "all_results.tsv") @@ -1028,7 +1034,7 @@ Maaslin2 <- ) ########################################### - # write results passing threshold to file # + # Write results passing threshold to file # ########################################### significant_results <- From b6ba704ed98b96ce8798b1d821f6b8518de77b8d Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 22 Nov 2021 14:08:04 -0500 Subject: [PATCH 23/54] fix: change "residuals" to "fits" --- R/Maaslin2.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index a96ac22..e35df77 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -905,13 +905,13 @@ Maaslin2 <- ################################ fits_file = file.path(fits_folder, "fits.rds") - # remove residuals file if already exists (since residuals append) + # remove fits file if already exists (since fits append) if (file.exists(fits_file)) { logging::logwarn( - "Deleting existing residuals file: %s", fits_file) + "Deleting existing fits file: %s", fits_file) unlink(fits_file) } - logging::loginfo("Writing residuals to file %s", fits_file) + logging::loginfo("Writing fits to file %s", fits_file) saveRDS(fit_data$fits, file = fits_file) From c23d1ec894cededa75c2c71d6eb2588a96323751 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 22 Nov 2021 14:28:42 -0500 Subject: [PATCH 24/54] docs: update readme to include new outputs --- README.md | 4 ++++ vignettes/maaslin2.Rmd | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/README.md b/README.md index f1d3ff8..6e81d7f 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,10 @@ MaAsLin2 generates two types of output files: data and visualization. * ``significant_results.tsv`` * This file is a subset of the results in the first file. * It only includes associations with q-values <= to the threshold. + * ``features``` + * This folder includes the filtered, normalized, and transformed versions of the input feature table if applicable. + * ``fits.rds`` + * This file contains a list of lists with every model fit. * ``residuals.rds`` * This file contains a data frame with residuals for each feature. * ``fitted.rds`` diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index 689ac41..edc17f5 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -127,6 +127,10 @@ MaAsLin2 generates two types of output files: data and visualization. * ``significant_results.tsv`` * This file is a subset of the results in the first file. * It only includes associations with q-values <= to the threshold. + * ``features``` + * This folder includes the filtered, normalized, and transformed versions of the input feature table if applicable. + * ``fits.rds`` + * This file contains a list of lists with every model fit. * ``residuals.rds`` * This file contains a data frame with residuals for each feature. * ``fitted.rds`` From 8b61217128209397c88c452a24d4d73f971f3d90 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 22 Nov 2021 14:37:31 -0500 Subject: [PATCH 25/54] docs: update feature folder description Currently all three feature tables are output even if steps do nothing (e.g. transform = NONE). Make this clear. --- README.md | 4 +++- vignettes/maaslin2.Rmd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e81d7f..da80ef7 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,9 @@ MaAsLin2 generates two types of output files: data and visualization. * This file is a subset of the results in the first file. * It only includes associations with q-values <= to the threshold. * ``features``` - * This folder includes the filtered, normalized, and transformed versions of the input feature table if applicable. + * This folder includes the filtered, normalized, and transformed versions of the input feature table. + * These steps are performed sequentially in the above order. + * If an option is set such that a step does not change the data, the resulting table will still be output. * ``fits.rds`` * This file contains a list of lists with every model fit. * ``residuals.rds`` diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index edc17f5..604c98e 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -128,7 +128,9 @@ MaAsLin2 generates two types of output files: data and visualization. * This file is a subset of the results in the first file. * It only includes associations with q-values <= to the threshold. * ``features``` - * This folder includes the filtered, normalized, and transformed versions of the input feature table if applicable. + * This folder includes the filtered, normalized, and transformed versions of the input feature table. + * These steps are performed sequentially in the above order. + * If an option is set such that a step does not change the data, the resulting table will still be output. * ``fits.rds`` * This file contains a list of lists with every model fit. * ``residuals.rds`` From 34c42b7d95be0d86375cca0aa4b74ce3e4c2fd69 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 22 Nov 2021 16:07:30 -0500 Subject: [PATCH 26/54] fix: stop extra plot from generating Previously if run from the command line, an extra copy of the heatmap would be saved in the working directory. --- R/viz.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/viz.R b/R/viz.R index d528bc8..789cfc0 100644 --- a/R/viz.R +++ b/R/viz.R @@ -235,7 +235,8 @@ maaslin2_heatmap <- treeheight_row = 0, treeheight_col = 0, display_numbers = matrix(ifelse( - a > 0.0, "+", ifelse(a < 0.0, "-", "")), nrow(a)) + a > 0.0, "+", ifelse(a < 0.0, "-", "")), nrow(a)), + silent = TRUE ) }, error = function(err) { logging::logerror("Unable to plot heatmap") From 11d0aa4a7baec70eaec66c2b16ce9ae606079e43 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Tue, 23 Nov 2021 11:07:03 -0500 Subject: [PATCH 27/54] feat: warn on identical fixed/random effect Previously automatically removed the repeated parameter, but this isn't always the desired behavior. However, it's not common, so warn the user that they are doing this to make sure it's not a mistake. --- R/Maaslin2.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index ef4d8ef..3488ea5 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -685,7 +685,14 @@ Maaslin2 <- random_effects <- unlist(strsplit(random_effects, ",", fixed = TRUE)) # subtract random effects from fixed effects - fixed_effects <- setdiff(fixed_effects, random_effects) + common_variables <- intersect(fixed_effects, random_effects) + if (length(common_variables) > 0) { + logging::logwarn( + paste("Feature name included as fixed and random effect,", + "check that this is intended: %s"), + paste(common_variables, collapse = " , ") + ) + } # remove any random effects not found in metadata to_remove <- setdiff(random_effects, colnames(metadata)) if (length(to_remove) > 0) From f5f64e0a58af90e17362d1e4ace32578f581c3b8 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 24 Nov 2021 11:15:21 -0500 Subject: [PATCH 28/54] feat: add option to save model fits Previously this was always done, but this file can be somewhat large. --- R/Maaslin2.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index ef4d8ef..be10004 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -296,7 +296,17 @@ options <- "run in parallel [ Default: %default ]" ) ) - +options <- + optparse::add_option( + options, + c("-j", "--save_models"), + type = "logical", + dest = "save_models", + default = args$save_models, + help = paste("Save the all full model objects", + " as an RData file [ Default: %default ]" + ) + ) options <- optparse::add_option( options, @@ -341,6 +351,7 @@ Maaslin2 <- plot_scatter = TRUE, max_pngs = 10, heatmap_first_n = 50, + save_models = FALSE, reference = NULL) { # Allow for lower case variables From 7ba97709325053e6ba6784dded79938bcbd2588a Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 24 Nov 2021 11:41:15 -0500 Subject: [PATCH 29/54] feat: only save models if indicated Also rename the output from "fits" to "model_objects" to make it more clear what this actually is. Don't change the name in fit.R so that it is consistent with the conventions in that file. --- R/Maaslin2.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index be10004..fe397d0 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -915,16 +915,17 @@ Maaslin2 <- # Write out the raw model fits # ################################ - fits_file = file.path(fits_folder, "fits.rds") - # remove fits file if already exists (since fits append) - if (file.exists(fits_file)) { - logging::logwarn( - "Deleting existing fits file: %s", fits_file) - unlink(fits_file) + if (save_models) { + model_file = file.path(fits_folder, "models.rds") + # remove models file if already exists (since models append) + if (file.exists(model_file)) { + logging::logwarn( + "Deleting existing model objects file: %s", model_file) + unlink(model_file) + } + logging::loginfo("Writing model objects to file %s", model_file) + saveRDS(fit_data$fits, file = model_file) } - logging::loginfo("Writing fits to file %s", fits_file) - saveRDS(fit_data$fits, file = fits_file) - ########################################## # Write processed feature tables to file # From 0e5762cbf237dd53eb8172b5db81ba7961200c24 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 24 Nov 2021 12:09:32 -0500 Subject: [PATCH 30/54] feat: add command line option for save_models --- R/Maaslin2.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index fe397d0..429638c 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -1164,6 +1164,7 @@ if (identical(environment(), globalenv()) && current_args$plot_scatter, current_args$max_pngs, current_args$heatmap_first_n, + current_args$save_models, current_args$reference ) } From 44733809e50a6ce64718814fbc9715492d080832 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 24 Nov 2021 15:24:21 -0500 Subject: [PATCH 31/54] docs: add save_models option to docs --- R/Maaslin2.R | 2 +- README.md | 11 +++++++---- vignettes/maaslin2.Rmd | 11 +++++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 429638c..25d7218 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -303,7 +303,7 @@ options <- type = "logical", dest = "save_models", default = args$save_models, - help = paste("Save the all full model objects", + help = paste("Save all full model objects", " as an RData file [ Default: %default ]" ) ) diff --git a/README.md b/README.md index da80ef7..24142b3 100644 --- a/README.md +++ b/README.md @@ -127,8 +127,9 @@ MaAsLin2 generates two types of output files: data and visualization. * This folder includes the filtered, normalized, and transformed versions of the input feature table. * These steps are performed sequentially in the above order. * If an option is set such that a step does not change the data, the resulting table will still be output. - * ``fits.rds`` - * This file contains a list of lists with every model fit. + * ``models.rds`` + * This file contains a list with every model fit object. + * It will only be generated if save_models is set to TRUE. * ``residuals.rds`` * This file contains a data frame with residuals for each feature. * ``fitted.rds`` @@ -214,8 +215,7 @@ Options: is detected at minimum abundance [ Default: 0.1 ] -b MIN_VARIANCE, --min_variance=MIN_VARIANCE - Keep features with variance greater than - [ Default: 0.0 ] + Keep features with variance greater than [ Default: 0.0 ] -s MAX_SIGNIFICANCE, --max_significance=MAX_SIGNIFICANCE The q-value threshold for significance [ Default: 0.25 ] @@ -267,6 +267,9 @@ Options: -e CORES, --cores=CORES The number of R processes to run in parallel [ Default: 1 ] + + -j SAVE_MODELS --save_models=SAVE_MODELS + Save all full model objects as an RData file [ Default: FALSE ] -d REFERENCE, --reference=REFERENCE The factor to use as a reference level for a categorical variable diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index 604c98e..8475ad8 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -131,8 +131,9 @@ MaAsLin2 generates two types of output files: data and visualization. * This folder includes the filtered, normalized, and transformed versions of the input feature table. * These steps are performed sequentially in the above order. * If an option is set such that a step does not change the data, the resulting table will still be output. - * ``fits.rds`` - * This file contains a list of lists with every model fit. + * ``models.rds`` + * This file contains a list with every model fit object. + * It will only be generated if save_models is set to TRUE. * ``residuals.rds`` * This file contains a data frame with residuals for each feature. * ``fitted.rds`` @@ -219,8 +220,7 @@ Options: is detected at minimum abundance [ Default: 0.1 ] -b MIN_VARIANCE, --min_variance=MIN_VARIANCE - Keep features with variance greater than - [ Default: 0.0 ] + Keep features with variance greater than [ Default: 0.0 ] -s MAX_SIGNIFICANCE, --max_significance=MAX_SIGNIFICANCE The q-value threshold for significance [ Default: 0.25 ] @@ -272,6 +272,9 @@ Options: -e CORES, --cores=CORES The number of R processes to run in parallel [ Default: 1 ] + + -j SAVE_MODELS --save_models=SAVE_MODELS + Save all full model objects as an RData file [ Default: FALSE ] -d REFERENCE, --reference=REFERENCE The factor to use as a reference level for a categorical variable From 079149e8b386c172330a8ef625a8e0fb9e5271da Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 24 Nov 2021 15:25:22 -0500 Subject: [PATCH 32/54] fix: add max_pngs to man page --- man/Maaslin2.Rd | 1 + 1 file changed, 1 insertion(+) diff --git a/man/Maaslin2.Rd b/man/Maaslin2.Rd index b6f7e8a..70f0498 100644 --- a/man/Maaslin2.Rd +++ b/man/Maaslin2.Rd @@ -31,6 +31,7 @@ Maaslin2( cores = 1, plot_heatmap = TRUE, plot_scatter = TRUE, + max_pngs = 10, heatmap_first_n = 50, reference = NULL ) From 421ef8a7996fe6fba3f66c2f92338b1679c53b75 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Wed, 24 Nov 2021 15:27:36 -0500 Subject: [PATCH 33/54] doc: add save_models to man --- man/Maaslin2.Rd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/man/Maaslin2.Rd b/man/Maaslin2.Rd index 70f0498..1856d0b 100644 --- a/man/Maaslin2.Rd +++ b/man/Maaslin2.Rd @@ -33,6 +33,7 @@ Maaslin2( plot_scatter = TRUE, max_pngs = 10, heatmap_first_n = 50, + save_models = FALSE, reference = NULL ) } @@ -95,6 +96,9 @@ Maaslin2( } \item{cores}{ The number of R processes to run in parallel. +} + \item{save_models}{ + Save the full model outputs to an RData file. } \item{reference}{ The factor to use as a reference for a variable with more than two levels From 52e0f7ec52140b35eccb01edb8ab8b803acbfbf8 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Tue, 23 Nov 2021 11:07:03 -0500 Subject: [PATCH 34/54] feat: warn on identical fixed/random effect Previously automatically removed the repeated parameter, but this isn't always the desired behavior. However, it's not common, so warn the user that they are doing this to make sure it's not a mistake. --- R/Maaslin2.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 25d7218..fc8fc4e 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -696,7 +696,14 @@ Maaslin2 <- random_effects <- unlist(strsplit(random_effects, ",", fixed = TRUE)) # subtract random effects from fixed effects - fixed_effects <- setdiff(fixed_effects, random_effects) + common_variables <- intersect(fixed_effects, random_effects) + if (length(common_variables) > 0) { + logging::logwarn( + paste("Feature name included as fixed and random effect,", + "check that this is intended: %s"), + paste(common_variables, collapse = " , ") + ) + } # remove any random effects not found in metadata to_remove <- setdiff(random_effects, colnames(metadata)) if (length(to_remove) > 0) From 9ac27802bbb95548980bba922affbe6a9532cecf Mon Sep 17 00:00:00 2001 From: Himel Mallick Date: Mon, 6 Dec 2021 15:40:11 -0500 Subject: [PATCH 35/54] Update CITATION --- inst/CITATION | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/inst/CITATION b/inst/CITATION index e7b60e1..ab74dbf 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,14 +1,15 @@ bibentry(bibtype = "Article", title = "Multivariable Association in Population-scale Meta-omics Studies.", author = "Himel Mallick, Ali Rahnavard, Lauren J. McIver, Siyuan Ma, Yangcong Zhang, Long H. Nguyen, Timothy L. Tickle, Geroge Weingart, Boyu Ren, Emma H. Schwager, Suvo Chatterjee, Kelsey N. Thompson, Jeremy E. Wilkinson, Ayshwarya Subramanian, Yiren Lu, Levi Waldron, Joseph N. Paulson, Eric A. Franzosa, Hector Corrada Bravo, Curtis Huttenhower", - year = 2020, - journal = "Nature Methods", - volume = "In Submission", - pages = "", - doi = "", + year = 2021, + journal = "PLOS Computational Biology", + volume = "17", + issue = "11", + pages = "e1009442", + doi = "https://doi.org/10.1371/journal.pcbi.1009442", url = "http://huttenhower.sph.harvard.edu/maaslin2", header = "To cite MaAsLin 2 in publications, please use:", - textVersion = "Mallick H et al. (2020). Multivariable Association in Population-scale Meta-omics Studies, http://huttenhower.sph.harvard.edu/maaslin2." + textVersion = "Mallick H et al. (2021). Multivariable Association in Population-scale Meta-omics Studies, http://huttenhower.sph.harvard.edu/maaslin2." ) bibentry(bibtype = "Manual", From a1ed56e43d8b221b558247934f144bdde88744b5 Mon Sep 17 00:00:00 2001 From: Himel Mallick Date: Mon, 6 Dec 2021 15:41:35 -0500 Subject: [PATCH 36/54] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1d3ff8..b33db07 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ MaAsLin2 is the next generation of MaAsLin (Microbiome Multivariable Association If you use the MaAsLin2 software, please cite our manuscript: -Mallick H, Rahnavard A, McIver LJ, Ma S, Zhang Y, Nguyen LH, Tickle TL, Weingart G, Ren B, Schwager EH, Chatterjee S, Thompson KN, Wilkinson JE, Subramanian A, Lu Y, Waldron L, Paulson JN, Franzosa EA, Bravo HC, Huttenhower C (2021). [Multivariable Association Discovery in Population-scale Meta-omics Studies](https://www.biorxiv.org/content/10.1101/2021.01.20.427420v1). bioRxiv, https://doi.org/10.1101/2021.01.20.427420. +Mallick H, Rahnavard A, McIver LJ, Ma S, Zhang Y, Nguyen LH, Tickle TL, Weingart G, Ren B, Schwager EH, Chatterjee S, Thompson KN, Wilkinson JE, Subramanian A, Lu Y, Waldron L, Paulson JN, Franzosa EA, Bravo HC, Huttenhower C (2021). [Multivariable Association Discovery in Population-scale Meta-omics Studies](https://www.biorxiv.org/content/10.1101/2021.01.20.427420v1). PLoS Computational Biology, 17(11):e1009442. Check out the [MaAsLin 2 tutorial](https://github.com/biobakery/biobakery/wiki/maaslin2) for an overview of analysis options. From 11e5c5deb94d85abbd41295374d71804e275ccab Mon Sep 17 00:00:00 2001 From: Himel Mallick Date: Mon, 6 Dec 2021 15:42:36 -0500 Subject: [PATCH 37/54] Update citation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b33db07..93c8394 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ MaAsLin2 is the next generation of MaAsLin (Microbiome Multivariable Association If you use the MaAsLin2 software, please cite our manuscript: -Mallick H, Rahnavard A, McIver LJ, Ma S, Zhang Y, Nguyen LH, Tickle TL, Weingart G, Ren B, Schwager EH, Chatterjee S, Thompson KN, Wilkinson JE, Subramanian A, Lu Y, Waldron L, Paulson JN, Franzosa EA, Bravo HC, Huttenhower C (2021). [Multivariable Association Discovery in Population-scale Meta-omics Studies](https://www.biorxiv.org/content/10.1101/2021.01.20.427420v1). PLoS Computational Biology, 17(11):e1009442. +Mallick H, Rahnavard A, McIver LJ, Ma S, Zhang Y, Nguyen LH, Tickle TL, Weingart G, Ren B, Schwager EH, Chatterjee S, Thompson KN, Wilkinson JE, Subramanian A, Lu Y, Waldron L, Paulson JN, Franzosa EA, Bravo HC, Huttenhower C (2021). [Multivariable Association Discovery in Population-scale Meta-omics Studies](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009442). PLoS Computational Biology, 17(11):e1009442. Check out the [MaAsLin 2 tutorial](https://github.com/biobakery/biobakery/wiki/maaslin2) for an overview of analysis options. From 2f2662aea2b67afae1b4aa6d780bc76903b521e6 Mon Sep 17 00:00:00 2001 From: sagun Date: Fri, 11 Feb 2022 10:50:37 -0500 Subject: [PATCH 38/54] Added PR template --- .DS_Store | Bin 0 -> 6148 bytes .github/pull_request_template.md | 13 +++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 .DS_Store create mode 100644 .github/pull_request_template.md diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..024c653d9f1c42a8a72ec2d57e30244b9326a4cb GIT binary patch literal 6148 zcmeHK%Wl&^6upzU#CCQoD+sMi7Fs z=L=xLFCadH4?uhiD>(C@vZuhZqG+x(bLMgGo$(oa#zRD`GaogHszhWXakN}yzcJp= z(f1e_H|m?OIf)6N)I)zTaK+)qe;R4^jEWqY2F^ z)YzZ?&JGvpnc5q^glE$z%^HnwtW>r(Zd7dBb={lp3xAR<{4B`kX(yOHRtDD4G5XIw}aE?Y}P@W$~@hDkzl6gEzMI2vt*tX+1N7e1+@XT1h+NS{x=>Yi?+S9z=XwFp*KG{8@G5$U zwaa@T!1XcpTHBk~XUb-#MggP16)C{?2M>v3Tj5%w*gBAjD*(_!voMs!uRqY^4Zya- zwM4YQm~;h7SEjBQOx+xqbVq+%@z)ZiJ25db;;50Cx}h+2@nD{y6KhK}xlzCp5N=>k<{6G>EZZU>meN@apSzTMB#!=9mk@;kK%14 bVd(RC0oYc!mWURZ`4JE@n9L|}r3(B6cwX>| literal 0 HcmV?d00001 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..559a9fe --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,13 @@ +We welcome feedback and issue reporting for all bioBakery tools through our [Discourse site](https://forum.biobakery.org/c/pull-request/featurepull-request/). For users that would like to directly contribute to the [tools](https://github.com/biobakery/) we are happy to field PRs to address bug fixes. Please note the turn around time on our end might be a bit long to field these but that does not mean we don't value the contribution! We currently don't accept PRs to add new functionality to tools but we would be happy to receive your feedback on [Discourse](https://forum.biobakery.org/c/pull-request/featurepull-request/). + + + +## Description + + +## Related Issue + + + + +## Screenshots (if appropriate): \ No newline at end of file From 681e764b758d60193b2b939d3e4d6202f4b025c5 Mon Sep 17 00:00:00 2001 From: sagun Date: Fri, 11 Feb 2022 11:03:01 -0500 Subject: [PATCH 39/54] Removed DS store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 024c653d9f1c42a8a72ec2d57e30244b9326a4cb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%Wl&^6upzU#CCQoD+sMi7Fs z=L=xLFCadH4?uhiD>(C@vZuhZqG+x(bLMgGo$(oa#zRD`GaogHszhWXakN}yzcJp= z(f1e_H|m?OIf)6N)I)zTaK+)qe;R4^jEWqY2F^ z)YzZ?&JGvpnc5q^glE$z%^HnwtW>r(Zd7dBb={lp3xAR<{4B`kX(yOHRtDD4G5XIw}aE?Y}P@W$~@hDkzl6gEzMI2vt*tX+1N7e1+@XT1h+NS{x=>Yi?+S9z=XwFp*KG{8@G5$U zwaa@T!1XcpTHBk~XUb-#MggP16)C{?2M>v3Tj5%w*gBAjD*(_!voMs!uRqY^4Zya- zwM4YQm~;h7SEjBQOx+xqbVq+%@z)ZiJ25db;;50Cx}h+2@nD{y6KhK}xlzCp5N=>k<{6G>EZZU>meN@apSzTMB#!=9mk@;kK%14 bVd(RC0oYc!mWURZ`4JE@n9L|}r3(B6cwX>| From 2528b2d95544a190a155174e66d4eec2e32220da Mon Sep 17 00:00:00 2001 From: Vic_ks Date: Wed, 16 Feb 2022 10:56:32 -0500 Subject: [PATCH 40/54] Updated pull_req template --- .github/pull_request_template.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 559a9fe..743bf17 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,6 +1,7 @@ -We welcome feedback and issue reporting for all bioBakery tools through our [Discourse site](https://forum.biobakery.org/c/pull-request/featurepull-request/). For users that would like to directly contribute to the [tools](https://github.com/biobakery/) we are happy to field PRs to address bug fixes. Please note the turn around time on our end might be a bit long to field these but that does not mean we don't value the contribution! We currently don't accept PRs to add new functionality to tools but we would be happy to receive your feedback on [Discourse](https://forum.biobakery.org/c/pull-request/featurepull-request/). +We welcome feedback and issue reporting for all bioBakery tools through our [Discourse site](https://forum.biobakery.org/c/pull-request/featurepull-request/). For users that would like to directly contribute to the [tools](https://github.com/biobakery/) we are happy to field PRs to address **bug fixes**. Please note the turn around time on our end might be a bit long to field these but that does not mean we don't value the contribution! We currently **don't** accept PRs to add **new functionality** to tools but we would be happy to receive your feedback on [Discourse](https://forum.biobakery.org/c/pull-request/featurepull-request/). + +Also, we will make sure to attribute your contribution in our User’s manual(README.md) and in any associated paper Acknowledgements. - ## Description @@ -10,4 +11,4 @@ We welcome feedback and issue reporting for all bioBakery tools through our [Dis -## Screenshots (if appropriate): \ No newline at end of file +## Screenshots (if appropriate): From 930ccf4a7a05aeb9985cfeb014aa7314a9960942 Mon Sep 17 00:00:00 2001 From: Vic_ks Date: Wed, 16 Feb 2022 11:01:40 -0500 Subject: [PATCH 41/54] Added contribution section --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index db8754b..a25a70d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ + # MaAsLin2 User Manual # MaAsLin2 is the next generation of MaAsLin (Microbiome Multivariable Association with Linear Models). @@ -278,6 +279,10 @@ Options: for variables with less than two levels but can be set regardless. [ Default: NA ] +## Contributions ## +Thanks go to these wonderful people: +- Nick Waters + ## Troubleshooting ## 1. Question: When I run from the command line I see the error ``Maaslin2.R: command not found``. How do I fix this? From bbd554f3efeb2a148b00577b66f7568d029ca4f5 Mon Sep 17 00:00:00 2001 From: chuttenh Date: Wed, 16 Feb 2022 11:07:45 -0500 Subject: [PATCH 42/54] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index a25a70d..a6b6332 100644 --- a/README.md +++ b/README.md @@ -281,7 +281,9 @@ Options: ## Contributions ## Thanks go to these wonderful people: + - Nick Waters + * Design of the PR and attribution process ## Troubleshooting ## From c372d85e3638bbceccdadfe1b2f9c2b7282932ab Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Fri, 4 Mar 2022 16:23:05 -0500 Subject: [PATCH 43/54] fix: make correction method input case agnostic (#13) Unlike other method variables, p-value correction is case sensitive because of p.adjust function behavior, so we need to account for that instead of always making uppercase. --- R/Maaslin2.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index fc8fc4e..d22e8cf 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -358,7 +358,9 @@ Maaslin2 <- normalization <- toupper(normalization) transform <- toupper(transform) analysis_method <- toupper(analysis_method) - correction <- toupper(correction) + + # Match variable ignoring case then set correctly as required for p.adjust + correction <- correction_choices[match(toupper(correction), toupper(correction_choices))] ################################################################# # Read in the data and metadata, create output folder, init log # From e51736186513af0d540da8728fb25fc9a5356891 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Thu, 10 Mar 2022 17:27:04 -0500 Subject: [PATCH 44/54] Check for normalization when using AST (#14) * fix: make correction method input case agnostic Unlike other method variables, p-value correction is case sensitive because of p.adjust function behavior, so we need to account for that instead of always making uppercase. * fix: error for invalid values for AST transform AST is only defined between -1 and 1, but since "NONE" is a valid normalization choice, the user could supply invalid values. This previously generated NaNs and continued running. --- R/utility_scripts.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/R/utility_scripts.R b/R/utility_scripts.R index 4fe6239..f8a0bb9 100644 --- a/R/utility_scripts.R +++ b/R/utility_scripts.R @@ -186,7 +186,15 @@ TMMnorm = function(features) { # Arc Sine Square Root Transformation AST <- function(x) { - return(sign(x) * asin(sqrt(abs(x)))) + y <- sign(x) * asin(sqrt(abs(x))) + if(any(is.na(y))) { + logging::logerror( + paste0("AST transform is only valid for values between -1 and 1. ", + "Please select an appropriate normalization option or ", + "normalize your data prior to running.")) + stop() + } + return(y) } ######################## From 8d090e4c4b228f4d372ee4fc13e76b5c825cdc85 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 21 Mar 2022 17:24:17 -0400 Subject: [PATCH 45/54] Minor docs changes (#15) * fix: update scatter plot description Documentation indicates scatter plots are of normalized and transformed data, but they are not. * doc: update citation in vignette * docs: capitalize "maaslin2" in vignette Vignette titles and Index Entries should exactly match the name of the package so that vignette("packagename") works. * docs: replace AST with default parameter Documents previously used AST transform to align with iHMP. However, this is no longer needed, and having this set may confuse users. --- README.md | 7 +++---- vignettes/maaslin2.Rmd | 25 ++++++++++++++----------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index a6b6332..c43d915 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ MaAsLin2 generates two types of output files: data and visualization. * A plot is generated for each significant association. * Scatter plots are used for continuous metadata. * Box plots are for categorical data. - * Data points plotted are after normalization, filtering, and transform. + * Data points plotted are after filtering but prior to normalization and transform. ### Run a Demo ### @@ -162,7 +162,7 @@ the HMP2 data which can be downloaded from https://ibdmdb.org/ . #### Command line #### -``$ Maaslin2.R --transform=AST --fixed_effects="diagnosis,dysbiosisnonIBD,dysbiosisUC,dysbiosisCD,antibiotics,age" --random_effects="site,subject" --normalization=NONE --standardize=FALSE inst/extdata/HMP2_taxonomy.tsv inst/extdata/HMP2_metadata.tsv demo_output`` +``$ Maaslin2.R --fixed_effects="diagnosis,dysbiosisnonIBD,dysbiosisUC,dysbiosisCD,antibiotics,age" --random_effects="site,subject" --standardize=FALSE inst/extdata/HMP2_taxonomy.tsv inst/extdata/HMP2_metadata.tsv demo_output`` * Make sure to provide the full path to the MaAsLin2 executable (ie ./R/Maaslin2.R). * In the demo command: @@ -180,10 +180,9 @@ input_data <- system.file( input_metadata <-system.file( 'extdata','HMP2_metadata.tsv', package="Maaslin2") fit_data <- Maaslin2( - input_data, input_metadata, 'demo_output', transform = "AST", + input_data, input_metadata, 'demo_output' fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'), random_effects = c('site', 'subject'), - normalization = 'NONE', standardize = FALSE) ``` diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index 8475ad8..039b9d9 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -1,5 +1,5 @@ --- -title: "maaslin2" +title: "Maaslin2" date: "Friday, May 10, 2019" author: - name: Himel Mallick @@ -10,22 +10,26 @@ author: email: lauren.j.mciver@gmail.com output: html_document vignette: > - %\VignetteIndexEntry{MaAsLin2} + %\VignetteIndexEntry{Maaslin2} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- # MaAsLin2 User Manual # -MaAsLin2 is the next generation of MaAsLin. +MaAsLin2 is the next generation of MaAsLin (Microbiome Multivariable Association with Linear Models). -[MaAsLin2](http://huttenhower.sph.harvard.edu/maaslin2) is comprehensive R package for finding multivariable association between clinical metadata and microbial meta-omics features. MaAsLin2 relies on general linear models to accommodate most modern epidemiological study designs, including multiple analysis methods (with support for multiple covariates and repeated measures), filtering, normalization, and transform options to customize analysis for your specific study. +[MaAsLin2](http://huttenhower.sph.harvard.edu/maaslin2) is comprehensive R package for efficiently determining multivariable association between clinical metadata and microbial meta-omics features. MaAsLin2 relies on general linear models to accommodate most modern epidemiological study designs, including cross-sectional and longitudinal, along with a variety of filtering, normalization, and transform methods. If you use the MaAsLin2 software, please cite our manuscript: -Mallick et al. (2020+). "Multivariable Association in Population-scale Meta-omics Studies" (In Preparation). -If you have questions, please email the google group -[MaAsLin Users](https://groups.google.com/forum/#!forum/maaslin-users). +Mallick H, Rahnavard A, McIver LJ, Ma S, Zhang Y, Nguyen LH, Tickle TL, Weingart G, Ren B, Schwager EH, Chatterjee S, Thompson KN, Wilkinson JE, Subramanian A, Lu Y, Waldron L, Paulson JN, Franzosa EA, Bravo HC, Huttenhower C (2021). [Multivariable Association Discovery in Population-scale Meta-omics Studies](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009442). PLoS Computational Biology, 17(11):e1009442. + +Check out the [MaAsLin 2 tutorial](https://github.com/biobakery/biobakery/wiki/maaslin2) for an overview of analysis options. + +If you have questions, please direct it to : +[MaAsLin2 Forum](https://forum.biobakery.org/c/Downstream-analysis-and-statistics/MaAsLin2) +[Google Groups](https://groups.google.com/forum/#!forum/maaslin-users) (Read only) -------------------------------------------- @@ -150,7 +154,7 @@ MaAsLin2 generates two types of output files: data and visualization. * A plot is generated for each significant association. * Scatter plots are used for continuous metadata. * Box plots are for categorical data. - * Data points plotted are after normalization, filtering, and transform. + * Data points plotted are after filtering but prior to normalization and transform. ### Run a Demo ### @@ -165,7 +169,7 @@ the HMP2 data which can be downloaded from https://ibdmdb.org/ . #### Command line #### -``$ Maaslin2.R --transform=AST --fixed_effects="diagnosis,dysbiosisnonIBD,dysbiosisUC,dysbiosisCD,antibiotics,age" --random_effects="site,subject" --normalization=NONE --standardize=FALSE inst/extdata/HMP2_taxonomy.tsv inst/extdata/HMP2_metadata.tsv demo_output`` +``$ Maaslin2.R --fixed_effects="diagnosis,dysbiosisnonIBD,dysbiosisUC,dysbiosisCD,antibiotics,age" --random_effects="site,subject" --standardize=FALSE inst/extdata/HMP2_taxonomy.tsv inst/extdata/HMP2_metadata.tsv demo_output`` * Make sure to provide the full path to the MaAsLin2 executable (ie ./R/Maaslin2.R). * In the demo command: @@ -183,11 +187,10 @@ input_data <- system.file( input_metadata <-system.file( 'extdata','HMP2_metadata.tsv', package="Maaslin2") fit_data <- Maaslin2( - input_data, input_metadata, 'demo_output', transform = "AST", + input_data, input_metadata, 'demo_output', fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'), random_effects = c('site', 'subject'), reference = "diagnosis,nonIBD", - normalization = 'NONE', standardize = FALSE) ``` From 983b475afb0de4c2825447c26ee8b06912212475 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Thu, 31 Mar 2022 12:14:06 -0400 Subject: [PATCH 46/54] Feat/save plots (#16) * fix: don't keep all model objects if they aren't saved to file Previously, a list of all model objects was kept in memory and returned as ouput even if the save_models argument was false, which takes up a lot of memory for a default behavior. * feat: return ggplot objects for scatter plots if desired from maaslin2_association_plots Null is returned if plots are not to be saved. Note that some plots were always stored as part of png generation to avoid issues with graphics devices, but only to a maximum number of max_pngs at a time. * feat: allow user to save scatterplots Save all scatterplot ggplot objects in an RData file in the figures folder. * fix: return randef as NULL instead of empty Previously, returned list either had or didn't have a ranef entry depending on if random effects were used. It's better to set this as NULL instead so the returned list is always the same style. * doc: document option to save scatterplots * fix: sort arguments correctly --- R/Maaslin2.R | 73 ++++++++++++++++++++++++++++++------------ R/fit.R | 18 ++++++++--- R/viz.R | 33 +++++++++++++++---- README.md | 9 ++++-- man/Maaslin2.Rd | 13 +++++--- vignettes/maaslin2.Rmd | 9 ++++-- 6 files changed, 116 insertions(+), 39 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index d22e8cf..4c60e2c 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -94,7 +94,9 @@ args$plot_heatmap <- TRUE args$heatmap_first_n <- 50 args$plot_scatter <- TRUE args$max_pngs <- 10 +args$save_scatter <- FALSE args$cores <- 1 +args$save_models <- FALSE args$reference <- NULL ############################## @@ -117,7 +119,7 @@ options <- dest = "min_abundance", default = args$min_abundance, help = paste0("The minimum abundance for each feature", - " [ Default: %default ]" + "[ Default: %default ]" ) ) options <- @@ -129,7 +131,7 @@ options <- default = args$min_prevalence, help = paste0("The minimum percent of samples for which", "a feature is detected at minimum abundance", - " [ Default: %default ]" + "[ Default: %default ]" ) ) options <- @@ -141,7 +143,7 @@ options <- default = args$min_variance, help = paste0("Keep features with variances", "greater than value", - " [ Default: %default ]" + "[ Default: %default ]" ) ) options <- @@ -152,7 +154,7 @@ options <- dest = "max_significance", default = args$max_significance, help = paste0("The q-value threshold for significance", - " [ Default: %default ]" + "[ Default: %default ]" ) ) options <- @@ -164,7 +166,7 @@ options <- default = args$normalization, help = paste( "The normalization method to apply", - " [ Default: %default ] [ Choices:", + "[ Default: %default ] [ Choices:", toString(normalization_choices), "]" ) @@ -204,7 +206,7 @@ options <- default = args$random_effects, help = paste("The random effects for the model, ", "comma-delimited for multiple effects", - " [ Default: none ]" + "[ Default: none ]" ) ) options <- @@ -215,8 +217,8 @@ options <- dest = "fixed_effects", default = args$fixed_effects, help = paste("The fixed effects for the model,", - " comma-delimited for multiple effects", - " [ Default: all ]" + "comma-delimited for multiple effects", + "[ Default: all ]" ) ) options <- @@ -227,7 +229,7 @@ options <- dest = "correction", default = args$correction, help = paste("The correction method for computing", - " the q-value [ Default: %default ]" + "the q-value [ Default: %default ]" ) ) options <- @@ -238,7 +240,7 @@ options <- dest = "standardize", default = args$standardize, help = paste("Apply z-score so continuous metadata are on", - " the same scale [ Default: %default ]" + "the same scale [ Default: %default ]" ) ) options <- @@ -248,7 +250,7 @@ options <- type = "logical", dest = "plot_heatmap", default = args$plot_heatmap, - help = paste("Generate a heatmap for the significant ", + help = paste("Generate a heatmap for the significant", "associations [ Default: %default ]" ) ) @@ -259,7 +261,7 @@ options <- type = "double", dest = "heatmap_first_n", default = args$heatmap_first_n, - help = paste("In heatmap, plot top N features with significant ", + help = paste("In heatmap, plot top N features with significant", "associations [ Default: %default ]" ) ) @@ -271,7 +273,7 @@ options <- dest = "plot_scatter", default = args$plot_scatter, help = paste("Generate scatter plots for the significant", - " associations [ Default: %default ]" + "associations [ Default: %default ]" ) ) options <- @@ -282,7 +284,18 @@ options <- dest = "max_pngs", default = args$max_pngs, help = paste("The maximum number of scatterplots for signficant", - " associations to save as png files [ Default: %default ]" + "associations to save as png files [ Default: %default ]" + ) + ) +options <- + optparse::add_option( + options, + c("-O", "--save_scatter"), + type = "logical", + dest = "save_scatter", + default = args$save_scatter, + help = paste("Save all scatter plot ggplot objects", + "to an RData file [ Default: %default ]" ) ) options <- @@ -303,8 +316,8 @@ options <- type = "logical", dest = "save_models", default = args$save_models, - help = paste("Save all full model objects", - " as an RData file [ Default: %default ]" + help = paste("Return the full model outputs ", + "and save to an RData file [ Default: %default ]" ) ) options <- @@ -348,9 +361,10 @@ Maaslin2 <- standardize = TRUE, cores = 1, plot_heatmap = TRUE, + heatmap_first_n = 50, plot_scatter = TRUE, max_pngs = 10, - heatmap_first_n = 50, + save_scatter = FALSE, save_models = FALSE, reference = NULL) { @@ -545,6 +559,11 @@ Maaslin2 <- ) } } + # check that plots are generated if to be saved + if (!plot_scatter && save_scatter) { + logging::logerror("Scatter plots cannot be saved if they are not plotted") + stop("Option not valid", call. = FALSE) + } ############################################################### # Determine orientation of data in input and reorder to match # @@ -897,6 +916,7 @@ Maaslin2 <- formula = formula, random_effects_formula = random_effects_formula, correction = correction, + save_models = save_models, cores = cores ) @@ -1117,13 +1137,25 @@ Maaslin2 <- "to output folder: %s"), output ) - maaslin2_association_plots( + saved_plots <- maaslin2_association_plots( unfiltered_metadata, filtered_data, significant_results_file, output, figures_folder, - max_pngs) + max_pngs, + save_scatter) + if (save_scatter) { + scatter_file <- file.path(figures_folder, "scatter_plots.rds") + # remove plots file if already exists + if (file.exists(scatter_file)) { + logging::logwarn( + "Deleting existing scatter plot objects file: %s", scatter_file) + unlink(scatter_file) + } + logging::loginfo("Writing scatter plot objects to file %s", scatter_file) + saveRDS(saved_plots, file = scatter_file) + } } return(fit_data) @@ -1170,9 +1202,10 @@ if (identical(environment(), globalenv()) && current_args$standardize, current_args$cores, current_args$plot_heatmap, + current_args$heatmap_first_n, current_args$plot_scatter, current_args$max_pngs, - current_args$heatmap_first_n, + current_args$save_scatter, current_args$save_models, current_args$reference ) diff --git a/R/fit.R b/R/fit.R index 77caac8..4b83044 100644 --- a/R/fit.R +++ b/R/fit.R @@ -23,6 +23,7 @@ fit.data <- formula = NULL, random_effects_formula = NULL, correction = "BH", + save_models = FALSE, cores = 1) { # set the formula default to all fixed effects if not provided @@ -254,7 +255,11 @@ fit.data <- names(d)<-unlist(lapply(l, row.names)) output$ranef<-d } - output$fit <- fit + if (save_models) { + output$fit <- fit + } else { + output$fit <- NA + } } else { @@ -301,7 +306,12 @@ fit.data <- lapply(outputs, function(x) { return(x$fit) }) - names(fits) <- colnames(features) + names(fits) <- colnames(features) + + # Return NULL rather than empty object if fits aren't saved + if (all(is.na(fits))) { + fits <- NULL + } if (!(is.null(random_effects_formula))) { ranef <- @@ -357,9 +367,9 @@ fit.data <- rownames(paras)<-NULL if (!(is.null(random_effects_formula))) { - return(list("results" = paras, "residuals" = residuals, "fitted" = fitted, "fits" = fits, "ranef" = ranef)) + return(list("results" = paras, "residuals" = residuals, "fitted" = fitted, "ranef" = ranef, "fits" = fits)) } else { - return(list("results" = paras, "residuals" = residuals, "fitted" = fitted, "fits" = fits)) + return(list("results" = paras, "residuals" = residuals, "fitted" = fitted, "ranef" = NULL, "fits" = fits)) } } diff --git a/R/viz.R b/R/viz.R index 789cfc0..210675b 100644 --- a/R/viz.R +++ b/R/viz.R @@ -290,7 +290,8 @@ maaslin2_association_plots <- output_results, write_to = './', figures_folder = './figures/', - max_pngs = 10) + max_pngs = 10, + save_scatter = FALSE) { #MaAslin2 scatter plot function and theme @@ -352,9 +353,10 @@ maaslin2_association_plots <- metadata_labels <- unlist(metadata_types[!duplicated(metadata_types)]) metadata_number <- 1 + saved_plots <- list() for (label in metadata_labels) { - saved_plots <- vector('list', max_pngs) + saved_plots[[label]] <- list() # for file name replace any non alphanumeric with underscore plot_file <- paste( @@ -442,7 +444,7 @@ maaslin2_association_plots <- size = 2, fontface = "italic" ) - } else{ + } else { # if Metadata is categorical generate a boxplot ### check if the variable is categorical @@ -520,21 +522,38 @@ maaslin2_association_plots <- stdout <- capture.output(print(temp_plot), type = "message") if (length(stdout) > 0) logging::logdebug(stdout) - if (count < max_pngs + 1) - saved_plots[[count]] <- temp_plot + + # keep all plots if desired + # or only keep plots to be printed to png + if (save_scatter) { + saved_plots[[label]][[count]] <- temp_plot + } + else if (count <= max_pngs) { + saved_plots[[label]][[count]] <- temp_plot + } count <- count + 1 } dev.off() + # print the saved figures - for (plot_number in seq(1,max_pngs)) { + # this is done separately from pdf generation + # because nested graphics devices cause problems in rmarkdown output + for (plot_number in seq(1, max_pngs)) { png_file <- file.path(figures_folder, paste0( substr(basename(plot_file),1,nchar(basename(plot_file))-4), "_",plot_number,".png")) png(png_file, res = 300, width = 960, height = 960) - stdout <- capture.output(print(saved_plots[[plot_number]])) + stdout <- capture.output(print(saved_plots[[label]][[plot_number]])) dev.off() } + # give plots informative names + if (save_scatter) { + names(saved_plots[[label]]) <- make.names(output_df_all[data_index, 'feature'], unique = TRUE) + } else { + saved_plots[[label]] <- NULL # instead remove plots if only saved for png generation + } metadata_number <- metadata_number + 1 } + return(saved_plots) } diff --git a/README.md b/README.md index c43d915..d414618 100644 --- a/README.md +++ b/README.md @@ -261,15 +261,20 @@ Options: associations [ Default: TRUE ] -g MAX_PNGS, --max_pngs=MAX_PNGS - The maximum number of scatterplots for signficant associations + The maximum number of scatter plots for signficant associations to save as png files [ Default: 10 ] + + -O SAVE_SCATTER, --save_scatter=SAVE_SCATTER + Save all scatter plot ggplot objects + to an RData file [ Default: FALSE ] -e CORES, --cores=CORES The number of R processes to run in parallel [ Default: 1 ] -j SAVE_MODELS --save_models=SAVE_MODELS - Save all full model objects as an RData file [ Default: FALSE ] + Return the full model outputs and save to an RData file + [ Default: FALSE ] -d REFERENCE, --reference=REFERENCE The factor to use as a reference level for a categorical variable diff --git a/man/Maaslin2.Rd b/man/Maaslin2.Rd index 1856d0b..c0bbadc 100644 --- a/man/Maaslin2.Rd +++ b/man/Maaslin2.Rd @@ -30,9 +30,10 @@ Maaslin2( standardize = TRUE, cores = 1, plot_heatmap = TRUE, + heatmap_first_n = 50, plot_scatter = TRUE, max_pngs = 10, - heatmap_first_n = 50, + save_scatter = FALSE, save_models = FALSE, reference = NULL ) @@ -91,14 +92,18 @@ Maaslin2( Generate scatter plots for the significant associations. } \item{max_pngs}{ - Set the maximum number of scatterplots for signficant associations + Set the maximum number of scatter plots for signficant associations to save as png files. +} + \item{save_scatter}{ + Save all scatter plot ggplot objects + to an RData file. } \item{cores}{ The number of R processes to run in parallel. } \item{save_models}{ - Save the full model outputs to an RData file. + Return the full model outputs and save to an RData file. } \item{reference}{ The factor to use as a reference for a variable with more than two levels @@ -107,7 +112,7 @@ Maaslin2( } } \value{ - Data.frame containing the results from applying the model. + List containing the results from applying the model. } \author{ Himel Mallick,\cr diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index 039b9d9..a5d0347 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -269,15 +269,20 @@ Options: associations [ Default: TRUE ] -g MAX_PNGS, --max_pngs=MAX_PNGS - The maximum number of scatterplots for signficant associations + The maximum number of scatter plots for signficant associations to save as png files [ Default: 10 ] + + -O SAVE_SCATTER, --save_scatter=SAVE_SCATTER + Save all scatter plot ggplot objects + to an RData file [ Default: FALSE ] -e CORES, --cores=CORES The number of R processes to run in parallel [ Default: 1 ] -j SAVE_MODELS --save_models=SAVE_MODELS - Save all full model objects as an RData file [ Default: FALSE ] + Return the full model outputs and save to an RData file + [ Default: FALSE ] -d REFERENCE, --reference=REFERENCE The factor to use as a reference level for a categorical variable From 13dab5b6e679e43fd9aa8cfc879f7d17439072b1 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Thu, 31 Mar 2022 16:22:47 -0400 Subject: [PATCH 47/54] fix: fix png plot saving (#17) Had an index error if number of plots to be saved was less than max. --- R/viz.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/viz.R b/R/viz.R index 210675b..79c4ed3 100644 --- a/R/viz.R +++ b/R/viz.R @@ -538,14 +538,14 @@ maaslin2_association_plots <- # print the saved figures # this is done separately from pdf generation # because nested graphics devices cause problems in rmarkdown output - for (plot_number in seq(1, max_pngs)) { - png_file <- file.path(figures_folder, + for (plot_number in seq(1, min((count-1), max_pngs))) { + png_file <- file.path(figures_folder, paste0( substr(basename(plot_file),1,nchar(basename(plot_file))-4), "_",plot_number,".png")) - png(png_file, res = 300, width = 960, height = 960) - stdout <- capture.output(print(saved_plots[[label]][[plot_number]])) - dev.off() + png(png_file, res = 300, width = 960, height = 960) + stdout <- capture.output(print(saved_plots[[label]][[plot_number]])) + dev.off() } # give plots informative names if (save_scatter) { From f64458ad0e2b983537e8f599b56139f8e788179f Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 8 Jun 2022 17:37:07 -0400 Subject: [PATCH 48/54] fix xtfrm warning message (#19) --- R/viz.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/viz.R b/R/viz.R index 79c4ed3..624b794 100644 --- a/R/viz.R +++ b/R/viz.R @@ -95,9 +95,9 @@ maaslin2_heatmap <- title_additional <- "" if (!is.na(first_n) & first_n > 0 & first_n < dim(df)[1]) { if (cell_value == 'coef') { - df <- df[order(-abs(df[cell_value])) , ] + df <- df[order(-abs(df[[cell_value]])) , ] } else{ - df <- df[order(df[cell_value]), ] + df <- df[order(df[[cell_value]]), ] } # get the top n features with significant associations df_sub <- df[1:first_n,] From 8e613b4b48d946fba6da8d0fc231d243e824db94 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 22 Feb 2023 12:44:05 -0500 Subject: [PATCH 49/54] Handle input tibbles (#27) * fix xtfrm warning message * check if input files exist, check that input dfs have rownames, pass input dfs through as.data.frame() (in case it's a tibble) --- R/Maaslin2.R | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 4c60e2c..50f3c1d 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -381,7 +381,7 @@ Maaslin2 <- ################################################################# # if a character string then this is a file name, else it # is a data frame - if (is.character(input_data)) { + if (is.character(input_data) && file.exists(input_data)) { data <- data.frame(data.table::fread( input_data, header = TRUE, sep = "\t"), @@ -390,10 +390,16 @@ Maaslin2 <- # read again to get row name data <- read.table(input_data, header = TRUE, row.names = 1) } + } else if (is.data.frame(input_data)) { + if (!tibble::has_rownames(input_data)) { + stop("If supplying input_data as a data frame, it must have appropriate rownames!") + } + data <- as.data.frame(input_data) # in case it's a tibble or something } else { - data <- input_data + stop("input_data is neither a file nor a data frame!") } - if (is.character(input_metadata)) { + + if (is.character(input_metadata) && file.exists(input_metadata)) { metadata <- data.frame(data.table::fread( input_metadata, header = TRUE, sep = "\t"), @@ -403,10 +409,14 @@ Maaslin2 <- header = TRUE, row.names = 1) } + } else if (is.data.frame(input_metadata)) { + if (!tibble::has_rownames(input_metadata)) { + stop("If supplying input_metadata as a data frame, it must have appropriate rownames!") + } + metadata <- as.data.frame(input_metadata) # in case it's a tibble or something } else { - metadata <- input_metadata - } - + stop("input_metadata is neither a file nor a data frame!") + } # create an output folder and figures folder if it does not exist if (!file.exists(output)) { print("Creating output folder") From 2a27a748d34acb5e93f02929703037e5f15d27fb Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Tue, 4 Apr 2023 12:08:10 -0400 Subject: [PATCH 50/54] allow matrix input (#30) --- R/Maaslin2.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/Maaslin2.R b/R/Maaslin2.R index 50f3c1d..8e7298e 100755 --- a/R/Maaslin2.R +++ b/R/Maaslin2.R @@ -395,8 +395,11 @@ Maaslin2 <- stop("If supplying input_data as a data frame, it must have appropriate rownames!") } data <- as.data.frame(input_data) # in case it's a tibble or something + } else if (is.matrix(input_data)) { + logging::logwarn("Input is a matrix, passing through as.data.frame() .") + data <- as.data.frame(input_data) } else { - stop("input_data is neither a file nor a data frame!") + stop("input_data is neither a file nor a data frame!") } if (is.character(input_metadata) && file.exists(input_metadata)) { From 4bdd87d8c654ab8cf79b82e5d9e0dfff06ddfbd4 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Thu, 11 May 2023 17:19:36 -0400 Subject: [PATCH 51/54] Add feature name to model fit warnings (#31) Concatenate the name of the feature to the warning and add a line in the log file. --- R/fit.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/R/fit.R b/R/fit.R index 4b83044..f692f7d 100644 --- a/R/fit.R +++ b/R/fit.R @@ -232,7 +232,14 @@ fit.data <- formula, data = dat_sub, na.action = na.exclude) - }, error = function(err) { + }, warning = function(w) { + message(paste("Feature", colnames(features)[x], ":", w)) + logging::logwarn(paste( + "Fitting problem for feature", + x, + "a warning was issued")) + return(fit1) + }, error = function(err) { fit1 <- try({ model_function( From 4300163739b0827581dd50f6483cfad6cd038976 Mon Sep 17 00:00:00 2001 From: Kevin Bonham Date: Thu, 11 May 2023 17:37:29 -0400 Subject: [PATCH 52/54] Fix readme CL install instructions (#22) - File pointed to in the link is `master.zip`, rather than `Maaslin2-master.zip` (though the inflated directory looks like the later). - Since it's a `zip` file rather than `.tar.gz`, `tar` doesn't work (at least not on Ubuntu). I got: ``` tar xvzf master.zip gzip: stdin has more than one entry--rest ignored tar: Child returned status 2 tar: Error is not recoverable: exiting now ``` But `unzip` worked just fine. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d414618..9f7df26 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ If only running from the command line, you do not need to install the MaAsLin2 p 1. Download the source: [MaAsLin2.master.zip](https://github.com/biobakery/Maaslin2/archive/master.zip) 2. Decompress the download: - * ``$ tar xzvf Maaslin2-master.zip`` + * ``$ unzip master.zip`` 3. Install the Bioconductor dependencies edgeR and metagenomeSeq. 4. Install the CRAN dependencies: * ``$ R -q -e "install.packages(c('lmerTest','pbapply','car','dplyr','vegan','chemometrics','ggplot2','pheatmap','hash','logging','data.table','MuMIn','glmmTMB','MASS','cplm','pscl'), repos='http://cran.r-project.org')"`` From de59004ccb6d277a6d8eebb6d2d35fda0004f3f0 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Tue, 30 May 2023 17:00:45 -0400 Subject: [PATCH 53/54] Update tests (#36) * Update main.yml Replace file with official bioc test suite, generated via biocthis::use_bioc_github_action() * Remove unused code from main.yml covr, pkgdown, etc. were not run because of environment variables, but remove these code chunks anyways to make it easier to read. * Add tibble to DESCRIPTION * Remove github token requirement from workflow * Remove unused and uninstallable packages lpsymphony fails to install on OSX and isn't used. MuMIn is no longer avaliable and isn't used. * Update README.md * Update main.yml Remove explicit version numbering for bioc and R This requires switching to the devel build of bioc. * Update main.yml Change from devel to latest. * Update main.yml --- .github/workflows/main.yml | 195 ++++++++++++++++++++++++++++++++----- DESCRIPTION | 2 +- NAMESPACE | 1 - R/fit.R | 1 - README.md | 2 +- vignettes/maaslin2.Rmd | 2 +- 6 files changed, 171 insertions(+), 32 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a6a21fd..e3dd8a2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,42 +1,183 @@ +## Read more about GitHub actions the features of this GitHub Actions workflow +## at https://lcolladotor.github.io/biocthis/articles/biocthis.html#use_bioc_github_action +## +## For more details, check the biocthis developer notes vignette at +## https://lcolladotor.github.io/biocthis/articles/biocthis_dev_notes.html +## +## You can add this workflow to other packages using: +## > biocthis::use_bioc_github_action() +## +## Using GitHub Actions exposes you to many details about how R packages are +## compiled and installed in several operating system.s +### If you need help, please follow the steps listed at +## https://github.com/r-lib/actions#where-to-find-help +## +## If you found an issue specific to biocthis's GHA workflow, please report it +## with the information that will make it easier for others to help you. +## Thank you! -name: build and test +## Acronyms: +## * GHA: GitHub Action +## * OS: operating system -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch on: push: - branches: [ master ] pull_request: - branches: [ master ] + +name: build and test + +env: + has_testthat: 'true' jobs: - build: - runs-on: ubuntu-latest - container: - image: biobakery/maaslin2:latest + build-check: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + container: ${{ matrix.config.cont }} + ## Environment variables unique to this job. + + strategy: + fail-fast: false + matrix: + config: + - { os: ubuntu-latest, cont: "bioconductor/bioconductor_docker", rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest" } + - { os: macOS-latest } + - { os: windows-latest } + ## Check https://github.com/r-lib/actions/tree/master/examples + ## for examples using the http-user-agent + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + NOT_CRAN: true + TZ: UTC steps: - # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v2 + ## Most of these steps are the same as the ones in + ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml + ## If they update their steps, we will also need to update ours. + - name: Checkout Repository + uses: actions/checkout@v3 + + ## R is already included in the Bioconductor docker images + - name: Setup R from r-lib + if: runner.os != 'Linux' + uses: r-lib/actions/setup-r@v2 + + ## pandoc is already included in the Bioconductor docker images + - name: Setup pandoc from r-lib + if: runner.os != 'Linux' + uses: r-lib/actions/setup-pandoc@v2 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + shell: Rscript {0} + + - name: Install Linux system dependencies + if: runner.os == 'Linux' + run: | + sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') + echo $sysreqs + sudo -s eval "$sysreqs" + + - name: Install macOS system dependencies + if: matrix.config.os == 'macOS-latest' + run: | + ## Enable installing XML from source if needed + brew install libxml2 + echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV + + ## Required to install magick as noted at + ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 + brew install imagemagick@6 + + ## For textshaping, required by ragg, and required by pkgdown + brew install harfbuzz fribidi + + ## For installing usethis's dependency gert + brew install libgit2 + + ## Required for tcltk + brew install xquartz --cask + + - name: Install Windows system dependencies + if: runner.os == 'Windows' + run: | + ## Edit below if you have any Windows system dependencies + shell: Rscript {0} + + - name: Install BiocManager + run: | + message(paste('****', Sys.time(), 'installing BiocManager ****')) + remotes::install_cran("BiocManager") + shell: Rscript {0} + + - name: Set BiocVersion + run: | + BiocManager::install(force = TRUE, ask = FALSE) + shell: Rscript {0} + + - name: Install dependencies pass 1 + run: | + ## Try installing the package dependencies in steps. First the local + ## dependencies, then any remaining dependencies to avoid the + ## issues described at + ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html + ## https://github.com/r-lib/remotes/issues/296 + ## Ideally, all dependencies should get installed in the first pass. + + ## For running the checks + message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) + install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories()) - - name: Install build and test dependencies from dist - run: apt-get update -y && apt-get install -y libssl-dev libxml2-dev libcurl4-openssl-dev texlive pandoc + ## Pass #1 at installing dependencies + message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****')) + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE) + continue-on-error: true + shell: Rscript {0} - - name: install build and test dependencies from bioconductor - run: R -q -e "BiocManager::install('BiocCheck')" + - name: Install dependencies pass 2 + run: | + ## Pass #2 at installing dependencies + message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE) + shell: Rscript {0} - - name: Install build and test dependencies from CRAN - run: R -q -e "install.packages(c('knitr','glmmTMB','rmarkdown','markdown'), repos='http://cran.r-project.org')" + - name: Session info + run: | + options(width = 100) + pkgs <- installed.packages()[, "Package"] + sessioninfo::session_info(pkgs, include_base = TRUE) + shell: Rscript {0} - - name: Build - run: cd $GITHUB_WORKSPACE && R CMD build . + - name: Run CMD check + env: + _R_CHECK_CRAN_INCOMING_: false + DISPLAY: 99.0 + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck( + args = c("--no-manual", "--no-vignettes", "--timings"), + build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), + error_on = "warning", + check_dir = "check" + ) + shell: Rscript {0} - - name: Install maaslin2 - run: VERSION=$(grep Version DESCRIPTION | cut -d" " -f2) && R CMD INSTALL "Maaslin2_${VERSION}.tar.gz" + ## Might need an to add this to the if: && runner.os == 'Linux' + - name: Reveal testthat details + if: env.has_testthat == 'true' + run: find . -name testthat.Rout -exec cat '{}' ';' - - name: R test - run: VERSION=$(grep Version DESCRIPTION | cut -d" " -f2) && R CMD check "Maaslin2_${VERSION}.tar.gz" - - - name: Bioc test - run: VERSION=$(grep Version DESCRIPTION | cut -d" " -f2) && R CMD BiocCheck "Maaslin2_${VERSION}.tar.gz" - + - name: Run BiocCheck + env: + DISPLAY: 99.0 + run: | + BiocCheck::BiocCheck( + dir('check', 'tar.gz$', full.names = TRUE), + `quit-with-status` = TRUE, + `no-check-R-ver` = TRUE, + `no-check-bioc-help` = TRUE + ) + shell: Rscript {0} diff --git a/DESCRIPTION b/DESCRIPTION index 0021157..b78d27d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,7 @@ Depends: R (>= 3.6) Description: MaAsLin2 is comprehensive R package for efficiently determining multivariable association between clinical metadata and microbial meta'omic features. MaAsLin2 relies on general linear models to accommodate most modern epidemiological study designs, including cross-sectional and longitudinal, and offers a variety of data exploration, normalization, and transformation methods. MaAsLin2 is the next generation of MaAsLin. License: MIT + file LICENSE LazyData: false -Imports: robustbase, biglm, pcaPP, edgeR, metagenomeSeq, lpsymphony, pbapply, car, dplyr, vegan, chemometrics, ggplot2, pheatmap, logging, data.table, lmerTest, hash, optparse, grDevices, stats, utils, glmmTMB, MASS, cplm, pscl, lme4 +Imports: robustbase, biglm, pcaPP, edgeR, metagenomeSeq, pbapply, car, dplyr, vegan, chemometrics, ggplot2, pheatmap, logging, data.table, lmerTest, hash, optparse, grDevices, stats, utils, glmmTMB, MASS, cplm, pscl, lme4, tibble Suggests: knitr, testthat (>= 2.1.0), diff --git a/NAMESPACE b/NAMESPACE index 5ac36da..60ec1ae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ export(Maaslin2) import("robustbase") import("biglm") import("pcaPP") -import("lpsymphony") import("optparse") importFrom("grDevices", "colorRampPalette", "dev.off", "pdf", "jpeg","png") importFrom("stats", "as.formula", "coef", "glm", "lm", "na.exclude", diff --git a/R/fit.R b/R/fit.R index f692f7d..a17c235 100644 --- a/R/fit.R +++ b/R/fit.R @@ -5,7 +5,6 @@ for (lib in c( 'lmerTest', 'car', 'parallel', - 'MuMIn', 'glmmTMB', 'MASS', 'cplm', diff --git a/README.md b/README.md index 9f7df26..d151821 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ If only running from the command line, you do not need to install the MaAsLin2 p * ``$ unzip master.zip`` 3. Install the Bioconductor dependencies edgeR and metagenomeSeq. 4. Install the CRAN dependencies: - * ``$ R -q -e "install.packages(c('lmerTest','pbapply','car','dplyr','vegan','chemometrics','ggplot2','pheatmap','hash','logging','data.table','MuMIn','glmmTMB','MASS','cplm','pscl'), repos='http://cran.r-project.org')"`` + * ``$ R -q -e "install.packages(c('lmerTest','pbapply','car','dplyr','vegan','chemometrics','ggplot2','pheatmap','hash','logging','data.table','glmmTMB','MASS','cplm','pscl'), repos='http://cran.r-project.org')"`` 5. Install the MaAsLin2 package (only r,equired if running as an R function): * ``$ R CMD INSTALL maaslin2`` diff --git a/vignettes/maaslin2.Rmd b/vignettes/maaslin2.Rmd index a5d0347..768bb5b 100644 --- a/vignettes/maaslin2.Rmd +++ b/vignettes/maaslin2.Rmd @@ -70,7 +70,7 @@ the MaAsLin2 dependencies. * ``$ tar xzvf maaslin2.tar.gz`` 3. Install the Bioconductor dependencies edgeR and metagenomeSeq. 4. Install the CRAN dependencies: - * ``$ R -q -e "install.packages(c('lmerTest','pbapply','car','dplyr','vegan','chemometrics','ggplot2','pheatmap','hash','logging','data.table','MuMIn','glmmTMB','MASS','cplm','pscl'), repos='http://cran.r-project.org')"`` + * ``$ R -q -e "install.packages(c('lmerTest','pbapply','car','dplyr','vegan','chemometrics','ggplot2','pheatmap','hash','logging','data.table','glmmTMB','MASS','cplm','pscl'), repos='http://cran.r-project.org')"`` 5. Install the MaAsLin2 package (only r,equired if running as an R function): * ``$ R CMD INSTALL maaslin2`` From 9899df4e94c8cba0c06af8cacb2076ef08fb67a8 Mon Sep 17 00:00:00 2001 From: Thomas Kuntz Date: Mon, 5 Jun 2023 16:26:05 -0400 Subject: [PATCH 54/54] Remove RSPM from linux build test --- .github/workflows/main.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e3dd8a2..8f4b5e8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -40,14 +40,13 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, cont: "bioconductor/bioconductor_docker", rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest" } + - { os: ubuntu-latest, cont: "bioconductor/bioconductor_docker" } - { os: macOS-latest } - { os: windows-latest } ## Check https://github.com/r-lib/actions/tree/master/examples ## for examples using the http-user-agent env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} NOT_CRAN: true TZ: UTC @@ -77,9 +76,7 @@ jobs: - name: Install Linux system dependencies if: runner.os == 'Linux' run: | - sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') - echo $sysreqs - sudo -s eval "$sysreqs" + apt-get update -y && apt-get install -y make cmake libicu-dev zlib1g-dev pandoc - name: Install macOS system dependencies if: matrix.config.os == 'macOS-latest'