diff --git a/.gitignore b/.gitignore index 60e12a82..0722530e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ tests/testthat/Rplots.pdf inst/develop_file* inst/doc +Rplots.pdf diff --git a/DESCRIPTION b/DESCRIPTION index 977eb369..d1376efa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cevomod Title: Cancer Evolution Models -Version: 2.2.0 +Version: 2.3.0 Authors@R: person("Paweł", "Kuś", , "kpawel2210@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4367-9821")) @@ -31,7 +31,8 @@ Suggests: tidyverse, vdiffr, readthis, - rsample + rsample, + processx Config/testthat/edition: 3 VignetteBuilder: knitr Imports: diff --git a/NAMESPACE b/NAMESPACE index fcf2e33c..2284841c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -91,6 +91,7 @@ export(add_to_cevodata) export(annotate_mutation_contexts) export(annotate_normal_cn) export(as_cevo_snvs) +export(build_clip_container) export(calc_Mf_1f) export(calc_SFS) export(calc_cumulative_tails) @@ -115,6 +116,7 @@ export(fit_powerlaw_tail_fixed) export(fit_powerlaw_tail_optim) export(fit_subclones) export(fit_subclones_bmix) +export(fit_subclones_clip) export(fit_subclones_mclust) export(fix_powerlaw_N_mutations) export(generate_neutral_snvs) @@ -124,6 +126,7 @@ export(get_Mf_1f) export(get_SFS) export(get_SNVs_wider) export(get_cevomod_verbosity) +export(get_containers_dir) export(get_frequency_measure_name) export(get_model_names) export(get_models) @@ -174,11 +177,13 @@ export(scale_fill_cevomod) export(scale_fill_pnw) export(set_cancer_type) export(set_cevomod_verbosity) +export(set_containers_dir) export(shuffle) export(split_by) export(stat_cumulative_tail) export(theme_ellie) export(to_clip) +export(unite_mutation_id) export(use_purity) export(variant_classification_filter) import(dplyr) diff --git a/NEWS.md b/NEWS.md index ae57c0f6..dbdd3e8b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ +## cevomod 2.3.0 +* subclones can be fitted using [CliP](https://github.com/wwylab/CliP) [(Jiang et al., 2021)](https://www.biorxiv.org/content/10.1101/2021.03.31.437383v1) (`fit_subclones(method = "CliP")`). This option requires that the Apptainer is installed. CliP container image can be build with `build_clip_container()` + ## cevomod 2.2.0 * fit_powerlaw_tail_fixed() has a bootstrap option diff --git a/R/cevo_snvs.R b/R/cevo_snvs.R index b16cb98c..2d27556e 100644 --- a/R/cevo_snvs.R +++ b/R/cevo_snvs.R @@ -103,8 +103,13 @@ count_mutation_types <- function(snvs) { } -unite_mutation_id <- function(snvs) { - unite(snvs, "mutation_id", "chrom":"alt", sep = "-") +#' Unite many columns to create mutation_id column +#' @param snvs SNVs +#' @param sep Separator +#' @param remove Remove united columns? +#' @export +unite_mutation_id <- function(snvs, sep = "-", remove = TRUE) { + unite(snvs, "mutation_id", "chrom":"alt", sep = sep, remove = remove) } diff --git a/R/cevodata-export.R b/R/cevodata-export.R index 53ad0937..76901aed 100644 --- a/R/cevodata-export.R +++ b/R/cevodata-export.R @@ -120,17 +120,24 @@ save_clip_files <- function(clip_data, out_dir) { if (!dir.exists(out_dir)) { dir.create(out_dir) } - iwalk( + + files <- imap( clip_data, function(x, sample_id) { - write_tsv(x$snvs, file.path(out_dir, str_c(sample_id, ".snv.tsv"))) - write_tsv(x$cnvs, file.path(out_dir, str_c(sample_id, ".cnv.tsv"))) - write_file( - as.character(x$purities), - file.path(out_dir, str_c(sample_id, ".purity.tsv")) - ) + # sample_id2 <- str_replace(sample_id, " ", "_") + snvs_file <- file.path(out_dir, str_c(sample_id, ".snv.tsv")) + cnvs_file <- file.path(out_dir, str_c(sample_id, ".cnv.tsv")) + purity_file <- file.path(out_dir, str_c(sample_id, ".purity.tsv")) + + write_tsv(x$snvs, snvs_file) + write_tsv(x$cnvs, cnvs_file) + write_file(as.character(x$purities), purity_file) + + tibble(sample_id, snvs_file, cnvs_file, purity_file) } ) + + invisible(bind_rows(files)) } diff --git a/R/cevomod-persistent.R b/R/cevomod-persistent.R new file mode 100644 index 00000000..7db4d0ed --- /dev/null +++ b/R/cevomod-persistent.R @@ -0,0 +1,37 @@ + +settings_dir <- tools::R_user_dir("cevomod", which = "config") +settings_file <- file.path(settings_dir, "settings.rds") + +default_settings <- list( + containers_dir = NULL +) + + +get_settings <- function() { + if (file.exists(settings_file)) { + readRDS(settings_file) + } else { + default_settings + } +} + + +#' Get/Set the containers directory +#' @param dir Path for containers +#' @export +set_containers_dir <- function(dir) { + settings <- get_settings() + settings$containers_dir <- dir + + if(!dir.exists(settings_dir)) { + dir.create(settings_dir) + } + write_rds(settings, settings_file) +} + + +#' @rdname set_containers_dir +#' @export +get_containers_dir <- function() { + get_settings()$containers_dir +} diff --git a/R/containers.R b/R/containers.R new file mode 100644 index 00000000..db5d815e --- /dev/null +++ b/R/containers.R @@ -0,0 +1,43 @@ +#' Build the CliP Apptainer container +#' +#' CliP.sif is saved to +#' - out_dir, if provided +#' - if out_dir is NULL but the containers_dir was set using the [set_containers_dir()], +#' image will be saved to the set containers_dir. +#' - if out_dir is NULL and the containers_dir was not set, image will +#' be saved to the current working dir. +#' @param out_dir Path +#' @param force Force build in image exists +#' @export +build_clip_container <- function(out_dir = NULL, force = FALSE) { + if (!is_apptainer_installed()) { + stop("Apptainer needs to be installed to build the containers") + } + + if (is.null(out_dir) & !is.null(get_containers_dir())) { + out_dir <- get_containers_dir() + } else { + out_dir <- "." + } + + sif_file <- file.path(out_dir, "CliP.sif") |> + str_replace("//", "/") + + if (!file.exists(sif_file) | force) { + def_file <- system.file("CliP.def", package = "cevomod") + command <- str_c("apptainer build", sif_file, def_file, sep = " ") + system(command) + } else { + msg(sif_file, " exists. Set force = TRUE to overwrite it") + } +} + + +is_apptainer_installed <- function() { + rlang::check_installed("processx", reason = "to interact with system") + x <- processx::run("apptainer", args = "--version") + x$status == 0 +} + + + diff --git a/R/fit_powerlaw_tail_fixed.R b/R/fit_powerlaw_tail_fixed.R index 8c261ea7..13761aa9 100644 --- a/R/fit_powerlaw_tail_fixed.R +++ b/R/fit_powerlaw_tail_fixed.R @@ -22,6 +22,7 @@ #' #' @param object SNVs tibble object #' @param rsq_treshold R-squared tresholds to keep model as neutral +#' @param lm_length length of the linear model fits #' @param name name in the models' slot #' @param verbose verbose? #' @param ... other arguments @@ -56,6 +57,7 @@ fit_powerlaw_tail_fixed <- function(object, ...) { #' @export fit_powerlaw_tail_fixed.cevodata <- function(object, rsq_treshold = 0.98, + lm_length = 0.05, name = "powerlaw_fixed", pct_left = 0.05, pct_right = 0.95, verbose = get_cevomod_verbosity(), @@ -78,7 +80,7 @@ fit_powerlaw_tail_fixed.cevodata <- function(object, reframe( model = "powerlaw_fixed", component = "Neutral tail", - fit_optimal_lm(.data$data, rsq_treshold, pb) + fit_optimal_lm(.data$data, rsq_treshold, lm_length = lm_length, pb) ) class(models) <- c("cevo_powerlaw_models", class(models)) @@ -89,7 +91,7 @@ fit_powerlaw_tail_fixed.cevodata <- function(object, } -fit_optimal_lm <- function(data, rsq_treshold = 0.98, pb = NULL) { +fit_optimal_lm <- function(data, rsq_treshold = 0.98, lm_length = 0.05, pb = NULL) { min_val <- min(data$f) max_val <- max(data$f) grid <- expand_grid( @@ -98,7 +100,7 @@ fit_optimal_lm <- function(data, rsq_treshold = 0.98, pb = NULL) { ) |> mutate(length = .data$to - .data$from) - desired_length <- if ((max_val - min_val) >= 0.05) 0.05 else max(grid$length) + desired_length <- if ((max_val - min_val) >= lm_length) lm_length else max(grid$length) grid <- grid |> filter(near(.data$length, desired_length)) diff --git a/R/fit_powerlaw_tail_optim.R b/R/fit_powerlaw_tail_optim.R index 9731fa64..098ec1e4 100644 --- a/R/fit_powerlaw_tail_optim.R +++ b/R/fit_powerlaw_tail_optim.R @@ -203,6 +203,8 @@ fit_powerlaw_tail_optim.cevo_SFS_tbl <- function(object, data <- sfs |> left_join(bounds, by = "sample_id") |> filter(.data$f > .data$lower_bound, .data$f < .data$higher_bound) |> + mutate(length = n(), .by = "sample_id") |> + filter(length >= 3) |> select("sample_id", "f", "y") |> mutate( y = .data$y |> diff --git a/R/fit_subclones.R b/R/fit_subclones.R index 6b30583a..348871f2 100644 --- a/R/fit_subclones.R +++ b/R/fit_subclones.R @@ -3,29 +3,58 @@ #' Fit clonal and subclonal components of the model to the residuals of the #' power-law model #' -#' @param object object -#' @param N numbers of clones to for models -#' @param powerlaw_model_name residual of which powerlaw model to use? +#' @param object cevodata object +#' @param N Vector of numbers of clones to for models +#' @param powerlaw_model_name Residual of which powerlaw model to use? #' powerlaw_fixed/powerlaw_optim -#' @param snvs_name which snvs to to use? -#' @param method clustering method to use: BMix and mclust are currently supported. -#' While mclust is a 3-4 times faster method, the BMix method is more accurate -#' and usually fast enough. +#' @param snvs_name Which snvs to to use? +#' @param cnvs_name Which cnvs to to use? +#' @param method Clustering method to use. Currently supported methods: +#' - mclust - the fastest method, approximately 3-4 times faster than BMix, +#' but uses a gaussian mixture modelling +#' - BMix - is more accurate, considers subclones as binomial clusters, +#' slightly slower +#' - CliP - Clonal structure identification through penalizing pairwise +#' differences #' @param upper_f_limit ignore variants with f higher than -#' @param verbose verbose? +#' @param verbose Verbose? #' @name fit_subclones NULL -#' @rdname fit_subclones +#' @describeIn fit_subclones Provides a common interface for all other methods, +#' runs the selected method and passes all the required arguments down. +#' @examples +#' \dontrun{ +#' # Using BMix +#' fit_subclones(test_data_fitted) +#' # or +#' fit_subclones_bmix(test_data_fitted) +#' +#' # Using mclust +#' fit_subclones(test_data_fitted, method = "mclust") +#' # or +#' fit_subclones_mclust(test_data_fitted) +#' +#' # Using CliP +#' set_containers_dir(selected_dir) +#' build_clip_container() +#' fit_subclones(test_data_fitted, method = "CliP") +#' # or +#' fit_subclones_clip(test_data_fitted) +#' } #' @export fit_subclones <- function(object, N = 1:3, powerlaw_model_name = active_models(object), snvs_name = default_SNVs(object), + cnvs_name = default_CNVs(object), method = "BMix", upper_f_limit = 0.75, + clip_sif = NULL, + clip_input = file.path(tempdir(), "clip_input"), + clip_output = file.path(tempdir(), "clip_output"), verbose = get_cevomod_verbosity()) { if (method == "BMix") { object <- object |> @@ -33,11 +62,13 @@ fit_subclones <- function(object, } else if (method == "mclust") { object <- object |> fit_subclones_mclust(N, powerlaw_model_name, snvs_name, upper_f_limit, verbose) + } else if (method == "CliP") { + object <- object |> + fit_subclones_clip(powerlaw_model_name, snvs_name, cnvs_name, upper_f_limit, verbose = verbose) } else { - stop("Currently supported methods are: BMix and mclust") + stop("Currently supported methods are: BMix, CliP, and mclust") } - object } diff --git a/R/fit_subclones_binomial_CliP.R b/R/fit_subclones_binomial_CliP.R new file mode 100644 index 00000000..197c0e04 --- /dev/null +++ b/R/fit_subclones_binomial_CliP.R @@ -0,0 +1,152 @@ + +#' @describeIn fit_subclones Fit subclonal distributions to neutral model +#' residuals using CliP - Clonal structure identification through penalizing +#' pairwise differences [(github)](https://github.com/wwylab/CliP). Requires +#' that the Apptainer installed and +#' - the path to the CliP.sif container image is provided, +#' - or CliP.sif exists in the containers_dir ([set_containers_dir()]) +#' - or CliP.sif exists in the current working directory. +#' The container should be build with [build_clip_container()]. +#' @param clip_sif Apptainer image file with CliP. If NULL, cevomod seeks for +#' the CliP.sif file in the containers_dir (if set before) and in the current +#' working directory. The container should be build with [build_clip_container()]. +#' @param clip_input Path to store the CliP input files +#' @param clip_output Path to store the CliP output files +#' @export +fit_subclones_clip <- function(object, + powerlaw_model_name = active_models(object), + snvs_name = default_SNVs(object), + cnvs_name = default_CNVs(object), + upper_f_limit = 0.75, + clip_sif = NULL, + clip_input = file.path(tempdir(), "clip_input"), + clip_output = file.path(tempdir(), "clip_output"), + verbose = get_cevomod_verbosity()) { + msg("Fitting binomial models using CllP", verbose = verbose) + rlang::check_installed("readthis", reason = "to read CliP results") + if (!is_apptainer_installed()) { + stop("Apptainer needs to be installed to run fit subclones using CliP") + } + + if (is.null(clip_sif)) { + clip_sif <- find_clip_container() + } + + powerlaw_models <- get_powerlaw_models(object, powerlaw_model_name) + residuals <- get_residuals(object, models_name = powerlaw_model_name) |> + filter(.data$f >= 0) + sequencing_depth <- SNVs(object, which = snvs_name) |> + get_local_sequencing_depths() |> + transmute(.data$sample_id, .data$f, sequencing_DP = .data$median_DP) + + non_neutral_tail_mut_counts <- residuals |> + mutate(n = if_else(.data$f > upper_f_limit, 0, round(.data$powerlaw_resid_clones))) |> + select("sample_id", "f_interval", "n") + snvs_to_cluster <- SNVs(object, which = snvs_name) |> + nest_by(.data$sample_id, .data$f_interval) |> + inner_join(non_neutral_tail_mut_counts, by = c("sample_id", "f_interval")) |> + reframe( + slice_sample(.data$data, n = .data$n) + ) + + clip_files <- object |> + add_SNV_data(snvs_to_cluster, "snvs_to_cluster") |> + to_clip(out_dir = clip_input, snvs_name = "snvs_to_cluster", cnvs_name = cnvs_name) + + pmap( + clip_files, + run_clip, + clip_sif = clip_sif, + out_dir = clip_output, + verbose = verbose > 0, + .progress = verbose > 0 + ) + + clip_res <- readthis::read_clip_best_lambda(clip_output) + + models <- clip_res$subclonal_structure |> + transmute( + .data$sample_id, + model = "subclones CliP", + component = if_else(.data$cluster_index == 0, "Clone", str_c("Subclone ", .data$cluster_index)), + N_mutations = .data$num_SNV, + cellularity = .data$cellular_prevalence/2, + f = round(.data$cellularity, digits = 2), + .data$lambda, + best = .data$best_lambda + ) |> + left_join(sequencing_depth, by = c("sample_id", "f")) |> + select(-"f") + + best_models <- models |> + filter(.data$best) |> + nest_by(.data$sample_id, .key = "clones") + + clonal_predictions <- residuals |> + select("sample_id", "f_interval", "f") |> + nest_by(.data$sample_id, .key = "intervals") |> + inner_join(best_models, by = "sample_id") |> + reframe(get_binomial_predictions(.data$clones, .data$intervals)) |> + select(-"f") + + residuals <- residuals |> + select(-"model_resid") |> + left_join(clonal_predictions, by = c("sample_id", "f_interval")) |> + mutate( + model_pred = .data$powerlaw_pred + .data$binom_pred, + model_resid = .data$SFS - .data$model_pred + ) + + models <- powerlaw_models |> + bind_rows(models) |> + arrange(.data$sample_id, .data$best, .data$model) + + models_name <- paste0(powerlaw_model_name, "_subclones") + resid_name <- paste0("residuals_", models_name) + object$models[[models_name]] <- models + object$misc[[resid_name]] <- residuals + object$active_models <- models_name + object +} + + + +find_clip_container <- function() { + containers_dir <- get_containers_dir() + clip_sif <- if (!is.null(containers_dir)) { + file.path(containers_dir, "CliP.sif") |> + str_replace("//", "/") + } + if (file.exists(clip_sif)) { + return(clip_sif) + } + + clip_sif <- "CliP.sif" + if (file.exists(clip_sif)) { + return(clip_sif) + } + + stop("Provide clip_sif path or use build_clip_container() to build it.") +} + + + +run_clip <- function(sample_id, snvs_file, cnvs_file, purity_file, out_dir, + clip_sif = NULL, + clip_args = c("-O", out_dir, "--sample_id", sample_id), + verbose = TRUE) { + rlang::check_installed("processx", reason = "to run CliP") + if (!is_apptainer_installed()) { + stop("Apptainer needs to be installed to run fit subclones using CliP") + } + + args <- c( + "exec", clip_sif, + "python", "/opt/CliP/run_clip_main.py", + snvs_file, cnvs_file, purity_file, clip_args + ) + out <- processx::run("apptainer", args, echo = verbose, echo_cmd = verbose) + + invisible(out) +} + diff --git a/R/plot_CNV.R b/R/plot_CNV.R index 2a7af07a..0e1ce7b7 100644 --- a/R/plot_CNV.R +++ b/R/plot_CNV.R @@ -101,11 +101,11 @@ heatmap_granges <- function(granges, meta_field, select("score", "cum") %>% deframe() - if (verbose) { - print("Proportion of ranges present in N GRanges:") + msg("Kept sites present in ", keep_sites_present_in, "/", length(granges), " GRanges", verbose = verbose) + msg("Kept ", range_fractions[as.character(keep_sites_present_in)], " of all sites", verbose = verbose) + if (verbose_down(verbose)) { + msg("Proportion of ranges present in N GRanges:", verbose = TRUE) print(range_fractions) - print(sprintf("Kept sites present in %d/%d GRanges", keep_sites_present_in, length(granges))) - print(sprintf("Kept %f of all sites", range_fractions[as.character(keep_sites_present_in)])) } common_ranges <- ranges_cov %>% diff --git a/R/plot_mutations.R b/R/plot_mutations.R index d9f374ed..3a361cff 100644 --- a/R/plot_mutations.R +++ b/R/plot_mutations.R @@ -108,20 +108,23 @@ filter_SNVs <- function(dt, genes = NULL, drivers = NULL) { #' @describeIn mutation_plots Adds mutations to SFS plots +#' @param mapping aes() #' @param color color #' @param size size #' @param show_labels `lgl` use ggrepel to label the mutations? #' @param ... other arguments passed to geom_point() #' @export layer_mutations <- function(object, + mapping = NULL, genes = NULL, drivers = NULL, show_labels = TRUE, color = "black", size = 3, shape = "impact", filter_fun = guess_filter_fun(shape), ...) { dt <- SNVs(object) + default_mapping <- aes(shape = .data[[shape]]) list( scale_shape_manual(values = c(2, 4, 3, 5)), geom_point( - aes(x = .data$VAF, shape = .data[[shape]]), + mapping = mapping, data = dt %>% filter_SNVs(genes, drivers) %>% filter_fun(), @@ -133,8 +136,9 @@ layer_mutations <- function(object, ), if (show_labels) { rlang::check_installed("ggrepel", reason = "to label driver mutations on the plot") + default_mapping <- aes(shape = .data[[shape]], label = .data$gene_symbol) ggrepel::geom_label_repel( - aes(x = .data$VAF, shape = .data[[shape]], label = .data$gene_symbol), + mapping = join_aes(default_mapping, mapping), data = dt %>% filter_SNVs(genes, drivers) %>% filter_fun(), diff --git a/R/stat_SFS.R b/R/stat_SFS.R index 0769b89b..53ea6980 100644 --- a/R/stat_SFS.R +++ b/R/stat_SFS.R @@ -17,7 +17,7 @@ #' test_data |> #' calc_SFS() |> #' plot_SFS() + -#' layer_mutations(test_data, drivers = "BRCA") +#' layer_mutations(test_data, mapping = ggplot2::aes(x = VAF), drivers = "BRCA") #' @name sfs NULL diff --git a/README.md b/README.md index ef3fe93b..0b076475 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,23 @@ *cevomod* is a package that implements methods for analyzing cancer evolution from the Next Generation Sequencing data. -The modeling approach implemented in *cevomod* was inspired by the [MOBSTER](https://caravagnalab.github.io/mobster/index.html) package, which models the distribution of Variant Allele Frequencies in the sample with a mixture of power-law-shaped and binomial distributions. However, MOBSTER fails to recognize the power-law component (the so-called neutral tail) in Whole Exome Sequencing data or data with insufficient sequencing coverage. cevomod implements methods that can fit the model to the data with significant loss of neutral tail variants. - +The modeling approach implemented in *cevomod* was inspired by the [MOBSTER](https://caravagnalab.github.io/mobster/index.html) package ([Caravagna et al., 2020](https://www.nature.com/articles/s41588-020-0675-5)), which models the distribution of Variant Allele Frequencies in the sample with a mixture of power-law-shaped and binomial distributions. However, MOBSTER fails to recognize the power-law component (the so-called neutral tail) in Whole Exome Sequencing data or data with insufficient sequencing coverage. cevomod implements methods that can fit the model to the data with significant loss of neutral tail variants. ![](man/figures/introduction_figure.svg) +In *cevomod*, the power-law and binomial components of the model are fitted sequentially. The power-law components are fitted first, and the binomial components are fitted to the residuals of the power-law components. There are two types of the power-law models: + +- with the power-law exponents equal to 2, as expected in the exponentially growing populations according to ([Durrett, 2013](https://pubmed.ncbi.nlm.nih.gov/23471293/)) and ([Williams et al., 2016](https://www.nature.com/articles/ng.3489)), +- and with the power-law exponent that best fits the data. + +There are also several methods for fitting the clonal and subclonal components: + +- using Gaussian mixtures ([mclust](https://mclust-org.github.io/mclust/)) ([Scrucca et al., 2016](https://journal.r-project.org/archive/2016/RJ-2016-021/index.html)), +- using binomial distributions ([BMix](https://github.com/caravagnalab/BMix)) ([Caravagna et al., 2020](https://www.nature.com/articles/s41588-020-0675-5)), +- and by penalizing pairwise differences ([CliP](https://github.com/wwylab/CliP)) ([Jiang et al., 2021](https://www.biorxiv.org/content/10.1101/2021.03.31.437383v1)). + +See the model fitting details in the [Get Started](https://pawelqs.github.io/cevomod/articles/get_started.html) and [Fitting models](https://pawelqs.github.io/cevomod/articles/fitting_models.html) vignettes. + ## Installation @@ -27,6 +39,7 @@ devtools::install_github("pawelqs/cevomod") ## Last changes +* **v2.3.0** - subclones can be fitted using [CliP](https://github.com/wwylab/CliP) ([Jiang et al., 2021](https://www.biorxiv.org/content/10.1101/2021.03.31.437383v1)) (`fit_subclones(method = "CliP")`). This option requires that the Apptainer is installed. CliP container image can be build with `build_clip_container()` * **v2.2.0** - fit_powerlaw_tail_fixed() has a bootstrap option. See the details in [Fitting models #Bootstrapping](https://pawelqs.github.io/cevomod/articles/fitting_models.html#bootstrapping) vignette * **v2.1.0** - cevomod is integrated with a helper [readthis](https://pawelqs.github.io/readthis/index.html) package, designed for bulk reading of variant files from algorithms such as Mutect2, Strelka, ASCAT, or FACETS, in the cevomod-friendly data format. Objects returned by `readthis::read_*()` functions can be added to the cevodata object using a general `add_data()` function. * **v2.0.0** - Starting with version 2.0.0, cevomod can use either VAF or CCF (Cancer Cell Fraction) as a measure of mutation frequency. CCF is a measure of mutation frequency corrected for tumor purity and copy number alterations. CCF can be calculated prior to mutation frequency intervalization using the `calc_mutation_frequencies()` function and requires information on total copy number in tumor and normal tissue and sample purity (tumor cell content). See the Vignettes for more examples. diff --git a/_pkgdown.yml b/_pkgdown.yml index 6bf5b672..3b177155 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -31,6 +31,7 @@ reference: - title: Other transformations - contents: - annotate_mutation_contexts + - unite_mutation_id - cut_f_intervals - filter_SNVs_by_regions - shuffle @@ -72,12 +73,14 @@ reference: - test_data_fitted - title: Other functions - contents: - - set_cevomod_verbosity - - generate_neutral_snvs - - "`%not in%`" + - build_clip_container - fill_na - fix_powerlaw_N_mutations + - generate_neutral_snvs + - set_cevomod_verbosity + - set_containers_dir - to_clip + - "`%not in%`" - title: internal contents: - colors diff --git a/inst/CliP.def b/inst/CliP.def new file mode 100644 index 00000000..9e636129 --- /dev/null +++ b/inst/CliP.def @@ -0,0 +1,42 @@ +BootStrap: docker +From: rocker/r-ver:4.0.0 + +%post + apt -y update + + # Install python + apt install software-properties-common -y + add-apt-repository ppa:deadsnakes/ppa + apt -y update + apt install python3.8 python3-pip python3.8-distutils git -y + ln `which python3.8` /usr/bin/python + # python -m ensurepip --upgrade + + # Install python dependencies + pip install numpy scipy pandas + + # Install CliP + cd /opt + git clone https://github.com/pawelqs/CliP.git + cd CliP + git checkout add_out_dir_arg + python setup.py build + +%environment + export LC_ALL=C + export PATH=/usr/games:$PATH + +%runscript + date | R --version + +%test + R --version + python --version + +%labels + Author Pawel Kus + Version v1.0.0 + +%help + This container contains everything that is needed to run clonality analysis + using CliP package \ No newline at end of file diff --git a/man/build_clip_container.Rd b/man/build_clip_container.Rd new file mode 100644 index 00000000..3144cc0a --- /dev/null +++ b/man/build_clip_container.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/containers.R +\name{build_clip_container} +\alias{build_clip_container} +\title{Build the CliP Apptainer container} +\usage{ +build_clip_container(out_dir = NULL, force = FALSE) +} +\arguments{ +\item{out_dir}{Path} + +\item{force}{Force build in image exists} +} +\description{ +CliP.sif is saved to +\itemize{ +\item out_dir, if provided +\item if out_dir is NULL but the containers_dir was set using the \code{\link[=set_containers_dir]{set_containers_dir()}}, +image will be saved to the set containers_dir. +\item if out_dir is NULL and the containers_dir was not set, image will +be saved to the current working dir. +} +} diff --git a/man/fit_subclones.Rd b/man/fit_subclones.Rd index 3bfd727e..3c1f60b6 100644 --- a/man/fit_subclones.Rd +++ b/man/fit_subclones.Rd @@ -1,9 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/fit_subclones.R, -% R/fit_subclones_binomial_BMix.R, R/fit_subclones_binomial_mclust.R +% R/fit_subclones_binomial_BMix.R, R/fit_subclones_binomial_CliP.R, +% R/fit_subclones_binomial_mclust.R \name{fit_subclones} \alias{fit_subclones} \alias{fit_subclones_bmix} +\alias{fit_subclones_clip} \alias{fit_subclones_mclust} \title{Fit clonal and subclonal components of the model to the residuals of the power-law model} @@ -13,8 +15,12 @@ fit_subclones( N = 1:3, powerlaw_model_name = active_models(object), snvs_name = default_SNVs(object), + cnvs_name = default_CNVs(object), method = "BMix", upper_f_limit = 0.75, + clip_sif = NULL, + clip_input = file.path(tempdir(), "clip_input"), + clip_output = file.path(tempdir(), "clip_output"), verbose = get_cevomod_verbosity() ) @@ -27,6 +33,18 @@ fit_subclones_bmix( verbose = get_cevomod_verbosity() ) +fit_subclones_clip( + object, + powerlaw_model_name = active_models(object), + snvs_name = default_SNVs(object), + cnvs_name = default_CNVs(object), + upper_f_limit = 0.75, + clip_sif = NULL, + clip_input = file.path(tempdir(), "clip_input"), + clip_output = file.path(tempdir(), "clip_output"), + verbose = get_cevomod_verbosity() +) + fit_subclones_mclust( object, N = 1:3, @@ -37,22 +55,38 @@ fit_subclones_mclust( ) } \arguments{ -\item{object}{object} +\item{object}{cevodata object} -\item{N}{numbers of clones to for models} +\item{N}{Vector of numbers of clones to for models} -\item{powerlaw_model_name}{residual of which powerlaw model to use? +\item{powerlaw_model_name}{Residual of which powerlaw model to use? powerlaw_fixed/powerlaw_optim} -\item{snvs_name}{which snvs to to use?} +\item{snvs_name}{Which snvs to to use?} -\item{method}{clustering method to use: BMix and mclust are currently supported. -While mclust is a 3-4 times faster method, the BMix method is more accurate -and usually fast enough.} +\item{cnvs_name}{Which cnvs to to use?} + +\item{method}{Clustering method to use. Currently supported methods: +\itemize{ +\item mclust - the fastest method, approximately 3-4 times faster than BMix, +but uses a gaussian mixture modelling +\item BMix - is more accurate, considers subclones as binomial clusters, +slightly slower +\item CliP - Clonal structure identification through penalizing pairwise +differences +}} \item{upper_f_limit}{ignore variants with f higher than} -\item{verbose}{verbose?} +\item{clip_sif}{Apptainer image file with CliP. If NULL, cevomod seeks for +the CliP.sif file in the containers_dir (if set before) and in the current +working directory. The container should be build with \code{\link[=build_clip_container]{build_clip_container()}}.} + +\item{clip_input}{Path to store the CliP input files} + +\item{clip_output}{Path to store the CliP output files} + +\item{verbose}{Verbose?} } \description{ Fit clonal and subclonal components of the model to the residuals of the @@ -60,8 +94,42 @@ power-law model } \section{Functions}{ \itemize{ +\item \code{fit_subclones()}: Provides a common interface for all other methods, +runs the selected method and passes all the required arguments down. + \item \code{fit_subclones_bmix()}: Fit subclonal distributions to neutral model residuals using BMix +\item \code{fit_subclones_clip()}: Fit subclonal distributions to neutral model +residuals using CliP - Clonal structure identification through penalizing +pairwise differences \href{https://github.com/wwylab/CliP}{(github)}. Requires +that the Apptainer installed and +\itemize{ +\item the path to the CliP.sif container image is provided, +\item or CliP.sif exists in the containers_dir (\code{\link[=set_containers_dir]{set_containers_dir()}}) +\item or CliP.sif exists in the current working directory. +The container should be build with \code{\link[=build_clip_container]{build_clip_container()}}. +} + \item \code{fit_subclones_mclust()}: Fit subclonal distributions to neutral model residuals using mclust }} +\examples{ +\dontrun{ +# Using BMix +fit_subclones(test_data_fitted) +# or +fit_subclones_bmix(test_data_fitted) + +# Using mclust +fit_subclones(test_data_fitted, method = "mclust") +# or +fit_subclones_mclust(test_data_fitted) + +# Using CliP +set_containers_dir(selected_dir) +build_clip_container() +fit_subclones(test_data_fitted, method = "CliP") +# or +fit_subclones_clip(test_data_fitted) +} +} diff --git a/man/mutation_plots.Rd b/man/mutation_plots.Rd index 62fc2a00..8120ec5c 100644 --- a/man/mutation_plots.Rd +++ b/man/mutation_plots.Rd @@ -34,6 +34,7 @@ plot_mutations(object, ...) layer_mutations( object, + mapping = NULL, genes = NULL, drivers = NULL, show_labels = TRUE, @@ -63,6 +64,8 @@ be taken from \code{driver_genes}} \item{filter_fun}{Function for filtering mutations: variant_classification_filter() or impact_filter()} +\item{mapping}{aes()} + \item{show_labels}{\code{lgl} use ggrepel to label the mutations?} \item{color}{color} diff --git a/man/powerlaw_fixed_model.Rd b/man/powerlaw_fixed_model.Rd index f96bf615..00e7718a 100644 --- a/man/powerlaw_fixed_model.Rd +++ b/man/powerlaw_fixed_model.Rd @@ -12,6 +12,7 @@ fit_powerlaw_tail_fixed(object, ...) \method{fit_powerlaw_tail_fixed}{cevodata}( object, rsq_treshold = 0.98, + lm_length = 0.05, name = "powerlaw_fixed", pct_left = 0.05, pct_right = 0.95, @@ -28,6 +29,8 @@ layer_lm_fits(cd, model_name = "powerlaw_fixed", ...) \item{rsq_treshold}{R-squared tresholds to keep model as neutral} +\item{lm_length}{length of the linear model fits} + \item{name}{name in the models' slot} \item{pct_left}{drop pct of the lowerst frequency variants to improve fit} diff --git a/man/set_containers_dir.Rd b/man/set_containers_dir.Rd new file mode 100644 index 00000000..70c148b5 --- /dev/null +++ b/man/set_containers_dir.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cevomod-persistent.R +\name{set_containers_dir} +\alias{set_containers_dir} +\alias{get_containers_dir} +\title{Get/Set the containers directory} +\usage{ +set_containers_dir(dir) + +get_containers_dir() +} +\arguments{ +\item{dir}{Path for containers} +} +\description{ +Get/Set the containers directory +} diff --git a/man/sfs.Rd b/man/sfs.Rd index c0f6522c..8ce62f6a 100644 --- a/man/sfs.Rd +++ b/man/sfs.Rd @@ -86,5 +86,5 @@ data("test_data") test_data |> calc_SFS() |> plot_SFS() + - layer_mutations(test_data, drivers = "BRCA") + layer_mutations(test_data, mapping = ggplot2::aes(x = VAF), drivers = "BRCA") } diff --git a/man/unite_mutation_id.Rd b/man/unite_mutation_id.Rd new file mode 100644 index 00000000..420d3c3a --- /dev/null +++ b/man/unite_mutation_id.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cevo_snvs.R +\name{unite_mutation_id} +\alias{unite_mutation_id} +\title{Unite many columns to create mutation_id column} +\usage{ +unite_mutation_id(snvs, sep = "-", remove = TRUE) +} +\arguments{ +\item{snvs}{SNVs} + +\item{sep}{Separator} + +\item{remove}{Remove united columns?} +} +\description{ +Unite many columns to create mutation_id column +} diff --git a/tests/testthat/test-fit_subclones.R b/tests/testthat/test-fit_subclones.R index 1696fbf3..88cad619 100644 --- a/tests/testthat/test-fit_subclones.R +++ b/tests/testthat/test-fit_subclones.R @@ -9,6 +9,13 @@ powerlaw_model_name <- "powerlaw_optim" upper_f_limit <- 0.75 snvs_name <- "snvs" +# For CliP interactive tests +cnvs_name <- default_CNVs(object) +clip_input <- file.path(tempdir(), "clip_input") +clip_output <- file.path(tempdir(), "clip_output") +clip_sif <- NULL +verbose <- get_cevomod_verbosity() + test_that("fit_subclones() mclust works", { res <- object |> @@ -32,6 +39,19 @@ test_that("fit_subclones() bmix works", { }) +### Do not push this test to github, since it requires that apptainer +### is installed +# test_that("fit_subclones() clip works", { +# res <- object |> +# fit_subclones(method = "CliP", verbose = TRUE) +# +# expect_false( +# is.null(res$models$powerlaw_optim_subclones) +# ) +# }) + + + fit_binomial_models_cols <- c("N", "component", "cellularity", "N_mutations", "BIC") diff --git a/vignettes/fitting_models.Rmd b/vignettes/fitting_models.Rmd index f0ebf17f..79590ebb 100644 --- a/vignettes/fitting_models.Rmd +++ b/vignettes/fitting_models.Rmd @@ -75,11 +75,10 @@ In Sample 2, the power-law exponent equals 4, much higher than in 3 other sample # Binomial components -In cevomod, the binomial components for clonal and subclonal variants are fitted to the positive part of the power-law model residuals. By default, they are fitted using the [BMix](https://github.com/caravagnalab/BMix) package, although an alternative method using mclust is also available. In the default method, we randomly subsample the SNVs and Indels in each spectrum bin to the number given by the power-law component residual. Then, we employ the BMix to fit the VAF distribution of these variants with a mixture of 1 to 3 binomial distributions (clone plus subclones), accounting for the variant's sequencing depth. The best model is selected based on the Bayesian Information Criterium (BIC). - +In cevomod, the binomial components for clonal and subclonal variants are fitted to the positive part of the power-law model residuals. By default, they are fitted using the [BMix](https://github.com/caravagnalab/BMix) package ([Caravagna et al., 2020](https://www.nature.com/articles/s41588-020-0675-5)), although an alternative methods are available. In the default method, we randomly subsample the SNVs and Indels in each spectrum bin to the number given by the power-law component residual. Then, we employ the BMix to fit the VAF distribution of these variants with a mixture of 1 to 3 binomial distributions (clone plus subclones), accounting for the variant's sequencing depth. The best model is selected based on the Bayesian Information Criterium (BIC). ```{r} -cd <- fit_subclones_bmix(cd) +cd <- fit_subclones(cd) plot_models(cd) ``` @@ -93,7 +92,7 @@ For example, one can get filter Sample 2 out from the cevodata object: ```{r} cd <- cd |> filter(sample_id != "Sample 2") |> - fit_subclones_bmix(powerlaw_model_name = "powerlaw_fixed") + fit_subclones(powerlaw_model_name = "powerlaw_fixed") cd |> get_models() |> @@ -104,6 +103,31 @@ cd |> get_selection_coefficients() ``` +## Alternative methods for fitting clones and subclones + +There are 2 alternative methods for fitting clonal and subclonal components of the model. + +- CliP (using `fit_subclones(method = "CliP")` or `fit_subclomes_clip()`) - uses the [CliP](https://github.com/wwylab/CliP) method published by ([Jiang et al., 2021](https://www.biorxiv.org/content/10.1101/2021.03.31.437383v1)). Running CliP requires that **[Apptainer](https://apptainer.org/) is installed** and **does not require installation of CliP and depenencies**. *cevomod* prepares the CliP input files, runs the container and reads the CliP output files back to *cevomod*. The container needs to be build *a priori* with `build_clip_container()`, which uses the image definition file [CliP.def](https://github.com/pawelqs/cevomod/inst/CliP.def). All of this can be done with a few lines of code: + +```{r eval=FALSE} +set_containers_dir("~/containers/") +# get_containers_dir() to see the current containers dir +build_clip_container() +fit_subclones(cd, method = "CliP") + +## OR the image can be built in the current working directory +build_clip_container() +fit_subclones(cd, method = "CliP") + +## OR in any custom directory +build_clip_container("/custom/path/") +fit_subclones(cd, method = "CliP", clip_sif = "/custom/path/CliP.sif") +``` + +See the `fit_subclones()` help page for more details. + +- mclust (using `fit_subclones(method = "mclust")` or `fit_subclomes_mclust()`) - fits the power-law component residuals with a Gaussian mixtures using the ([mclust](https://mclust-org.github.io/mclust/)) package ([Scrucca et al., 2016](https://journal.r-project.org/archive/2016/RJ-2016-021/index.html)). This is a faster but approximate method for recognition of clones and subclones. + # Bootstrapping diff --git a/vignettes/get_started.Rmd b/vignettes/get_started.Rmd index 2ed9d17c..b157da2f 100644 --- a/vignettes/get_started.Rmd +++ b/vignettes/get_started.Rmd @@ -148,7 +148,7 @@ Now we are ready to fit the cevomod models. Once the SNVs are prepared, the models can be fitted using the `fit_*()` functions. Full models consist of the power-law component and one or more binomial components, which are fitted sequentially. -In this example, we first fit the power-law model with an exponent of 2, and then the mixture of binomial models. +In this example, we first fit the power-law model with an exponent of 2, and then the mixture of binomial models. By default, clonal and subclonal components are fitted using [BMix](https://github.com/caravagnalab/BMix) package ([Caravagna et al., 2020](https://www.nature.com/articles/s41588-020-0675-5)). See other methods for fitting these components in [Fitting models/Binomial components](https://pawelqs.github.io/cevomod/articles/fitting_models.html#binomial-components) vignette. ```{r} cd <- cd |> diff --git a/vignettes/visualizations.Rmd b/vignettes/visualizations.Rmd index dff046d3..8fb98280 100644 --- a/vignettes/visualizations.Rmd +++ b/vignettes/visualizations.Rmd @@ -86,14 +86,14 @@ Drier mutations can be easily annotated on the VAF plots using the layer_mutatio ```{r, fig.width=10, fig.height=6} plot_SFS(cd) + - layer_mutations(cd, drivers = "BRCA") + layer_mutations(cd, mapping = aes(x = VAF), drivers = "BRCA") ``` Also, custom list of genes can be provided with the `genes` argument: ```{r, fig.width=10, fig.height=6} plot_SFS(cd) + - layer_mutations(cd, genes = c("TP53", "BRCA1")) + layer_mutations(cd, mapping = aes(x = VAF), genes = c("TP53", "BRCA1")) ``` ## Mutation plots