pawelqs · pawelqs · Aug 4, 2023 · Aug 2, 2023 · Aug 2, 2023 · Aug 3, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: cevomod
 Title: Cancer Evolution Models
-Version: 2.1.0
+Version: 2.2.0
 Authors@R: 
     person("Paweł", "Kuś", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-4367-9821"))
@@ -30,7 +30,8 @@ Suggests:
     testthat (>= 3.0.0),
     tidyverse,
     vdiffr,
-    readthis
+    readthis,
+    rsample
 Config/testthat/edition: 3
 VignetteBuilder: knitr
 Imports: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,8 @@ S3method(evaluate_MC_runs,matrix)
 S3method(filter,cevodata)
 S3method(fit_mobster,cevodata)
 S3method(fit_powerlaw_tail_fixed,cevodata)
+S3method(fit_powerlaw_tail_optim,cevo_SFS_bootstraps)
+S3method(fit_powerlaw_tail_optim,cevo_SFS_tbl)
 S3method(fit_powerlaw_tail_optim,cevodata)
 S3method(get_CNVs_var_names,cevodata)
 S3method(get_frequency_measure_name,cevo_snvs)
@@ -42,6 +44,8 @@ S3method(get_selection_coefficients,cevodata)
 S3method(get_selection_coefficients,tbl_df)
 S3method(identify_non_neutral_tail_mutations,cevodata)
 S3method(identify_non_neutral_tail_mutations,singlepatient_cevodata)
+S3method(intervalize_mutation_frequencies,cevo_snvs)
+S3method(intervalize_mutation_frequencies,cevodata)
 S3method(merge,cevodata)
 S3method(plot,cevo_ITH_tbl)
 S3method(plot,cevo_MC_solutions_eval)

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,7 @@
 
+## cevomod 2.2.0
+* fit_powerlaw_tail_fixed() has a bootstrap option
+
 ## cevomod 2.1.0
 * cevomod is integrated with a helper [readthis](https://pawelqs.github.io/readthis/index.html) package, designed for bulk reading of variant files from algorithms such as Mutect2, Strelka, ASCAT, or FACETS, in the cevomod-friendly data format. Objects returned by `readthis::read_*()` functions can be added to the cevodata object using a general `add_data()` function.
 

diff --git a/R/fit_powerlaw_tail_optim.R b/R/fit_powerlaw_tail_optim.R
@@ -1,5 +1,5 @@
 
-#' Fitting Tung Durrett models
+#' Fitting power-law tails with aptimum exponent value
 #'
 #' `fit_powerlaw_tail_optim()` uses `stats::optim` to find optimal A and alpha
 #' whch maximizes SFS area under the powerlaw curve (*sampled* region of SFS and
@@ -35,12 +35,15 @@ fit_powerlaw_tail_optim <- function(object, ...) {
 
 #' @rdname powerlaw_optim
 #' @inheritParams get_non_zero_SFS_range
-#' @param peak_detection_upper_limit upper f value up to which the main peak is searched
-#' @param reward_upper_limit mutations under the curve up to this limit will be rewarded
+#' @param peak_detection_upper_limit Upper f value up to which the main peak is searched
+#' @param reward_upper_limit Mutations under the curve up to this limit will be rewarded
+#' @param bootstraps Number of bootstrap samples, or FALSE to make no resampling.
+#'   **This option significantly extendis the model fitting time!!**
 #' @export
 fit_powerlaw_tail_optim.cevodata <- function(object,
                                              name = "powerlaw_optim",
                                              # pct_left = 0, pct_right = 0.98,
+                                             bootstraps = FALSE,
                                              allowed_zero_bins = 2,
                                              y_treshold = 1,
                                              y_threshold_pct = 0.01,
@@ -50,10 +53,143 @@ fit_powerlaw_tail_optim.cevodata <- function(object,
                                              control = list(maxit = 1000, ndeps = c(0.1, 0.01)),
                                              verbose = get_cevomod_verbosity(),
                                              ...) {
+  if (!bootstraps) {
+    sfs <- get_SFS(object, name = "SFS")
+    models <- fit_powerlaw_tail_optim(
+      sfs,
+      name = name,
+      allowed_zero_bins = allowed_zero_bins,
+      y_treshold = y_treshold,
+      y_threshold_pct = y_threshold_pct,
+      av_filter = av_filter,
+      peak_detection_upper_limit = peak_detection_upper_limit,
+      reward_upper_limit = reward_upper_limit,
+      control = control,
+      verbose = verbose
+    )
+    object$models[[name]] <- models
+    object <- calc_powerlaw_model_residuals(object, name)
+    object$active_models <- name
+    object
+  } else {
+    rlang::check_installed("rsample", reason = "to perform bootstrap sampling of SNVs")
+    sfs_resamples <- calc_SFS_resamples(object, times = bootstraps, verbose = verbose)
+
+    models <- sfs_resamples |>
+      map(
+        fit_powerlaw_tail_optim,
+        name = name,
+        allowed_zero_bins = allowed_zero_bins,
+        y_treshold = y_treshold,
+        y_threshold_pct = y_threshold_pct,
+        av_filter = av_filter,
+        peak_detection_upper_limit = peak_detection_upper_limit,
+        reward_upper_limit = reward_upper_limit,
+        control = control,
+        verbose = verbose
+    )
+
+    bootstrap_models <- models |>
+      map("bootstrap_models") |>
+      bind_rows()
+
+    models <- models |>
+      map("models") |>
+      bind_rows(.id = "sample_id")
+
+    bootstrap_name <- str_c(name, "_bootstraps")
+    object$models[[bootstrap_name]] <- bootstrap_models
+    object <- calc_powerlaw_model_residuals(object, bootstrap_name)
+    object$models[[name]] <- models
+    object <- calc_powerlaw_model_residuals(object, name)
+    object$active_models <- name
+    object
+  }
+}
+
+
+#' @rdname powerlaw_optim
+#' @export
+fit_powerlaw_tail_optim.cevo_SFS_bootstraps <- function(object,
+                                                        name = "powerlaw_optim",
+                                                        allowed_zero_bins = 2,
+                                                        y_treshold = 1,
+                                                        y_threshold_pct = 0.01,
+                                                        av_filter = c(1/3, 1/3, 1/3),
+                                                        peak_detection_upper_limit = 0.3,
+                                                        reward_upper_limit = 0.4,
+                                                        control = list(maxit = 1000, ndeps = c(0.1, 0.01)),
+                                                        verbose = get_cevomod_verbosity(),
+                                                        ...) {
+  rlang::check_installed("rsample", reason = "to perform bootstrap sampling of SNVs")
+  msg("Fitting models to ", unique(object$sfs[[1]]$sample_id), " resamples", verbose = verbose)
+  pb <- if (verbose) progress_bar$new(total = nrow(object)) else NULL
+
+  object$models <- object$sfs |>
+    map(function(sfs) {
+      if (!is.null(pb)) pb$tick()
+      fit_powerlaw_tail_optim(
+        sfs,
+        name = name,
+        allowed_zero_bins = allowed_zero_bins,
+        y_treshold = y_treshold,
+        y_threshold_pct = y_threshold_pct,
+        av_filter = av_filter,
+        peak_detection_upper_limit = peak_detection_upper_limit,
+        reward_upper_limit = reward_upper_limit,
+        control = control,
+        verbose = verbose_down(verbose)
+      )
+    })
+
+  object$tidy_models <- object$models |>
+    map(~pivot_longer(.x, all_of(c("A", "alpha")), names_to = "term", values_to = "estimate"))
+
+  conf_intervals <- rsample::int_pctl(object, .data$tidy_models)
+
+  bootstrap_models <- object$models |>
+    set_names(object$id) |>
+    bind_rows(.id = "resample_id")
+  class(bootstrap_models) <- c("cevo_bootstrap_powerlaw_models", class(bootstrap_models))
+
+  models <- conf_intervals |>
+    pivot_wider(
+      names_from = "term",
+      values_from = ".lower":".upper",
+      names_glue = "{term}{.value}",
+      names_vary = "slowest"
+    ) |>
+    transmute(
+      model = name,
+      component = "powerlaw tail",
+      A = .data$A.estimate,
+      .data$A.lower, .data$A.upper,
+      alpha = .data$alpha.estimate,
+      .data$alpha.lower, .data$alpha.upper
+    )
+  class(models) <- c("cevo_powerlaw_models", class(models))
+
+  lst(bootstrap_models, models)
+}
+
+
+#' @rdname powerlaw_optim
+#' @export
+fit_powerlaw_tail_optim.cevo_SFS_tbl <- function(object,
+                                                 name = "powerlaw_optim",
+                                                 allowed_zero_bins = 2,
+                                                 y_treshold = 1,
+                                                 y_threshold_pct = 0.01,
+                                                 av_filter = c(1/3, 1/3, 1/3),
+                                                 peak_detection_upper_limit = 0.3,
+                                                 reward_upper_limit = 0.4,
+                                                 control = list(maxit = 1000, ndeps = c(0.1, 0.01)),
+                                                 verbose = get_cevomod_verbosity(),
+                                                 ...) {
   msg("Fitting optimized power-law models...", verbose = verbose)
   start_time <- Sys.time()
 
-  sfs <- get_SFS(object, name = "SFS")
+  sfs <- object
   bounds <- sfs |>
     get_non_zero_SFS_range(
       allowed_zero_bins = allowed_zero_bins,
@@ -109,14 +245,11 @@ fit_powerlaw_tail_optim.cevodata <- function(object,
   class(models) <- c("cevo_powerlaw_models", class(models))
 
   msg("Models fitted in ", Sys.time() - start_time, " seconds", verbose = verbose)
-
-  object$models[[name]] <- models
-  object <- calc_powerlaw_model_residuals(object, name)
-  object$active_models <- name
-  object
+  models
 }
 
 
+
 td_optim <- function(init_A, init_alpha, data,
                      peak_detection_upper_limit = 0.3,
                      reward_upper_limit = 0.4,