provided new versions of adjust_time

RECETOX · Jul 23, 2024 · aa37e31 · aa37e31
1 parent 7889ed5
commit aa37e31
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 59 deletions.
diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -34,29 +34,60 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
   return(all_features)
 }
 
+compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
+  features <- features |> dplyr::arrange_at(c("rt", "mz"))
+  idx <- dplyr::between(features$rt, min(template_rt), max(template_rt))
+  to_correct <- (features |> dplyr::filter(idx))$rt
+
+  this.smooth <- ksmooth(
+    template_rt,
+    delta_rt,
+    kernel = "normal",
+    bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
+    x.points = to_correct
+  )
+
+  lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
+  upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
+
+  features <- features |>
+    dplyr::mutate(rt = dplyr::case_when(
+      rt < min(template_rt) ~ rt + lower_bound_adjustment,
+      rt > max(template_rt) ~ rt + upper_bound_adjustment
+    ))
+  features[idx, "rt"] <- to_correct + this.smooth$y
+  return(features |> dplyr::arrange_at(c("mz", "rt")))
+}
+
 #' @export
 compute_corrected_features <- function(features, delta_rt, avg_time) {
-  features <- features[order(features$rt, features$mz), ]
+  features <- features |> dplyr::arrange_at(c("rt", "mz"))
 
   corrected <- features$rt
   original <- features$rt
-  to_correct <- original[original >= min(delta_rt) &
-    original <= max(delta_rt)]
 
-  this.smooth <- ksmooth(delta_rt, avg_time,
+  idx <- dplyr::between(original, min(delta_rt), max(delta_rt))
+  to_correct <- original[idx]
+  this.smooth <- ksmooth(
+    delta_rt,
+    avg_time,
     kernel = "normal",
     bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
     x.points = to_correct
   )
 
-  corrected[dplyr::between(original, min(delta_rt), max(delta_rt))] <-
-    this.smooth$y + to_correct
-  corrected[original < min(delta_rt)] <- corrected[original < min(delta_rt)] +
-    mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
-  corrected[original > max(delta_rt)] <- corrected[original > max(delta_rt)] +
-    mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
+  corrected[idx] <- this.smooth$y + to_correct
+  lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
+  upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
+
+  idx_lower <- original < min(delta_rt)
+  idx_upper <- original > max(delta_rt)
+
+  corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment
+  corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment
   features$rt <- corrected
   features <- features[order(features$mz, features$rt), ]
+
   return(features)
 }
 
@@ -73,18 +104,6 @@ fill_missing_values <- function(orig.feature, this.feature) {
   return(this.feature)
 }
 
-#' @export
-compute_template <- function(extracted_features) {
-  num.ftrs <- sapply(extracted_features, nrow)
-  template_id <- which.max(num.ftrs)
-  template <- extracted_features[[template_id]]$sample_id[1]
-  message(paste("the template is sample", template))
-
-  candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt, cluster))
-  template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
-  return(tibble::as_tibble(template_features))
-}
-
 #' @export
 correct_time <- function(this.feature, template_features) {
     orig.features <- this.feature
@@ -99,11 +118,12 @@ correct_time <- function(this.feature, template_features) {
         stop("too few, aborted")
       } else {
         all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j)
-        # the to be adjusted time
-        this.diff <- all.ftr.table[, 2]
-        # the difference between the true time and the to-be-adjusted time
-        avg_time <- all.ftr.table[, 1] - this.diff
-        this.feature <- compute_corrected_features(this.feature, this.diff, avg_time)
+
+        this.feature <- compute_corrected_features(
+          this.feature,
+          all.ftr.table[, 2],  # the to be adjusted time
+          all.ftr.table[, 1] - all.ftr.table[, 2]  # the difference between the true time and the to-be-adjusted time
+        )
       }
     }
 
@@ -117,6 +137,48 @@ correct_time <- function(this.feature, template_features) {
   return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster")))
 }
 
+#' @export
+compute_template <- function(extracted_features) {
+  num.ftrs <- sapply(extracted_features, nrow)
+  template_id <- which.max(num.ftrs)
+  template <- extracted_features[[template_id]]$sample_id[1]
+  message(paste("the template is sample", template))
+
+  candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt, cluster))
+  template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
+  return(tibble::as_tibble(template_features))
+}
+
+correct_time_v2 <- function(features, template) {
+  if (unique(features$sample_id) == unique(template$sample_id))
+    return(tibble::as_tibble(features))
+
+  subsets <- template |>
+    dplyr::bind_rows(
+      features |> dplyr::select(c(mz, rt, cluster, sample_id))
+    ) |>
+    dplyr::arrange_at(c("cluster", "mz")) |>
+    dplyr::group_by(cluster) |>
+    dplyr::mutate(count = dplyr::n_distinct(sample_id)) |>
+    filter(count == 2) |>
+    dplyr::add_count() |>
+    filter(n == 2) |>
+    dplyr::ungroup() |>
+    dplyr::group_by(sample_id) |>
+    dplyr::group_split()
+
+  all_features_new <- cbind(subsets[[1]]$rt, subsets[[2]]$rt)
+  all_features_new_order <- order(all_features_new[, 2])
+  all_features_new_arranged <- all_features_new[all_features_new_order,]
+
+  corrected <- compute_corrected_features_v2(
+    features,
+    all_features_new_arranged[, 2],
+    all_features_new_arranged[, 1] - all_features_new_arranged[, 2]
+  )
+  return(tibble::as_tibble(corrected))
+}
+
 #' Adjust retention time across spectra.
 #'
 #' This function adjusts the retention time in each LC/MS profile to achieve better between-profile agreement.

diff --git a/tests/testthat/test-adjust-time.R b/tests/testthat/test-adjust-time.R
@@ -42,44 +42,19 @@ patrick::with_parameters_test_that(
   )
 )
 
-test_that("compute_sel", {
-  testdata <- file.path("..", "testdata")
-  template_features <- file.path(testdata, "template", "RCX_shortened.parquet")
-  template_features <- arrow::read_parquet(template_features)
+test_that("correct_time_v2 is close to correct time", {
+  template_features <- arrow::read_parquet(
+    file.path("..", "testdata", "template", "RCX_shortened.parquet")
+  )
 
   clustered_table <- read_parquet_files(
     c("RCX_06_shortened"),
     "clusters",
     "_extracted_clusters.parquet"
   )[[1]]
 
-  all_features <- compute_template_adjusted_rt(
-    compute_comb(template_features, clustered_table),
-    compute_sel(combined),
-    "RCX_07_shortened"
-  )
-
-  subsets <- template_features |>
-    dplyr::bind_rows(clustered_table |> dplyr::select(c(mz, rt, cluster, sample_id))) |>
-    dplyr::arrange_at(c("cluster","mz")) |>
-    dplyr::group_by(cluster) |>
-    mutate(count = n_distinct(sample_id)) |>
-    filter(count == 2) |>
-    add_count() |>
-    filter(n == 2) |>
-    ungroup() |>
-    group_by(sample_id) |>
-    group_split()
-
-  all_features_new <- cbind(subsets[[1]]$rt, subsets[[2]]$rt)
-  all_features_new_order <- order(all_features_new[,2])
-  all_features_new_arranged <- all_features_new[all_features_new_order,]
-
-  this.feature <- compute_corrected_features(
-    clustered_table,
-    all_features_new_arranged[, 2],
-    all_features_new_arranged[, 1] - all_features_new_arranged[, 2]
-  )
+  expected <- correct_time(clustered_table, template_features)
+  actual <- correct_time_v2(clustered_table, template_features)
 
-  browser()
+  expect_equal(actual, expected, tolerance = 0.02)
 })