diff --git a/R/adjust.time.R b/R/adjust.time.R index 11d0a74..fe896d1 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -34,29 +34,60 @@ compute_template_adjusted_rt <- function(combined, sel, j) { return(all_features) } +compute_corrected_features_v2 <- function(features, template_rt, delta_rt) { + features <- features |> dplyr::arrange_at(c("rt", "mz")) + idx <- dplyr::between(features$rt, min(template_rt), max(template_rt)) + to_correct <- (features |> dplyr::filter(idx))$rt + + this.smooth <- ksmooth( + template_rt, + delta_rt, + kernel = "normal", + bandwidth = (max(delta_rt) - min(delta_rt)) / 5, + x.points = to_correct + ) + + lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) + upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) + + features <- features |> + dplyr::mutate(rt = dplyr::case_when( + rt < min(template_rt) ~ rt + lower_bound_adjustment, + rt > max(template_rt) ~ rt + upper_bound_adjustment + )) + features[idx, "rt"] <- to_correct + this.smooth$y + return(features |> dplyr::arrange_at(c("mz", "rt"))) +} + #' @export compute_corrected_features <- function(features, delta_rt, avg_time) { - features <- features[order(features$rt, features$mz), ] + features <- features |> dplyr::arrange_at(c("rt", "mz")) corrected <- features$rt original <- features$rt - to_correct <- original[original >= min(delta_rt) & - original <= max(delta_rt)] - this.smooth <- ksmooth(delta_rt, avg_time, + idx <- dplyr::between(original, min(delta_rt), max(delta_rt)) + to_correct <- original[idx] + this.smooth <- ksmooth( + delta_rt, + avg_time, kernel = "normal", bandwidth = (max(delta_rt) - min(delta_rt)) / 5, x.points = to_correct ) - corrected[dplyr::between(original, min(delta_rt), max(delta_rt))] <- - this.smooth$y + to_correct - corrected[original < min(delta_rt)] <- corrected[original < min(delta_rt)] + - mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) - corrected[original > max(delta_rt)] <- corrected[original > max(delta_rt)] + - mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) + corrected[idx] <- this.smooth$y + to_correct + lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) + upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) + + idx_lower <- original < min(delta_rt) + idx_upper <- original > max(delta_rt) + + corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment + corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment features$rt <- corrected features <- features[order(features$mz, features$rt), ] + return(features) } @@ -73,18 +104,6 @@ fill_missing_values <- function(orig.feature, this.feature) { return(this.feature) } -#' @export -compute_template <- function(extracted_features) { - num.ftrs <- sapply(extracted_features, nrow) - template_id <- which.max(num.ftrs) - template <- extracted_features[[template_id]]$sample_id[1] - message(paste("the template is sample", template)) - - candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt, cluster)) - template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi))) - return(tibble::as_tibble(template_features)) -} - #' @export correct_time <- function(this.feature, template_features) { orig.features <- this.feature @@ -99,11 +118,12 @@ correct_time <- function(this.feature, template_features) { stop("too few, aborted") } else { all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j) - # the to be adjusted time - this.diff <- all.ftr.table[, 2] - # the difference between the true time and the to-be-adjusted time - avg_time <- all.ftr.table[, 1] - this.diff - this.feature <- compute_corrected_features(this.feature, this.diff, avg_time) + + this.feature <- compute_corrected_features( + this.feature, + all.ftr.table[, 2], # the to be adjusted time + all.ftr.table[, 1] - all.ftr.table[, 2] # the difference between the true time and the to-be-adjusted time + ) } } @@ -117,6 +137,48 @@ correct_time <- function(this.feature, template_features) { return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster"))) } +#' @export +compute_template <- function(extracted_features) { + num.ftrs <- sapply(extracted_features, nrow) + template_id <- which.max(num.ftrs) + template <- extracted_features[[template_id]]$sample_id[1] + message(paste("the template is sample", template)) + + candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt, cluster)) + template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi))) + return(tibble::as_tibble(template_features)) +} + +correct_time_v2 <- function(features, template) { + if (unique(features$sample_id) == unique(template$sample_id)) + return(tibble::as_tibble(features)) + + subsets <- template |> + dplyr::bind_rows( + features |> dplyr::select(c(mz, rt, cluster, sample_id)) + ) |> + dplyr::arrange_at(c("cluster", "mz")) |> + dplyr::group_by(cluster) |> + dplyr::mutate(count = dplyr::n_distinct(sample_id)) |> + filter(count == 2) |> + dplyr::add_count() |> + filter(n == 2) |> + dplyr::ungroup() |> + dplyr::group_by(sample_id) |> + dplyr::group_split() + + all_features_new <- cbind(subsets[[1]]$rt, subsets[[2]]$rt) + all_features_new_order <- order(all_features_new[, 2]) + all_features_new_arranged <- all_features_new[all_features_new_order,] + + corrected <- compute_corrected_features_v2( + features, + all_features_new_arranged[, 2], + all_features_new_arranged[, 1] - all_features_new_arranged[, 2] + ) + return(tibble::as_tibble(corrected)) +} + #' Adjust retention time across spectra. #' #' This function adjusts the retention time in each LC/MS profile to achieve better between-profile agreement. diff --git a/tests/testthat/test-adjust-time.R b/tests/testthat/test-adjust-time.R index fdd65ab..ec9b3e8 100644 --- a/tests/testthat/test-adjust-time.R +++ b/tests/testthat/test-adjust-time.R @@ -42,10 +42,10 @@ patrick::with_parameters_test_that( ) ) -test_that("compute_sel", { - testdata <- file.path("..", "testdata") - template_features <- file.path(testdata, "template", "RCX_shortened.parquet") - template_features <- arrow::read_parquet(template_features) +test_that("correct_time_v2 is close to correct time", { + template_features <- arrow::read_parquet( + file.path("..", "testdata", "template", "RCX_shortened.parquet") + ) clustered_table <- read_parquet_files( c("RCX_06_shortened"), @@ -53,33 +53,8 @@ test_that("compute_sel", { "_extracted_clusters.parquet" )[[1]] - all_features <- compute_template_adjusted_rt( - compute_comb(template_features, clustered_table), - compute_sel(combined), - "RCX_07_shortened" - ) - - subsets <- template_features |> - dplyr::bind_rows(clustered_table |> dplyr::select(c(mz, rt, cluster, sample_id))) |> - dplyr::arrange_at(c("cluster","mz")) |> - dplyr::group_by(cluster) |> - mutate(count = n_distinct(sample_id)) |> - filter(count == 2) |> - add_count() |> - filter(n == 2) |> - ungroup() |> - group_by(sample_id) |> - group_split() - - all_features_new <- cbind(subsets[[1]]$rt, subsets[[2]]$rt) - all_features_new_order <- order(all_features_new[,2]) - all_features_new_arranged <- all_features_new[all_features_new_order,] - - this.feature <- compute_corrected_features( - clustered_table, - all_features_new_arranged[, 2], - all_features_new_arranged[, 1] - all_features_new_arranged[, 2] - ) + expected <- correct_time(clustered_table, template_features) + actual <- correct_time_v2(clustered_table, template_features) - browser() + expect_equal(actual, expected, tolerance = 0.02) }) \ No newline at end of file