Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated correct time to use precomputed clusters #220

Merged
merged 4 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ NeedsCompilation: no
Suggests:
dataCompareR, testthat (>= 3.0.0), microbenchmark
Config/testthat/edition: 3
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
143 changes: 99 additions & 44 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,16 @@ NULL
compute_comb <- function(template_features, features) {
combined <- dplyr::bind_rows(
template_features,
dplyr::bind_cols(features |> dplyr::select(c(mz, rt)), sample_id = features$sample_id)
)
combined <- combined |> dplyr::arrange_at("mz")
features |> dplyr::select(c(mz, rt, cluster, sample_id))
) |> dplyr::arrange_at(c("cluster","mz"))
return(combined)
}

#' @export
compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
compute_sel <- function(combined) {
l <- nrow(combined)
sel <- which(combined$mz[2:l] - combined$mz[1:(l - 1)] <
mz_tol_relative * combined$mz[1:(l - 1)] * 2 &
abs(combined$rt[2:l] - combined$rt[1:(l - 1)]) <
rt_tol_relative & combined$sample_id[2:l] != combined$sample_id[1:(l - 1)])
sel <- which(combined$cluster[1:(l - 1)] == combined$cluster[2:l] &
combined$sample_id[1:(l - 1)] != combined$sample_id[2:l])
return(sel)
}

Expand All @@ -32,33 +29,65 @@ compute_template_adjusted_rt <- function(combined, sel, j) {

# now the first column is the template retention time.
# the second column is the to-be-adjusted retention time

all_features <- all_features[order(all_features[, 2]), ]
return(all_features)
}

compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
features <- features |> dplyr::arrange_at(c("rt", "mz"))
idx <- dplyr::between(features$rt, min(template_rt), max(template_rt))
to_correct <- (features |> dplyr::filter(idx))$rt

this.smooth <- ksmooth(
template_rt,
delta_rt,
kernel = "normal",
bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
x.points = to_correct
)

lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])

features <- features |>
dplyr::mutate(rt = dplyr::case_when(
rt < min(template_rt) ~ rt + lower_bound_adjustment,
rt > max(template_rt) ~ rt + upper_bound_adjustment
))
features[idx, "rt"] <- to_correct + this.smooth$y
return(features |> dplyr::arrange_at(c("mz", "rt")))
}

#' @export
compute_corrected_features <- function(features, delta_rt, avg_time) {
features <- features[order(features$rt, features$mz), ]
features <- features |> dplyr::arrange_at(c("rt", "mz"))

corrected <- features$rt
original <- features$rt
to_correct <- original[original >= min(delta_rt) &
original <= max(delta_rt)]

this.smooth <- ksmooth(delta_rt, avg_time,
idx <- dplyr::between(original, min(delta_rt), max(delta_rt))
to_correct <- original[idx]
this.smooth <- ksmooth(
delta_rt,
avg_time,
kernel = "normal",
bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
x.points = to_correct
)

corrected[dplyr::between(original, min(delta_rt), max(delta_rt))] <-
this.smooth$y + to_correct
corrected[original < min(delta_rt)] <- corrected[original < min(delta_rt)] +
mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
corrected[original > max(delta_rt)] <- corrected[original > max(delta_rt)] +
mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
corrected[idx] <- this.smooth$y + to_correct
lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])

idx_lower <- original < min(delta_rt)
idx_upper <- original > max(delta_rt)

corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment
corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment
features$rt <- corrected
features <- features[order(features$mz, features$rt), ]

return(features)
}

Expand All @@ -76,36 +105,25 @@ fill_missing_values <- function(orig.feature, this.feature) {
}

#' @export
compute_template <- function(extracted_features) {
num.ftrs <- sapply(extracted_features, nrow)
template_id <- which.max(num.ftrs)
template <- extracted_features[[template_id]]$sample_id[1]
message(paste("the template is sample", template))

candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt))
template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
return(tibble::as_tibble(template_features))
}

#' @export
correct_time <- function(this.feature, template_features, mz_tol_relative, rt_tol_relative) {
correct_time <- function(this.feature, template_features) {
orig.features <- this.feature
template <- unique(template_features$sample_id)[1]
j <- unique(this.feature$sample_id)[1]

if (j != template) {
this.comb <- compute_comb(template_features, this.feature)
sel <- compute_sel(this.comb, mz_tol_relative, rt_tol_relative)
sel <- compute_sel(this.comb)

if (length(sel) < 20) {
stop("too few, aborted")
} else {
all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j)
# the to be adjusted time
this.diff <- all.ftr.table[, 2]
# the difference between the true time and the to-be-adjusted time
avg_time <- all.ftr.table[, 1] - this.diff
this.feature <- compute_corrected_features(this.feature, this.diff, avg_time)

this.feature <- compute_corrected_features(
this.feature,
all.ftr.table[, 2], # the to be adjusted time
all.ftr.table[, 1] - all.ftr.table[, 2] # the difference between the true time and the to-be-adjusted time
)
}
}

Expand All @@ -119,6 +137,48 @@ correct_time <- function(this.feature, template_features, mz_tol_relative, rt_to
return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster")))
}

#' @export
compute_template <- function(extracted_features) {
num.ftrs <- sapply(extracted_features, nrow)
template_id <- which.max(num.ftrs)
template <- extracted_features[[template_id]]$sample_id[1]
message(paste("the template is sample", template))

candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt, cluster))
template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
return(tibble::as_tibble(template_features))
}

correct_time_v2 <- function(features, template) {
if (unique(features$sample_id) == unique(template$sample_id))
return(tibble::as_tibble(features))

subsets <- template |>
dplyr::bind_rows(
features |> dplyr::select(c(mz, rt, cluster, sample_id))
) |>
dplyr::arrange_at(c("cluster", "mz")) |>
dplyr::group_by(cluster) |>
dplyr::mutate(count = dplyr::n_distinct(sample_id)) |>
filter(count == 2) |>
dplyr::add_count() |>
filter(n == 2) |>
dplyr::ungroup() |>
dplyr::group_by(sample_id) |>
dplyr::group_split()

all_features_new <- cbind(subsets[[1]]$rt, subsets[[2]]$rt)
all_features_new_order <- order(all_features_new[, 2])
all_features_new_arranged <- all_features_new[all_features_new_order,]

corrected <- compute_corrected_features_v2(
features,
all_features_new_arranged[, 2],
all_features_new_arranged[, 1] - all_features_new_arranged[, 2]
)
return(tibble::as_tibble(corrected))
}

#' Adjust retention time across spectra.
#'
#' This function adjusts the retention time in each LC/MS profile to achieve better between-profile agreement.
Expand All @@ -135,8 +195,6 @@ correct_time <- function(this.feature, template_features, mz_tol_relative, rt_to
#' column in each of the matrices is changed to new adjusted values.
#' @export
adjust.time <- function(extracted_features,
mz_tol_relative,
rt_tol_relative,
colors = NA,
do.plot = TRUE) {
number_of_samples <- length(extracted_features)
Expand All @@ -154,17 +212,14 @@ adjust.time <- function(extracted_features,

corrected_features <- foreach::foreach(features = extracted_features) %do% correct_time(
features,
template_features,
mz_tol_relative,
rt_tol_relative
template_features
)

if (do.plot) {
draw_rt_correction_plot(
colors,
extracted_features,
corrected_features,
rt_tol_relative
)
}

Expand Down
8 changes: 2 additions & 6 deletions R/hybrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,7 @@ hybrid <- function(
message("**** time correction ****")
corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time(
this.feature,
template_features,
extracted_clusters$mz_tol_relative,
extracted_clusters$rt_tol_relative
template_features
)

message("**** computing clusters ****")
Expand Down Expand Up @@ -472,9 +470,7 @@ hybrid <- function(
message("**** second time correction ****")
corrected <- foreach::foreach(this.feature = recovered_clusters$feature_tables) %dopar% correct_time(
this.feature,
template_features,
recovered_clusters$mz_tol_relative,
recovered_clusters$rt_tol_relative
template_features
)

message("**** fourth computing clusters ****")
Expand Down
4 changes: 1 addition & 3 deletions R/unsupervised.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,7 @@ unsupervised <- function(
message("**** time correction ****")
corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time(
this.feature,
template_features,
extracted_clusters$mz_tol_relative,
extracted_clusters$rt_tol_relative
template_features
)

message("**** computing clusters ****")
Expand Down
10 changes: 10 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,13 @@ get_num_workers <- function() {
}
return(num_workers)
}

read_parquet_files <- function(filename, folder, pattern) {
testdata <- file.path("..", "testdata")

input <- lapply(filename, function(x) {
tibble::as_tibble(arrow::read_parquet(file.path(testdata, folder, paste0(x, pattern))))
})

return(input)
}
52 changes: 26 additions & 26 deletions tests/testthat/test-adjust-time.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@ patrick::with_parameters_test_that(
{
testdata <- file.path("..", "testdata")

filenames <- lapply(files, function(x) {
file.path(testdata, "clusters", paste0(x, "_extracted_clusters.parquet"))
})

extracted <- lapply(filenames, arrow::read_parquet)
extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")
template_features <- compute_template(extracted)

expected <- file.path(testdata, "template", "RCX_shortened.parquet")
Expand All @@ -30,31 +26,35 @@ patrick::with_parameters_test_that(
template_features <- file.path(testdata, "template", "RCX_shortened.parquet")
template_features <- arrow::read_parquet(template_features)

extracted <- file.path(testdata, "clusters", paste0(.test_name, "_extracted_clusters.parquet"))
extracted <- arrow::read_parquet(extracted)
extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")

corrected <- correct_time(
this.feature = extracted,
template_features = template_features,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative
)
corrected <- lapply(extracted, function(x){
correct_time(x, template_features)
})

expected <- tibble::as_tibble(arrow::read_parquet(file.path(testdata, "adjusted", paste0(.test_name, ".parquet"))))
expect_equal(corrected, expected)
expected <- read_parquet_files(files, "adjusted", ".parquet")
expect_equal(corrected, expected, tolerance = 0.01)
},
patrick::cases(
RCX_06_shortened = list(
mz_tol_relative = 6.856763e-06,
rt_tol_relative = 3.618581
),
RCX_07_shortened = list(
mz_tol_relative = 6.856763e-06,
rt_tol_relative = 3.618581
),
RCX_08_shortened = list(
mz_tol_relative = 6.856763e-06,
rt_tol_relative = 3.618581
RCX_shortened = list(
files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened")
)
)
)

test_that("correct_time_v2 is close to correct time", {
template_features <- arrow::read_parquet(
file.path("..", "testdata", "template", "RCX_shortened.parquet")
)

clustered_table <- read_parquet_files(
c("RCX_06_shortened"),
"clusters",
"_extracted_clusters.parquet"
)[[1]]

expected <- correct_time(clustered_table, template_features)
actual <- correct_time_v2(clustered_table, template_features)

expect_equal(actual, expected, tolerance = 0.02)
})
21 changes: 5 additions & 16 deletions tests/testthat/test-compute_clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ patrick::with_parameters_test_that(
{
testdata <- file.path("..", "testdata")

extracted <- lapply(files, function(x) {
xx <- file.path(testdata, input, paste0(x, ".parquet"))
tibble::as_tibble(arrow::read_parquet(xx))
})
extracted <- read_parquet_files(files, input, ".parquet")

actual <- compute_clusters(
feature_tables = extracted,
Expand All @@ -18,10 +15,7 @@ patrick::with_parameters_test_that(
sample_names = files
)

expected <- lapply(files, function(x) {
filepath <- file.path(testdata, "clusters", paste0(x, "_", input, "_clusters.parquet"))
tibble::as_tibble(arrow::read_parquet(filepath))
})
expected <- read_parquet_files(files, "clusters", paste0("_", input, "_clusters.parquet"))


for(i in seq_along(files)) {
Expand Down Expand Up @@ -56,10 +50,8 @@ test_that("compute clusters simple", {
testdata <- file.path("..", "testdata")
files <- c("8_qc_no_dil_milliq", "21_qc_no_dil_milliq", "29_qc_no_dil_milliq")

extracted <- lapply(files, function(x) {
xx <- file.path(testdata, "extracted", paste0(x, ".mzml.parquet"))
tibble::as_tibble(arrow::read_parquet(xx))
})
extracted <- read_parquet_files(files, "extracted", ".mzml.parquet")

actual <- compute_clusters_simple(
feature_tables = extracted,
sample_names = files,
Expand All @@ -69,10 +61,7 @@ test_that("compute clusters simple", {

actual <- actual[order(sapply(actual, function(x) x$sample_id[1]))]

expected <- lapply(files, function(x) {
file <- file.path(testdata, "clusters", paste0(x, ".parquet"))
tibble::as_tibble(arrow::read_parquet(file))
})
expected <- read_parquet_files(files, "clusters", ".parquet")

expected <- expected[order(sapply(expected, function(x) x$sample_id[1]))]

Expand Down
Loading
Loading