From 0b95af7ef37cbb939fd315b58b5ca73d0a232eee Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 16 Sep 2024 09:25:23 +0200 Subject: [PATCH] fixed eic splitting and updated tests --- R/remove_noise.R | 43 +++++----- R/run_filter.R | 100 ++++++++++++------------ R/utils.R | 5 +- tests/testthat/test-adjust-time.R | 17 ++-- tests/testthat/test-compute_clusters.R | 15 ++-- tests/testthat/test-extract_features.R | 1 - tests/testthat/test-feature-align.R | 18 ++--- tests/testthat/test-find_mz_tolerance.R | 2 +- tests/testthat/test-hybrid.R | 9 +-- tests/testthat/test-prof.to.features.R | 16 ++-- tests/testthat/test-recover-weaker.R | 27 +++++-- tests/testthat/test-remove_noise.R | 32 +++++++- tests/testthat/test-run_filter.R | 11 +-- 13 files changed, 164 insertions(+), 132 deletions(-) diff --git a/R/remove_noise.R b/R/remove_noise.R index e4ebc74..bc19e22 100644 --- a/R/remove_noise.R +++ b/R/remove_noise.R @@ -92,24 +92,17 @@ remove_noise <- function(filename, run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_pres & raw.prof$height.rec[, 3] > baseline_correct), 1] - newprof <- newprof[newprof[, 4] %in% run.sel, ] - - if (grouping_threshold < Inf) { - sorted_newprof <- newprof[order(newprof[,2]),] - new_grps <- cumsum(c(0, diff(sorted_newprof[,2])) > grouping_threshold) - sorted_newprof <- cbind(sorted_newprof, new_grps, deparse.level = 0) - - sorted_newprof_df <- tibble::as_tibble(sorted_newprof) - - newprof <- as.matrix(sorted_newprof_df |> - dplyr::group_by(V4, V5) |> - dplyr::mutate(cluster = cur_group_id()) |> - dplyr::ungroup() |> - dplyr::arrange(cluster) |> - dplyr::select(-V4, -V5) - ) - colnames(newprof) <- NULL - } + newprof <- as.data.frame(newprof[newprof[, 4] %in% run.sel, ]) + colnames(newprof) <- c("mz", "rt", "intensity", "group_number") + + newprof <- tibble::tibble(newprof |> + dplyr::group_by(group_number) |> + dplyr::arrange_at("rt") |> + dplyr::mutate(subset_group_number = cumsum(c(0, abs(diff(rt)) > grouping_threshold))) |> + dplyr::group_by(group_number, subset_group_number) |> + dplyr::mutate(grps = cur_group_id()) |> + dplyr::ungroup() |> + dplyr::select(mz, rt, intensity, grps)) new.prof <- run_filter( newprof, @@ -128,12 +121,12 @@ remove_noise <- function(filename, ) } - new_rec_tibble <- tibble::tibble( - mz = new.prof$new_rec[, 1], - rt = new.prof$new_rec[, 2], - intensity = new.prof$new_rec[, 3], - group_number = new.prof$new_rec[, 4] - ) + # new_rec_tibble <- tibble::tibble( + # mz = new.prof$new_rec[, 1], + # rt = new.prof$new_rec[, 2], + # intensity = new.prof$new_rec[, 3], + # group_number = new.prof$new_rec[, 4] + # ) - return(new_rec_tibble) + return(new.prof) } diff --git a/R/run_filter.R b/R/run_filter.R index 4b46f00..e4ab5cf 100644 --- a/R/run_filter.R +++ b/R/run_filter.R @@ -6,8 +6,7 @@ #' @return unique_grp. #' @export compute_uniq_grp <- function(profile, min_count_run, min_pres) { - grps <- profile - ttt <- table(grps) + ttt <- table(profile) ttt <- ttt[ttt >= max(min_count_run * min_pres, 2)] unique_grp <- as.numeric(names(ttt)) return(unique_grp) @@ -52,7 +51,7 @@ label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) { # filtering based on the kernel regression estimate this_smooth <- predict_smoothed_rt(min_run, this_timeline) - if (max(this_smooth) >= min_pres) { + if (max(this_smooth, na.rm = TRUE) >= min_pres) { measured_points <- good_points <- timeline measured_points[this_times] <- 1 @@ -82,22 +81,22 @@ label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) { run_filter <- function(newprof, min_pres, min_run) { - newprof <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensi = newprof[, 3], grps = newprof[, 4]) - # ordering retention time values - labels <- newprof$rt - times <- unique(labels) - times <- times[order(times)] + # labels <- newprof$rt + # times <- unique(labels) + # times <- times[order(times)] - for (i in 1:length(times)) { - labels[which(newprof$rt == times[i])] <- i # now labels is the index of time points - } + # for (i in 1:length(times)) { + # labels[which(newprof$rt == times[i])] <- i # now labels is the index of time points + # } - newprof$rt <- labels + # newprof$rt <- labels + newprof <- dplyr::arrange_at(newprof, "rt") + times <- unique(newprof$rt) # calculates the minimun number of rt points to be considered a peak - min_count_run <- min_run * length(times) / (max(times) - min(times)) - min_run <- round(min_count_run) + scan_rate <- 1.0 / abs(median(diff(times))) + min_count_run <- round(min_pres * min_run * scan_rate) # computes unique groups uniq_grp <- compute_uniq_grp(newprof$grps, min_count_run, min_pres) @@ -105,40 +104,45 @@ run_filter <- function(newprof, # ordered by mz and grps data that are inside unigrps newprof <- dplyr::filter(newprof, grps %in% uniq_grp) |> dplyr::arrange(grps, mz) - # computes break points i.e. indices of mass differences greater than min_mz_tol - breaks <- compute_breaks_3(newprof$grps) - - # init counters for loop - new_rec <- newprof * 0 - rec_pointer <- 1 - timeline <- rep(0, length(times)) - for (m in 2:length(breaks)) - { - this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt") - - to_keep <- label_val_to_keep( - min_run, - timeline, - min_pres, - this_prof$rt, - times - ) - - # operation over selected indices - if (sum(to_keep) > 0) { - this_sel <- which(to_keep == 1) - this_new <- dplyr::slice(this_prof, this_sel) - r_new <- nrow(this_new) - new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new - rec_pointer <- rec_pointer + r_new - } - } - - new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1)) - new_rec[, 2] <- times[new_rec[, 2]] - - results <- new("list") - results$new_rec <- new_rec + results <- dplyr::group_by(newprof, grps) |> + dplyr::filter(n() >= min_count_run && abs(span(rt)) >= min_run) |> + dplyr::ungroup() |> + dplyr::rename(group_number = grps) + + # # computes break points i.e. indices of mass differences greater than min_mz_tol + # breaks <- compute_breaks_3(newprof$grps) + + # # init counters for loop + # new_rec <- newprof * 0 + # rec_pointer <- 1 + # timeline <- rep(0, length(times)) + # for (m in 2:length(breaks)) + # { + # this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt") + + # to_keep <- label_val_to_keep( + # min_run, + # timeline, + # min_pres, + # this_prof$rt, + # times + # ) + # browser() + # # operation over selected indices + # if (sum(to_keep) > 0) { + # this_sel <- which(to_keep == 1) + # this_new <- dplyr::slice(this_prof, this_sel) + # r_new <- nrow(this_new) + # new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new + # rec_pointer <- rec_pointer + r_new + # } + # } + + # new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1)) + # new_rec[, 2] <- times[new_rec[, 2]] + + # results <- new("list") + # results$new_rec <- new_rec return(results) } diff --git a/R/utils.R b/R/utils.R index d8101f1..a3b9fbf 100644 --- a/R/utils.R +++ b/R/utils.R @@ -110,18 +110,15 @@ concatenate_feature_tables <- function(features, sample_names) { } #' @export -load_aligned_features <- function(metadata_file, intensities_file, rt_file, tol_file) { +load_aligned_features <- function(metadata_file, intensities_file, rt_file) { metadata <- arrow::read_parquet(metadata_file) intensities <- arrow::read_parquet(intensities_file) rt <- arrow::read_parquet(rt_file) - tolerances <- arrow::read_parquet(tol_file) result <- list() result$metadata <- as_tibble(metadata) result$intensity <- as_tibble(intensities) result$rt <- as_tibble(rt) - result$mz_tol_relative <- tolerances$mz_tolerance - result$rt_tol_relative <- tolerances$rt_tolerance return(result) } diff --git a/tests/testthat/test-adjust-time.R b/tests/testthat/test-adjust-time.R index ec9b3e8..3405cbf 100644 --- a/tests/testthat/test-adjust-time.R +++ b/tests/testthat/test-adjust-time.R @@ -4,12 +4,12 @@ patrick::with_parameters_test_that( testdata <- file.path("..", "testdata") extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet") - template_features <- compute_template(extracted) + actual <- compute_template(extracted) - expected <- file.path(testdata, "template", "RCX_shortened.parquet") - expected <- arrow::read_parquet(expected) - - expect_equal(template_features, expected) + expected_path <- file.path(testdata, "template", "RCX_shortened.parquet") + expected <- arrow::read_parquet(expected_path) + + expect_equal(actual, expected) }, patrick::cases( RCX_shortened = list( @@ -32,6 +32,10 @@ patrick::with_parameters_test_that( correct_time(x, template_features) }) + # for(i in 1:3) { + # arrow::write_parquet(corrected[[i]], file.path(testdata, "adjusted", paste0(files[i], ".parquet"))) + # } + expected <- read_parquet_files(files, "adjusted", ".parquet") expect_equal(corrected, expected, tolerance = 0.01) }, @@ -56,5 +60,8 @@ test_that("correct_time_v2 is close to correct time", { expected <- correct_time(clustered_table, template_features) actual <- correct_time_v2(clustered_table, template_features) + actual <- dplyr::arrange_at(actual, c("cluster", "mz", "area", "rt")) + expected <- dplyr::arrange_at(expected, c("cluster", "mz", "area", "rt")) + expect_equal(actual, expected, tolerance = 0.02) }) \ No newline at end of file diff --git a/tests/testthat/test-compute_clusters.R b/tests/testthat/test-compute_clusters.R index edd1101..faa5d54 100644 --- a/tests/testthat/test-compute_clusters.R +++ b/tests/testthat/test-compute_clusters.R @@ -15,6 +15,9 @@ patrick::with_parameters_test_that( sample_names = files ) + # for(i in 1:3) { + # arrow::write_parquet(actual$feature_tables[[i]], file.path(testdata, "clusters", paste0(files[i], "_", input ,"_clusters.parquet"))) + # } expected <- read_parquet_files(files, "clusters", paste0("_", input, "_clusters.parquet")) @@ -22,8 +25,8 @@ patrick::with_parameters_test_that( expect_equal(actual$feature_tables[[i]], expected[[i]]) } - expect_equal(actual$mz_tol_relative, expected_mz_tol_relative) - expect_equal(actual$rt_tol_relative, expected_rt_tol_relative) + expect_equal(actual$mz_tol_relative, expected_mz_tol_relative, tolerance=0.1) + expect_equal(actual$rt_tol_relative, expected_rt_tol_relative, tolerance=0.1) }, patrick::cases( @@ -32,16 +35,16 @@ patrick::with_parameters_test_that( input = "extracted", mz_max_diff = 10 * 1e-05, mz_tol_absolute = 0.01, - expected_mz_tol_relative = 6.85676325338646e-06, - expected_rt_tol_relative = 3.61858118506494 + expected_mz_tol_relative = 6.849039e-06, + expected_rt_tol_relative = 2.894385 ), RCX_shortened_adjusted = list( files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened"), input = "adjusted", mz_max_diff = 10 * 1e-05, mz_tol_absolute = 0.01, - expected_mz_tol_relative = 6.85676325338646e-06, - expected_rt_tol_relative = 2.17918873407775 + expected_mz_tol_relative = 6.856763e-06, + expected_rt_tol_relative = 1.93185408267324 ) ) ) diff --git a/tests/testthat/test-extract_features.R b/tests/testthat/test-extract_features.R index 85d59b4..ef12099 100644 --- a/tests/testthat/test-extract_features.R +++ b/tests/testthat/test-extract_features.R @@ -53,7 +53,6 @@ patrick::with_parameters_test_that( }) expected <- read_parquet_files(expected_files, "extracted", ".parquet") - expect_equal(actual, expected, tolerance = 0.02) }, patrick::cases( diff --git a/tests/testthat/test-feature-align.R b/tests/testthat/test-feature-align.R index a267541..e6d7b7a 100644 --- a/tests/testthat/test-feature-align.R +++ b/tests/testthat/test-feature-align.R @@ -29,15 +29,11 @@ patrick::with_parameters_test_that( res$mz_tol_relative, get_num_workers() ) - - aligned_actual$mz_tol_relative <- res$mz_tol_relative - aligned_actual$rt_tol_relative <- res$rt_tol_relative - + aligned_expected <- load_aligned_features( file.path(testdata, "aligned", "metadata_table.parquet"), file.path(testdata, "aligned", "intensity_table.parquet"), - file.path(testdata, "aligned", "rt_table.parquet"), - file.path(testdata, "aligned", "tolerances.parquet") + file.path(testdata, "aligned", "rt_table.parquet") ) expect_equal(aligned_actual, aligned_expected) @@ -75,21 +71,17 @@ patrick::with_parameters_test_that( aligned_expected <- load_aligned_features( file.path(testdata, "aligned", "metadata_table.parquet"), file.path(testdata, "aligned", "intensity_table.parquet"), - file.path(testdata, "aligned", "rt_table.parquet"), - file.path(testdata, "aligned", "tolerances.parquet") + file.path(testdata, "aligned", "rt_table.parquet") ) - aligned_expected["mz_tol_relative"] <- NULL - aligned_expected["rt_tol_relative"] <- NULL - expect_equal(aligned_actual, aligned_expected) }, patrick::cases( RCX_shortened = list( files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened"), min_occurrence = 2, - mz_tol_relative = 6.85676325338646e-06, - rt_tol_relative = 2.17918873407775 + mz_tol_relative = 6.84903911826453e-06, + rt_tol_relative = 1.93185408267324 ) ) ) diff --git a/tests/testthat/test-find_mz_tolerance.R b/tests/testthat/test-find_mz_tolerance.R index 1a75edc..af3f9e1 100644 --- a/tests/testthat/test-find_mz_tolerance.R +++ b/tests/testthat/test-find_mz_tolerance.R @@ -11,5 +11,5 @@ test_that("mz tolerance is found", { do.plot = FALSE ) - expect_equal(mz_tol_relative, 0.01664097, tolerance=0.001) + expect_equal(mz_tol_relative, 0.0135994, tolerance=0.001) }) \ No newline at end of file diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R index 8a064ba..1117d4d 100644 --- a/tests/testthat/test-hybrid.R +++ b/tests/testthat/test-hybrid.R @@ -25,10 +25,9 @@ patrick::with_parameters_test_that("basic hybrid test", { actual <- as_tibble(result$recovered_feature_sample_table) keys <- c("mz", "rt", "sample", "sample_rt", "sample_intensity") - # arrow::write_parquet(actual, file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet"))) - expected <- arrow::read_parquet( - file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet")) - ) + expected_path <- file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet")) + # arrow::write_parquet(actual, expected_path) + expected <- arrow::read_parquet(expected_path) if (store_reports) { report <- dataCompareR::rCompare( @@ -63,6 +62,6 @@ patrick::cases( qc_no_dil_milliq = list( files = c("8_qc_no_dil_milliq.mzml", "21_qc_no_dil_milliq.mzml", "29_qc_no_dil_milliq.mzml"), ci_skip = TRUE, - full_testdata = TRUE + full_testdata = FALSE ) )) diff --git a/tests/testthat/test-prof.to.features.R b/tests/testthat/test-prof.to.features.R index da4e32e..7571300 100644 --- a/tests/testthat/test-prof.to.features.R +++ b/tests/testthat/test-prof.to.features.R @@ -33,14 +33,14 @@ patrick::with_parameters_test_that( shape_model = "bi-Gaussian", do.plot = FALSE ), - RCX_06_shortened_gaussian = list( - filename = c("RCX_06_shortened.parquet"), - expected_filename = "RCX_06_shortened_gaussian_features.parquet", - sd_cut = c(0.01, 500), - sigma_ratio_lim = c(0.01, 100), - shape_model = "Gaussian", - do.plot = FALSE - ), + # RCX_06_shortened_gaussian = list( + # filename = c("RCX_06_shortened.parquet"), + # expected_filename = "RCX_06_shortened_gaussian_features.parquet", + # sd_cut = c(0.01, 500), + # sigma_ratio_lim = c(0.01, 100), + # shape_model = "Gaussian", + # do.plot = FALSE + # ), RCX_06_shortened_v2 = list( filename = c("RCX_06_shortened.parquet"), expected_filename = "RCX_06_shortened_features.parquet", diff --git a/tests/testthat/test-recover-weaker.R b/tests/testthat/test-recover-weaker.R index 35a7a14..428bf4a 100644 --- a/tests/testthat/test-recover-weaker.R +++ b/tests/testthat/test-recover-weaker.R @@ -1,3 +1,9 @@ +.update_expected <- function(actual, folder, filenames) { + for(i in seq_along(actual)) { + arrow::write_parquet(actual[[i]], file.path(folder, filenames[i])) + } +} + patrick::with_parameters_test_that( "recover weaker signals test", { @@ -9,14 +15,12 @@ patrick::with_parameters_test_that( }) extracted <- read_parquet_files(files, "extracted", ".parquet") - adjusted <- read_parquet_files(files, "adjusted", ".parquet") aligned <- load_aligned_features( file.path(testdata, "aligned", "metadata_table.parquet"), file.path(testdata, "aligned", "intensity_table.parquet"), - file.path(testdata, "aligned", "rt_table.parquet"), - file.path(testdata, "aligned", "tolerances.parquet") + file.path(testdata, "aligned", "rt_table.parquet") ) recovered <- lapply(seq_along(ms_files), function(i) { @@ -29,8 +33,8 @@ patrick::with_parameters_test_that( rt_table = aligned$rt, intensity_table = aligned$intensity, mz_tol = mz_tol, - mz_tol_relative = aligned$mz_tol_relative, - rt_tol_relative = aligned$rt_tol_relative, + mz_tol_relative = 6.84903911826453e-06, + rt_tol_relative = 1.93185408267324, recover_mz_range = recover_mz_range, recover_rt_range = recover_rt_range, use_observed_range = use_observed_range, @@ -48,8 +52,19 @@ patrick::with_parameters_test_that( extracted_recovered_actual <- lapply(recovered, function(x) x$extracted_features |> dplyr::arrange_at(keys)) corrected_recovered_actual <- lapply(recovered, function(x) x$adjusted_features |> dplyr::arrange_at(keys)) - extracted_recovered_expected <- read_parquet_files(files, file.path("recovered", "recovered-extracted"), ".parquet") + # .update_expected( + # extracted_recovered_actual, + # file.path(testdata, "recovered", "recovered-extracted"), + # lapply(files, function(x) {paste0(x, ".parquet")}) + # ) + # .update_expected( + # corrected_recovered_actual, + # file.path(testdata, "recovered", "recovered-corrected"), + # lapply(files, function(x) {paste0(x, ".parquet")}) + # ) + + extracted_recovered_expected <- read_parquet_files(files, file.path("recovered", "recovered-extracted"), ".parquet") corrected_recovered_expected <- read_parquet_files(files, file.path("recovered", "recovered-corrected"), ".parquet") expect_equal(extracted_recovered_actual, extracted_recovered_expected) diff --git a/tests/testthat/test-remove_noise.R b/tests/testthat/test-remove_noise.R index 7464fbc..be4159c 100644 --- a/tests/testthat/test-remove_noise.R +++ b/tests/testthat/test-remove_noise.R @@ -84,6 +84,8 @@ test_that("remove noise works with grouping threshold", { expected <- tibble(group_number = c(1, 2, 3, 5, 6, 7, 8, 9), n = c(67, 73, 3, 39, 2, 6, 3, 7)) + threshold <- 4 + sut <- remove_noise( input_path, min_pres = 0.8, @@ -94,13 +96,41 @@ test_that("remove noise works with grouping threshold", { intensity_weighted = FALSE, do.plot = FALSE, cache = FALSE, - grouping_threshold = 4 + grouping_threshold = threshold ) + diffs <- sut |> group_by(group_number) |> arrange_at("rt") |> summarise(max_diff = max(abs(diff(rt)))) + expect_true(all(diffs$max_diff <= threshold)) + actual <- sut %>% mutate(group = factor(group_number)) %>% group_by(group_number) %>% summarize(n = n()) expect_equal(actual, expected) +}) + +test_that("remove noise really really works", { + testdata <- file.path("..", "testdata") + input_path <- file.path(testdata, + "input", + "RCX_06_shortened.mzML") + + threshold <- 1 + + sut <- remove_noise( + input_path, + min_pres = 0.8, + min_run = 1.2, + mz_tol = 5e-06, + baseline_correct = 0.0, + baseline_correct_noise_percentile = 0.05, + intensity_weighted = FALSE, + do.plot = FALSE, + cache = FALSE, + grouping_threshold = threshold + ) + + diffs <- sut |> group_by(group_number) |> arrange_at("rt") |> summarise(max_diff = max(abs(diff(rt)))) + expect_true(all(diffs$max_diff <= threshold)) }) \ No newline at end of file diff --git a/tests/testthat/test-run_filter.R b/tests/testthat/test-run_filter.R index 309cb11..ed3359d 100644 --- a/tests/testthat/test-run_filter.R +++ b/tests/testthat/test-run_filter.R @@ -6,18 +6,11 @@ patrick::with_parameters_test_that( testdata <- file.path("..", "testdata") input_path <- file.path(testdata, "filtered", "run_filter", paste0(filename, ".parquet")) - input_data <- as.matrix(arrow::read_parquet(input_path)) + input_data <- arrow::read_parquet(input_path) actual <- run_filter(input_data, min_pres, min_run) - actual <- tibble::tibble( - mz = actual$new_rec[, 1], - rt = actual$new_rec[, 2], - intensity = actual$new_rec[, 3], - group_number = actual$new_rec[, 4] - ) - expected_path <- file.path(testdata, "filtered", "run_filter", paste0(filename, "_run_filter.parquet")) - expected <- arrow::read_parquet(expected_path) + expected <- arrow::read_parquet(expected_path) expect_equal(actual, expected)