diff --git a/R/remove_noise.R b/R/remove_noise.R index 07ca42d..e4ebc74 100644 --- a/R/remove_noise.R +++ b/R/remove_noise.R @@ -60,6 +60,7 @@ load_data <- function(filename, #' @param intensity_weighted Whether to use intensity to weight mass density estimation. #' @param do.plot Indicates whether plot should be drawn. #' @param cache Whether to use cache +#' @param grouping_threshold The maximum difference between two scans to be considered the same EIC. Default is Inf. #' @return A matrix with four columns: m/z value, retention time, intensity, and group number. #' @export remove_noise <- function(filename, @@ -70,7 +71,8 @@ remove_noise <- function(filename, baseline_correct_noise_percentile, intensity_weighted, do.plot, - cache) { + cache, + grouping_threshold = Inf) { raw.data <- load_file(filename) raw.prof <- adaptive.bin( @@ -87,10 +89,28 @@ remove_noise <- function(filename, raw.prof$features$intensities, raw.prof$features$grps ) - + run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_pres & raw.prof$height.rec[, 3] > baseline_correct), 1] newprof <- newprof[newprof[, 4] %in% run.sel, ] + + if (grouping_threshold < Inf) { + sorted_newprof <- newprof[order(newprof[,2]),] + new_grps <- cumsum(c(0, diff(sorted_newprof[,2])) > grouping_threshold) + sorted_newprof <- cbind(sorted_newprof, new_grps, deparse.level = 0) + + sorted_newprof_df <- tibble::as_tibble(sorted_newprof) + + newprof <- as.matrix(sorted_newprof_df |> + dplyr::group_by(V4, V5) |> + dplyr::mutate(cluster = cur_group_id()) |> + dplyr::ungroup() |> + dplyr::arrange(cluster) |> + dplyr::select(-V4, -V5) + ) + colnames(newprof) <- NULL + } + new.prof <- run_filter( newprof, min_pres = min_pres, diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index ba6f375..c903498 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -5,16 +5,16 @@ channels: - defaults dependencies: - r-base - - icu <=70.1 + - icu - r-mass - r-rgl - - bioconductor-mzR ==2.28.0 + - bioconductor-mzR ==2.36.0 - r-splines2 - r-doparallel - r-foreach - r-snow - r-rcpp - - r-arrow >=7.0.0,<10.0.0 + - r-arrow - r-dplyr - r-tidyr - r-stringr diff --git a/tests/remote-files/input.txt b/tests/remote-files/input.txt index 4afb4bf..80f98a8 100644 --- a/tests/remote-files/input.txt +++ b/tests/remote-files/input.txt @@ -6,4 +6,5 @@ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/RCX_08_shortened.mzML https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/single_eic.mzml https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/alg3.mzdata -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/test_file.mzXML \ No newline at end of file +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/test_file.mzXML +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/Tribrid_201106_009-QC1_1_NEG_FISABIO_single_eic.raw.mzML \ No newline at end of file diff --git a/tests/testthat/test-load.lcms.R b/tests/testthat/test-load.lcms.R index dd55f6e..d9c56cc 100644 --- a/tests/testthat/test-load.lcms.R +++ b/tests/testthat/test-load.lcms.R @@ -10,6 +10,10 @@ create_test_case <- function(filename, mz_length, rt_length, intensities_length) patrick::with_parameters_test_that( "test load.lcms reads different file types", { + if(packageVersion("mzR") >= "2.29.0" && tools::file_ext(filename) == "mzdata") { + print("mzR >= 2.29.0 no longer supports mzdata.") + skip() + } # Arrange: Set up test inputs testdata <- file.path("..", "testdata") input_path <- file.path(testdata, "input", filename) diff --git a/tests/testthat/test-remove_noise.R b/tests/testthat/test-remove_noise.R index 2691fec..7464fbc 100644 --- a/tests/testthat/test-remove_noise.R +++ b/tests/testthat/test-remove_noise.R @@ -74,3 +74,33 @@ patrick::with_parameters_test_that( ) ) ) + +test_that("remove noise works with grouping threshold", { + testdata <- file.path("..", "testdata") + input_path <- file.path(testdata, + "input", + "Tribrid_201106_009-QC1_1_NEG_FISABIO_single_eic.raw.mzML") + + expected <- tibble(group_number = c(1, 2, 3, 5, 6, 7, 8, 9), + n = c(67, 73, 3, 39, 2, 6, 3, 7)) + + sut <- remove_noise( + input_path, + min_pres = 0.8, + min_run = 0.2, + mz_tol = 5e-05, + baseline_correct = 0.0, + baseline_correct_noise_percentile = 0.05, + intensity_weighted = FALSE, + do.plot = FALSE, + cache = FALSE, + grouping_threshold = 4 + ) + + actual <- sut %>% + mutate(group = factor(group_number)) %>% + group_by(group_number) %>% + summarize(n = n()) + + expect_equal(actual, expected) +}) \ No newline at end of file