Merge pull request #44 from KristinaGomoryova/findRecalSeries

function to find top 10 recalibrant series
RECETOX · Sep 5, 2024 · e08ee5f · e08ee5f
2 parents 79e56fc + 0380c83
commit e08ee5f
Show file tree

Hide file tree

Showing 16 changed files with 301 additions and 1 deletion.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,7 +21,8 @@ Depends: R (>= 3.4.1)
 Imports: dplyr (>= 0.7.6),
          tidyr (>= 0.8.1),
          ggplot2 (>= 3.0.0),
-	 	 colorRamps (>= 2.3)
+	 	 colorRamps (>= 2.3),
+		 gtools (>= 3.9.5)
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
 Suggests: knitr,

diff --git a/R/FindRecalSeries.R b/R/FindRecalSeries.R
@@ -0,0 +1,223 @@
+#' Filters the input dataframe
+#' This function filters the input dataframe based on abundance score threshold and peak distance threshold; and
+#' computes the length of the series.
+#' @param df DataFrame An output from RecalList, containing recalibrant CH2 series.
+#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
+#' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
+#' better.
+#' @return DataFrame A filtered dataframe.
+filter_recal_series <- function(df, abundance_score_threshold, peak_distance_threshold) {
+  df <- df %>%
+    dplyr::filter(Abundance.Score > abundance_score_threshold) %>%
+    dplyr::filter(Peak.Distance < peak_distance_threshold) %>%
+    tidyr::separate(col = Mass.Range, into = c("Min.Mass.Range", "Max.Mass.Range"), sep = "-") %>%
+    dplyr::mutate(
+      Min.Mass.Range = as.numeric(Min.Mass.Range),
+      Max.Mass.Range = as.numeric(Max.Mass.Range)
+    ) %>%
+    dplyr::mutate(Series.Length = Max.Mass.Range - Min.Mass.Range)
+
+  return(df)
+}
+
+#' Computes all possible combinations of series
+#' This function computes all possible combinations of series. The number of combinations is specified by user.
+#' @param df DataFrame A pre-filtered dataframe containing series.
+#' @param n Integer Number of combinations to compute.
+#'
+#' @return DataFrame A dataframe of row-index based combinations.
+compute_combinations <- function(df, n) {
+  combs <- gtools::combinations(nrow(df), n, v = seq_len(nrow(df)))
+  return(combs)
+}
+
+#' Computes subsets based on combinations.
+#' This function selects subsets of the input data based on the combinations of series.
+#' @param df DataFrame A pre-filtered dataframe containing series.
+#' @param n Integer Number of combinations to compute.
+#'
+#' @return A list of subsets based on series combinations.
+compute_subsets <- function(df, n) {
+  subsets <- apply(compute_combinations(df, n), 1, function(x) {
+    df[x, ]
+  })
+  return(subsets)
+}
+
+#' Computes coverage of a subset.
+#' This function computes m/z range coverage of a particular subset. The higher coverage, the better.
+#' @param subset DataFrame A subset of pre-filtered input data, has a size of combination number.
+#' @param global_min Float A lower bound of the instrument m/z range.
+#' @param global_max Float A higher bound of the instrument m/z range.
+#'
+#' @return Coverage (in %) of a particular subset.
+compute_coverage <- function(subset, global_max, global_min) {
+  subset <- subset %>%
+    dplyr::arrange(Min.Mass.Range)
+
+  #subset <- subset[order(subset$Min.Mass.Range), ]
+
+  # Initialize the coverage and the end of the last segment
+  total_coverage <- 0
+  last_end <- -Inf
+
+  # Iterate through the intervals
+  for (i in seq_len(nrow(subset))) {
+    current_start <- subset$Min.Mass.Range[i]
+    current_end <- subset$Max.Mass.Range[i]
+
+    if (current_start > last_end) {
+      # Non-overlapping segment, add the full length to coverage
+      total_coverage <- total_coverage + (current_end - current_start)
+    } else {
+      # Overlapping segment, add only the non-overlapping portion
+      total_coverage <- total_coverage + max(0, current_end - last_end)
+    }
+
+    # Update the last_end to the current segment's end
+    last_end <- max(last_end, current_end)
+  }
+  coverage_percent <- total_coverage / (global_max - global_min) * 100
+  return(coverage_percent)
+}
+
+#' Filters subsets based on the coverage.
+#' This function filters the subsets based on the coverage threshold.
+#' @param subsets DataFrame A subset of pre-filtered input data, has a size of combination number.
+#' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %.
+#' @param global_min Float A lower bound of the instrument m/z range.
+#' @param global_max Float A higher bound of the instrument m/z range.
+#'
+#' @return List A list of subsets which pass the coverage threshold (their coverage is higher than the threshold)
+filter_subsets_based_on_coverage <- function(subsets, coverage_threshold, global_max, global_min) {
+  filtered_subsets <- Filter(function(x) compute_coverage(x, global_max, global_min) > coverage_threshold, subsets)
+  return(filtered_subsets)
+}
+
+#' Computes the scores for individual subsets.
+#' This function computes individual scores for the particular combination of series.
+#' @param combination DataFrame A subset of pre-filtered input data, has a size of combination number.
+#'
+#' @return List List of scores for individual parameters.
+compute_scores <- function(combination) {
+  series <- paste0(combination$Series)
+  total_abundance <- sum(combination$Abundance.Score)
+  total_series_length <- sum(combination$Series.Length)
+  peak_score <- sum(1 / (combination$Peak.Score))
+  peak_distance_proximity <- sum(1 / (combination$Peak.Distance - 1))
+  series_id <- paste(combination$Series, collapse = " ")
+
+  return(list(
+    series = series,
+    total_abundance = total_abundance,
+    total_series_length = total_series_length,
+    peak_proximity = peak_score,
+    peak_distance_proximity = peak_distance_proximity,
+    series_id = series_id
+  ))
+}
+
+#' Compute a sum score for the whole combination.
+#' This function computes a score summing all parameter scores and arranges them in descending manner.
+#' @param scores_df DataFrame A dataframe of scores.
+#'
+#' @return DataFrame A sorted dataframe containing the final score.
+compute_final_score <- function(scores_df) {
+  final_score <- scores_df %>%
+    dplyr::rowwise() %>%
+    dplyr::mutate(sum_score = sum(total_abundance, total_series_length, peak_proximity, peak_distance_proximity)) %>%
+    dplyr::arrange(desc(sum_score)) %>%
+    dplyr::ungroup() 
+
+  return(final_score)
+}
+
+#' Selects final series.
+#' This function selects final best-scoring series. An user can choose, whether only a number of series corresponding
+#' to the size of the combination will be returned, or 10 best-scoring 10 series will be returned.
+#' @param scores_df DataFrame A dataframe of scores.
+#' @param number_of_combinations Integer Number of combinations to compute.
+#' @param fill_series Boolean If TRUE, top 10 unique series will be returned, otherwise only the series from the best
+#' combination will be returned.
+#'
+#' @return DataFrame A dataframe of best-scoring series.
+find_final_series <- function(scores_df, number_of_combinations, fill_series) {
+  final_series <- compute_final_score(scores_df)
+
+  if (fill_series == FALSE) {
+    final_series <- final_series %>%
+      dplyr::slice_head(n = number_of_combinations)
+    saveRDS(final_series, "test-data/final_seriesFALSE.rds")
+  } else {
+    final_series <- final_series %>%
+      dplyr::distinct(series, .keep_all = TRUE) %>%
+      dplyr::slice_head(n = 10)
+    saveRDS(final_series, "test-data/final_seriesTRUE.rds")
+  }
+
+  return(final_series)
+}
+
+#' Attempts to find most suitable series for recalibration.
+#'
+#' This function takes on input the CH2 homologous recalibration series, which are provided by the RecalList function #' and tries to find the most suitable series combination for recalibration based on the following criteria:
+#' 1) Series should cover the full mass spectral range,
+#' 2) Series should be optimally long and combined have a “Tall Peak” at least every 100 m/z,
+#' 3) Abundance score: the higher, the better,
+#' 4) Peak score: the closer to 0, the better,
+#' 5) Peak Distance: the closer to 1, the better,
+#' 6) Series Score: the closer to this value, the better.
+#'
+#' The recal function can take up to 10 series - due to the size of the search space when looking for combinations of 10
+#' elements, a pre-filtering is done: only the series which have Abundance score > 100 are considered and the one #'
+#' having Peak Distance < 2.
+#' Combinations of 5 series are assembled, scores are computed for other metrics (in case of Peak proximity and Peak
+#' distance, an inverted score is computed) and these are summed. Finally, top 10 unique series having the highest
+#' score are outputted.
+#'
+#' Warning: this step is in general computationally demanding, for ~30 series it took around 30 min.
+#'
+#' @param df DataFrame An output from RecalList, containing recalibrant CH2 series.
+#' @param global_min Float A lower bound of the instrument m/z range.
+#' @param global_max Float A higher bound of the instrument m/z range.
+#' @param number_of_combinations Integer Combinations of how many series should be computed. Default is 5, Recal function can
+#' take up to 10 series, but the more combinations, the longer computing time is expected (growing exponentially)
+#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
+#' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
+#' better.
+#' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %.
+#' @param fill_series Boolean If TRUE, top 10 unique series will be returned, otherwise only the series from the best
+#' combination will be returned.
+#'
+#' @return A dataframe of n-10 best-scoring series.
+find_series <- function(df,
+                        global_min,
+                        global_max,
+                        number_of_combinations,
+                        abundance_score_threshold,
+                        peak_distance_threshold,
+                        coverage_threshold,
+                        fill_series) {
+  # Pre-filter and arrange the data
+  df <- filter_recal_series(df, abundance_score_threshold, peak_distance_threshold)
+
+  # Create all combinations of ions
+  subsets <- compute_subsets(df, number_of_combinations)
+
+  # Filter the subsets based on coverage threshold
+  subsets_filtered <- filter_subsets_based_on_coverage(subsets, coverage_threshold, global_max, global_min)
+
+  # Compute the scores
+  scores <- lapply(subsets_filtered, function(x) {
+    compute_scores(x)
+  })
+
+  # Append all scored combinations into a dataframe
+  scores_df <- do.call(rbind, lapply(scores, as.data.frame))
+
+  # Filter for final series
+  final_series <- find_final_series(scores_df, number_of_combinations, fill_series)
+
+  # Return the top scoring series
+  return(final_series)
+}
diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml
@@ -14,3 +14,4 @@ dependencies:
 - radian
 - r-covr
 - r-patrick
+- r-gtools
diff --git a/tests/testthat/test-data/combination_subsets.rds b/tests/testthat/test-data/combination_subsets.rds
diff --git a/tests/testthat/test-data/combinations_simple.rds b/tests/testthat/test-data/combinations_simple.rds
diff --git a/tests/testthat/test-data/computed_scores.rds b/tests/testthat/test-data/computed_scores.rds
diff --git a/tests/testthat/test-data/filtered_recallist.rds b/tests/testthat/test-data/filtered_recallist.rds
diff --git a/tests/testthat/test-data/filtered_subsets.rds b/tests/testthat/test-data/filtered_subsets.rds
diff --git a/tests/testthat/test-data/final_scores.rds b/tests/testthat/test-data/final_scores.rds
diff --git a/tests/testthat/test-data/final_seriesFALSE.rds b/tests/testthat/test-data/final_seriesFALSE.rds
diff --git a/tests/testthat/test-data/final_seriesTRUE.rds b/tests/testthat/test-data/final_seriesTRUE.rds
diff --git a/tests/testthat/test-data/final_series_combinations.rds b/tests/testthat/test-data/final_series_combinations.rds
diff --git a/tests/testthat/test-data/pos_recalSeries.rds b/tests/testthat/test-data/pos_recalSeries.rds
diff --git a/tests/testthat/test-data/scores_df.rds b/tests/testthat/test-data/scores_df.rds
diff --git a/tests/testthat/test-data/scores_df_full.rds b/tests/testthat/test-data/scores_df_full.rds
diff --git a/tests/testthat/test-findRecalSeries.R b/tests/testthat/test-findRecalSeries.R
@@ -0,0 +1,75 @@
+test_that("Filtering input dataframe works", {
+  pos_recalList <- head(readRDS(file.path("test-data", "pos_recallist.rds")), 10)
+  colnames(pos_recalList) <- gsub(" ", ".", colnames(pos_recalList))
+  expected <- readRDS("test-data/filtered_recallist.rds")
+  actual <- filter_recal_series(pos_recalList, abundance_score_threshold = 0, peak_distance_threshold = 2)
+  expect_equal(actual, expected)
+})
+
+test_that("Compute combinations works", {
+  df <- data.frame(series = c(1:5))
+  expected <- readRDS("test-data/combinations_simple.rds")
+  actual <- compute_combinations(df, 3)
+  expect_equal(actual, expected)
+})
+
+test_that("Compute subsets from combinations works", {
+  df <- head(readRDS(file.path("test-data", "pos_recalSeries.rds")), 5)
+  expected <- readRDS("test-data/combination_subsets.rds")
+  actual <- compute_subsets(df, 3)
+  expect_equal(actual, expected)
+})
+
+test_that("Compute coverage works", {
+  pos_recalList <- readRDS(file.path("test-data", "pos_recalSeries.rds"))
+  actual <- compute_coverage(pos_recalList, 499.326, 111.044)
+  expected <- 23.21534
+  expect_equal(actual, expected, tolerance = 0.0005)
+})
+
+test_that("Filtering of the subsets based on coverage works", {
+  df <- readRDS("test-data/combination_subsets.rds")
+  expected <- readRDS("test-data/filtered_subsets.rds")
+  actual <- filter_subsets_based_on_coverage(df, 80, 206, 117)
+  expect_equal(actual, expected)
+})
+
+test_that("Computing final scores work", {
+  df <- head(readRDS("test-data/scores_df.rds"), 6)
+  expected <- readRDS("test-data/final_scores.rds")
+  actual <- compute_final_score(df)
+  expect_equal(actual, expected)
+})
+
+test_that("Computing scores works", {
+  df <- readRDS("test-data/filtered_subsets.rds")[[1]]
+  expected <- readRDS("test-data/computed_scores.rds")
+  actual <- compute_scores(df)
+  expect_equal(actual, expected)
+})
+
+test_that("Positive final series work", {
+  df <- readRDS("test-data/scores_df_full.rds")
+  expected <- readRDS("test-data/final_seriesTRUE.rds")
+  actual <- find_final_series(df, 3, TRUE)
+  expect_equal(actual, expected)
+})
+
+#patrick::with_parameters_test_that("Selection of the final series works",
+#  {
+#    df <- readRDS("test-data/scores_df_full.rds")
+#    expected <- readRDS(file.path("test-data", paste0("final_series", mode, ".rds")))
+#    n <- 3
+
+#    actual <- find_final_series(df, n, mode)
+  #  if (mode == TRUE) {
+  #    expect_equal(nrow(actual), 10)
+  # } else {
+  #    expect_true(nrow(actual), n)
+  #
+  #  }
+  #  expect_equal(actual, expected)
+#  expect_equal(actual,expected)
+#  },
+#  mode = c(TRUE, FALSE)
+#)
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,3 +14,4 @@ dependencies: @@
     - radian
     - r-covr
     - r-patrick
+    - r-gtools