Skip to content

Commit

Permalink
Merge pull request #44 from KristinaGomoryova/findRecalSeries
Browse files Browse the repository at this point in the history
function to find top 10 recalibrant series
  • Loading branch information
hechth authored Sep 5, 2024
2 parents 79e56fc + 0380c83 commit e08ee5f
Show file tree
Hide file tree
Showing 16 changed files with 301 additions and 1 deletion.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ Depends: R (>= 3.4.1)
Imports: dplyr (>= 0.7.6),
tidyr (>= 0.8.1),
ggplot2 (>= 3.0.0),
colorRamps (>= 2.3)
colorRamps (>= 2.3),
gtools (>= 3.9.5)
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
Suggests: knitr,
Expand Down
223 changes: 223 additions & 0 deletions R/FindRecalSeries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#' Filters the input dataframe
#' This function filters the input dataframe based on abundance score threshold and peak distance threshold; and
#' computes the length of the series.
#' @param df DataFrame An output from RecalList, containing recalibrant CH2 series.
#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
#' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
#' better.
#' @return DataFrame A filtered dataframe.
filter_recal_series <- function(df, abundance_score_threshold, peak_distance_threshold) {
df <- df %>%
dplyr::filter(Abundance.Score > abundance_score_threshold) %>%
dplyr::filter(Peak.Distance < peak_distance_threshold) %>%
tidyr::separate(col = Mass.Range, into = c("Min.Mass.Range", "Max.Mass.Range"), sep = "-") %>%
dplyr::mutate(
Min.Mass.Range = as.numeric(Min.Mass.Range),
Max.Mass.Range = as.numeric(Max.Mass.Range)
) %>%
dplyr::mutate(Series.Length = Max.Mass.Range - Min.Mass.Range)

return(df)
}

#' Computes all possible combinations of series
#' This function computes all possible combinations of series. The number of combinations is specified by user.
#' @param df DataFrame A pre-filtered dataframe containing series.
#' @param n Integer Number of combinations to compute.
#'
#' @return DataFrame A dataframe of row-index based combinations.
compute_combinations <- function(df, n) {
combs <- gtools::combinations(nrow(df), n, v = seq_len(nrow(df)))
return(combs)
}

#' Computes subsets based on combinations.
#' This function selects subsets of the input data based on the combinations of series.
#' @param df DataFrame A pre-filtered dataframe containing series.
#' @param n Integer Number of combinations to compute.
#'
#' @return A list of subsets based on series combinations.
compute_subsets <- function(df, n) {
subsets <- apply(compute_combinations(df, n), 1, function(x) {
df[x, ]
})
return(subsets)
}

#' Computes coverage of a subset.
#' This function computes m/z range coverage of a particular subset. The higher coverage, the better.
#' @param subset DataFrame A subset of pre-filtered input data, has a size of combination number.
#' @param global_min Float A lower bound of the instrument m/z range.
#' @param global_max Float A higher bound of the instrument m/z range.
#'
#' @return Coverage (in %) of a particular subset.
compute_coverage <- function(subset, global_max, global_min) {
subset <- subset %>%
dplyr::arrange(Min.Mass.Range)

#subset <- subset[order(subset$Min.Mass.Range), ]

# Initialize the coverage and the end of the last segment
total_coverage <- 0
last_end <- -Inf

# Iterate through the intervals
for (i in seq_len(nrow(subset))) {
current_start <- subset$Min.Mass.Range[i]
current_end <- subset$Max.Mass.Range[i]

if (current_start > last_end) {
# Non-overlapping segment, add the full length to coverage
total_coverage <- total_coverage + (current_end - current_start)
} else {
# Overlapping segment, add only the non-overlapping portion
total_coverage <- total_coverage + max(0, current_end - last_end)
}

# Update the last_end to the current segment's end
last_end <- max(last_end, current_end)
}
coverage_percent <- total_coverage / (global_max - global_min) * 100
return(coverage_percent)
}

#' Filters subsets based on the coverage.
#' This function filters the subsets based on the coverage threshold.
#' @param subsets DataFrame A subset of pre-filtered input data, has a size of combination number.
#' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %.
#' @param global_min Float A lower bound of the instrument m/z range.
#' @param global_max Float A higher bound of the instrument m/z range.
#'
#' @return List A list of subsets which pass the coverage threshold (their coverage is higher than the threshold)
filter_subsets_based_on_coverage <- function(subsets, coverage_threshold, global_max, global_min) {
filtered_subsets <- Filter(function(x) compute_coverage(x, global_max, global_min) > coverage_threshold, subsets)
return(filtered_subsets)
}

#' Computes the scores for individual subsets.
#' This function computes individual scores for the particular combination of series.
#' @param combination DataFrame A subset of pre-filtered input data, has a size of combination number.
#'
#' @return List List of scores for individual parameters.
compute_scores <- function(combination) {
series <- paste0(combination$Series)
total_abundance <- sum(combination$Abundance.Score)
total_series_length <- sum(combination$Series.Length)
peak_score <- sum(1 / (combination$Peak.Score))
peak_distance_proximity <- sum(1 / (combination$Peak.Distance - 1))
series_id <- paste(combination$Series, collapse = " ")

return(list(
series = series,
total_abundance = total_abundance,
total_series_length = total_series_length,
peak_proximity = peak_score,
peak_distance_proximity = peak_distance_proximity,
series_id = series_id
))
}

#' Compute a sum score for the whole combination.
#' This function computes a score summing all parameter scores and arranges them in descending manner.
#' @param scores_df DataFrame A dataframe of scores.
#'
#' @return DataFrame A sorted dataframe containing the final score.
compute_final_score <- function(scores_df) {
final_score <- scores_df %>%
dplyr::rowwise() %>%
dplyr::mutate(sum_score = sum(total_abundance, total_series_length, peak_proximity, peak_distance_proximity)) %>%
dplyr::arrange(desc(sum_score)) %>%
dplyr::ungroup()

return(final_score)
}

#' Selects final series.
#' This function selects final best-scoring series. An user can choose, whether only a number of series corresponding
#' to the size of the combination will be returned, or 10 best-scoring 10 series will be returned.
#' @param scores_df DataFrame A dataframe of scores.
#' @param number_of_combinations Integer Number of combinations to compute.
#' @param fill_series Boolean If TRUE, top 10 unique series will be returned, otherwise only the series from the best
#' combination will be returned.
#'
#' @return DataFrame A dataframe of best-scoring series.
find_final_series <- function(scores_df, number_of_combinations, fill_series) {
final_series <- compute_final_score(scores_df)

if (fill_series == FALSE) {
final_series <- final_series %>%
dplyr::slice_head(n = number_of_combinations)
saveRDS(final_series, "test-data/final_seriesFALSE.rds")
} else {
final_series <- final_series %>%
dplyr::distinct(series, .keep_all = TRUE) %>%
dplyr::slice_head(n = 10)
saveRDS(final_series, "test-data/final_seriesTRUE.rds")
}

return(final_series)
}

#' Attempts to find most suitable series for recalibration.
#'
#' This function takes on input the CH2 homologous recalibration series, which are provided by the RecalList function #' and tries to find the most suitable series combination for recalibration based on the following criteria:
#' 1) Series should cover the full mass spectral range,
#' 2) Series should be optimally long and combined have a “Tall Peak” at least every 100 m/z,
#' 3) Abundance score: the higher, the better,
#' 4) Peak score: the closer to 0, the better,
#' 5) Peak Distance: the closer to 1, the better,
#' 6) Series Score: the closer to this value, the better.
#'
#' The recal function can take up to 10 series - due to the size of the search space when looking for combinations of 10
#' elements, a pre-filtering is done: only the series which have Abundance score > 100 are considered and the one #'
#' having Peak Distance < 2.
#' Combinations of 5 series are assembled, scores are computed for other metrics (in case of Peak proximity and Peak
#' distance, an inverted score is computed) and these are summed. Finally, top 10 unique series having the highest
#' score are outputted.
#'
#' Warning: this step is in general computationally demanding, for ~30 series it took around 30 min.
#'
#' @param df DataFrame An output from RecalList, containing recalibrant CH2 series.
#' @param global_min Float A lower bound of the instrument m/z range.
#' @param global_max Float A higher bound of the instrument m/z range.
#' @param number_of_combinations Integer Combinations of how many series should be computed. Default is 5, Recal function can
#' take up to 10 series, but the more combinations, the longer computing time is expected (growing exponentially)
#' @param abundance_score_threshold Float A threshold for filtering abundance score parameter. The series with higher values #' are better. Default value is 100.
#' @param peak_distance_threshold Float A threshold for the peak distance parameter. The closer this value is to 1, the
#' better.
#' @param coverage_threshold Integer How many % of the m/z range should be covered. Default is 90 %.
#' @param fill_series Boolean If TRUE, top 10 unique series will be returned, otherwise only the series from the best
#' combination will be returned.
#'
#' @return A dataframe of n-10 best-scoring series.
find_series <- function(df,
global_min,
global_max,
number_of_combinations,
abundance_score_threshold,
peak_distance_threshold,
coverage_threshold,
fill_series) {
# Pre-filter and arrange the data
df <- filter_recal_series(df, abundance_score_threshold, peak_distance_threshold)

# Create all combinations of ions
subsets <- compute_subsets(df, number_of_combinations)

# Filter the subsets based on coverage threshold
subsets_filtered <- filter_subsets_based_on_coverage(subsets, coverage_threshold, global_max, global_min)

# Compute the scores
scores <- lapply(subsets_filtered, function(x) {
compute_scores(x)
})

# Append all scored combinations into a dataframe
scores_df <- do.call(rbind, lapply(scores, as.data.frame))

# Filter for final series
final_series <- find_final_series(scores_df, number_of_combinations, fill_series)

# Return the top scoring series
return(final_series)
}
1 change: 1 addition & 0 deletions conda/environment-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ dependencies:
- radian
- r-covr
- r-patrick
- r-gtools
Binary file added tests/testthat/test-data/combination_subsets.rds
Binary file not shown.
Binary file added tests/testthat/test-data/combinations_simple.rds
Binary file not shown.
Binary file added tests/testthat/test-data/computed_scores.rds
Binary file not shown.
Binary file added tests/testthat/test-data/filtered_recallist.rds
Binary file not shown.
Binary file added tests/testthat/test-data/filtered_subsets.rds
Binary file not shown.
Binary file added tests/testthat/test-data/final_scores.rds
Binary file not shown.
Binary file added tests/testthat/test-data/final_seriesFALSE.rds
Binary file not shown.
Binary file added tests/testthat/test-data/final_seriesTRUE.rds
Binary file not shown.
Binary file not shown.
Binary file added tests/testthat/test-data/pos_recalSeries.rds
Binary file not shown.
Binary file added tests/testthat/test-data/scores_df.rds
Binary file not shown.
Binary file added tests/testthat/test-data/scores_df_full.rds
Binary file not shown.
75 changes: 75 additions & 0 deletions tests/testthat/test-findRecalSeries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
test_that("Filtering input dataframe works", {
pos_recalList <- head(readRDS(file.path("test-data", "pos_recallist.rds")), 10)
colnames(pos_recalList) <- gsub(" ", ".", colnames(pos_recalList))
expected <- readRDS("test-data/filtered_recallist.rds")
actual <- filter_recal_series(pos_recalList, abundance_score_threshold = 0, peak_distance_threshold = 2)
expect_equal(actual, expected)
})

test_that("Compute combinations works", {
df <- data.frame(series = c(1:5))
expected <- readRDS("test-data/combinations_simple.rds")
actual <- compute_combinations(df, 3)
expect_equal(actual, expected)
})

test_that("Compute subsets from combinations works", {
df <- head(readRDS(file.path("test-data", "pos_recalSeries.rds")), 5)
expected <- readRDS("test-data/combination_subsets.rds")
actual <- compute_subsets(df, 3)
expect_equal(actual, expected)
})

test_that("Compute coverage works", {
pos_recalList <- readRDS(file.path("test-data", "pos_recalSeries.rds"))
actual <- compute_coverage(pos_recalList, 499.326, 111.044)
expected <- 23.21534
expect_equal(actual, expected, tolerance = 0.0005)
})

test_that("Filtering of the subsets based on coverage works", {
df <- readRDS("test-data/combination_subsets.rds")
expected <- readRDS("test-data/filtered_subsets.rds")
actual <- filter_subsets_based_on_coverage(df, 80, 206, 117)
expect_equal(actual, expected)
})

test_that("Computing final scores work", {
df <- head(readRDS("test-data/scores_df.rds"), 6)
expected <- readRDS("test-data/final_scores.rds")
actual <- compute_final_score(df)
expect_equal(actual, expected)
})

test_that("Computing scores works", {
df <- readRDS("test-data/filtered_subsets.rds")[[1]]
expected <- readRDS("test-data/computed_scores.rds")
actual <- compute_scores(df)
expect_equal(actual, expected)
})

test_that("Positive final series work", {
df <- readRDS("test-data/scores_df_full.rds")
expected <- readRDS("test-data/final_seriesTRUE.rds")
actual <- find_final_series(df, 3, TRUE)
expect_equal(actual, expected)
})

#patrick::with_parameters_test_that("Selection of the final series works",
# {
# df <- readRDS("test-data/scores_df_full.rds")
# expected <- readRDS(file.path("test-data", paste0("final_series", mode, ".rds")))
# n <- 3

# actual <- find_final_series(df, n, mode)
# if (mode == TRUE) {
# expect_equal(nrow(actual), 10)
# } else {
# expect_true(nrow(actual), n)
#
# }
# expect_equal(actual, expected)
# expect_equal(actual,expected)
# },
# mode = c(TRUE, FALSE)
#)

0 comments on commit e08ee5f

Please sign in to comment.