diff --git a/CHANGELOG.md b/CHANGELOG.md index 61036f3f..18847c4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - refactored `feature.align.R` [#63](https://github.com/RECETOX/recetox-aplcms/pull/63)[#88](https://github.com/RECETOX/recetox-aplcms/pull/88) - refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64) +- refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) +- refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) ### Removed ## [0.9.4] - 2022-05-10 diff --git a/DESCRIPTION b/DESCRIPTION index 014516b5..56db4ee0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,13 +8,13 @@ Authors@R: c( Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. -Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach, snow, dplyr, - tidyr, stringr, tibble, tools, arrow +Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach, + snow, dplyr, tidyr, stringr, tibble, tools, arrow biocViews: Technology, MassSpectrometry License: GPL-2 LazyLoad: yes NeedsCompilation: no Suggests: - arrow, dataCompareR, testthat (>= 3.0.0) + dataCompareR, testthat (>= 3.0.0) Config/testthat/edition: 3 -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 diff --git a/NAMESPACE b/NAMESPACE index 483e2708..8302b4e5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(solve,a) S3method(solve,sigma) export(adaptive.bin) export(adjust.time) @@ -8,25 +9,36 @@ export(bigauss.esti) export(bigauss.esti.EM) export(bigauss.mix) export(combine.seq.3) -export(compute_all_times) -export(compute_base_curve) export(compute_boundaries) export(compute_bounds) export(compute_breaks) -export(compute_breaks_2) +export(compute_chromatographic_profile) +export(compute_delta_rt) export(compute_densities) +export(compute_dx) +export(compute_e_step) +export(compute_end_bound) +export(compute_gaussian_peak_shape) +export(compute_initiation_params) export(compute_mass_density) export(compute_mass_values) -export(compute_target_time) +export(compute_mu_sc_std) +export(compute_mz_sd) +export(compute_scale) +export(compute_start_bound) +export(compute_target_times) export(cont.index) +export(draw_chr_normal_peaks) export(duplicate.row.remove) +export(extract_features) export(extract_pattern_colnames) export(feature.align) export(find.tol) export(find.turn.point) +export(find_local_maxima) export(get_custom_chr_tol) export(get_mzrange_bound_indices) -export(get_raw_features_in_mzrange) +export(get_rt_region_indices) export(get_times_to_use) export(hybrid) export(increment_counter) @@ -35,10 +47,18 @@ export(load.lcms) export(load_data) export(load_file) export(make.known.table) +export(mass.match) +export(msExtrema) export(normix) export(normix.bic) +export(plot_chr_profile) +export(plot_normix_bic) +export(plot_peak_summary) export(plot_raw_profile_histogram) +export(predict_mz_break_indices) export(prep.uv) +export(preprocess_bandwidth) +export(preprocess_profile) export(proc.cdf) export(prof.to.features) export(recover.weaker) @@ -48,6 +68,7 @@ export(semi.sup) export(sort_samples_by_acquisition_number) export(two.step.hybrid) export(unsupervised) +export(validate_inputs) import(MASS) import(arrow) import(doParallel) @@ -61,3 +82,5 @@ import(stringr) import(tibble) import(tidyr) import(tools) +importFrom(dplyr,arrange) +importFrom(dplyr,filter) diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R index 7b33e5f6..99751313 100644 --- a/R/adaptive.bin.R +++ b/R/adaptive.bin.R @@ -32,13 +32,6 @@ compute_breaks <- function(tol, masses, intensi, weighted) { return(breaks) } -#' @export -compute_boundaries <- function(mass.vlys, mass.pks, j){ - mass.lower <- max(mass.vlys[mass.vlys < mass.pks[j]]) - mass.upper <- min(mass.vlys[mass.vlys > mass.pks[j]]) - - return(list(lower = mass.lower, upper = mass.upper)) -} #' @export increment_counter <- function(pointers, that.n){ @@ -134,7 +127,7 @@ adaptive.bin <- function(x, for (j in 1:length(mass.pks)) { # compute boundaries - boundaries <- compute_boundaries(mass.vlys, mass.pks, j) + boundaries <- compute_boundaries(mass.vlys, mass.pks[j]) if (length(mass.pks) == 1){ boundaries$lower <- boundaries$lower - 1 @@ -144,12 +137,7 @@ adaptive.bin <- function(x, that <- this_table |> dplyr::filter(mz > boundaries$lower & mz <= boundaries$upper) if (nrow(that) > 0) { - that <- combine.seq.3(that) - - if (nrow(that) != 1) { - that <- that[order(that[, 1]), ] - } - + that <- combine.seq.3(that) |> dplyr::arrange_at("mz") that.range <- diff(range(that$labels)) if (that.range > 0.5 * time_range & length(that$labels) > that.range * min.pres & length(that$labels) / (that.range / aver.time.range) > min.pres) { diff --git a/R/adjust.time.R b/R/adjust.time.R index cd0e4478..7e799b6a 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -2,99 +2,104 @@ NULL #> NULL -compute_comb <- function(candi, template, this.feature, j){ - this.comb <- dplyr::bind_rows(dplyr::bind_cols(candi, label = rep(template, nrow(candi))), - dplyr::bind_cols(this.feature[, 1:2], label = rep(j, nrow(this.feature)))) - this.comb <- dplyr::arrange(this.comb, this.comb[, 1]) - return(this.comb) +compute_comb <- function(candi, template, this.feature, j) { + this.comb <- dplyr::bind_rows( + dplyr::bind_cols(candi, label = rep(template, nrow(candi))), + dplyr::bind_cols(this.feature[, 1:2], label = rep(j, nrow(this.feature))) + ) + this.comb <- dplyr::arrange(this.comb, this.comb[, 1]) + return(this.comb) } -compute_sel <- function(this.comb, mz_tol_relative, rt_tol_relative){ - l <- nrow(this.comb) - sel <- which(this.comb[2:l, 1] - this.comb[1:(l-1), 1] < - mz_tol_relative * this.comb[1:(l-1), 1] * 2 & - abs(this.comb[2:l, 2] - this.comb[1:(l-1), 2]) < - rt_tol_relative & this.comb[2:l, 3] != this.comb[1:(l-1), 3]) - return(sel) +compute_sel <- function(this.comb, mz_tol_relative, rt_tol_relative) { + l <- nrow(this.comb) + sel <- which(this.comb[2:l, 1] - this.comb[1:(l - 1), 1] < + mz_tol_relative * this.comb[1:(l - 1), 1] * 2 & + abs(this.comb[2:l, 2] - this.comb[1:(l - 1), 2]) < + rt_tol_relative & this.comb[2:l, 3] != this.comb[1:(l - 1), 3]) + return(sel) } -compute_template_adjusted_rt <- function(this.comb, sel, j){ - all.ftr.table <- cbind(this.comb[sel, 2], this.comb[sel+1, 2]) - to.flip <- which(this.comb[sel, 3] == j) - temp <- all.ftr.table[to.flip, 2] - all.ftr.table[to.flip, 2] <- all.ftr.table[to.flip, 1] - all.ftr.table[to.flip, 1] <- temp - - # now the first column is the template retention time. - # the second column is the to-be-adjusted retention time - - cat(c("sample", j, "using", nrow(all.ftr.table), ", ")) - if(j %% 3 == 0) - cat("\n") - - all.ftr.table <- all.ftr.table[order(all.ftr.table[, 2]), ] - return(all.ftr.table) +compute_template_adjusted_rt <- function(this.comb, sel, j) { + all.ftr.table <- cbind(this.comb[sel, 2], this.comb[sel + 1, 2]) + to.flip <- which(this.comb[sel, 3] == j) + temp <- all.ftr.table[to.flip, 2] + all.ftr.table[to.flip, 2] <- all.ftr.table[to.flip, 1] + all.ftr.table[to.flip, 1] <- temp + + # now the first column is the template retention time. + # the second column is the to-be-adjusted retention time + + cat(c("sample", j, "using", nrow(all.ftr.table), ", ")) + if (j %% 3 == 0) { + cat("\n") + } + + all.ftr.table <- all.ftr.table[order(all.ftr.table[, 2]), ] + return(all.ftr.table) } -compute_corrected_features <- function(this.feature, this.diff, avg_time){ - this.feature <- this.feature[order(this.feature[, 2], this.feature[, 1]), ] - this.corrected <- this.old <- this.feature[, 2] - to.correct <- this.old[this.old >= min(this.diff) & - this.old <= max(this.diff)] - - this.smooth <- ksmooth(this.diff, avg_time, kernel="normal", - bandwidth = (max(this.diff) - min(this.diff)) / 5, - x.points = to.correct) - - this.corrected[this.old >= min(this.diff) & this.old <= max(this.diff)] <- - this.smooth$y + to.correct - this.corrected[this.old < min(this.diff)] <- this.corrected[this.old < min(this.diff)] + - mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) - this.corrected[this.old > max(this.diff)] <- this.corrected[this.old > max(this.diff)] + - mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) - this.feature[, 2] <- this.corrected - this.feature <- this.feature[order(this.feature[, 1], this.feature[, 2]), ] - return(this.feature) +compute_corrected_features <- function(this.feature, this.diff, avg_time) { + this.feature <- this.feature[order(this.feature[, 2], this.feature[, 1]), ] + this.corrected <- this.old <- this.feature[, 2] + to.correct <- this.old[this.old >= min(this.diff) & + this.old <= max(this.diff)] + + this.smooth <- ksmooth(this.diff, avg_time, + kernel = "normal", + bandwidth = (max(this.diff) - min(this.diff)) / 5, + x.points = to.correct + ) + + this.corrected[this.old >= min(this.diff) & this.old <= max(this.diff)] <- + this.smooth$y + to.correct + this.corrected[this.old < min(this.diff)] <- this.corrected[this.old < min(this.diff)] + + mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) + this.corrected[this.old > max(this.diff)] <- this.corrected[this.old > max(this.diff)] + + mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) + this.feature[, 2] <- this.corrected + this.feature <- this.feature[order(this.feature[, 1], this.feature[, 2]), ] + return(this.feature) } fill_missing_values <- function(orig.feature, this.feature) { - missing_values <- which(is.na(this.feature[, 2])) - for(i in missing_values) { - this.d <- abs(orig.feature[i, 2] - orig.feature[, 2]) - this.d[missing_values] <- Inf - this.s <- which(this.d == min(this.d))[1] - this.feature[i, 2] <- orig.feature[i, 2] + this.feature[this.s, 2] - - orig.feature[this.s, 2] - } - return(this.feature) + missing_values <- which(is.na(this.feature[, 2])) + for (i in missing_values) { + this.d <- abs(orig.feature[i, 2] - orig.feature[, 2]) + this.d[missing_values] <- Inf + this.s <- which(this.d == min(this.d))[1] + this.feature[i, 2] <- orig.feature[i, 2] + this.feature[this.s, 2] - + orig.feature[this.s, 2] + } + return(this.feature) } #' Adjust retention time across spectra. -#' +#' #' This function adjusts the retention time in each LC/MS profile to achieve better between-profile agreement. -#' -#' @param features A list object. Each component is a matrix which is the output from proc.to.feature() +#' +#' @param extracted_features A list object. Each component is a matrix which is the output from proc.to.feature() #' @param mz_tol_relative The m/z tolerance level for peak alignment. The default is NA, which allows the -#' program to search for the tolerance level based on the data. This value is expressed as the +#' program to search for the tolerance level based on the data. This value is expressed as the #' percentage of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. -#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which +#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which #' allows the program to search for the tolerance level based on the data. -#' @param colors The vector of colors to be used for the line plots of time adjustments. The default is NA, +#' @param colors The vector of colors to be used for the line plots of time adjustments. The default is NA, #' in which case the program uses a set of default color set. -#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. +#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. #' This is only used when the mz_tol_relative is NA. -#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable -#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly +#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable +#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly #' influences feature matching in higher m/z range. #' @param do.plot Indicates whether plot should be drawn. #' @param rt_colname contains the retention time information -#' @return A list object with the exact same structure as the input object features, i.e. one matrix per profile -#' being processed. The only difference this output object has with the input object is that the retention time +#' @return A list object with the exact same structure as the input object features, i.e. one matrix per profile +#' being processed. The only difference this output object has with the input object is that the retention time #' column in each of the matrices is changed to new adjusted values. #' @export #' @examples #' adjust.time(extracted_features, mz_max_diff = 10 * 1e-05, do.plot = FALSE) -adjust.time <- function(features, +adjust.time <- function(extracted_features, mz_tol_relative = NA, rt_tol_relative = NA, colors = NA, @@ -102,116 +107,132 @@ adjust.time <- function(features, mz_tol_absolute = 0.01, do.plot = TRUE, rt_colname = "pos") { - - number_of_samples <- nrow(summary(features)) - if(number_of_samples > 1) { - if (do.plot) { - par(mfrow = c(2,2)) - draw_plot(label = "Retention time \n adjustment", - cex = 2) - } - - values <- get_feature_values(features, rt_colname) - mz <- values$mz - chr <- values$rt - lab <- values$sample_id - - if(is.na(mz_tol_relative)) { - mz_tol_relative <- find.tol(mz, mz_max_diff = mz_max_diff, do.plot = do.plot) - } else if(do.plot) { - draw_plot(main = "m/z tolerance level given", - label = mz_tol_relative) - } - - if(!is.na(rt_tol_relative) && do.plot) { - draw_plot(main = "retention time \n tolerance level given", - label = rt_tol_relative) - } - - all.ft <- find.tol.time(mz, - chr, - lab, - number_of_samples = number_of_samples, - mz_tol_relative = mz_tol_relative, - rt_tol_relative = rt_tol_relative, - mz_tol_absolute = mz_tol_absolute, - do.plot = do.plot) - rt_tol_relative <- all.ft$rt.tol - - message("**** performing time correction ****") - message(paste("m/z tolerance level: ", mz_tol_relative)) - message(paste("time tolerance level:", rt_tol_relative)) - - for(i in 1:number_of_samples) { - this <- features[[i]] - sel <- which(all.ft$lab == i) - that <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel]) - this <- this[order(this[, 1], this[, 2]), ] - that <- that[order(that[, 1], that[, 2]), ] - this <- cbind(this, V6=rep(i, nrow(this)), V7=that[, 3]) - features[[i]] <- this - } - - num.ftrs <- as.vector(table(all.ft$lab)) - template <- which(num.ftrs == max(num.ftrs))[1] - message(paste("the template is sample", template)) - - candi <- features[[template]][, 1:2] - - corrected_features <- foreach::foreach(j = 1:number_of_samples,.export = c("compute_corrected_features", - "compute_template_adjusted_rt", "compute_comb", "compute_sel")) %dopar% { - this.feature <- features[[j]] - if(j != template) { - this.comb <- compute_comb(candi, template, this.feature, j) - - sel <- compute_sel(this.comb, mz_tol_relative, rt_tol_relative) - if(length(sel) < 20) { - cat("too few, aborted") - } else { - all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j) - - # the to be adjusted time - this.diff <- all.ftr.table[, 2] - - # the difference between the true time and the to-be-adjusted time - avg_time <- all.ftr.table[, 1] - this.diff - - this.feature <- compute_corrected_features(this.feature, this.diff, avg_time) - } - } - - if(sum(is.na(this.feature[, 2])) > 0) { - this.feature <- fill_missing_values( - features[[j]], - this.feature - ) - } - this.feature - } - } else { - message("Only one sample. No need to correct for time.") + number_of_samples <- nrow(summary(extracted_features)) + + if (number_of_samples <= 1) { + message("Only one sample. No need to correct for time.") + } + + if (do.plot) { + par(mfrow = c(2, 2)) + draw_plot(label = "Retention time \n adjustment", cex = 2) + } + + extracted_features <- lapply(extracted_features, function(x) tibble::as_tibble(x) |> dplyr::rename(rt = pos)) + + values <- concatenate_feature_tables(extracted_features, rt_colname) + mz <- values$mz + chr <- values$rt + lab <- values$sample_id + + if (is.na(mz_tol_relative)) { + mz_tol_relative <- find.tol(mz, mz_max_diff = mz_max_diff, do.plot = do.plot) + } else if (do.plot) { + draw_plot( + main = "m/z tolerance level given", + label = mz_tol_relative + ) + } + + if (!is.na(rt_tol_relative) && do.plot) { + draw_plot( + main = "retention time \n tolerance level given", + label = rt_tol_relative + ) + } + + all.ft <- find.tol.time(mz, + chr, + lab, + number_of_samples = number_of_samples, + mz_tol_relative = mz_tol_relative, + rt_tol_relative = rt_tol_relative, + mz_tol_absolute = mz_tol_absolute, + do.plot = do.plot + ) + rt_tol_relative <- all.ft$rt.tol + + message("**** performing time correction ****") + message(paste("m/z tolerance level: ", mz_tol_relative)) + message(paste("time tolerance level:", rt_tol_relative)) + + for (i in 1:number_of_samples) { + features <- extracted_features[[i]] + sel <- which(all.ft$lab == i) + + feature_subset <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel]) + features <- features[order(features[, 1], features[, 2]), ] + feature_subset <- feature_subset[order(feature_subset[, 1], feature_subset[, 2]), ] + features <- cbind(features, sample_id = rep(i, nrow(features)), rt_cluster = feature_subset[, 3]) + extracted_features[[i]] <- features + } + + num.ftrs <- as.vector(table(all.ft$lab)) + template <- which(num.ftrs == max(num.ftrs))[1] + message(paste("the template is sample", template)) + + candi <- extracted_features[[template]][, 1:2] + + corrected_features <- foreach::foreach(j = 1:number_of_samples, .export = c( + "compute_corrected_features", + "compute_template_adjusted_rt", "compute_comb", "compute_sel" + )) %dopar% { + this.feature <- extracted_features[[j]] + if (j != template) { + this.comb <- compute_comb(candi, template, this.feature, j) + + sel <- compute_sel(this.comb, mz_tol_relative, rt_tol_relative) + if (length(sel) < 20) { + cat("too few, aborted") + } else { + all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j) + + # the to be adjusted time + this.diff <- all.ftr.table[, 2] + + # the difference between the true time and the to-be-adjusted time + avg_time <- all.ftr.table[, 1] - this.diff + + this.feature <- compute_corrected_features(this.feature, this.diff, avg_time) + } } - if (do.plot) { - if(is.na(colors[1])) - colors <-c ("red", "blue", "dark blue", "orange", "green", "yellow", - "cyan", "pink", "violet", "bisque", "azure", "brown", - "chocolate", rep("grey", number_of_samples)) - - draw_plot(x = range(features[[1]][, 2]), - y = c(-rt_tol_relative, rt_tol_relative), - xlab = "Original Retention time", - ylab = "Retention time deviation", - axes = TRUE) - - for(i in 1:number_of_samples) { - features[[i]] <- features[[i]][order(features[[i]][, 1], features[[i]][, 2]), ] - points(features[[i]][, 2], corrected_features[[i]][, 2] - features[[i]][, 2], - col=colors[i], cex=.2) - } + if (sum(is.na(this.feature[, 2])) > 0) { + this.feature <- fill_missing_values( + extracted_features[[j]], + this.feature + ) } + this.feature + } - if(exists("corrected_features")){ - return(corrected_features) + + if (do.plot) { + if (is.na(colors[1])) { + colors <- c( + "red", "blue", "dark blue", "orange", "green", "yellow", + "cyan", "pink", "violet", "bisque", "azure", "brown", + "chocolate", rep("grey", number_of_samples) + ) } + + draw_plot( + x = range(extracted_features[[1]][, 2]), + y = c(-rt_tol_relative, rt_tol_relative), + xlab = "Original Retention time", + ylab = "Retention time deviation", + axes = TRUE + ) + + for (i in 1:number_of_samples) { + extracted_features[[i]] <- extracted_features[[i]][order(extracted_features[[i]][, 1], extracted_features[[i]][, 2]), ] + points(extracted_features[[i]][, 2], corrected_features[[i]][, 2] - extracted_features[[i]][, 2], + col = colors[i], cex = .2 + ) + } + } + + if (exists("corrected_features")) { + return(corrected_features) + } } diff --git a/R/combine.seq.3.R b/R/combine.seq.3.R index 5b941430..51d8cdf9 100644 --- a/R/combine.seq.3.R +++ b/R/combine.seq.3.R @@ -1,8 +1,8 @@ #' An internal function. -#' +#' #' This is a internal function. -#' -#' @param table dataframe of retention time, m/z ratio, signal strength. +#' +#' @param features dataframe of retention time, m/z ratio, signal strength. #' @return returns #' \itemize{ #' \item masses - m/z ratio @@ -15,11 +15,16 @@ combine.seq.3 <- function(features) { l <- nrow(features) breaks <- c(0, which(features$labels[1:(l - 1)] != features$labels[2:l]), l) - new_table <- data.frame(mz = rep(0, length(breaks) - 1), labels = unique(features$labels), intensities = rep(0, length(breaks) - 1)) - + new_table <- tibble::tibble( + mz = rep(0, length(breaks) - 1), + labels = unique(features$labels), + intensities = rep(0, length(breaks) - 1) + ) + for (i in 1:(length(breaks) - 1)) { start <- breaks[i] + 1 end <- breaks[i + 1] + mz <- features$mz[start:end] ints <- features$intensities[start:end] @@ -29,20 +34,3 @@ combine.seq.3 <- function(features) { return(new_table) } - -combine.seq.3_old <- - function(a, mz, inte) ### the input need to be pre-ordered by a - { - l <- length(a) - breaks <- c(0, which(a[1:(l - 1)] != a[2:l]), l) - new.int <- new.mz <- rep(0, length(breaks) - 1) - - for (i in 1:(length(breaks) - 1)) { - this.int <- inte[(breaks[i] + 1):breaks[i + 1]] - this.mz <- mz[(breaks[i] + 1):breaks[i + 1]] - new.int[i] <- sum(this.int) - new.mz[i] <- median(this.mz[which(this.int == max(this.int))]) - } - new.a <- unique(a) - return(cbind(new.mz, new.a, new.int)) - } diff --git a/R/extract_features.R b/R/extract_features.R index ef02533a..e7433bcc 100644 --- a/R/extract_features.R +++ b/R/extract_features.R @@ -4,7 +4,7 @@ NULL #' feature extraction #' -#' extract feature +#' extract features #' #' @param cluster The number of CPU cores to be used #' @param filenames The CDF file names. @@ -35,8 +35,7 @@ NULL #' model in an EIC. #' @param BIC_factor The factor that is multiplied on the number of parameters to modify the BIC criterion. #' If larger than 1, models with more peaks are penalized more. -#' @examples -#' extract_features(cluster, filenames, min_pres, min_run, mz_tol, 0, 0.05, intensity_weighted, NA, NA, sd_cut, sigma_ratio_lim, "bi-Gaussian", "moment", 0.01, 1, 2.0) +#' @export extract_features <- function( cluster, filenames, @@ -62,6 +61,8 @@ extract_features <- function( 'load.lcms', 'adaptive.bin', 'find.turn.point', + 'msExtrema', + 'find_local_maxima', 'combine.seq.3', 'cont.index', 'interpol.area', @@ -74,13 +75,26 @@ extract_features <- function( 'compute_boundaries', 'increment_counter', 'rm.ridge', - 'compute_base_curve', - 'compute_all_times', + 'compute_delta_rt', 'bigauss.mix', 'bigauss.esti', 'rev_cum_sum', - 'compute_bounds' + 'compute_bounds', + 'validate_inputs', + 'preprocess_bandwidth', + 'preprocess_profile', + 'compute_gaussian_peak_shape', + 'compute_chromatographic_profile', + 'compute_dx', + 'compute_initiation_params', + 'compute_e_step', + 'compute_start_bound', + 'compute_end_bound', + 'compute_bounds', + 'compute_scale' )) + snow::clusterEvalQ(cluster, library("dplyr")) + snow::parLapply(cluster, filenames, function(filename) { profile <- proc.cdf( @@ -95,7 +109,7 @@ extract_features <- function( cache = FALSE ) features <- prof.to.features( - a = profile, + profile = profile, min.bw = min_bandwidth, max.bw = max_bandwidth, sd.cut = sd_cut, diff --git a/R/feature.align.R b/R/feature.align.R index 9646562d..030ba6d4 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -11,25 +11,29 @@ to_attach <- function(pick, number_of_samples, use = "sum") { for (i in seq_along(strengths)) { # select all areas/RTs from the same sample values <- pick[pick[, 6] == i, 5] - if (use == "sum") + if (use == "sum") { strengths[i] <- sum(values) - if (use == "median") + } + if (use == "median") { strengths[i] <- median(values) + } # can be NA if pick does not contain any data from a sample } # average of m/z, average of rt, min of m/z, max of m/z, sum/median of areas/RTs - return(c(mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]), - max(pick[, 1]), strengths)) + return(c( + mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]), + max(pick[, 1]), strengths + )) } } create_output <- function(sample_grouped, number_of_samples, deviation) { - return(c(to_attach(sample_grouped, number_of_samples, use = "sum"), - to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"), - deviation - ) - ) + return(c( + to_attach(sample_grouped, number_of_samples, use = "sum"), + to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"), + deviation + )) } @@ -81,27 +85,63 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, for (i in seq_along(turns$peaks)) { sample_grouped <- filter_based_on_density(sample, turns, 1, i) if (validate_contents(sample_grouped, min_occurrence)) { - return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples)) + return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples)) } } } +create_rows <- function(i, + grps, + sel.labels, + mz_values, + rt, + area, + sample_id, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + number_of_samples) { + if (i %% 100 == 0) { + gc() + } # call Garbage Collection for performance improvement? + # select a group + group_ids <- which(grps == sel.labels[i]) + if (length(group_ids) > 1) { + # select data from the group + sample <- cbind( + mz_values[group_ids], rt[group_ids], rt[group_ids], + rt[group_ids], area[group_ids], sample_id[group_ids] + ) + # continue if data is from at least 'min_occurrence' samples + if (validate_contents(sample, min_occurrence)) { + return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples)) + } + } else if (min_occurrence == 1) { + sample_grouped <- c( + mz_values[group_ids], rt[group_ids], rt[group_ids], + rt[group_ids], area[group_ids], sample_id[group_ids] + ) + return(create_output(sample_grouped, number_of_samples, NA)) + } + return(NULL) +} + #' Align peaks from spectra into a feature table. -#' +#' #' Identifies which of the peaks from the profiles correspond to the same feature. -#' +#' #' @param features A list object. Each component is a matrix which is the output from proc.to.feature(). #' @param min_occurrence A feature has to show up in at least this number of profiles to be included in the final result. #' @param mz_tol_relative The m/z tolerance level for peak alignment. The default is NA, which allows the -#' program to search for the tolerance level based on the data. This value is expressed as the +#' program to search for the tolerance level based on the data. This value is expressed as the #' percentage of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. -#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which +#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which #' allows the program to search for the tolerance level based on the data. -#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. +#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. #' This is only used when the mz_tol_relative is NA. -#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable -#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly +#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable +#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly #' influences feature matching in higher m/z range. #' @param do.plot Indicates whether plot should be drawn. #' @param rt_colname Name of the column containing the retention time information. @@ -128,141 +168,175 @@ feature.align <- function(features, draw_plot(label = "Feature alignment", cex = 2) draw_plot() } - - number_of_samples <- nrow(summary(features)) + + features <- lapply(features, function(x) { + x <- tibble::as_tibble(x) + if("pos" %in% colnames(x)) { + x <- x |> dplyr::rename(rt = pos) + } + return(x) + }) + + number_of_samples <- length(features) if (number_of_samples > 1) { - values <- get_feature_values(features, rt_colname) + values <- concatenate_feature_tables(features, rt_colname) |> dplyr::arrange_at(c("mz", "rt")) + mz_values <- values$mz rt <- values$rt sample_id <- values$sample_id - + # sort all values by m/z, if equal by rt - ordering <- order(mz_values, rt) - mz_values <- mz_values[ordering] - rt <- rt[ordering] - sample_id <- sample_id[ordering] - + # ordering <- order(mz_values, rt) + # mz_values <- mz_values[ordering] + # rt <- rt[ordering] + # sample_id <- sample_id[ordering] + # find relative m/z tolerance level if (is.na(mz_tol_relative)) { mz_tol_relative <- find.tol(mz_values, mz_max_diff = mz_max_diff, do.plot = do.plot) if (length(mz_tol_relative) == 0) { mz_tol_relative <- 1e-5 - warning("Automatic tolerance finding failed, 10 ppm was assigned. + warning("Automatic tolerance finding failed, 10 ppm was assigned. May need to manually assign alignment mz tolerance level.") } } else if (do.plot) { - draw_plot(main = "alignment m/z tolerance level given", - label = mz_tol_relative, cex = 1.2) + draw_plot( + main = "alignment m/z tolerance level given", + label = mz_tol_relative, cex = 1.2 + ) } - + if (!is.na(rt_tol_relative) && do.plot) { - draw_plot(main = "retention time \n tolerance level given", - label = rt_tol_relative, cex = 1.2) + draw_plot( + main = "retention time \n tolerance level given", + label = rt_tol_relative, cex = 1.2 + ) } - + # find relative retention time tolerance level # also does some preprocessing grouping steps all_features <- find.tol.time(mz_values, - rt, - sample_id, - number_of_samples = number_of_samples, - mz_tol_relative = mz_tol_relative, - rt_tol_relative = rt_tol_relative, - mz_tol_absolute = mz_tol_absolute, - do.plot = do.plot) + rt, + sample_id, + number_of_samples = number_of_samples, + mz_tol_relative = mz_tol_relative, + rt_tol_relative = rt_tol_relative, + mz_tol_absolute = mz_tol_absolute, + do.plot = do.plot + ) rt_tol_relative <- all_features$rt.tol - + message("**** performing feature alignment ****") message(paste("m/z tolerance level: ", mz_tol_relative)) message(paste("time tolerance level:", rt_tol_relative)) - + # create zero vectors of length number_of_samples + 4 ? - aligned_features <- pk.times <- rep(0, 4 + number_of_samples) + aligned_features <- pk.times <- NULL mz_sd <- 0 - + labels <- unique(all_features$grps) area <- grps <- mz_values - + # grouping the features based on their m/z values (assuming the tolerance level) sizes <- c(0, cumsum(sapply(features, nrow))) for (i in 1:number_of_samples) { sample <- features[[i]] # order by m/z then by rt - sample <- sample[order(sample[, 1], sample[, 2]),] - + # sample <- sample[order(sample[, 1], sample[, 2]),] + sample <- features[[i]] |> dplyr::arrange_at(c("mz", "rt")) + # select preprocessed features belonging to current sample group_ids <- which(all_features$lab == i) # select m/z, rt and their group ID - sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids]) - sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),] - + # sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids]) + sample_grouped <- tibble::tibble(mz = all_features$mz[group_ids], rt = all_features$rt[group_ids], grps = all_features$grps[group_ids]) + + # sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),] + sample_grouped <- sample_grouped |> dplyr::arrange_at(c("mz", "rt")) + + # update m/z, rt, area values with ordered ones - mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample[, 1] - rt[(sizes[i] + 1):sizes[i + 1]] <- sample[, 2] - area[(sizes[i] + 1):sizes[i + 1]] <- sample[, 5] + mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample$mz + rt[(sizes[i] + 1):sizes[i + 1]] <- sample$rt + area[(sizes[i] + 1):sizes[i + 1]] <- sample$area # assign row identifier - grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped[, 3] + grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped$grps # assign batch identifier sample_id[(sizes[i] + 1):sizes[i + 1]] <- i } - + # table with number of values per group groups_cardinality <- table(all_features$grps) - # count those with minimal occurrence + # count those with minimal occurrence # (times 3 ? shouldn't be number of samples) !!! curr.row <- sum(groups_cardinality >= min_occurrence) * 3 mz_sd <- rep(0, curr.row) - + sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) - + # retention time alignment - aligned_features <- - foreach::foreach(i = seq_along(sel.labels), .combine = rbind) %do% { - if (i %% 100 == 0) - gc() # call Garbage Collection for performance improvement? - # select a group - group_ids <- which(grps == sel.labels[i]) - if (length(group_ids) > 1) { - # select data from the group - sample <- cbind(mz_values[group_ids], rt[group_ids], rt[group_ids], - rt[group_ids], area[group_ids], sample_id[group_ids]) - # continue if data is from at least 'min_occurrence' samples - if (validate_contents(sample, min_occurrence)) { - return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples)) - } - } else if (min_occurrence == 1) { - sample_grouped <- c(mz_values[group_ids], rt[group_ids], rt[group_ids], - rt[group_ids], area[group_ids], sample_id[group_ids]) - return(create_output(sample_grouped, number_of_samples, NA)) - } - return(NULL) - } - + # aligned_features <- + # foreach::foreach(i = seq_along(sel.labels), .combine = rbind) %do% + # create_rows( + # i, + # grps, + # sel.labels, + # mz_values, + # rt, + # area, + # sample_id, + # mz_tol_relative, + # rt_tol_relative, + # min_occurrence, + # number_of_samples + # ) + for(i in seq_along(sel.labels)) { + rows <- create_rows( + i, + grps, + sel.labels, + mz_values, + rt, + area, + sample_id, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + number_of_samples + ) + + aligned_features <- rbind(aligned_features, rows) + } + #aligned_features <- aligned_features[-1, ] + # select columns: average of m/z, average of rt, min of m/z, max of m/z, median of rt per sample (the second to_attach call) pk.times <- aligned_features[, (5 + number_of_samples):(2 * (4 + number_of_samples))] mz_sd <- aligned_features[, ncol(aligned_features)] # select columns: average of m/z, average of rt, min of m/z, max of m/z, sum of areas per sample (the first to_attach call) aligned_features <- aligned_features[, 1:(4 + number_of_samples)] - + # rename columns on both tables, samples are called "exp_i" colnames(aligned_features) <- colnames(pk.times) <- c("mz", "time", "mz.min", "mz.max", paste("exp", 1:number_of_samples)) - + # return both tables and both computed tolerances rec <- new("list") rec$aligned.ftrs <- aligned_features rec$pk.times <- pk.times rec$mz.tol <- mz_tol_relative rec$chr.tol <- rt_tol_relative - + if (do.plot) { - hist(mz_sd, xlab = "m/z SD", ylab = "Frequency", - main = "m/z SD distribution") - hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE), - xlab = "Retention time SD", ylab = "Frequency", - main = "Retention time SD distribution") + hist(mz_sd, + xlab = "m/z SD", ylab = "Frequency", + main = "m/z SD distribution" + ) + hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE), + xlab = "Retention time SD", ylab = "Frequency", + main = "Retention time SD distribution" + ) } - + return(rec) } else { message("There is but one experiment. What are you trying to align?") diff --git a/R/find.tol.time.R b/R/find.tol.time.R index 96dddfd0..6c976218 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -1,24 +1,171 @@ -#' An internal function that find elution time tolerance level. -#' +#' Compute minimum mz tolerance to use. +#' @description +#' Compute the minimum mz tolerance based on the relative +#' tolerance and the mz values and the absolute tolerance. +#' Uses midpoints between mz values for the weighting. +#' @param mz vector Mz values to use. +#' @param mz_tol_relative float Relative mz tolerance to use with the mz values. +#' This forms a sort of weighted tolerance. +#' @param mz_tol_absolute float Absolute tolerance to use independent from the mz values. +#' @return float Minimum tolerance values to use. +compute_min_mz_tolerance <- function(mz, mz_tol_relative, mz_tol_absolute) { + l <- length(mz) + mz_midpoints <- ((mz[2:l] + mz[1:(l - 1)]) / 2) + mz_ftr_relative_tolerances <- mz_tol_relative * mz_midpoints + min_mz_tol <- min(mz_tol_absolute, mz_ftr_relative_tolerances) + return(min_mz_tol) +} + +#' @description +#' Compute indices of mass differences greater than min_mz_tol. +#' @param mz mz values of all peaks in all profiles in the study. +#' @param min_mz_tol float Minimum tolerance value. +#' @return breaks Integer indices of mass differences to use. +compute_breaks_3 <- function(mz, min_mz_tol) { + l <- length(mz) + mass_differences <- diff(mz) + indices <- which(mass_differences > min_mz_tol) + breaks <- c(0, indices, l) + return(breaks) +} + +#' Compute rt relative tolerance to use. +#' @description +#' Compute the elution time tolerance based on the kernel density estimation. +#' It plots the fitting function if set to TRUE. +#' @param max.num.segments the maximum number of segments. +#' @param aver.bin.size The average bin size to determine the number of equally spaced points in the kernel density estimation. +#' @param number_of_samples The number of spectra in this analysis. +#' @param chr retention time of all peaks in all profiles in the study. +#' @param min.bins the minimum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too few observations are present. +#' @param max.bins the maximum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too many observations are present. +#' @param do.plot Indicates whether plot should be drawn. +#' @return rt_tol_relative the elution time tolerance. +compute_rt_tol_relative <- function(breaks, + max.num.segments, + aver.bin.size, + number_of_samples, + chr, + min.bins, + max.bins, + do.plot = FALSE) { + distances <- 0 + #' This conditional makes sure that length(s) is <= max.num.segments + #' If False, length(s) = max.num.segments, and s[i] is the largest + #' integer no greater than the corresponding element. Otherwise + #' length(s) = length(breaks) - 1 + if (length(breaks) > max.num.segments) { + s <- floor(seq(2, length(breaks), length.out = max.num.segments)) + } else { + s <- 2:length(breaks) + } + + #' This loop creates a vector with distances between rt peaks. Distances + #' are stored in a triangular matrix and converted to a vector subsequently. + #' Vector length should be < 100, otherwise, vector is + #' constructed extracting only 100 samples. + for (i in s) { + subset_idx <- (breaks[i - 1] + 1):breaks[i] # create subset of indices + if (length(subset_idx) <= 3 * number_of_samples) { + rt_distances <- as.vector(dist(chr[subset_idx])) + if (length(rt_distances) > 100) { + rt_distances <- sample(rt_distances, 100) + } + distances <- c(distances, rt_distances) + } + } + + # #' Calculation of kernel density estimation to estimate the rt_tol_relative + # da <- da[!is.na(da)] + # uppermost <- max(da) + # n <- min(max.bins, max(min.bins, round(length(da) / aver.bin.size))) + # des <- density(da, + # kernel = "gaussian", n = n, + # bw = uppermost / n * 2, from = 0 + # ) + # y <- des$y[des$x > 0] + # x <- des$x[des$x > 0] + + # this.l <- lm(y[x > uppermost / 4] ~ x[x > uppermost / 4]) + # exp.y <- this.l$coef[1] + this.l$coef[2] * x + # y2 <- y[1:(length(y) - 1)] + # y3 <- y[2:(length(y))] + # y2[which(y2 < y3)] <- y3[which(y2 < y3)] + # y[1:(length(y) - 1)] <- y2 + + # yy <- cumsum(y > 1.5 * exp.y) + # yi <- seq_along(yy) + # sel <- min(which(yy < yi)) - 1 + # rt_tol_relative <- x[sel] + + # a long vector of distances between rt values (with no particular order) + distances <- distances[!is.na(distances)] + max_distance <- max(distances) # maximal distance + # number of equally spaced points at which the density is to be estimated + n <- min(max.bins, max(min.bins, round(length(distances) / aver.bin.size))) + # estimate probability density function of distances + des <- density(distances, + kernel = "gaussian", n = n, + bw = max_distance / n * 2, from = 0 + ) + # the n (-1?) coordinates of the points where the density is estimated + points <- des$x[des$x > 0] + # the estimated density values + density_values <- des$y[des$x > 0] + + linear_regression_model <- lm(density_values[points > max_distance / 4] ~ points[points > max_distance / 4]) + + # compute probability density values (y) using the linear function + estimated_density_values <- linear_regression_model$coef[1] + linear_regression_model$coef[2] * points + + values_not_last <- density_values[1:(length(density_values) - 1)] # density values without the last one + values_not_first <- density_values[2:(length(density_values))] # density values without the first one + # pair-wise copy greater value to the left + values_not_last[which(values_not_last < values_not_first)] <- values_not_first[which(values_not_last < values_not_first)] + density_values[1:(length(density_values) - 1)] <- values_not_last + + # cumulative sum - where density value is greater than estimated density value + # cutoff is selected where the density of the empirical distribution is >1.5 times the density of the distribution + cumulative <- cumsum(density_values > 1.5 * estimated_density_values) + cumulative_indices <- seq_along(cumulative) + # find last index where density value is greater than estimated density value + selected <- min(which(cumulative < cumulative_indices)) - 1 + # corresponding coordinate is used as rt tolerance + rt_tol_relative <- points[selected] + + if (do.plot) { + tolerance_plot( + points, + density_values, + estimated_density_values, + selected, + main = "find retention time tolerance" + ) + } + return(rt_tol_relative) +} + +#' An internal function that find elution time tolerance level. +#' #' This function finds the time tolerance level. Also, it returns the grouping information given the time tolerance. -#' +#' #' @param mz mz values of all peaks in all profiles in the study. #' @param chr retention time of all peaks in all profiles in the study. #' @param lab label of all peaks in all profiles in the study. #' @param number_of_samples The number of spectra in this analysis. -#' @param mz_tol_relative m/z tolerance level for the grouping of signals into peaks. This value is expressed as the percentage of the m/z value. +#' @param mz_tol_relative m/z tolerance level for the grouping of signals into peaks. This value is expressed as the percentage of the m/z value. #' This value, multiplied by the m/z value, becomes the cutoff level. -#' @param rt_tol_relative the elution time tolerance. If NA, the function finds the tolerance level first. If a numerical value is given, +#' @param rt_tol_relative the elution time tolerance. If NA, the function finds the tolerance level first. If a numerical value is given, #' the function directly goes to the second step - grouping peaks based on the tolerance. #' @param aver.bin.size The average bin size to determine the number of equally spaced points in the kernel density estimation. #' @param min.bins the minimum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too few observations are present. #' @param max.bins the maximum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too many observations are present. -#' @param mz_tol_absolute As the m/z tolerance in alignment is expressed in relative terms (ppm), it may not be suitable when the m/z range is wide. +#' @param mz_tol_absolute As the m/z tolerance in alignment is expressed in relative terms (ppm), it may not be suitable when the m/z range is wide. #' This parameter limits the tolerance in absolute terms. It mostly influences feature matching in higher m/z range. #' @param max.num.segments the maximum number of segments. #' @param do.plot Indicates whether plot should be drawn. -#' @return A matrix with six columns. Every row corrsponds to a peak in one of the spectrum. The columns are: m/z, elution time, spread, signal strength, -#' spectrum label, and peak group label. The rows are ordered by the median m/z of each peak group, and with each peak group the rows are ordered +#' @return A matrix with six columns. Every row corresponds to a peak in one of the spectrum. The columns are: m/z, elution time, spread, signal strength, +#' spectrum label, and peak group label. The rows are ordered by the median m/z of each peak group, and with each peak group the rows are ordered #' by the elution time. #' @examples #' find.tol.time(mz, chr, lab, number_of_samples = number_of_samples, mz_tol_relative = mz_tol_relative, mz_tol_absolute = mz_tol_absolute, do.plot = FALSE) @@ -34,119 +181,54 @@ find.tol.time <- function(mz, mz_tol_absolute = 0.01, max.num.segments = 10000, do.plot = TRUE) { - # sort m/z, rt, and sampel IDs based on m/z - mz_ordering <- order(mz) - mz <- mz[mz_ordering] - rt <- rt[mz_ordering] - sample_id <- sample_id[mz_ordering] - rm(mz_ordering) - - l <- length(mz) - - # indices of m/z where difference from its neighbor is greater than allowed - # tolerance; as result, it separates m/z to groups with similar values - breaks <- - c(0, which((mz[2:l] - mz[1:(l - 1)]) - > min(mz_tol_absolute, - mz_tol_relative * ((mz[2:l] + mz[1:(l - 1)]) / 2) - ) - ), - l) - - for (i in 2:length(breaks)) { - # sort rt inside each m/z group - rt_ordering <- order(rt[(breaks[i - 1] + 1):breaks[i]]) - rt_ordering <- rt_ordering + breaks[i - 1] - # reorder other m/z, rt and sample ID within group based on rt order - mz[(breaks[i - 1] + 1):breaks[i]] <- mz[rt_ordering] - rt[(breaks[i - 1] + 1):breaks[i]] <- rt[rt_ordering] - sample_id[(breaks[i - 1] + 1):breaks[i]] <- sample_id[rt_ordering] + features <- tibble::tibble(mz = mz, rt = rt, labels = sample_id) + features <- dplyr::arrange_at(features, "mz") + + min_mz_tol <- compute_min_mz_tolerance( + features$mz, + mz_tol_relative, + mz_tol_absolute + ) + + mz_breaks <- compute_breaks_3(features$mz, min_mz_tol) + features$mz_group <- 0 + + for (i in 2:length(mz_breaks)) { + subset_indices <- (mz_breaks[i - 1] + 1):mz_breaks[i] + features$mz_group[subset_indices] <- i } - - # remove fist and last index - breaks <- breaks[c(-1, -length(breaks))] - + + features <- features |> dplyr::arrange_at(c("mz_group", "rt")) + + mz_breaks <- mz_breaks[c(-1, -length(mz_breaks))] + if (is.na(rt_tol_relative)) { - distances <- 0 - # create indices for each groups - if (length(breaks) > max.num.segments) { - segments <- floor(seq(2, length(breaks), length.out = max.num.segments)) - } else { - segments <- 2:length(breaks) - } - - # compute distance matrix of each group using stats::dist - for (i in segments) { - segment <- (breaks[i - 1] + 1):breaks[i] - - if (length(segment) <= 3 * number_of_samples) { - distance_matrix <- as.vector(dist(rt[segment])) - if (length(distance_matrix) > 100) - distance_matrix <- sample(distance_matrix, 100) - distances <- c(distances, distance_matrix) - } - } - - # goal: statistically find the smallest rt which is still significant - - # a long vector of distances between rt values (with no particular order) - distances <- distances[!is.na(distances)] - max_distance <- max(distances) # maximal distance - # number of equally spaced points at which the density is to be estimated - n = min(max.bins, max(min.bins, round(length(distances) / aver.bin.size))) - # estimate probability density function of distances - des <- density(distances, kernel = "gaussian", n = n, - bw = max_distance / n * 2, from = 0) - # the n (-1?) coordinates of the points where the density is estimated - points <- des$x[des$x > 0] - # the estimated density values - density_values <- des$y[des$x > 0] - - linear_regression_model <- lm(density_values[points > max_distance / 4] ~ points[points > max_distance / 4]) - - # compute probability density values (y) using the linear function - estimated_density_values <- linear_regression_model$coef[1] + linear_regression_model$coef[2] * points - - values_not_last <- density_values[1:(length(density_values) - 1)] # density values without the last one - values_not_first <- density_values[2:(length(density_values))] # density values without the first one - # pair-wise copy greater value to the left - values_not_last[which(values_not_last < values_not_first)] <- values_not_first[which(values_not_last < values_not_first)] - density_values[1:(length(density_values) - 1)] <- values_not_last - - # cumulative sum - where density value is greater than estimated density value - # cutoff is selected where the density of the empirical distribution is >1.5 times the density of the distribution - cumulative <- cumsum(density_values > 1.5 * estimated_density_values) - cumulative_indices <- seq_along(cumulative) - # find last index where density value is greater than estimated density value - selected <- min(which(cumulative < cumulative_indices)) - 1 - # corresponding coordinate is used as rt tolerance - rt_tol_relative <- points[selected] - - if (do.plot) { - tolerance_plot(points, density_values, estimated_density_values, selected, main = "find retention time tolerance") - } + rt_tol_relative <- compute_rt_tol_relative( + mz_breaks, + max.num.segments, + aver.bin.size, + number_of_samples, + features$rt, + min.bins, + max.bins + ) } - - # pair-wise rt differences - distances <- rt[2:l] - rt[1:(l - 1)] - # find those respecting the computed tolerance - breaks_rt <- which(distances > rt_tol_relative) - # merge and sort both group delimiters - breaks_final <- c(0, unique(c(breaks, breaks_rt)), l) - breaks_final <- breaks_final[order(breaks_final)] - - # create list of indices corresponding to a representative from each group - # (always the first element) - groups <- seq_along(mz) - for (i in 2:length(breaks_final)) { - groups[(breaks_final[i - 1] + 1):breaks_final[i]] <- i + + rt_diffs <- diff(features$rt) + rt_breaks <- which(rt_diffs > rt_tol_relative) + all.breaks <- c(0, unique(c(mz_breaks, rt_breaks)), nrow(features)) + all.breaks <- all.breaks[order(all.breaks)] + + features$grps <- 0 + for (i in 2:length(all.breaks)) { + features$grps[(all.breaks[i - 1] + 1):all.breaks[i]] <- i } - + list( - mz = mz, - rt = rt, - lab = sample_id, - grps = groups, + mz = features$mz, + rt = features$rt, + lab = features$labels, + grps = features$grps, rt.tol = rt_tol_relative, mz.tol = mz_tol_relative ) diff --git a/R/find.turn.point.R b/R/find.turn.point.R index 1f731098..29c39420 100644 --- a/R/find.turn.point.R +++ b/R/find.turn.point.R @@ -1,7 +1,47 @@ +#' @description +#' Compute local maxima turn points. +#' @param y The y values of a curve in x-y plane. +#' @param ties.method specifies the method rank uses to break ties. +#' @return boolean row with local maxima turn point. +#' @export +find_local_maxima <- function(y, ties.method) { + padded_y <- rev(as.vector(c(-Inf, y, -Inf))) + + # each row is 3 consecutive values in descending order + # rows are sorted in ascending order + z <- embed(padded_y, dim = 3) + + # reverse the row ordering + # first column is equal y + z <- z[rev(seq(nrow(z))), ] + + # row where the max is in the middle is a turn point + v <- max.col(z, ties.method = ties.method) == 2 + + return(v) +} + +#' @description +#' Compute maxima and minima turn points. +#' @param y The y values of a curve in x-y plane. +#' @return boolean row with local maxima and minima turn points. +#' @export +msExtrema <- function(y) { + index1 <- find_local_maxima(y, ties.method = "first") + index2 <- find_local_maxima(-y, ties.method = "last") + + # this is some sort of safety mechanism to protect against numerical issues + index.max <- index1 & !index2 + index.min <- index2 & !index1 + + list(index.max = index.max, index.min = index.min) +} + #' Find peaks and valleys of a curve. -#' +#' +#' @description #' This is an internal function which finds the peaks and valleys of a smooth curve. -#' +#' #' @param y The y values of a curve in x-y plane. #' @return A list object: #' \itemize{ @@ -11,48 +51,34 @@ #' @export #' @examples #' find.turn.point(y) -find.turn.point <- function(values) { - - find_extrema <- function(values, ties.method) { - # this might not work correctly - z <- embed(rev(as.vector(c(-Inf, values,-Inf))), dim = 3) - z <- z[rev(seq(nrow(z))),] - v <- max.col(z, ties.method = ties.method) == 2 - return(v) - } - - analyse_extrema <- function(values) { - minima <- find_extrema(values, ties.method = "first") # find peaks - maxima <- find_extrema(-values, ties.method = "last") # find valleys - # remove overlaps - maxima.indices <- minima & !maxima - minima.indices <- maxima & !minima - return(list(minima.indices = minima.indices, maxima.indices = maxima.indices)) - } - - values <- values[!is.na(values)] - # flat curve => peak is middle and valleys are the first and the last - if (length(unique(values)) == 1) { - x <- new("list") - x$pks <- round(length(values) / 2) - x$vlys <- c(1, length(values)) - return(x) - } - - extrema <- analyse_extrema(values) - maxima <- which(extrema$maxima.indices) - minima <- which(extrema$minima.indices) - # if the first or the last index are not maxima, added them to minima - if (maxima[1] != 1) - minima <- c(1, minima) - if (maxima[length(maxima)] != length(values)) - minima <- c(minima, length(values)) - # if there is only one maximum, the minima are the first and the last index - # (probably because Gaussian was used) - if (length(maxima) == 1) - minima <- c(1, length(values)) - x <- new("list") - x$pks <- maxima - x$vlys <- minima - return(x) +find.turn.point <- function(y) { + y <- y[!is.na(y)] # filter NA values + + if (length(unique(y)) == 1) { # if exactly one distinct value + middle_index <- round(length(y) / 2) # get mid index + start_and_end <- c(1, length(y)) # get first and last index + return(list(pks = middle_index, vlys = start_and_end)) + } else { + b <- msExtrema(y) + + pks <- which(b$index.max) + vlys <- which(b$index.min) + + if (pks[1] != 1) { + vlys <- c(1, vlys) + } + + if (pks[length(pks)] != length(y)) { + vlys <- c(vlys, length(y)) + } + + if (length(pks) == 1) { + vlys <- c(1, length(y)) + } + + x <- new("list") + x$pks <- pks + x$vlys <- vlys + return(x) + } } diff --git a/R/hybrid.R b/R/hybrid.R index fd6de2fc..eaa5386a 100644 --- a/R/hybrid.R +++ b/R/hybrid.R @@ -7,11 +7,11 @@ NULL match_tol_ppm <- aligned$mz_tolerance * 1e+06 } - features <- as.matrix(aligned$int_crosstab) + features <- tibble::as_tibble(aligned$int_crosstab) known_mz <- known_table[, 6] known_rt <- known_table[, 11] - mass_d2 <- mass.match(features[, 1], known_mz, match_tol_ppm) + mass_d2 <- mass.match(features$mz, known_mz, match_tol_ppm) mass_matched_pos <- which(mass_d2 > 0) known_assigned <- rep(0, nrow(known_table)) @@ -23,12 +23,12 @@ NULL if (new_assigned[i] == 0) { # find all potentially related known/newly found peaks prev_sel_new <- i - threshold <- features[i, 1] * match_tol_ppm / 1e+06 + threshold <- features$mz[i] * match_tol_ppm / 1e+06 - sel_known <- which(abs(known_mz - features[i, 1]) < threshold) + sel_known <- which(abs(known_mz - features$mz[i]) < threshold) sel_new <- NULL for (m in seq_along(sel_known)) { - distance <- abs(features[, 1] - known_mz[sel_known[m]]) + distance <- abs(features$mz - known_mz[sel_known[m]]) sel_new <- c(sel_new, which(distance < threshold)) } sel_known <- unique(sel_known) @@ -39,13 +39,13 @@ NULL sel_known <- NULL for (m in seq_along(sel_new)) { - distance <- abs(known_mz - features[sel_new[m], 1]) + distance <- abs(known_mz - features$mz[sel_new[m]]) sel_known <- c(sel_known, which(distance < threshold)) } sel_new <- NULL for (m in seq_along(sel_known)) { - distance <- abs(features[, 1] - known_mz[sel_known[m]]) + distance <- abs(features$mz - known_mz[sel_known[m]]) sel_new <- c(sel_new, which(distance < threshold)) } @@ -57,8 +57,8 @@ NULL matrix(data = 0, nrow = length(sel_known), ncol = length(sel_new)) for (k in seq_along(sel_known)) { - time_matched[k, ] <- abs(features[sel_new, 2] - known_rt[sel_known[k]]) - mass_matched[k, ] <- abs(features[sel_new, 1] - known_mz[sel_known[k]]) + time_matched[k, ] <- abs(features$rt[sel_new] - known_rt[sel_known[k]]) + mass_matched[k, ] <- abs(features$mz[sel_new] - known_mz[sel_known[k]]) } mass_matched <- mass_matched/median(known_mz[sel_known]) time_matched[mass_matched <= match_tol_ppm * 1e-06] <- 1e+10 @@ -98,6 +98,7 @@ augment_with_known_features <- function(aligned, known_table, match_tol_ppm) { sel <- sel[-(pairing[, 2])] } + to_add_ftrs[, 1] <- to_add_times[, 1] <- known_table[sel, 6] to_add_ftrs[, 2] <- to_add_times[, 2] <- known_table[sel, 11] to_add_ftrs[, 3] <- to_add_times[, 3] <- known_table[sel, 9] @@ -257,7 +258,7 @@ hybrid <- function( message("**** time correction ****") corrected <- adjust.time( - features = extracted, + extracted_features = extracted, mz_tol_relative = align_mz_tol, rt_tol_relative = align_chr_tol, mz_max_diff = 10 * mz_tol, @@ -284,6 +285,7 @@ hybrid <- function( match_tol_ppm = match_tol_ppm ) + message("**** weaker signal recovery ****") recovered <- recover_weaker_signals( cluster = cluster, @@ -305,7 +307,7 @@ hybrid <- function( message("**** second round time correction ****") recovered_corrected <- adjust.time( - features = recovered$extracted_features, + extracted_features = recovered$extracted_features, mz_tol_relative = align_mz_tol, rt_tol_relative = align_chr_tol, mz_max_diff = 10 * mz_tol, @@ -347,7 +349,7 @@ hybrid <- function( corrected_features = recovered_corrected, aligned_feature_sample_table = aligned_feature_sample_table, recovered_feature_sample_table = recovered_feature_sample_table, - aligned_mz_toletance = as.numeric(recovered_aligned$mz_tolerance), + aligned_mz_tolerance = as.numeric(recovered_aligned$mz_tolerance), aligned_rt_tolerance = as.numeric(recovered_aligned$rt_tolerance), updated_known_table = as.data.frame(augmented$known_table), features_known_table_pairing = as.data.frame(augmented$pairing) diff --git a/R/mass.match.R b/R/mass.match.R index 6a3aa3f8..b3dfe949 100644 --- a/R/mass.match.R +++ b/R/mass.match.R @@ -2,17 +2,21 @@ #' @param x The mz array for which to compute the matching. #' @param known.mz The mz value with which to match. #' @param match.tol.ppm Matching tolerance in ppm. -#' @return Binary vector, 1 indicating a match, 0 a mismatch. +#' @return Binary vector, 1 indicating a match, 0 a mismatch. +#' @export #' @examples -#' mass.match(aligned.ftrs, known.table, match.tol.ppm) -mass.match <- -function(x, known.mz, match.tol.ppm=5) -{ - mass.matched.pos<-rep(0, length(x)) - for(i in 1:length(x)) - { - this.d<-abs((x[i]-known.mz)/x[i]) - if(min(this.d) < match.tol.ppm/1e6) mass.matched.pos[i]<-1 - } - return(mass.matched.pos) +#' mass.match( +#' x = c(10, 20, 21), +#' known.mz = 20 +#' ) +mass.match <- function(x, known.mz, match.tol.ppm = 5) { + mass.matched.pos <- rep(0, length(x)) + for (i in seq_along(x)) + { + this.d <- abs((x[i] - known.mz) / x[i]) + if (min(this.d) < match.tol.ppm / 1e6) { + mass.matched.pos[i] <- 1 + } + } + return(mass.matched.pos) } diff --git a/R/plot.R b/R/plot.R index f9e4038f..e3ec4915 100644 --- a/R/plot.R +++ b/R/plot.R @@ -14,6 +14,13 @@ tolerance_plot <- function(x, y, exp_y, selected, main) { abline(v = x[selected], col = "blue") } +#' @export +draw_chr_normal_peaks <- function(x, truth) { + true.y1 <- dnorm(x[x < truth[1]], mean = truth[1], sd = truth[2]) * truth[2] * truth[4] + true.y2 <- dnorm(x[x >= truth[1]], mean = truth[1], sd = truth[3]) * truth[3] * truth[4] + lines(x, c(true.y1, true.y2), col = "green") +} + #' @export plot_raw_profile_histogram <- function(raw.prof, min.pres, @@ -72,3 +79,39 @@ plot_raw_profile_histogram <- function(raw.prof, main = "Group % present signal distribution" ) } + +#' @export +plot_peak_summary <- function(feature_groups, processed_features) { + mz_sd <- compute_mz_sd(feature_groups) + + par(mfrow = c(2, 2)) + plot(c(-1, 1), c(-1, 1), type = "n", xlab = "", ylab = "", main = "", axes = FALSE) + text(x = 0, y = 0, "Estimate peak \n area/location", cex = 1.5) + hist(mz_sd, xlab = "m/z SD", ylab = "Frequency", main = "m/z SD distribution") + hist(c(processed_features[, "sd1"], processed_features[, "sd2"]), xlab = "Retention time SD", ylab = "Frequency", main = "Retention time SD distribution") + hist(log10(processed_features[, "area"]), xlab = "peak strength (log scale)", ylab = "Frequency", main = "Peak strength distribution") +} + +#' @export +plot_chr_profile <- function(chr_profile, bw, fit, m) { + plot(chr_profile[, "base_curve"], chr_profile[, "intensity"], cex = .1, main = paste("bw=", bw)) + sum.fit <- apply(fit, 1, sum) + lines(chr_profile[, "base_curve"], sum.fit) + abline(v = m) + cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100)) + for (i in 1:length(m)) + { + lines(chr_profile[, "base_curve"], fit[, i], col = cols[i]) + } +} + +#' @export +plot_normix_bic <- function(x, y, bw, aaa) { + plot(x, y, cex = .1, main = paste("bw=", bw)) + abline(v = aaa[, 1]) + cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100)) + for (i in 1:nrow(aaa)) + { + lines(x, dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3], col = cols[i]) + } +} diff --git a/R/prof.to.features.R b/R/prof.to.features.R index f33cf807..c9bdd579 100644 --- a/R/prof.to.features.R +++ b/R/prof.to.features.R @@ -1,622 +1,830 @@ -solve.a <- function(x, t, a, sigma.1, sigma.2) { - ## thif function solves the value of a using the x, t, a from the - ## previous step, and sigma.1, and sigma.2 +#' Validate that provided inputs match expected, exit execution otherwise. +#' @param shape.model The mathematical model for the shape of a peak. There are two choices - "bi-Gaussian" and "Gaussian". +#' When the peaks are asymmetric, the bi-Gaussian is better. The default is "bi-Gaussian". +#' @param estim.method The estimation method for the bi-Gaussian peak model. Two possible values: moment and EM. +#' @export +validate_inputs <- function(shape.model, estim.method) { + if (!shape.model %in% c("Gaussian", "bi-Gaussian")) { + stop("shape.model argument must be 'Gaussian' or 'bi-Gaussian'") + } + if (!estim.method %in% c("moment", "EM")) { + stop("estim.method argument must be 'moment' or 'EM'") + } +} - w <- x * (as.numeric(t < a) / sigma.1 + as.numeric(t >= a) / sigma.2) - return(sum(t * w) / sum(w)) +#' Initialize minimum and maximum bandwidth values if none given. Ensure that minimum bandwidth is lower that maximum, else set minimum to 1/4 of maximum value. +#' @param min.bw The minimum bandwidth to use in the kernel smoother. +#' @param max.bw The maximum bandwidth to use in the kernel smoother. +#' @param profile Profile table with shape number-of-features*4. The table contains following columns: +#' \itemize{ +#' \item mz - float - mass-to-charge ratio of feature +#' \item rt - float - retention time of features +#' \item intensity - float - intensity of features +#' \item group_number - integer - group number assigned to each feature based on their rt similarity +#' } +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item min.bw - float - Minimum bandwidth. +#' \item max.bw - float - Maximum bandwidth +#' @export +preprocess_bandwidth <- function(min.bw, max.bw, profile) { + if (is.na(min.bw)) { + min.bw <- diff(range(profile[, 2], na.rm = TRUE)) / 60 + } + if (is.na(max.bw)) { + max.bw <- diff(range(profile[, 2], na.rm = TRUE)) / 15 + } + if (min.bw >= max.bw) { + min.bw <- max.bw / 4 + } + + return(list("min.bw" = min.bw, "max.bw" = max.bw)) } +#' Convert input matrix to a dataframe with column names (see source code for the names). +#' @param profile Profile table with shape number-of-features*4. The table contains following columns: +#' \itemize{ +#' \item float - mass-to-charge ratio of feature +#' \item float - retention time of features +#' \item float - intensity of features +#' \item integer - group number assigned to each feature based on their rt similarity +#' } +#' @return Returns a dataframe with shape number-of-features*4. The columns are as follows: +#' \itemize{ +#' \item mz - float - mass-to-charge ratio of feature +#' \item rt - float - retention time of features +#' \item intensity - float - intensity of features +#' \item group_number - integer - group number assigned to each feature based on their rt similarity +#' } #' @export -prep.uv <- function(x, t, a) { +preprocess_profile <- function(profile) { + keys <- c("mz", "rt", "intensity", "group_number") + colnames(profile) <- keys - ## this function prepares the parameters required for latter - ## compuation. u, v, and sum of x. - - temp <- (t - a)^2 * x - u <- sum(temp * as.numeric(t < a)) - v <- sum(temp * as.numeric(t >= a)) - return(list( - u = u, - v = v, - x.sum = sum(x) - )) + return(data.frame(profile)) } +#' Compute parameters of chromatographic peak shape if peaks are considered to be gaussian +#' @param chr_profile A matrix with two columns: "base.curve" (rt) and "intensity". +#' @param power The power parameter for data transformation when fitting the bi-Gaussian or Gaussian mixture model in an EIC. +#' @param bw Bandwidth vector to use in the kernel smoother. +#' @param component.eliminate When a component accounts for a proportion of intensities less than this value, the component will be ignored. +#' @param BIC.factor The factor that is multiplied on the number of parameters to modify the BIC criterion. If larger than 1, +#' models with more peaks are penalized more. +#' @param aver_diff Average retention time difference across RTs of all features. +#' @return Returns a single-row vector or a table object with the following items/columns: +#' \itemize{ +#' \item miu - float - mean value of the gaussian curve +#' \item sigma - float - standard deviation of the gaussian curve +#' \item sigma - float - standard deviation of the gaussian curve +#' \item scale - float - estimated total signal strength (total area of the estimated normal curve) +#'} +#' @export +compute_gaussian_peak_shape <- function(chr_profile, power, bw, component.eliminate, BIC.factor, aver_diff) { + chr_peak_shape <- normix.bic(chr_profile[, "base.curve"], chr_profile[, 2], power = power, bw = bw, eliminate = component.eliminate, BIC.factor = BIC.factor, aver_diff = aver_diff)$param + if (nrow(chr_peak_shape) == 1) { + chr_peak_shape <- c(chr_peak_shape[1, 1:2], chr_peak_shape[1, 2], chr_peak_shape[1, 3]) + } else { + chr_peak_shape <- cbind(chr_peak_shape[, 1:2], chr_peak_shape[, 2], chr_peak_shape[, 3]) + } + return(chr_peak_shape) +} + +#' This function solves the value of a using the x, t, a from the previous step, and sigma.1, and sigma.2 (original authors' comment). +#' @export +solve.a <- function(x, t, a, sigma.1, sigma.2) { + # This function is a part of bigauss.esti.EM and is not covered by any of test-cases + w <- x * (as.numeric(t < a) / sigma.1 + as.numeric(t >= a) / sigma.2) + return(sum(t * w) / sum(w)) +} + +#' This function prepares the parameters required for latter compuation. u, v, and sum of x (original authors' comment). +#' @export +prep.uv <- function(x, t, a) { + # This function is a part of bigauss.esti.EM and is not covered by any of test-cases + temp <- (t - a)^2 * x + u <- sum(temp * as.numeric(t < a)) + v <- sum(temp * as.numeric(t >= a)) + return(list( + u = u, + v = v, + x.sum = sum(x) + )) +} + +#' This function takes the value intensity level x, retention time t and assumed breaking point a, calculates the square estimated of sigma.1 and sigma.2 (original authors' comment). #' @export solve.sigma <- function(x, t, a) { - ## this function takes the value intensity level x, retention time t - ## and assumed breaking point a, calculates the square estimated of - ## sigma.1 and sigma.2. - - - tt <- prep.uv(x, t, a) - sigma.1 <- tt$u / tt$x.sum * ((tt$v / tt$u)^(1 / 3) + 1) - sigma.2 <- tt$v / tt$x.sum * ((tt$u / tt$v)^(1 / 3) + 1) - return(list( - sigma.1 = sigma.1, - sigma.2 = sigma.2 - )) + # This function is a part of bigauss.esti.EM and is not covered by any of test-cases + tt <- prep.uv(x, t, a) + sigma.1 <- tt$u / tt$x.sum * ((tt$v / tt$u)^(1 / 3) + 1) + sigma.2 <- tt$v / tt$x.sum * ((tt$u / tt$v)^(1 / 3) + 1) + return(list( + sigma.1 = sigma.1, + sigma.2 = sigma.2 + )) } +#' @description +#' Function takes into x and t, and then computes the value of sigma.1, sigma.2 and a using iterative method. the returned values include estimated sigmas, +#' a and a boolean variable on whether the termination criteria is satified upon the end of the program (original authors' comment). #' @export bigauss.esti.EM <- function(t, x, max.iter = 50, epsilon = 0.005, power = 1, do.plot = FALSE, truth = NA, sigma.ratio.lim = c(0.3, 1)) { - ## function takes into x and t, and then computes the value of - ## sigma.1, sigma.2 and a using iterative method. the returned - ## values include estimated sigmas, a and a boolean variable on - ## whether the termination criteria is satified upon the end of the - ## program. - - sel <- which(x > 1e-10) - if (length(sel) == 0) { - return(c(median(t), 1, 1, 0)) - } - if (length(sel) == 1) { - return(c(t[sel], 1, 1, 0)) + # This function is not covered by any test case + sel <- which(x > 1e-10) + if (length(sel) == 0) { + return(c(median(t), 1, 1, 0)) + } + if (length(sel) == 1) { + return(c(t[sel], 1, 1, 0)) + } + t <- t[sel] + x <- x[sel] + + ## epsilon is the threshold for continuing the iteration. change in + ## a smaller than epsilon will terminate the iteration. + ## epsilon <- min(diff(sort(t)))/2 + + ## using the median value of t as the initial value of a. + a.old <- t[which(x == max(x))[1]] + a.new <- a.old + change <- 10 * epsilon + + ## n.iter is the number of iteration covered so far. + n.iter <- 0 + + while ((change > epsilon) & (n.iter < max.iter)) { + a.old <- a.new + n.iter <- n.iter + 1 + sigma <- solve.sigma(x, t, a.old) + if (n.iter == 1) { + sigma[is.na(sigma)] <- as.numeric(sigma[which(!is.na(sigma))])[1] / 10 } - t <- t[sel] - x <- x[sel] - - ## epsilon is the threshold for continuing the iteration. change in - ## a smaller than epsilon will terminate the iteration. - ## epsilon <- min(diff(sort(t)))/2 - - ## using the median value of t as the initial value of a. - a.old <- t[which(x == max(x))[1]] - a.new <- a.old - change <- 10 * epsilon - - ## n.iter is the number of iteration covered so far. - n.iter <- 0 - - while ((change > epsilon) & (n.iter < max.iter)) { - ## print(c(n.iter,change)) - a.old <- a.new - n.iter <- n.iter + 1 - sigma <- solve.sigma(x, t, a.old) - if (n.iter == 1) sigma[is.na(sigma)] <- as.numeric(sigma[which(!is.na(sigma))])[1] / 10 - a.new <- solve.a(x, t, a.old, sigma$sigma.1, sigma$sigma.2) - change <- abs(a.old - a.new) - } - # return(list(a=a.new, - # sigma.1=sigma$sigma.1, - # sigma.2=sigma$sigma.2, - # iter.end=(max.iter>n.iter) - # )) - d <- x - sigma$sigma.2 <- sqrt(sigma$sigma.2) - sigma$sigma.1 <- sqrt(sigma$sigma.1) - - d[t < a.new] <- dnorm(t[t < a.new], mean = a.new, sd = sigma$sigma.1) * sigma$sigma.1 - d[t >= a.new] <- dnorm(t[t >= a.new], mean = a.new, sd = sigma$sigma.2) * sigma$sigma.2 - scale <- exp(sum(d[d > 1e-3]^2 * log(x[d > 1e-3] / d[d > 1e-3])) / sum(d[d > 1e-3]^2)) - return(c(a.new, sigma$sigma.1, sigma$sigma.2, scale)) + a.new <- solve.a(x, t, a.old, sigma$sigma.1, sigma$sigma.2) + change <- abs(a.old - a.new) + } + d <- x + sigma$sigma.2 <- sqrt(sigma$sigma.2) + sigma$sigma.1 <- sqrt(sigma$sigma.1) + + d[t < a.new] <- dnorm(t[t < a.new], mean = a.new, sd = sigma$sigma.1) * sigma$sigma.1 + d[t >= a.new] <- dnorm(t[t >= a.new], mean = a.new, sd = sigma$sigma.2) * sigma$sigma.2 + scale <- exp(sum(d[d > 1e-3]^2 * log(x[d > 1e-3] / d[d > 1e-3])) / sum(d[d > 1e-3]^2)) + return(c(a.new, sigma$sigma.1, sigma$sigma.2, scale)) } +#' Computes vector of cumulative sums on reversed input. Returns cumulative sum vector going from the sum of all elements to one. +#' @param x float - vector of numerical values +#' @return Returns a vector #' @export rev_cum_sum <- function(x) { - l <- length(x) - return(cumsum((x)[l:1])[l:1]) + x <- rev(x) + return(rev(cumsum(x))) +} + +#' TODO: Document +#' @export +compute_start_bound <- function(x, left_sigma_ratio_lim) { + start_bound <- 1 + + len_x <- length(x) + idx <- which(x >= left_sigma_ratio_lim / (left_sigma_ratio_lim + 1) * x[len_x]) + if (length(idx) > 0) { + start_bound <- max(1, min(idx)) + } + return (start_bound) +} + +#' TODO: Document +#' @export +compute_end_bound <- function(x, right_sigma_ratio_lim) { + len_x <- length(x) + end_bound <- len_x - 1 + + idx <- which(x <= right_sigma_ratio_lim / (right_sigma_ratio_lim + 1) * x[len_x]) + if (length(idx) > 0) { + end_bound <- min(len_x - 1, max(idx)) + } + return (end_bound) } +#' @param x Cumulative intensity values. +#' @param sigma.ratio.lim A vector of two. It enforces the belief of the range of the ratio between the left-standard deviation. +#' and the right-standard deviation of the bi-Gaussian function used to fit the data. +#' @return Returns a list with bounds with following items: +#' \itemize{ +#' \item start - start bound +#' \item end - end bound +#'} #' @export compute_bounds <- function(x, sigma.ratio.lim) { - l <- length(x) - sel <- which(x >= sigma.ratio.lim[1] / (sigma.ratio.lim[1] + 1) * x[l]) - if (length(sel) > 0) { - start <- max(1, min(sel)) - } else { - start <- 1 - } - sel <- which(x <= sigma.ratio.lim[2] / (sigma.ratio.lim[2] + 1) * x[l]) - if (length(sel) > 0) { - end <- min(l - 1, max(sel)) - } else { - end <- l - 1 - } - return(list(start = start, end = end)) + start <- compute_start_bound(x, sigma.ratio.lim[1]) + end <- compute_end_bound(x, sigma.ratio.lim[2]) + return(list(start = start, end = end)) } +#' Compute difference between neighbouring elements of a vector and optionally apply a mask such that the maximum difference is no higher than 4-fold minimum difference. +#' @param x - float - a vector of numerical values. +#' @param apply_mask - boolean - whether to apply threshold mask to the output vector. +#' @return Returns vector of numeric differences between neighbouring values. #' @export -bigauss.esti <- function(x, y, power = 1, do.plot = FALSE, truth = NA, sigma.ratio.lim = c(0.3, 3)) { - sel <- which(y > 1e-10) - if (length(sel) < 2) { - to.return <- c(median(x), 1, 1, 0) - } else { - x <- x[sel] - y <- y[sel] - # sel<-order(x) - # y<-y[sel] - # x<-x[sel] - - y.0 <- y - if (do.plot) plot(x, y) - if (do.plot & !is.na(truth[1])) { - true.y1 <- dnorm(x[x < truth[1]], mean = truth[1], sd = truth[2]) * truth[2] * truth[4] - true.y2 <- dnorm(x[x >= truth[1]], mean = truth[1], sd = truth[3]) * truth[3] * truth[4] - lines(x, c(true.y1, true.y2), col = "green") - } - max.y.0 <- max(y.0, na.rm = TRUE) - y <- (y / max.y.0)^power - - l <- length(x) - min.d <- min(diff(x)) - dx <- c(x[2] - x[1], (x[3:l] - x[1:(l - 2)]) / 2, x[l] - x[l - 1]) - if (l == 2) dx <- rep(diff(x), 2) - dx[dx > 4 * min.d] <- 4 * min.d - - y.cum <- cumsum(y * dx) - x.y.cum <- cumsum(y * x * dx) - xsqr.y.cum <- cumsum(y * x^2 * dx) - - y.cum.rev <- rev_cum_sum(y * dx) - x.y.cum.rev <- rev_cum_sum(x * y * dx) - xsqr.y.cum.rev <- rev_cum_sum(y * x^2 * dx) - - bounds <- compute_bounds(y.cum, sigma.ratio.lim) - end <- bounds$end - start <- bounds$start - - if (end <= start) { - m <- min(mean(x[start:end]), x[max(which(y.cum.rev > 0))]) - } else { - m.candi <- x[start:end] + diff(x[start:(end + 1)]) / 2 - rec <- matrix(0, ncol = 3, nrow = end - start + 1) - - s1 <- sqrt((xsqr.y.cum[start:end] + m.candi^2 * y.cum[start:end] - 2 * m.candi * x.y.cum[start:end]) / y.cum[start:end]) - s2 <- sqrt((xsqr.y.cum.rev[start:end + 1] + m.candi^2 * y.cum.rev[start:end + 1] - 2 * m.candi * x.y.cum.rev[start:end + 1]) / y.cum.rev[start:end + 1]) - rec[, 1] <- s1 - rec[, 2] <- s2 - rec[, 3] <- y.cum[start:end] / y.cum.rev[start:end + 1] - - d <- log(rec[, 1] / rec[, 2]) - log(rec[, 3]) - if (min(d, na.rm = TRUE) * max(d, na.rm = TRUE) < 0) { - sel <- c(which(d == max(d[d < 0]))[1], which(d == min(d[d >= 0]))) - m <- (sum(abs(d[sel]) * m.candi[sel])) / (sum(abs(d[sel]))) - } else { - d <- abs(d) - m <- m.candi[which(d == min(d, na.rm = TRUE))[1]] - } - } +compute_dx <- function(x, apply_mask=TRUE) { + l <- length(x) + diff_x <- diff(x) + if (l == 2) { + dx <- rep(diff_x, 2) + } else { + dx <- c( + x[2] - x[1], + diff(x, lag = 2) / 2, + x[l] - x[l - 1] + ) + } + if (apply_mask) { + diff_threshold <- min(diff_x) * 4 + dx <- pmin(dx, diff_threshold) + } + return (dx) +} - if (do.plot) abline(v = m) +#' Find base.curve RTs that lay within RT range of the whole feature table and append the intensities to these RTs. +#' @param profile Profile table with shape number-of-features*4. The table contains following columns: +#' \itemize{ +#' \item mz - float - mass-to-charge ratio of feature +#' \item rt - float - retention time of features +#' \item intensity - float - intensity of features +#' \item group_number - integer - group number assigned to each feature based on their rt similarity +#' } +#' @param base.curve Matrix that contains rts of feature in the same rt cluster. +#' @return dataframe with two columns +#' @export +compute_chromatographic_profile <- function(profile, base.curve) { + rt_range <- range(profile[, "rt"]) + chr_profile <- base.curve[between(base.curve[, "base.curve"], min(rt_range), max(rt_range)), ] + chr_profile[chr_profile[, "base.curve"] %in% profile[, "rt"], 2] <- profile[, "intensity"] + colnames(chr_profile)[2] <- "intensity" - sel1 <- which(x < m) - sel2 <- which(x >= m) - s1 <- sqrt(sum((x[sel1] - m)^2 * y[sel1] * dx[sel1]) / sum(y[sel1] * dx[sel1])) - s2 <- sqrt(sum((x[sel2] - m)^2 * y[sel2] * dx[sel2]) / sum(y[sel2] * dx[sel2])) + return (chr_profile) +} +#' Estimate total signal strength (total area of the estimated normal curve). +#' @param y - float - a vector of intensities. +#' @param d - float - a vector of \emph{y} values in a gaussian curve. +#' @param scale - float - a vector of scaled intensity values. +#' @export +compute_scale <- function(y, d) { + dy_ratio <- d^2 * log(y / d) + dy_ratio[is.na(dy_ratio)] <- 0 + dy_ratio[is.infinite(dy_ratio)] <- 0 - if (power != 1) { - s1 <- s1 * sqrt(power) - s2 <- s2 * sqrt(power) - } + scale <- exp(sum(dy_ratio) / sum(d^2)) + return (scale) +} + +#' Estimate the parameters of Bi-Gaussian curve. +#' @param x Vector of RTs that lay in the same RT cluster. +#' @param y Intensities that belong to x. +#' @param power The power parameter for data transformation when fitting the bi-Gaussian or Gaussian mixture model in an EIC. +#' @param sigma.ratio.lim A vector of two. It enforces the belief of the range of the ratio between the left-standard deviation +#' and the right-standard deviation of the bi-Gaussian function used to fit the data. +#' @return A vector with length 4. The items are as follows going from first to last: +#' \itemize{ +#' \item mean of gaussian curve +#' \item standard deviation at the left side of the gaussian curve +#' \item standard deviation at the right side of the gaussian curve +#' \item estimated total signal strength (total area of the estimated normal curve) +#'} +#' @export +bigauss.esti <- function(x, y, power = 1, do.plot = FALSE, sigma.ratio.lim = c(0.3, 3)) { + # even producing a dataframe with x and y as columns without actually using it causes the test to run forever + sel <- which(y > 1e-10) + if (length(sel) < 2) { + return (c(median(x), 1, 1, 0)) + } else { + x <- x[sel] + y <- y[sel] - d1 <- dnorm(x[sel1], sd = s1, mean = m) - d2 <- dnorm(x[sel2], sd = s2, mean = m) - d <- c(d1 * s1, d2 * s2) # notice this "density" doesnt integrate to 1. Rather it integrates to (s1+s2)/2 - y <- y.0 + y.0 <- y + if (do.plot) { + plot(x, y) + } + max.y.0 <- max(y.0, na.rm = TRUE) + y <- (y / max.y.0)^power - dy.ratio <- d^2 * log(y / d) - dy.ratio[is.na(dy.ratio)] <- 0 - dy.ratio[dy.ratio == -Inf] <- 0 - dy.ratio[dy.ratio == Inf] <- 0 + dx <- compute_dx(x) + y.cum <- cumsum(y * dx) + x.y.cum <- cumsum(y * x * dx) + xsqr.y.cum <- cumsum(y * x^2 * dx) - scale <- exp(sum(dy.ratio) / sum(d^2)) + y.cum.rev <- rev_cum_sum(y * dx) + x.y.cum.rev <- rev_cum_sum(x * y * dx) + xsqr.y.cum.rev <- rev_cum_sum(y * x^2 * dx) - if (do.plot) { - fit.1 <- d * scale - lines(x[y > 0], fit.1, col = "red") - } + bounds <- compute_bounds(y.cum, sigma.ratio.lim) + end <- bounds$end + start <- bounds$start - to.return <- c(m, s1, s2, scale) - if (sum(is.na(to.return)) > 0) { - m <- sum(x * y) / sum(y) - s1 <- s2 <- sum(y * (x - m)^2) / sum(y) - scale <- sum(y) / s1 - to.return <- c(m, s1, s2, scale) - } + if (end <= start) { + m <- min(mean(x[start:end]), x[max(which(y.cum.rev > 0))]) + } else { + m.candi <- x[start:end] + diff(x[start:(end + 1)]) / 2 + rec <- matrix(0, ncol = 3, nrow = end - start + 1) + + s1 <- sqrt((xsqr.y.cum[start:end] + m.candi^2 * y.cum[start:end] - 2 * m.candi * x.y.cum[start:end]) / y.cum[start:end]) + s2 <- sqrt((xsqr.y.cum.rev[start:end + 1] + m.candi^2 * y.cum.rev[start:end + 1] - 2 * m.candi * x.y.cum.rev[start:end + 1]) / y.cum.rev[start:end + 1]) + rec[, 1] <- s1 + rec[, 2] <- s2 + rec[, 3] <- y.cum[start:end] / y.cum.rev[start:end + 1] + + d <- log(rec[, 1] / rec[, 2]) - log(rec[, 3]) + if (min(d, na.rm = TRUE) * max(d, na.rm = TRUE) < 0) { + sel <- c(which(d == max(d[d < 0]))[1], which(d == min(d[d >= 0]))) + m <- (sum(abs(d[sel]) * m.candi[sel])) / (sum(abs(d[sel]))) + } else { + d <- abs(d) + m <- m.candi[which(d == min(d, na.rm = TRUE))[1]] + } } - return(to.return) -} -#' @export -bigauss.mix <- function(x, y, power = 1, do.plot = FALSE, sigma.ratio.lim = c(0.1, 10), bw = c(15, 30, 60), eliminate = .05, max.iter = 25, estim.method, BIC.factor = 2) { - all.bw <- bw[order(bw)] + if (do.plot) { + abline(v = m) + } - x.0 <- x - y.0 <- y + sel1 <- which(x < m) + sel2 <- which(x >= m) + s1 <- sqrt(sum((x[sel1] - m)^2 * y[sel1] * dx[sel1]) / sum(y[sel1] * dx[sel1])) + s2 <- sqrt(sum((x[sel2] - m)^2 * y[sel2] * dx[sel2]) / sum(y[sel2] * dx[sel2])) - sel <- y > 1e-5 - x <- x[sel] - y <- y[sel] - sel <- order(x) - y <- y[sel] - x <- x[sel] - results <- new("list") - smoother.pk.rec <- smoother.vly.rec <- new("list") - bic.rec <- all.bw + s1 <- s1 * sqrt(power) + s2 <- s2 * sqrt(power) + + d1 <- dnorm(x[sel1], sd = s1, mean = m) + d2 <- dnorm(x[sel2], sd = s2, mean = m) + d <- c(d1 * s1, d2 * s2) # notice this "density" doesnt integrate to 1. Rather it integrates to (s1+s2)/2 + y <- y.0 + + scale <- compute_scale(y, d) if (do.plot) { - par(mfrow = c(ceiling(length(all.bw) / 2), 2)) - par(mar = c(1, 1, 1, 1)) + lines(x[y > 0], d * scale, col = "red") } - last.num.pks <- Inf + to.return <- c(m, s1, s2, scale) + if (sum(is.na(to.return)) > 0) { + m <- sum(x * y) / sum(y) + s1 <- s2 <- sum(y * (x - m)^2) / sum(y) + scale <- sum(y) / s1 + to.return <- c(m, s1, s2, scale) + } + } + return(to.return) +} - for (bw.n in length(all.bw):1) +#' @param chr_profile A matrix with two columns: "base.curve" (rt) and "intensity". +#' @param vlys A vector of sorted RT-valley values at which the kernel estimate was computed. +#' @param dx Difference between neighbouring RT values with step 2. +#' @param pks A vector of sorted RT-peak values at which the kernel estimate was computed. +#' @export +compute_initiation_params <- function(chr_profile, vlys, dx, pks) { + m <- s1 <- s2 <- delta <- pks + for (i in 1:length(m)) + { + sel.1 <- which(chr_profile[, "base.curve"] >= max(vlys[vlys < m[i]]) & chr_profile[, "base.curve"] < m[i]) + s1[i] <- sqrt(sum((chr_profile[sel.1, "base.curve"] - m[i])^2 * chr_profile[sel.1, "intensity"] * dx[sel.1]) / sum(chr_profile[sel.1, "intensity"] * dx[sel.1])) + + sel.2 <- which(chr_profile[, "base.curve"] >= m[i] & chr_profile[, "base.curve"] < min(vlys[vlys > m[i]])) + s2[i] <- sqrt(sum((chr_profile[sel.2, "base.curve"] - m[i])^2 * chr_profile[sel.2, "intensity"] * dx[sel.2]) / sum(chr_profile[sel.2, "intensity"] * dx[sel.2])) + + delta[i] <- (sum(chr_profile[sel.1, "intensity"] * dx[sel.1]) + sum(chr_profile[sel.2, "intensity"] * dx[sel.2])) / ((sum(dnorm(chr_profile[sel.1, "base.curve"], mean = m[i], sd = s1[i])) * s1[i] / 2) + (sum(dnorm(chr_profile[sel.2, "base.curve"], mean = m[i], sd = s2[i])) * s2[i] / 2)) + } + return (list(s1 = s1, + s2 = s2, + delta = delta)) +} + +#' @param m A vector of sorted RT-peak values at which the kernel estimate was computed. +#' @param chr_profile A matrix with two columns: "base.curve" (rt) and "intensity". +#' @param delta Parameter computed by the initiation step. +#' @param s1 Parameter computed by the initiation step. +#' @param s2 Parameter computed by the initiation step. +#' @export +compute_e_step <- function(m, chr_profile, delta, s1, s2) { + fit <- matrix(0, ncol = length(m), nrow = length(chr_profile[, "base.curve"])) # this is the matrix of fitted values + cuts <- c(-Inf, m, Inf) + for (j in 2:length(cuts)) + { + sel <- which(chr_profile[, "base.curve"] >= cuts[j - 1] & chr_profile[, "base.curve"] < cuts[j]) + use.s1 <- which(1:length(m) >= (j - 1)) + s.to.use <- s2 + s.to.use[use.s1] <- s1[use.s1] + for (i in 1:ncol(fit)) { - bw <- all.bw[bw.n] - this.smooth <- ksmooth(x.0, y.0, kernel = "normal", bandwidth = bw) - turns <- find.turn.point(this.smooth$y) - pks <- this.smooth$x[turns$pks] - vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf) - - smoother.pk.rec[[bw.n]] <- pks - smoother.vly.rec[[bw.n]] <- vlys - if (length(pks) != last.num.pks) { - last.num.pks <- length(pks) - l <- length(x) - dx <- c(x[2] - x[1], (x[3:l] - x[1:(l - 2)]) / 2, x[l] - x[l - 1]) - if (l == 2) dx <- rep(diff(x), 2) - - # initiation - m <- s1 <- s2 <- delta <- pks - for (i in 1:length(m)) - { - sel.1 <- which(x >= max(vlys[vlys < m[i]]) & x < m[i]) - s1[i] <- sqrt(sum((x[sel.1] - m[i])^2 * y[sel.1] * dx[sel.1]) / sum(y[sel.1] * dx[sel.1])) - - sel.2 <- which(x >= m[i] & x < min(vlys[vlys > m[i]])) - s2[i] <- sqrt(sum((x[sel.2] - m[i])^2 * y[sel.2] * dx[sel.2]) / sum(y[sel.2] * dx[sel.2])) - - delta[i] <- (sum(y[sel.1] * dx[sel.1]) + sum(y[sel.2] * dx[sel.2])) / ((sum(dnorm(x[sel.1], mean = m[i], sd = s1[i])) * s1[i] / 2) + (sum(dnorm(x[sel.2], mean = m[i], sd = s2[i])) * s2[i] / 2)) - } - delta[is.na(delta)] <- 1e-10 - s1[is.na(s1)] <- 1e-10 - s2[is.na(s2)] <- 1e-10 - - - fit <- matrix(0, ncol = length(m), nrow = length(x)) # this is the matrix of fitted values - - this.change <- Inf - counter <- 0 - - while (this.change > 0.1 & counter <= max.iter) { - counter <- counter + 1 - old.m <- m - - # E step - cuts <- c(-Inf, m, Inf) - for (j in 2:length(cuts)) - { - sel <- which(x >= cuts[j - 1] & x < cuts[j]) - use.s1 <- which(1:length(m) >= (j - 1)) - s.to.use <- s2 - s.to.use[use.s1] <- s1[use.s1] - for (i in 1:ncol(fit)) - { - fit[sel, i] <- dnorm(x[sel], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i] - } - } - fit[is.na(fit)] <- 0 - sum.fit <- apply(fit, 1, sum) - - # Elimination step - fit <- fit / sum.fit - fit2 <- fit * y - perc.explained <- apply(fit2, 2, sum) / sum(y) - max.erase <- max(1, round(length(perc.explained) / 5)) - - to.erase <- which(perc.explained <= min(eliminate, perc.explained[order(perc.explained, na.last = FALSE)[max.erase]])) - - - if (length(to.erase) > 0) { - m <- m[-to.erase] - s1 <- s1[-to.erase] - s2 <- s2[-to.erase] - delta <- delta[-to.erase] - fit <- fit[, -to.erase] - if (is.null(ncol(fit))) fit <- matrix(fit, ncol = 1) - sum.fit <- apply(fit, 1, sum) - fit <- fit / sum.fit - old.m <- old.m[-to.erase] - } - - # M step - for (i in 1:length(m)) - { - this.y <- y * fit[, i] - if (estim.method == "moment") { - this.fit <- bigauss.esti(x, this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim) - } else { - this.fit <- bigauss.esti.EM(x, this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim) - } - m[i] <- this.fit[1] - s1[i] <- this.fit[2] - s2[i] <- this.fit[3] - delta[i] <- this.fit[4] - } - delta[is.na(delta)] <- 0 - - # amount of change - this.change <- sum((old.m - m)^2) - } - cuts <- c(-Inf, m, Inf) - fit <- fit * 0 - for (j in 2:length(cuts)) - { - sel <- which(x >= cuts[j - 1] & x < cuts[j]) - use.s1 <- which(1:length(m) >= (j - 1)) - s.to.use <- s2 - s.to.use[use.s1] <- s1[use.s1] - for (i in 1:ncol(fit)) - { - if (s.to.use[i] != 0) { - fit[sel, i] <- dnorm(x[sel], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i] - } - } - } - - if (do.plot) { - plot(x, y, cex = .1, main = paste("bw=", bw)) - sum.fit <- apply(fit, 1, sum) - lines(x, sum.fit) - abline(v = m) - cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100)) - for (i in 1:length(m)) - { - lines(x, fit[, i], col = cols[i]) - } - } - area <- delta * (s1 + s2) / 2 - rss <- sum((y - apply(fit, 1, sum))^2) - l <- length(x) - bic <- l * log(rss / l) + 4 * length(m) * log(l) * BIC.factor - results[[bw.n]] <- cbind(m, s1, s2, delta, area) - bic.rec[bw.n] <- bic - } else { - results[[bw.n]] <- NA - bic.rec[bw.n] <- Inf - results[[bw.n]] <- results[[bw.n + 1]] - } + fit[sel, i] <- dnorm(chr_profile[sel, "base.curve"], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i] } - sel <- which(bic.rec == min(bic.rec, na.rm = TRUE)) - if (length(sel) > 1) sel <- sel[which(all.bw[sel] == max(all.bw[sel]))] - rec <- new("list") - rec$param <- results[[sel]] - rec$smoother.pks <- smoother.pk.rec - rec$smoother.vlys <- smoother.vly.rec - rec$all.param <- results - rec$bic <- bic.rec - return(rec) + } + fit[is.na(fit)] <- 0 + sum.fit <- apply(fit, 1, sum) + return(list(fit = fit, sum.fit = sum.fit)) } +#' @param chr_profile Dataframe that stores RTs and intensities of features. +#' @param power The power parameter for data transformation when fitting the bi-Gaussian or Gaussian mixture model in an EIC. +#' @param sigma.ratio.lim A vector of two. It enforces the belief of the range of the ratio between the left-standard deviation +#' and the right-standard deviation of the bi-Gaussian function used to fit the data. +#' @param bw Bandwidth vector to use in the kernel smoother. +#' @param eliminate When a component accounts for a proportion of intensities less than this value, the component will be ignored. +#' @param max.iter Maximum number of iterations when executing the E step. +#' @param estim.method The estimation method for the bi-Gaussian peak model. Two possible values: moment and EM. +#' @param BIC.factor The factor that is multiplied on the number of parameters to modify the BIC criterion. If larger than 1, +#' models with more peaks are penalized more. +#' @importFrom dplyr filter arrange #' @export -normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, prob.cut = 1e-2) { - x <- that.curve[, 1] - y <- that.curve[, 2] - - if (length(pks) == 1) { - miu <- sum(x * y) / sum(y) - sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) - fitted <- dnorm(x, mean = miu, sd = sigma) - this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > prob.cut - sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - } else { - pks <- pks[order(pks)] - vlys <- vlys[order(vlys)] - l <- length(pks) - miu <- sigma <- sc <- pks - w <- matrix(0, nrow = l, ncol = length(x)) +bigauss.mix <- function(chr_profile, power = 1, do.plot = FALSE, sigma.ratio.lim = c(0.1, 10), bw = c(15, 30, 60), eliminate = .05, max.iter = 25, estim.method, BIC.factor = 2) { + all.bw <- sort(bw) + results <- new("list") + smoother.pk.rec <- smoother.vly.rec <- new("list") + bic.rec <- all.bw + + if (do.plot) { + par(mfrow = c(ceiling(length(all.bw) / 2), 2)) + par(mar = c(1, 1, 1, 1)) + } + + last.num.pks <- Inf + + chr_profile_unfiltered <- chr_profile + chr_profile <- data.frame(chr_profile) |> + filter(intensity > 1e-5) |> + arrange(base.curve) + + for (bw.n in length(all.bw):1) + { + bw <- all.bw[bw.n] + this.smooth <- ksmooth(chr_profile_unfiltered[, "base.curve"], chr_profile_unfiltered[, "intensity"], kernel = "normal", bandwidth = bw) + turns <- find.turn.point(this.smooth$y) + pks <- this.smooth$x[turns$pks] + vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf) + + smoother.pk.rec[[bw.n]] <- pks + smoother.vly.rec[[bw.n]] <- vlys + if (length(pks) != last.num.pks) { + last.num.pks <- length(pks) + l <- length(chr_profile[, "base.curve"]) + dx <- compute_dx(chr_profile[, "base.curve"], apply_mask = FALSE) + + # initiation + initiation_params <- compute_initiation_params(chr_profile, vlys, dx, pks) + s1 <- initiation_params$s1 + s2 <- initiation_params$s2 + delta <- initiation_params$delta + + delta[is.na(delta)] <- 1e-10 + s1[is.na(s1)] <- 1e-10 + s2[is.na(s2)] <- 1e-10 + + this.change <- Inf + counter <- 0 + + m <- pks + while (this.change > 0.1 & counter <= max.iter) { + counter <- counter + 1 + old.m <- m + + # E step + fits <- compute_e_step(m, chr_profile, delta, s1, s2) + fit <- fits$fit + sum.fit <- fits$sum.fit + + # Elimination step + fit <- fit / sum.fit + fit2 <- fit * chr_profile[, "intensity"] + perc.explained <- apply(fit2, 2, sum) / sum(chr_profile[, "intensity"]) + max.erase <- max(1, round(length(perc.explained) / 5)) + + to.erase <- which(perc.explained <= min(eliminate, perc.explained[order(perc.explained, na.last = FALSE)[max.erase]])) - for (m in 1:l) - { - this.low <- max(vlys[vlys <= pks[m]]) - this.high <- min(vlys[vlys >= pks[m]]) - this.x <- x[x >= this.low & x <= this.high] - this.y <- y[x >= this.low & x <= this.high] - - miu[m] <- sum(this.x * this.y) / sum(this.y) - # if(sum(this.y>0) > 1) - # { - sigma[m] <- sqrt(sum(this.y * (this.x - miu[m])^2) / sum(this.y)) - # }else{ - # sigma[m]<-mean(diff(this.x))/2 - # } - fitted <- dnorm(this.x, mean = miu[m], sd = sigma[m]) - this.sel <- this.y > 0 & fitted / dnorm(miu[m], mean = miu[m], sd = sigma[m]) > prob.cut - sc[m] <- exp(sum(fitted[this.sel]^2 * log(this.y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - # sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef - } - to.erase <- which(is.na(miu) | is.na(sigma) | sigma == 0 | is.na(sc)) if (length(to.erase) > 0) { - l <- l - length(to.erase) - miu <- miu[-to.erase] - sigma <- sigma[-to.erase] - sc <- sc[-to.erase] - w <- w[-to.erase, ] + m <- m[-to.erase] + s1 <- s1[-to.erase] + s2 <- s2[-to.erase] + delta <- delta[-to.erase] + fit <- fit[, -to.erase] + if (is.null(ncol(fit))) { + fit <- matrix(fit, ncol = 1) + } + sum.fit <- apply(fit, 1, sum) + fit <- fit / sum.fit + old.m <- old.m[-to.erase] } - direc <- 1 - diff <- 1000 - iter <- 0 - - while (diff > 0.05 & iter < max.iter) { - iter <- iter + 1 - if (l == 1) { - miu <- sum(x * y) / sum(y) - sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) - fitted <- dnorm(x, mean = miu, sd = sigma) - this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > prob.cut - sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - # sc<-lm(y[this.sel]~fitted[this.sel]+0)$coef - break - } - miu.0 <- miu - sigma.0 <- sigma - sc.0 <- sc - - all.w <- y * 0 - for (m in 1:l) - { - all.w <- all.w + dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m] - } - - for (m in 1:l) - { - w[m, ] <- dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m] / all.w - } - - if (sum(is.na(w)) > 0) break - - for (m in 1:l) - { - this.y <- y * w[m, ] - miu[m] <- sum(x * this.y) / sum(this.y) - sigma[m] <- sqrt(sum(this.y * (x - miu[m])^2) / sum(this.y)) - fitted <- dnorm(x, mean = miu[m], sd = sigma[m]) - this.sel <- this.y > 0 & fitted / dnorm(miu[m], mean = miu[m], sd = sigma[m]) > prob.cut - sc[m] <- exp(sum(fitted[this.sel]^2 * log(this.y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - # sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef - } - diff <- sum((miu.0 - miu)^2) - - www <- w - for (m in 1:l) - { - www[m, ] <- www[m, ] * y - } - www <- apply(www, 1, sum) - www[which(is.na(sc))] <- 0 - www <- www / sum(www) - max.erase <- max(1, round(l / 5)) - - to.erase <- which(www <= min(ignore, www[order(www, na.last = FALSE)[max.erase]])) - - if (length(to.erase) > 0) { - l <- l - length(to.erase) - miu <- miu[-to.erase] - sigma <- sigma[-to.erase] - sc <- sc[-to.erase] - w <- w[-to.erase, ] - diff <- 1000 - } + # M step + for (i in 1:length(m)) + { + this.y <- chr_profile[, "intensity"] * fit[, i] + if (estim.method == "moment") { + this.fit <- bigauss.esti(chr_profile[, "base.curve"], this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim) + } else { + this.fit <- bigauss.esti.EM(chr_profile[, "base.curve"], this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim) + } + m[i] <- this.fit[1] + s1[i] <- this.fit[2] + s2[i] <- this.fit[3] + delta[i] <- this.fit[4] } - } - l <- length(miu) - if (l == 1) { - rec <- matrix(c(miu, sigma, sc), nrow = 1) + delta[is.na(delta)] <- 0 + + # amount of change + this.change <- sum((old.m - m)^2) + } + cuts <- c(-Inf, m, Inf) + fit <- fit * 0 + for (j in 2:length(cuts)) + { + sel <- which(chr_profile[, "base.curve"] >= cuts[j - 1] & chr_profile[, "base.curve"] < cuts[j]) + use.s1 <- which(1:length(m) >= (j - 1)) + s.to.use <- s2 + s.to.use[use.s1] <- s1[use.s1] + for (i in 1:ncol(fit)) + { + if (s.to.use[i] != 0) { + fit[sel, i] <- dnorm(chr_profile[sel, "base.curve"], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i] + } + } + } + + if (do.plot) { + plot_chr_profile(chr_profile, bw, fit, m) + } + area <- delta * (s1 + s2) / 2 + rss <- sum((chr_profile[, "intensity"] - apply(fit, 1, sum))^2) + l <- length(chr_profile[, "base.curve"]) + bic <- l * log(rss / l) + 4 * length(m) * log(l) * BIC.factor + results[[bw.n]] <- cbind(m, s1, s2, delta, area) + bic.rec[bw.n] <- bic } else { - rec <- cbind(miu, sigma, sc) + results[[bw.n]] <- NA + bic.rec[bw.n] <- Inf + results[[bw.n]] <- results[[bw.n + 1]] } - colnames(rec) <- c("miu", "sigma", "scale") - return(rec) + } + sel <- which(bic.rec == min(bic.rec, na.rm = TRUE)) + if (length(sel) > 1) { + sel <- sel[which(all.bw[sel] == max(all.bw[sel]))] + } + rec <- new("list") + rec$param <- results[[sel]] + rec$smoother.pks <- smoother.pk.rec + rec$smoother.vlys <- smoother.vly.rec + rec$all.param <- results + rec$bic <- bic.rec + return(rec) } +#' Reevaluate parameters of chromatographic gaussian curves. +#' @param that.curve Dataframe that stores RTs and intensities of features. +#' @param pks A vector of sorted RT-peak values at which the kernel estimate was computed. +#' @param vlys A vector of sorted RT-valley values at which the kernel estimate was computed. +#' @param ignore In fitting mixture of bi-Gaussian (or Gaussian) model of an EIC, when a component accounts for a +#' proportion of intensities less than this value, the component will be ignored. +#' @param max.iter Maximum number of iterations when reevaluating gaussian curves. +#' @param aver_diff Average retention time difference across RTs of all features. #' @export -normix.bic <- function(x, y, power = 2, do.plot = FALSE, bw = c(15, 30, 60), eliminate = .05, max.iter = 50, BIC.factor = 2) { - all.bw <- bw[order(bw)] - sel <- y > 1e-5 - x <- x[sel] - y <- y[sel] - sel <- order(x) - y <- y[sel] - x <- x[sel] - results <- new("list") - smoother.pk.rec <- smoother.vly.rec <- new("list") - bic.rec <- all.bw - - if (do.plot) { - par(mfrow = c(ceiling(length(all.bw) / 2), 2)) - par(mar = c(1, 1, 1, 1)) +normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, aver_diff) { + x <- that.curve[, 1] + y <- that.curve[, 2] + rt_int_list <- list(labels = x, intensities = y) + + if (length(pks) == 1) { + mu_sc_std <- compute_mu_sc_std(rt_int_list, aver_diff) + miu <- mu_sc_std$label + sc <- mu_sc_std$intensity + sigma <- mu_sc_std$sigma + } else { + pks <- sort(pks) + vlys <- sort(vlys) + l <- length(pks) + miu <- sigma <- sc <- pks + w <- matrix(0, nrow = l, ncol = length(x)) + + for (m in 1:l) + { + # this pattern occurs multiple times in other scripts + this.low <- max(vlys[vlys <= pks[m]]) + this.high <- min(vlys[vlys >= pks[m]]) + + this.x <- x[x >= this.low & x <= this.high] + this.y <- y[x >= this.low & x <= this.high] + + if (length(this.x) == 0 | length(this.y) == 0) { + miu[m] <- NaN + sigma[m] <- NaN + sc[m] <- 1 + } else { + rt_int_list_this <- list(labels = this.x, intensities = this.y) + mu_sc_std <- compute_mu_sc_std(rt_int_list_this, aver_diff) + miu[m] <- mu_sc_std$label + sc[m] <- mu_sc_std$intensity + sigma[m] <- mu_sc_std$sigma + } } - last.num.pks <- Inf + to.erase <- which(is.na(miu) | is.na(sigma) | sigma == 0 | is.na(sc)) + if (length(to.erase) > 0) { + l <- l - length(to.erase) + miu <- miu[-to.erase] + sigma <- sigma[-to.erase] + sc <- sc[-to.erase] + w <- w[-to.erase, ] + } - for (bw.n in length(all.bw):1) - { - bw <- all.bw[bw.n] - this.smooth <- ksmooth(x, y, kernel = "normal", bandwidth = bw) - turns <- find.turn.point(this.smooth$y) - pks <- this.smooth$x[turns$pks] - vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf) - - smoother.pk.rec[[bw.n]] <- pks - smoother.vly.rec[[bw.n]] <- vlys - if (length(pks) != last.num.pks) { - last.num.pks <- length(pks) - aaa <- normix(cbind(x, y), pks = pks, vlys = vlys, ignore = eliminate, max.iter = max.iter) - - total.fit <- x * 0 - for (i in 1:nrow(aaa)) - { - total.fit <- total.fit + dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3] - } - - if (do.plot) { - plot(x, y, cex = .1, main = paste("bw=", bw)) - abline(v = aaa[, 1]) - cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100)) - for (i in 1:nrow(aaa)) - { - lines(x, dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3], col = cols[i]) - } - } - - rss <- sum((y - total.fit)^2) - l <- length(x) - bic <- l * log(rss / l) + 3 * nrow(aaa) * log(l) * BIC.factor - results[[bw.n]] <- aaa - bic.rec[bw.n] <- bic - } else { - bic.rec[bw.n] <- Inf - results[[bw.n]] <- results[[bw.n + 1]] + direc <- 1 + diff <- 1000 + iter <- 0 + + while (diff > 0.05 & iter < max.iter) { + iter <- iter + 1 + if (l == 1) { + mu_sc_std <- compute_mu_sc_std(rt_int_list, aver_diff) + miu <- mu_sc_std$label + sc <- mu_sc_std$intensity + sigma <- mu_sc_std$sigma + break + } + miu.0 <- miu + sigma.0 <- sigma + sc.0 <- sc + + all.w <- y * 0 + for (m in 1:l) + { + all.w <- all.w + dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m] + } + + # when l is zero the iteration goes from 1 to 0 znd results in "index out of bound" error + for (m in 1:l) + { + w[m, ] <- dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m] / all.w + } + + if (sum(is.na(w)) > 0) { + break + } + + for (m in 1:l) + { + this.y <- y * w[m, ] + rt_int_list_this <- list(labels = x, intensities = this.y) + mu_sc_std <- compute_mu_sc_std(rt_int_list_this, aver_diff) + miu[m] <- mu_sc_std$label + sc[m] <- mu_sc_std$intensity + sigma[m] <- mu_sc_std$sigma + + if (sigma[m] == 0) { + sc[m] <- NA } + } + diff <- sum((miu.0 - miu)^2) + + www <- w + for (m in 1:l) + { + www[m, ] <- www[m, ] * y + } + www <- apply(www, 1, sum) + www[which(is.na(sc))] <- 0 + www <- www / sum(www) + max.erase <- max(1, round(l / 5)) + + to.erase <- which(www <= min(ignore, www[order(www, na.last = FALSE)[max.erase]])) + + if (length(to.erase) > 0) { + l <- l - length(to.erase) + miu <- miu[-to.erase] + sigma <- sigma[-to.erase] + sc <- sc[-to.erase] + w <- w[-to.erase, ] + diff <- 1000 + } + } + } + l <- length(miu) + if (l == 1) { + rec <- matrix(c(miu, sigma, sc), nrow = 1) + } else { + rec <- cbind(miu, sigma, sc) + } + colnames(rec) <- c("miu", "sigma", "scale") + return(rec) +} + +#' @param x Vector of RTs that lay in the same RT cluster. +#' @param y Intensities that belong to x. +#' @param power The power parameter for data transformation when fitting the bi-Gaussian or Gaussian mixture model in an EIC. +#' @param bw Bandwidth vector to use in the kernel smoother. +#' @param eliminate When a component accounts for a proportion of intensities less than this value, the component will be ignored. +#' @param max.iter Maximum number of iterations when executing the E step. +#' @param BIC.factor The factor that is multiplied on the number of parameters to modify the BIC criterion. If larger than 1, +#' @param aver_diff Average retention time difference across RTs of all features. +#' @export +normix.bic <- function(x, y, power = 2, do.plot = FALSE, bw = c(15, 30, 60), eliminate = .05, max.iter = 50, BIC.factor = 2, aver_diff) { + all.bw <- bw[order(bw)] + sel <- y > 1e-5 + x <- x[sel] + y <- y[sel] + sel <- order(x) + y <- y[sel] + x <- x[sel] + results <- new("list") + smoother.pk.rec <- smoother.vly.rec <- new("list") + bic.rec <- all.bw + + if (do.plot) { + par(mfrow = c(ceiling(length(all.bw) / 2), 2)) + par(mar = c(1, 1, 1, 1)) + } + + last.num.pks <- Inf + + for (bw.n in length(all.bw):1) + { + bw <- all.bw[bw.n] + this.smooth <- ksmooth(x, y, kernel = "normal", bandwidth = bw) + turns <- find.turn.point(this.smooth$y) + pks <- this.smooth$x[turns$pks] + vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf) + + smoother.pk.rec[[bw.n]] <- pks + smoother.vly.rec[[bw.n]] <- vlys + if (length(pks) != last.num.pks) { + last.num.pks <- length(pks) + aaa <- normix(cbind(x, y), pks = pks, vlys = vlys, ignore = eliminate, max.iter = max.iter, aver_diff = aver_diff) + + total.fit <- x * 0 + for (i in 1:nrow(aaa)) + { + total.fit <- total.fit + dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3] + } + + if (do.plot) { + plot_normix_bic(x, y, bw, aaa) + } + + rss <- sum((y - total.fit)^2) + l <- length(x) + bic <- l * log(rss / l) + 3 * nrow(aaa) * log(l) * BIC.factor + results[[bw.n]] <- aaa + bic.rec[bw.n] <- bic + } else { + bic.rec[bw.n] <- Inf + results[[bw.n]] <- results[[bw.n + 1]] } - sel <- which(bic.rec == min(bic.rec)) - if (length(sel) > 1) sel <- sel[which(all.bw[sel] == max(all.bw[sel]))] - rec <- new("list") - rec$param <- results[[sel]] - rec$smoother.pks <- smoother.pk.rec - rec$smoother.vlys <- smoother.vly.rec - rec$all.param <- results - rec$bic <- bic.rec - return(rec) + } + sel <- which(bic.rec == min(bic.rec)) + if (length(sel) > 1) { + sel <- sel[which(all.bw[sel] == max(all.bw[sel]))] + } + rec <- new("list") + rec$param <- results[[sel]] + rec$smoother.pks <- smoother.pk.rec + rec$smoother.vlys <- smoother.vly.rec + rec$all.param <- results + rec$bic <- bic.rec + return(rec) } -#' Generate feature table from noise-removed LC/MS profile -#' +#' Generate feature table from noise-removed LC/MS profile. +#' #' @description -#' Each LC/MS profile is first processed by the function proc.cdf() to remove noise and reduce data size. A matrix containing m/z -#' value, retention time, intensity, and group number is output from proc.cdf(). This matrix is then fed to the function +#' Each LC/MS profile is first processed by the function proc.cdf() to remove noise and reduce data size. A matrix containing m/z +#' value, retention time, intensity, and group number is output from proc.cdf(). This matrix is then fed to the function #' prof.to.features() to generate a feature list. Every detected feature is summarized into a single row in the output matrix from this function. -#' -#' @param a The matrix output from proc.cdf(). It contains columns of m/z value, retention time, intensity and group number. -#' @param bandwidth A value between zero and one. Multiplying this value to the length of the signal along the time axis helps +#' +#' @param profile The matrix output from proc.cdf(). It contains columns of m/z value, retention time, intensity and group number. +#' @param bandwidth A value between zero and one. Multiplying this value to the length of the signal along the time axis helps #' determine the bandwidth in the kernel smoother used for peak identification. #' @param min.bw The minimum bandwidth to use in the kernel smoother. #' @param max.bw The maximum bandwidth to use in the kernel smoother. #' @param sd.cut A vector of two. Features with standard deviation outside the range defined by the two numbers are eliminated. -#' @param sigma.ratio.lim A vector of two. It enforces the belief of the range of the ratio between the left-standard deviation +#' @param sigma.ratio.lim A vector of two. It enforces the belief of the range of the ratio between the left-standard deviation #' and the right-standard deviation of the bi-Gaussian function used to fit the data. -#' @param shape.model The mathematical model for the shape of a peak. There are two choices - "bi-Gaussian" and "Gaussian". +#' @param shape.model The mathematical model for the shape of a peak. There are two choices - "bi-Gaussian" and "Gaussian". #' When the peaks are asymmetric, the bi-Gaussian is better. The default is "bi-Gaussian". #' @param estim.method The estimation method for the bi-Gaussian peak model. Two possible values: moment and EM. #' @param do.plot Whether to generate diagnostic plots. #' @param power The power parameter for data transformation when fitting the bi-Gaussian or Gaussian mixture model in an EIC. -#' @param component.eliminate In fitting mixture of bi-Gaussian (or Gaussian) model of an EIC, when a component accounts for a +#' @param component.eliminate In fitting mixture of bi-Gaussian (or Gaussian) model of an EIC, when a component accounts for a #' proportion of intensities less than this value, the component will be ignored. -#' @param BIC.factor the factor that is multiplied on the number of parameters to modify the BIC criterion. If larger than 1, +#' @param BIC.factor the factor that is multiplied on the number of parameters to modify the BIC criterion. If larger than 1, #' models with more peaks are penalized more. -#' @return A matrix is returned. The columns are: m/z value, retention time, spread (standard deviation of the estimated normal +#' @return A matrix is returned. The columns are: m/z value, retention time, spread (standard deviation of the estimated normal #' curve), and estimated total signal strength (total area of the estimated normal curve). #' @export #' @examples #' prof.to.features(extracted_features, sd.cut = sd_cut, sigma.ratio.lim = sigma_ratio_lim, do.plot = FALSE) -prof.to.features <- function(a, +prof.to.features <- function(profile, bandwidth = 0.5, min.bw = NA, max.bw = NA, @@ -628,93 +836,73 @@ prof.to.features <- function(a, power = 1, component.eliminate = 0.01, BIC.factor = 2) { - if (sum(shape.model %in% c("bi-Gaussian", "Gaussian")) == 0) { - print("Error: peak shape model has to be Gaussian or bi-Gaussian") - return(0) + validate_inputs(shape.model, estim.method) + + profile <- preprocess_profile(profile) + + bws <- preprocess_bandwidth(min.bw, max.bw, profile) + min.bw <- bws[["min.bw"]] + max.bw <- bws[["max.bw"]] + + # base.curve <- compute_base_curve(profile[, "rt"]) + base.curve <- sort(unique(profile[, "rt"])) + base.curve <- cbind(base.curve, base.curve * 0) + all_rts <- compute_delta_rt(base.curve[, 1]) + aver_diff <- mean(diff(base.curve)) + + keys <- c("mz", "pos", "sd1", "sd2", "area") + peak_parameters <- matrix(0, nrow = 0, ncol = length(keys), dimnames = list(NULL, keys)) + + feature_groups <- split(profile, profile$group_number) + for (i in seq_along(feature_groups)) + { + feature_group <- feature_groups[[i]] + feature_group <- feature_group[order(feature_group[, "rt"]), ] + + num_features <- nrow(feature_group) + if (dplyr::between(num_features, 2, 10)) { + eic_area <- interpol.area(feature_group[, "rt"], feature_group[, "intensity"], base.curve[, "base.curve"], all_rts) + chr_peak_shape <- c(median(feature_group[, "mz"]), median(feature_group[, "rt"]), sd(feature_group[, "rt"]), sd(feature_group[, "rt"]), eic_area) + peak_parameters <- rbind(peak_parameters, chr_peak_shape) } - if (sum(estim.method %in% c("moment", "EM")) == 0) { - print("Error: peak model estimation method has to be moment or EM") - return(0) + if (num_features < 2) { + time_weights <- all_rts[which(base.curve[, "base.curve"] %in% feature_group[2])] + chr_peak_shape <- c(feature_group[1], feature_group[2], NA, NA, feature_group[3] * time_weights) + peak_parameters <- rbind(peak_parameters, chr_peak_shape) } - - if (is.na(min.bw)) min.bw <- diff(range(a[, 2], na.rm = TRUE)) / 60 - if (is.na(max.bw)) max.bw <- diff(range(a[, 2], na.rm = TRUE)) / 15 - if (min.bw >= max.bw) min.bw <- max.bw / 4 - - base.curve <- compute_base_curve(a[, 2]) - all.times <- compute_all_times(base.curve) - - this.features <- matrix(0, nrow = 1, ncol = 5) - colnames(this.features) <- c("mz", "pos", "sd1", "sd2", "area") - nrowa <- nrow(a) - - a.breaks <- c(0, which(a[1:(nrowa - 1), 4] != a[2:nrowa, 4]), nrowa) - mz.sd.rec <- NA - - for (nnn in 1:(length(a.breaks) - 1)) - { - this <- a[(a.breaks[nnn] + 1):a.breaks[nnn + 1], ] - if (is.null(nrow(this))) this <- matrix(this, nrow = 1) - this <- this[order(this[, 2]), ] - if (is.null(nrow(this))) this <- matrix(this, nrow = 1) - mz.sd.rec <- c(mz.sd.rec, sd(this[, 1])) - - this.count.1 <- nrow(this) - if (this.count.1 <= 10) { - if (this.count.1 > 1) { - this.inte <- interpol.area(this[, 2], this[, 3], base.curve[, 1], all.times) - xxx <- c(median(this[, 1]), median(this[, 2]), sd(this[, 2]), sd(this[, 2]), this.inte) - } else { - this.time.weights <- all.times[which(base.curve[, 1] %in% this[2])] - xxx <- c(this[1], this[2], NA, NA, this[3] * this.time.weights) - } - this.features <- rbind(this.features, xxx) - } else { - this.span <- range(this[, 2]) - this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ] - this.curve[this.curve[, 1] %in% this[, 2], 2] <- this[, 3] - - bw <- min(max(bandwidth * (this.span[2] - this.span[1]), min.bw), max.bw) - bw <- seq(bw, 2 * bw, length.out = 3) - if (bw[1] > 1.5 * min.bw) bw <- c(max(min.bw, bw[1] / 2), bw) - - if (shape.model == "Gaussian") { - xxx <- normix.bic(this.curve[, 1], this.curve[, 2], power = power, bw = bw, eliminate = component.eliminate, BIC.factor = BIC.factor)$param - if (nrow(xxx) == 1) { - xxx <- c(xxx[1, 1:2], xxx[1, 2], xxx[1, 3]) - } else { - xxx <- cbind(xxx[, 1:2], xxx[, 2], xxx[, 3]) - } - } else { - xxx <- bigauss.mix(this.curve[, 1], this.curve[, 2], sigma.ratio.lim = sigma.ratio.lim, bw = bw, power = power, estim.method = estim.method, eliminate = component.eliminate, BIC.factor = BIC.factor)$param[, c(1, 2, 3, 5)] - } - - if (is.null(nrow(xxx))) { - this.features <- rbind(this.features, c(median(this[, 1]), xxx)) - } else { - for (m in 1:nrow(xxx)) - { - this.d <- abs(this[, 2] - xxx[m, 1]) - this.features <- rbind(this.features, c(mean(this[which(this.d == min(this.d)), 1]), xxx[m, ])) - } - } + if (num_features > 10) { + rt_range <- range(feature_group[, "rt"]) + bw <- min(max(bandwidth * (max(rt_range) - min(rt_range)), min.bw), max.bw) + bw <- seq(bw, 2 * bw, length.out = 3) + if (bw[1] > 1.5 * min.bw) { + bw <- c(max(min.bw, bw[1] / 2), bw) + } + + chr_profile <- compute_chromatographic_profile(feature_group, base.curve) + if (shape.model == "Gaussian") { + chr_peak_shape <- compute_gaussian_peak_shape(chr_profile, power, bw, component.eliminate, BIC.factor, aver_diff) + } else { + chr_peak_shape <- bigauss.mix(chr_profile, sigma.ratio.lim = sigma.ratio.lim, bw = bw, power = power, estim.method = estim.method, eliminate = component.eliminate, BIC.factor = BIC.factor)$param[, c(1, 2, 3, 5)] + } + + if (is.null(nrow(chr_peak_shape))) { + peak_parameters <- rbind(peak_parameters, c(median(feature_group[, "mz"]), chr_peak_shape)) + } else { + for (m in 1:nrow(chr_peak_shape)) + { + rt_diff <- abs(feature_group[, "rt"] - chr_peak_shape[m, 1]) + peak_parameters <- rbind(peak_parameters, c(mean(feature_group[which(rt_diff == min(rt_diff)), 1]), chr_peak_shape[m, ])) } - - # message(nnn) + } } - this.features <- this.features[-1, ] - this.features <- this.features[order(this.features[, 1], this.features[, 2]), ] - this.features <- this.features[which(apply(this.features[, 3:4], 1, min) > sd.cut[1] & apply(this.features[, 3:4], 1, max) < sd.cut[2]), ] - rownames(this.features) <- NULL + } + peak_parameters <- peak_parameters[order(peak_parameters[, "mz"], peak_parameters[, "pos"]), ] + peak_parameters <- peak_parameters[which(apply(peak_parameters[, c("sd1", "sd2")], 1, min) > sd.cut[1] & apply(peak_parameters[, c("sd1", "sd2")], 1, max) < sd.cut[2]), ] + rownames(peak_parameters) <- NULL - if (do.plot) { - par(mfrow = c(2, 2)) - plot(c(-1, 1), c(-1, 1), type = "n", xlab = "", ylab = "", main = "", axes = FALSE) - text(x = 0, y = 0, "Estimate peak \n area/location", cex = 1.5) - hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency", main = "m/z SD distribution") - hist(c(this.features[, 3], this.features[, 4]), xlab = "Retention time SD", ylab = "Frequency", main = "Retention time SD distribution") - hist(log10(this.features[, 5]), xlab = "peak strength (log scale)", ylab = "Frequency", main = "Peak strength distribution") - } + if (do.plot) { + plot_peak_summary(feature_groups, peak_parameters) + } - return(this.features) + return(peak_parameters) } diff --git a/R/recover.weaker.R b/R/recover.weaker.R index c15692a5..c46905e8 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -2,156 +2,621 @@ NULL #> NULL + +#' Custom way of removing duplicate rows from a specifically formatted table. +#' +#' @description +#' Rows are considered as duplicate if the 1st, 2nd and 5th column are less than 1e-10 (tolerance) apart. +#' Only a single row in this `range` is kept from a group. +#' @param new.table The table from which the duplicate rows should be removed. Needs at least 5 columns. +#' Columns 1, 2 and 5 have to be of numeric type. +#' @param tolerance Tolerance to use for numeric comparisons. +#' @return Returns the same table with duplicate rows removed. #' @export -duplicate.row.remove <- function(new.table) { - new.table <- new.table[order(new.table[, 1], new.table[, 2], new.table[, 5]), ] +duplicate.row.remove <- function(features, tolerance = 1e-10) { + new.table <- features |> dplyr::arrange_at(c("mz", "rt", "area")) n <- 1 - m <- 2 - to.remove <- rep(0, nrow(new.table)) - - while (m <= nrow(new.table)) { - if (abs(new.table[m, 1] - new.table[n, 1]) < 1e-10 & - abs(new.table[m, 2] - new.table[n, 2]) < 1e-10 & - abs(new.table[m, 5] - new.table[n, 5]) < 1e-10) { - to.remove[m] <- 1 - m <- m + 1 + to.remove <- c() + + for (m in 2:nrow(new.table)) { + if (abs(new.table$mz[m] - new.table$mz[n]) < tolerance & + abs(new.table$rt[m] - new.table$rt[n]) < tolerance & + abs(new.table$area[m] - new.table$area[n]) < tolerance) { + to.remove <- c(to.remove, m) } else { n <- m - m <- m + 1 } - # cat("*(", n, m, ")") } - if (sum(to.remove) > 0) new.table <- new.table[-which(to.remove == 1), ] + if(length(to.remove) > 0) { + new.table <- new.table[-to.remove, ] + } new.table } -#' Compute all time values from base curve. -#' @param base_curve Basis curve +#' Compute custom smoothed h-method derivative of function. +#' @description +#' The function adds an extrapolated element in the end and a 0 element in the front, +#' then computes the midpoints between#' neighbouring elements and then uses the `diff` +#' function to compute the changes in rt between points. +#' @param times Retention time values. +#' @return Differences between time values. #' @export -compute_all_times <- function(base_curve) { - all_times <- base_curve[, 1] - if (all_times[1] > 0) all_times <- c(0, all_times) - all_times <- c(all_times, 2 * all_times[length(all_times)] - all_times[length(all_times) - 1]) +compute_delta_rt <- function(times) { + # add element which is 2x the last element - the second to last - basically the extrapolated next element + all_times <- c(0, times, 2 * times[length(times)] - times[length(times) - 1]) + + # basically take the mean between consecutive values as the values - somewhat smoothed all_times <- (all_times[1:(length(all_times) - 1)] + all_times[2:length(all_times)]) / 2 - all_times <- all_times[2:length(all_times)] - all_times[1:(length(all_times) - 1)] - return(all_times) -} -#' @export -compute_base_curve <- function(x) { - base_curve <- unique(x) - base_curve <- base_curve[order(base_curve)] - base_curve <- cbind(base_curve, base_curve * 0) - return(base_curve) + # get the differences between the values + all_times <- diff(all_times) + return(all_times) } - #' Normalize vector so that sum(vec) = 1 +#' @description x / sum(x) +#' @param x Data to normalize. +#' @return Normalized data. l2normalize <- function(x) { x / sum(x) } + +#' Compute the density function of mz values. +#' @description +#' The function takes the mz values and uses \link[stats]{density} to +#' compute the local density, optionally using intensity based weighting. +#' @param mz Mass values to compute the density over. +#' @param intensities Intensities of the peaks at mz values. +#' Only used if intensity_weighted == TRUE. +#' @param bandwidth Bandwidth to use to compute the kernel density. +#' @param intensity_weighted Whether to use intensity weighting or not. +#' @return \link[stats]{density} object containing the densities. #' @export -compute_mass_density <- function(mz, - intensities, +compute_mass_density <- function(features, bandwidth, - intensity_weighted) { + intensity_weighted, + n = 512) { if (intensity_weighted) { - mass_density <- density( - mz, - weights = l2normalize(intensities), - bw = bandwidth - ) + weights <- l2normalize(features$intensities) } else { - mass_density <- density(mz, bw = bandwidth) + weights <- NULL } + mass_density <- density( + features$mz, + weights = weights, + bw = bandwidth, + n = n + ) return(mass_density) } +#' Compute custom chromatographic tolerance. +#' @description +#' Compute chromatographic tolerance for each feature. If `use_observed_range == TRUE`, +#' the whole range of retention times for all peaks is used to compute the tolerance, +#' otherwise `chr_range` is used for each feature. +#' @param use_observed_range bool Whether to use the observed chromatographic range for computation or not. +#' @param peak_rts data.frame Retention time cross table with all peak rts. +#' @param chr_range float Default chromatographic tolerance to use. +#' @param aligned_features data.frame Aligned feature table. +#' @return vector Custom chromatographic tolerances to use for each feature. #' @export -get_custom_chr_tol <- function(use.observed.range, - pk.times, - chr.range, - aligned.ftrs) { - custom.chr.tol <- rep(chr.range, nrow(aligned.ftrs)) +get_custom_chr_tol <- function(use_observed_range, + peak_rts, + chr_range, + aligned_features) { + custom_chr_tol <- rep(chr_range, nrow(aligned_features)) - if (use.observed.range) { + if (use_observed_range) { # check observed rt range across ALL SAMPLES - all_peak_rts <- pk.times[, 5:ncol(pk.times)] + all_peak_rts <- peak_rts[, 5:ncol(peak_rts)] observed.chr.range <- (apply(all_peak_rts, 1, max) - apply(all_peak_rts, 1, min)) / 2 sufficient_rts <- apply(!is.na(all_peak_rts), 1, sum) >= 5 - selection <- which(sufficient_rts & custom.chr.tol > observed.chr.range) - custom.chr.tol[selection] <- observed.chr.range[selection] + selection <- which(sufficient_rts & custom_chr_tol > observed.chr.range) + custom_chr_tol[selection] <- observed.chr.range[selection] } - return(custom.chr.tol) + return(custom_chr_tol) } +#' Compute target times for regions of interest for recovery. +#' @description +#' Compute the individual target times for the features to be recovered in the sample. +#' Spline interpolation using \link[splines]{interpSpline} is used to map from adjusted times +#' back into the original times. The function requires `x` to be distinct, hence the filtering +#' to only include rt values that occurr only once in both lists. +#' @param aligned_rts vector Aligned retention time values. +#' @param original_sample_rts vector Original feature retention time values before correction. +#' @param adjusted_sample_rts vector Feature retention time values after time correction. #' @export -compute_target_time <- function(aligned_rts, orig.time, adjusted.time) { - to.use <- get_times_to_use(orig.time, adjusted.time) - orig.time <- orig.time[to.use] - adjusted.time <- adjusted.time[to.use] - - sel.non.na <- which(!is.na(aligned_rts)) - if (length(adjusted.time) >= 4) { - sp <- interpSpline(orig.time ~ adjusted.time, na.action = na.omit) - aligned_rts[sel.non.na] <- predict(sp, aligned_rts[sel.non.na])$y +compute_target_times <- function(aligned_rts, + original_sample_rts, + adjusted_sample_rts, + min_common_times = 4) { + to.use <- get_times_to_use(original_sample_rts, adjusted_sample_rts) + adjusted_subset <- adjusted_sample_rts[to.use] + + sel_non_na <- which(!is.na(aligned_rts)) + if (length(adjusted_subset) >= min_common_times) { + original_subset <- original_sample_rts[to.use] + sp <- splines::interpSpline( + original_subset ~ adjusted_subset, + na.action = na.omit + ) + aligned_rts[sel_non_na] <- predict(sp, aligned_rts[sel_non_na])$y } } +#' Get boolean mask for values that occur only once. +#' @description +#' Uses the \link[base]{table} function to compute the occurrences and then +#' checks which values only occur a single time. +#' @param values vector Values for which to compute the mask. +#' @return vector Boolean vector which is the mask of values occuring only once. +get_single_occurrence_mask <- function(values) { + ttt <- table(values) + mask <- values %in% as.numeric(names(ttt)[ttt == 1]) + return(mask) +} + +#' Get retention time values to use +#' @description +#' Obtain retention time values which occur only once in both the original and the adjusted times. +#' This is a custom version of the unique or intersection function with rounding etc. +#' @param original_sample_rts vector Original feature retention time values before correction. +#' @param adjusted_sample_rts vector Feature retention time values after time correction. +#' @param cap int Maximum number of time values to return. +#' @return Indices of retention time values to use. #' @export -get_times_to_use <- function(orig.time, adjusted.time) { - ttt.0 <- table(orig.time) - ttt <- table(adjusted.time) - to.use <- which(adjusted.time %in% as.numeric(names(ttt)[ttt == 1]) & orig.time %in% as.numeric(names(ttt.0)[ttt.0 == 1])) - if (length(to.use) > 2000) { - to.use <- sample(to.use, 2000, replace = FALSE) +get_times_to_use <- function(original_sample_rts, adjusted_sample_rts, cap = 2000) { + to.use <- which( + get_single_occurrence_mask(adjusted_sample_rts) & + get_single_occurrence_mask(original_sample_rts) + ) + + if (length(to.use) > cap) { + to.use <- sample(to.use, cap, replace = FALSE) } return(to.use) } +#' Predict the indices for the valley points with low mass density. +#' @description +#' The density of mz values in the feature table is computed based on the tolerance. +#' The valleys or breaks of clusters in mz values are detected and a function +#' is approximated to predict the indices for the mass values which are the closest to those +#' valley points. +#' @param features data.frame Data table with features for which to predict the indices. +#' @param mz_orig_tol float Mz tolerance to use as KDE bandwidth parameter. +#' @return vector Predicted indices for valley points. #' @export -compute_breaks_2 <- function(data_table, orig.tol) { - all.mass.den <- density( - data_table$mz, - weights = l2normalize(data_table$intensities), - bw = 0.5 * orig.tol * max(data_table$mz), - n = 2^min(15, floor(log2(length(data_table$mz))) - 2) +predict_mz_break_indices <- function(features, mz_orig_tol) { + mz_density <- compute_mass_density( + features, + TRUE, + bandwidth = 0.5 * mz_orig_tol * max(features$mz), + n = 2^min(15, floor(log2(length(features$mz))) - 2) ) - all.mass.turns <- find.turn.point(all.mass.den$y) - all.mass.vlys <- all.mass.den$x[all.mass.turns$vlys] - breaks <- c(0, unique(round(approx(data_table$mz, seq_along(data_table$mz), xout = all.mass.vlys, rule = 2, ties = "ordered")$y))[-1]) + turnpoints <- find.turn.point(mz_density$y) + mz_valleys <- mz_density$x[turnpoints$vlys] + indices <- seq_along(features$mz) + predictions <- approx( + x = features$mz, + y = indices, + xout = mz_valleys, + rule = 2, + ties = "ordered" + )$y + + predicted_indices_at_vlys <- unique(round(predictions))[-1] + breaks <- c(0, predicted_indices_at_vlys) return(breaks) } +#' Compute range of valley indices which are in mz_tol range around aligned_feature_mass. +#' @description +#' +#' @param aligned_feature_mass float Mz value of the aligned feature. +#' @param mz vector mz values of the features. +#' @param vlys_indices vector Indices of the valley points of mz clusters. +#' @param mz_tol float Tolerance to use to check if values are close. +#' @return pair Index range (start, end). +#' @export +get_mzrange_bound_indices <- function(aligned_feature_mass, + mz, + vlys_indices, + mz_tol) { + if (aligned_feature_mass <= mz[vlys_indices[2]]) { + all_indices <- c(1, 2) + } else { + # get all indices where mz is close to aligned_feature mass, + # the first one that is larger and the last one that is smaller. + upper_bound_idx <- min(which(mz[vlys_indices] > aligned_feature_mass)) + lower_bound_idx <- max(which(mz[vlys_indices] < aligned_feature_mass)) + valley_indices_within_tol <- which(abs(mz[vlys_indices] - aligned_feature_mass) < mz_tol) + all_indices <- c( + valley_indices_within_tol, + upper_bound_idx, + lower_bound_idx + ) + 1 + } + return(list(start = min(all_indices), end = max(all_indices))) +} + +#' Get indices where rt in `features` is within `chr_tol` of `target_time`. +#' @param target_time float Target retention time region. +#' @param features tibble Feature table including `labels` column. +#' @param chr_tol float Retention time tolerance. +#' @return vector Indices which are within `chr_tol` from `target_time` or +#' 1 if `target_time` is NA. +#' @export +get_rt_region_indices <- function(target_time, features, chr_tol) { + if (!is.null(target_time) && !is.na(target_time)) { + selection <- which(abs(features$labels - target_time) < chr_tol) + } else { + selection <- 1:nrow(features) + } + return(selection) +} + +#' Get peaks and valleys of smoothed rt values in range. +#' +#' @param features tibble Data table with `labels` and `intensities` columns. +#' @param times vector Raw retention time data from raw data file. +#' @param bw float Bandwidth to use for kernel smoothing. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item pks - vector - The data points at which the density peaks. +#' \item vlys - vector - The points in the data where the density is low +#' (forming a valley in the function). +get_features_in_rt_range <- function(features, times, bw) { + time_curve <- times[times >= min(features$labels) & times <= max(features$labels)] + + this.curve <- cbind(time_curve, time_curve * 0) + this.curve[this.curve[, 1] %in% features$labels, 2] <- features$intensities + + this.smooth <- ksmooth( + this.curve[, 1], + this.curve[, 2], + kernel = "normal", + bandwidth = bw + ) + + return(compute_peaks_and_valleys(this.smooth)) +} + +#' Count the number of peaks in all valleys +#' @description +#' For each peak in ROI, count the peaks between the valley points. +#' @param roi list Named list with vectors `pks` and `vlys`. +#' @param times vector Retention time values +#' @return vector Numbers of peaks within each region defined by a peak and the two valley points. +count_peaks <- function(roi, times) { + num_peaks <- rep(0, length(roi$pks)) + + for (m in seq_along(roi$pks)) { + boundaries <- compute_boundaries(roi$vlys, roi$pks[m]) + num_peaks[m] <- sum(times >= boundaries$lower & times <= boundaries$upper) + } + return(num_peaks) +} + +#' Compute peaks and valleys which have at least `recover_min_count` peaks. +#' +#' @param features tibble Features with `mz`, `labels` and `intensities`. +#' @param times vector Retention time values from the raw data file. +#' @param bandwidth float Bandwidth to use in smoothing. +#' @param target_rt float Retention time at which to recover the intensity. +#' @param recover_min_count int Minimum number of peaks required in the area to recover the signal. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item pks - vector - The data points at which the density peaks with at least `recover_min_count` peaks between the valley points. +#' \item vlys - vector - The points in the data where the density is low +#' (forming a valley in the function). +compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_min_count) { + roi <- get_features_in_rt_range( + features, + times, + bandwidth + ) + + pks <- roi$pks + vlys <- roi$vlys + + num_peaks <- count_peaks(roi, features$labels) + + if (!is.null (target_rt) && !is.na(target_rt)) { + pks.d <- abs(pks - target_rt) # distance from the target peak location + pks.d[num_peaks == 0] <- Inf + pks <- pks[which.min(pks.d)] + } else { + pks <- pks[num_peaks > recover_min_count] + } + + return(list(pks = pks, vlys = vlys)) +} + +#' Compute interpolated retention time, its standard deviation, and intensity values,. +#' +#' @param features tibble Features with `labels` and `intensities` columns. +#' @param aver_diff float Average retention time difference. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item intensity - float - Interpolated intensity value. +#' \item label - float - Interpolated retention time value. +#' \item sigma - float - Standard deviation of retention times #' @export -get_mzrange_bound_indices <- function(aligned.ftrs, masses, breaks, i, custom.mz.tol) { - if (aligned.ftrs[i, "mz"] <= masses[breaks[2]]) { - this.found <- c(1, 2) +compute_mu_sc_std <- function(features, aver_diff) { + x <- features$labels + y <- features$intensities + + sum_y <- sum(y) + miu <- sum(x * y) / sum_y # weighted retention time values + sigma <- sqrt(sum(y * (x - miu)^2) / sum_y) + if (sigma == 0) { + sc <- sum_y * aver_diff + miu <- miu } else { - this.found <- c(which(abs(masses[breaks] - aligned.ftrs[i, "mz"]) < custom.mz.tol[i]), min(which(masses[breaks] > aligned.ftrs[i, "mz"])), max(which(masses[breaks] < aligned.ftrs[i, "mz"]))) + 1 - this.found <- c(min(this.found), max(this.found)) + fitted <- dnorm(x, mean = miu, sd = sigma) + selection <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 + sc <- exp(sum(fitted[selection]^2 * log(y[selection] / fitted[selection]) / sum(fitted[selection]^2))) } - return(this.found) + + return(list(intensity = sc, label = miu, sigma = sigma)) } +#' Compute the rectangle around recovered features given that enough peaks are present. +#' @description +#' +#' @param mz Mz value of the feature. +#' @param peak Peak around which to detect the new feature. +#' @param valleys Valley points to compute the boundary region. +#' @param features tibble Tibble with `labels` and `intensities` column. +#' @param aver_diff float Average retention time difference. +#' @param times vector Raw retention time values from raw data file. +#' @param delta_rt vector Differences between consecutive retention time values (diff(times)). +#' @return list Triplet of mz, label and intensity for the feature. +compute_curr_rec_with_enough_peaks <- function(mz, + peak, + valleys, + features, + aver_diff, + times, + delta_rt) { + # same filtering of peaks as in compute_pks_vlyws and as above + boundaries <- compute_boundaries(valleys, peak) + + subset <- features |> + dplyr::filter(labels >= boundaries$lower & labels <= boundaries$upper) + + if (nrow(subset) == 1) { + intensity <- subset$intensities * aver_diff + label <- subset$labels + } else if (nrow(subset) >= 10) { + res <- compute_mu_sc_std(subset, aver_diff) + intensity <- res$intensity + label <- res$label + } else { + intensity <- interpol.area( + subset$labels, + subset$intensities, + times, + delta_rt + ) + label <- median(subset$labels) + } + + return(c(mz, label, intensity)) +} + +#' Compute bounds of area using given peak and mass valley points. +#' @description +#' The lower bound is the mass of the valley the closest but smaller than peak +#' and the upper bound is the mass of the valley the closest but higher than +#' the peak. +#' @param valley_points vector values of valley points defining clusters. +#' @param peak double Value of the peak for which to get the valley bounds. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item lower - double - The value of the lower bound valley point. +#' \item upper - double - The value of the upper bound valley point. +#' } #' @export -get_raw_features_in_mzrange <- function(data_table, aligned.ftrs, breaks, i, custom.mz.tol) { - this.found <- get_mzrange_bound_indices(aligned.ftrs, data_table$mz, breaks, i, custom.mz.tol) - this.sel <- (breaks[this.found[1]] + 1):breaks[this.found[2]] - features <- data_table |> dplyr::slice(this.sel) - return(features) +compute_boundaries <- function(valley_points, peak) { + lower <- max(valley_points[valley_points < peak]) + upper <- min(valley_points[valley_points > peak]) + return(list(lower = lower, upper = upper)) +} + +#' Compute peaks and valleys of density function. +#' @description +#' Given a density function, the turn points are computed and +#' the peaks and valleys in the original data (points with highest +#' and lowest density) are returned. +#' @param density stats::density Density object for which to compute peaks +#' and valleys. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item pks - vector - The data points at which the density peaks. +#' \item vlys - vector - The points in the data where the density is low +#' (forming a valley in the function). +#' } +compute_peaks_and_valleys <- function(dens) { + turns <- find.turn.point(dens$y) + pks <- dens$x[turns$pks] # mz values with highest density + vlys <- dens$x[turns$vlys] + vlys <- c(-Inf, vlys, Inf) # masses with lowest densities values -> valley + return(list(pks = pks, vlys = vlys)) +} + +#' Compute rectangle around feature with `aligned_feature_mz` and `target_rt` for recovery. +#' +#' @param data_table tibble Feature table with `mz`, `labels` and `intensities` column. +#' @param aligned_feature_mz float Mz value of feature in aligned feature table. +#' @param breaks vector Integer boundaries of clusters in mz values. +#' @param custom_mz_tol float Custom mz tolerance for the feature. +#' @param orig_mz_tol float Flat original mz tolerance to use. +#' @param use_intensity_weighting bool Whether to use intensity weighting. +#' @param recover_min_count int Minimum number of peaks required in the area to recover the signal. +#' @param target_rt float Target retention time value. +#' @param custom_chr_tol float Custom chromatographic tolerance to use. +#' @param times vector Raw retention time values from raw data file. +#' @param delta_rt vector Differences between consecutive retention time values (diff(times)). +#' @param aver_diff float Average retention time difference. +#' @param bandwidth float Bandwidth to use in smoothing. +#' @param min.bw float Minimum bandwidth to use. +#' @param max.bw float Maximum bandwidth to use. +#' @return tibble Tibble with `mz`, `labels` and `intensities` columns. +compute_rectangle <- function(data_table, + aligned_feature_mz, + breaks, + custom_mz_tol, + orig_mz_tol, + use_intensity_weighting, + recover_min_count, + target_rt, + custom_chr_tol, + times, + delta_rt, + aver_diff, + bandwidth, + min.bw, + max.bw) { + bounds <- get_mzrange_bound_indices( + aligned_feature_mz, + data_table$mz, + breaks, + orig_mz_tol + ) + + features <- dplyr::slice( + data_table, + (breaks[bounds$start] + 1):breaks[bounds$end] + ) |> dplyr::arrange_at("labels") + + mass.den <- compute_mass_density( + features, + bandwidth = 0.5 * orig_mz_tol * aligned_feature_mz, + intensity_weighted = use_intensity_weighting + ) + + # find peaks in mz range in raw data + mass_range <- compute_peaks_and_valleys(mass.den) + pks_in_tol <- abs(mass_range$pks - aligned_feature_mz) < custom_mz_tol / 1.5 + mass_range$pks <- mass_range$pks[pks_in_tol] + + this.rec <- tibble::tibble(mz = Inf, labels = Inf, intensities = Inf) + for (peak in mass_range$pks) { + # get mass values of valleys the closest to the peak + mass <- compute_boundaries(mass_range$vlys, peak) + + that <- features |> + dplyr::filter(mz > mass$lower & mz <= mass$upper) + + # get values in RT region of interest? + if (nrow(that) > recover_min_count) { + that.prof <- combine.seq.3(that) + that.mass <- sum(that.prof$mz * that.prof$intensities) / sum(that.prof$intensities) + curr.rec <- c(that.mass, NA, NA) + + if (nrow(that.prof) < 10) { + thee.sel <- get_rt_region_indices( + target_rt, + that.prof, + custom_chr_tol + ) + + if (length(thee.sel) > recover_min_count) { + if (length(thee.sel) > 1) { + curr.rec[3] <- interpol.area( + that.prof$labels[thee.sel], + that.prof$intensities[thee.sel], + times, + delta_rt + ) + } else { + curr.rec[3] <- that.prof$intensities[thee.sel] * aver_diff + } + curr.rec[2] <- median(that.prof$labels[thee.sel]) + this.rec <- tibble::add_row( + this.rec, + tibble::tibble_row( + mz = curr.rec[1], + labels = curr.rec[2], + intensities = curr.rec[3] + ) + ) + } + } else { + labels_intensities <- dplyr::select( + that.prof, + c("labels", "intensities") + ) |> dplyr::arrange_at("labels") + bw <- min(max(bandwidth * (span(labels_intensities$labels)), min.bw), max.bw) + + all <- compute_pks_vlys_rt( + labels_intensities, + times, + bw, + target_rt, + recover_min_count + ) + + for (peak in all$pks) { + curr.rec <- compute_curr_rec_with_enough_peaks( + that.mass, + peak, + all$vlys, + labels_intensities, + aver_diff, + times, + delta_rt + ) + + this.rec <- tibble::add_row( + this.rec, + tibble::tibble_row( + mz = curr.rec[1], + labels = curr.rec[2], + intensities = curr.rec[3] + ) + ) + } + } + } + } + return(this.rec) +} + +#' Refine the selection based on mz and rt differences. +#' @param target_rt float Target retention time value. +#' @param rectangle tibble Features with columns `labels` and `mz`. +#' @param aligned_mz float Mz value in the aligned feature table of the +#' feature to be recovered. +#' @param chr_tol float Retention time tolerance. +#' @param mz_tol float Mz tolerance to use. +#' @return int Index of value in rectable closest to `target_rt` and `aligned_mz`. +refine_selection <- function(target_rt, rectangle, aligned_mz, chr_tol, mz_tol) { + if (!is.na(target_rt)) { + rt_term <- (rectangle$labels - target_rt)^2 / chr_tol^2 + mz_term <- (rectangle$mz - aligned_mz)^2 / mz_tol^2 + this.d <- rt_term + mz_term + } else { + this.d <- abs(rectangle$mz - aligned_mz) + } + this.sel <- which.min(this.d) + return(this.sel) } #' Recover weak signals in some profiles that is not identified as a peak, but corresponds to identified peaks in other spectra. -#' +#' #' @description -#' Given the aligned feature table, some features are identified in a subgroup of spectra. This doesn't mean they don't exist in the other spectra. -#' The signal could be too low to pass the run filter. Thus after obtaining the aligned feature table, this function re-analyzes each spectrum to +#' Given the aligned feature table, some features are identified in a subgroup of spectra. This doesn't mean they don't exist in the other spectra. +#' The signal could be too low to pass the run filter. Thus after obtaining the aligned feature table, this function re-analyzes each spectrum to #' try and fill in the holes in the aligned feature table. #' @param filename the cdf file name from which weaker signal is to be recovered. #' @param loc the location of the filename in the vector of filenames. @@ -159,15 +624,20 @@ get_raw_features_in_mzrange <- function(data_table, aligned.ftrs, breaks, i, cus #' @param pk.times matrix, with columns of m/z, median elution time, and elution times in each spectrum. #' @param align.mz.tol the m/z tolerance used in the alignment. #' @param align.chr.tol the elution time tolerance in the alignment. -#' @param this.f1 The matrix which is the output from proc.to.feature(). -#' @param this.f2 The matrix which is the output from proc.to.feature(). The retention time in this object have been adjusted by the function adjust.time(). -#' @param mz.range The m/z around the feature m/z to search for observations. The default value is NA, in which case 1.5 times the m/z tolerance in the aligned object will be used. -#' @param chr.range The retention time around the feature retention time to search for observations. The default value is NA, in which case 0.5 times the retention time tolerance in the aligned object will be used. -#' @param use.observed.range If the value is TRUE, the actual range of the observed locations of the feature in all the spectra will be used. +#' @param extracted_features The matrix which is the output from proc.to.feature(). +#' @param adjusted_features The matrix which is the output from proc.to.feature(). +#' The retention time in this object have been adjusted by the function adjust.time(). +#' @param mz.range The m/z around the feature m/z to search for observations. +#' The default value is NA, in which case 1.5 times the m/z tolerance in the aligned object will be used. +#' @param chr.range The retention time around the feature retention time to search for observations. +#' The default value is NA, in which case 0.5 times the retention time tolerance in the aligned object will be used. +#' @param use.observed.range If the value is TRUE, the actual range of the observed locations +#' of the feature in all the spectra will be used. #' @param orig.tol The mz.tol parameter provided to the proc.cdf() function. This helps retrieve the intermediate file. #' @param min.bw The minimum bandwidth to use in the kernel smoother. #' @param max.bw The maximum bandwidth to use in the kernel smoother. -#' @param bandwidth A value between zero and one. Multiplying this value to the length of the signal along the time axis helps determine the bandwidth in the kernel smoother used for peak identification. +#' @param bandwidth A value between zero and one. Multiplying this value to the length of the signal along the +#' time axis helps determine the bandwidth in the kernel smoother used for peak identification. #' @param recover.min.count minimum number of raw data points to support a recovery. #' @param intensity.weighted Whether to use intensity to weight mass density estimation. #' @return Returns a list object with the following objects in it: @@ -186,8 +656,8 @@ recover.weaker <- function(filename, pk.times, align.mz.tol, align.chr.tol, - this.f1, - this.f2, + extracted_features, + adjusted_features, mz.range = NA, chr.range = NA, use.observed.range = TRUE, @@ -201,24 +671,27 @@ recover.weaker <- function(filename, # load raw data this.raw <- load_file(filename) times <- this.raw$times - data_table <- tibble::tibble(mz = this.raw$masses, labels = this.raw$labels, intensities = this.raw$intensi) |> dplyr::arrange_at("mz") + data_table <- tibble::tibble( + mz = this.raw$masses, + labels = this.raw$labels, + intensities = this.raw$intensi + ) |> dplyr::arrange_at("mz") rm(this.raw) - masses <- data_table$mz # Initialize parameters with default values if (is.na(mz.range)) mz.range <- 1.5 * align.mz.tol if (is.na(chr.range)) chr.range <- align.chr.tol / 2 - if (is.na(min.bw)) min.bw <- diff(range(times, na.rm = TRUE)) / 60 - if (is.na(max.bw)) max.bw <- diff(range(times, na.rm = TRUE)) / 15 + if (is.na(min.bw)) min.bw <- span(times) / 60 + if (is.na(max.bw)) max.bw <- span(times) / 15 if (min.bw >= max.bw) min.bw <- max.bw / 4 - base.curve <- compute_base_curve(sort(times)) - aver.diff <- mean(diff(base.curve[, 1])) - all.times <- compute_all_times(base.curve) + times <- sort(unique(times)) + aver.diff <- mean(diff(times)) + vec_delta_rt <- compute_delta_rt(times) - this.ftrs <- aligned.ftrs[, sample_name] - this.times <- pk.times[, sample_name] + sample_intensities <- aligned.ftrs[, sample_name] + sample_times <- pk.times[, sample_name] custom.mz.tol <- mz.range * aligned.ftrs$mz custom.chr.tol <- get_custom_chr_tol( @@ -228,188 +701,104 @@ recover.weaker <- function(filename, aligned.ftrs ) - target.time <- compute_target_time( + # # rounding is used to create a histogram of retention time values + target_times <- compute_target_times( aligned.ftrs[, "rt"], - round(this.f1[, "pos"], 5), - round(this.f2[, "pos"], 5) + round(extracted_features$pos, 5), + round(adjusted_features$rt, 5) ) - breaks <- compute_breaks_2(data_table, orig.tol) + # IMPORTANT: THIS CODE SECTION COULD BE USED TO REPLACE COMPUTE_TARGET_TIMES FOR THE TEST CASES AND + # IS A MASSIVE SIMPLIFICATION. + # sp <- splines::interpSpline( + # unique(extracted_features$pos) ~ unique(adjusted_features$rt), + # na.action = na.omit + # ) + # target_times <- predict(sp, aligned.ftrs[, "rt"])$y - this.mz <- rep(NA, length(this.ftrs)) - for (i in seq_along(this.ftrs)) - { - if (this.ftrs[i] == 0 && aligned.ftrs[i, "mz"] < max(masses)) { - features <- get_raw_features_in_mzrange(data_table, aligned.ftrs, breaks, i, custom.mz.tol) - - mass.den <- compute_mass_density( - mz = features$mz, - intensities = features$intensities, - bandwidth = 0.5 * orig.tol * aligned.ftrs[i, "mz"], - intensity_weighted = intensity.weighted - ) + breaks <- predict_mz_break_indices(data_table, orig.tol) - # find peaks in mz range in raw data - mass.turns <- find.turn.point(mass.den$y) - mass.pks <- mass.den$x[mass.turns$pks] # mz values with highest density - mass.vlys <- c(-Inf, mass.den$x[mass.turns$vlys], Inf) # masses with lowest densities values -> valley - mass.pks <- mass.pks[which(abs(mass.pks - aligned.ftrs[i, "mz"]) < custom.mz.tol[i] / 1.5)] - - if (length(mass.pks) > 0) { - this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) - for (peak in mass.pks) - { - # get mass values of valleys the closest to the peak - mass.lower <- max(mass.vlys[mass.vlys < peak]) - mass.upper <- min(mass.vlys[mass.vlys > peak]) - - that <- features |> - dplyr::filter(mz > mass.lower & mz <= mass.upper) |> - dplyr::arrange_at("labels") - - if (nrow(that) > recover.min.count) { - that.prof <- combine.seq.3_old(that$labels, that$mz, that$intensities) - that.mass <- sum(that.prof[, 1] * that.prof[, 3]) / sum(that.prof[, 3]) - curr.rec <- c(that.mass, NA, NA) - - if (nrow(that.prof) < 10) { - if (!is.na(target.time[i])) { - thee.sel <- which(abs(that.prof[, 2] - target.time[i]) < custom.chr.tol[i] * 2) - } else { - thee.sel <- 1:nrow(that.prof) - } - - if (length(thee.sel) > recover.min.count) { - if (length(thee.sel) > 1) { - that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], base.curve[, 1], all.times) - } else { - that.inte <- that.prof[thee.sel, 3] * aver.diff - } - curr.rec[3] <- that.inte - curr.rec[2] <- median(that.prof[thee.sel, 2]) - this.rec <- rbind(this.rec, curr.rec) - } - } else { - this <- that.prof[, 2:3] - this <- this[order(this[, 1]), ] - this.span <- range(this[, 1]) - this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ] - this.curve[this.curve[, 1] %in% this[, 1], 2] <- this[, 2] - - bw <- min(max(bandwidth * (max(this[, 1]) - min(this[, 1])), min.bw), max.bw) - this.smooth <- ksmooth(this.curve[, 1], this.curve[, 2], kernel = "normal", bandwidth = bw) - smooth.y <- this.smooth$y - turns <- find.turn.point(smooth.y) - pks <- this.smooth$x[turns$pks] - vlys <- this.smooth$x[turns$vlys] - vlys <- c(-Inf, vlys, Inf) - - pks.n <- pks - for (m in 1:length(pks)) - { - this.vlys <- c(max(vlys[which(vlys < pks[m])]), min(vlys[which(vlys > pks[m])])) - pks.n[m] <- sum(this[, 1] >= this.vlys[1] & this[, 1] <= this.vlys[2]) - } - - if (!is.na(target.time[i])) { - pks.d <- abs(pks - target.time[i]) # distance from the target peak location - pks.d[pks.n == 0] <- Inf - pks <- pks[which(pks.d == min(pks.d))[1]] - } else { - pks <- pks[pks.n > recover.min.count] - } - - all.pks <- pks - all.vlys <- vlys - all.this <- this - - if (length(all.pks) > 0) { - for (pks.i in 1:length(all.pks)) - { - pks <- all.pks[pks.i] - vlys <- c(max(all.vlys[which(all.vlys < pks)]), min(all.vlys[which(all.vlys > pks)])) - - this <- all.this[which(all.this[, 1] >= vlys[1] & all.this[, 1] <= vlys[2]), ] - if (is.null(nrow(this))) { - curr.rec[3] <- this[2] * aver.diff - curr.rec[2] <- this[1] - } else { - x <- this[, 1] - y <- this[, 2] - - if (nrow(this) >= 10) { - miu <- sum(x * y) / sum(y) - sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) - if (sigma == 0) { - curr.rec[3] <- sum(y) * aver.diff - curr.rec[2] <- miu - } else { - fitted <- dnorm(x, mean = miu, sd = sigma) - this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 - sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - } - } else { - sc <- interpol.area(x, y, base.curve[, 1], all.times) - miu <- median(x) - } - curr.rec[3] <- sc - curr.rec[2] <- miu - } - this.rec <- rbind(this.rec, curr.rec) - } - } - } - } - } + this.mz <- rep(NA, length(sample_intensities)) + max_mz <- max(data_table$mz) - if (!is.na(target.time[i])) { - this.sel <- which(abs(this.rec[, 2] - target.time[i]) < custom.chr.tol[i]) - } else { - this.sel <- 1:nrow(this.rec) - this.sel <- this.sel[this.sel != 1] - } + # THIS CONSTRUCT TO EXTRACT MISSING FEATURES COULD BE USED TO POSSIBLY SPEED UP + # THE COMPUTATION AS THE LOOP WILL ONLY GO OVER THE ROWS AND THE ADDITIONAL VARIABLES + # CAN BE ADDED USING THE MUTATE FUNCTION. + # t_aligned <- tibble::tibble(aligned.ftrs) + # missing_features <- dplyr::filter(t_aligned, !!rlang::sym(sample_name) == 0 & mz < max_mz) + # if(nrow(missing_features) > 0) { + # browser() + # } - if (length(this.sel) > 0) { - if (length(this.sel) > 1) { - if (!is.na(target.time[i])) { - this.d <- (this.rec[, 2] - target.time[i])^2 / custom.chr.tol[i]^2 + (this.rec[, 1] - aligned.ftrs[i, 1])^2 / custom.mz.tol[i]^2 - this.sel <- which(this.d == min(this.d))[1] - } else { - this.d <- abs(this.rec[, 1] - aligned.ftrs[i, 1]) - this.sel <- which(this.d == min(this.d))[1] - } - } - this.pos.diff <- abs(this.f1[, 2] - this.rec[this.sel, 2]) - this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] - this.f1 <- rbind(this.f1, c(this.rec[this.sel, 1], this.rec[this.sel, 2], NA, NA, this.rec[this.sel, 3])) - this.time.adjust <- (-this.f1[this.pos.diff, 2] + this.f2[this.pos.diff, 2]) - this.f2 <- rbind( - this.f2, - c( - this.rec[this.sel, 1], - this.rec[this.sel, 2] + this.time.adjust, - NA, - NA, - this.rec[this.sel, 3], - grep(sample_name, colnames(aligned.ftrs)), - NA - ) + for (i in seq_along(sample_intensities)) + { + if (sample_intensities[i] == 0 && aligned.ftrs[i, "mz"] < max_mz) { + this.rec <- compute_rectangle( + data_table, + aligned.ftrs[i, "mz"], + breaks, + custom.mz.tol[i], + orig.tol, + intensity.weighted, + recover.min.count, + target_times[i], + custom.chr.tol[i] * 2, + times, + vec_delta_rt, + aver.diff, + bandwidth, + min.bw, + max.bw + ) + + this.sel <- get_rt_region_indices( + target_times[i], + this.rec, + custom.chr.tol[i] + ) + this.sel <- this.sel[this.sel != 1] + + if (length(this.sel) > 0) { + if (length(this.sel) > 1) { + this.sel <- refine_selection( + target_times[i], + this.rec, + aligned.ftrs[i, 1], + custom.chr.tol[i], + custom.mz.tol[i] ) - this.ftrs[i] <- this.rec[this.sel, 3] - this.times[i] <- this.rec[this.sel, 2] + this.time.adjust - this.mz[i] <- this.rec[this.sel, 1] } + + this.pos.diff <- which.min(abs(extracted_features$pos - this.rec$labels[this.sel])) + extracted_features <- extracted_features |> tibble::add_row( + mz = this.rec$mz[this.sel], + pos = this.rec$labels[this.sel], + area = this.rec$intensities[this.sel] + ) + + this.time.adjust <- (-extracted_features$pos[this.pos.diff] + adjusted_features$rt[this.pos.diff]) + + adjusted_features <- adjusted_features |> tibble::add_row( + mz = this.rec$mz[this.sel], + rt = this.rec$labels[this.sel] + this.time.adjust, + area = this.rec$intensities[this.sel], + sample_id = grep(sample_name, colnames(aligned.ftrs)) - 4 # offset for other columns `mz`, `rt` etc + ) + + sample_intensities[i] <- this.rec$intensities[this.sel] + sample_times[i] <- this.rec$labels[this.sel] + this.time.adjust + this.mz[i] <- this.rec$mz[this.sel] } } } to.return <- new("list") to.return$this.mz <- this.mz - to.return$this.ftrs <- this.ftrs - to.return$this.times <- this.times - to.return$this.f1 <- duplicate.row.remove(this.f1) - to.return$this.f2 <- duplicate.row.remove(this.f2) + to.return$this.ftrs <- sample_intensities + to.return$this.times <- sample_times + to.return$this.f1 <- duplicate.row.remove(extracted_features |> dplyr::rename(rt = pos)) |> dplyr::rename(pos = rt) + to.return$this.f2 <- duplicate.row.remove(adjusted_features) return(to.return) } diff --git a/R/semi.sup.R b/R/semi.sup.R index cf2eed11..eaea1650 100644 --- a/R/semi.sup.R +++ b/R/semi.sup.R @@ -383,7 +383,7 @@ semi.sup <- function( for(j in this.subset) { this.name<-paste(strsplit(tolower(files[j]),"\\.")[[1]][1],suf,"semi_sup.recover",sep="_") - this.recovered<-recover.weaker(filename=files[j], sample_name = files[j], aligned.ftrs=aligned.ftrs, pk.times=pk.times, align.mz.tol=aligned$mz.tol, align.chr.tol=aligned$chr.tol, this.f1=features[[j]], this.f2=f2[[j]], mz.range=recover.mz.range, chr.range=recover.chr.range, use.observed.range=use.observed.range, orig.tol=mz.tol, min.bw=min.bw, max.bw=max.bw, bandwidth=.5, recover.min.count=recover.min.count) + this.recovered<-recover.weaker(filename=files[j], sample_name = files[j], aligned.ftrs=aligned.ftrs, pk.times=pk.times, align.mz.tol=aligned$mz.tol, align.chr.tol=aligned$chr.tol, extracted_features=features[[j]], adjusted_features=f2[[j]], mz.range=recover.mz.range, chr.range=recover.chr.range, use.observed.range=use.observed.range, orig.tol=mz.tol, min.bw=min.bw, max.bw=max.bw, bandwidth=.5, recover.min.count=recover.min.count) save(this.recovered, file=this.name) } } diff --git a/R/unsupervised.R b/R/unsupervised.R index b5ca531e..b88a3a6d 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -89,8 +89,8 @@ recover_weaker_signals <- function( recover.weaker( sample_name = get_sample_name(filenames[i]), filename = filenames[[i]], - this.f1 = extracted_features[[i]], - this.f2 = corrected_features[[i]], + extracted_features = as_tibble(extracted_features[[i]]), + adjusted_features = as_tibble(corrected_features[[i]]), pk.times = aligned_rt_crosstab, aligned.ftrs = aligned_int_crosstab, orig.tol = original_mz_tolerance, @@ -234,7 +234,7 @@ unsupervised <- function( message("**** time correction ****") corrected <- adjust.time( - features = extracted, + extracted_features = extracted, mz_tol_relative = align_mz_tol, rt_tol_relative = align_chr_tol, mz_max_diff = 10 * mz_tol, @@ -287,7 +287,7 @@ unsupervised <- function( corrected_features = recovered$corrected_features, aligned_feature_sample_table = aligned_feature_sample_table, recovered_feature_sample_table = recovered_feature_sample_table, - aligned_mz_toletance = as.numeric(aligned$mz_tolerance), + aligned_mz_tolerance = as.numeric(aligned$mz_tolerance), aligned_rt_tolerance = as.numeric(aligned$rt_tolerance) ) } diff --git a/R/utils.R b/R/utils.R index d508aa59..4c4b1c92 100644 --- a/R/utils.R +++ b/R/utils.R @@ -2,17 +2,19 @@ NULL #> NULL -get_feature_values <- function(features, rt_colname) { - mz <- c() - rt <- c() - sample_id <- c() - for (i in 1:length(features)) { - sample_features <- dplyr::as_tibble(features[[i]]) - mz <- c(mz, sample_features$mz) - rt <- c(rt, sample_features[[rt_colname]]) - sample_id <- c(sample_id, rep(i, nrow(sample_features))) - } - return(list(mz = mz, rt = rt, sample_id = sample_id)) +#' Concatenate multiple feature lists and add the sample id (origin of feature) as additional column. +#' +#' @param features list List of tibbles containing extracted feature tables. +#' @param rt_colname string Name of retention time information column, usually "pos". +concatenate_feature_tables <- function(features, rt_colname) { + for (i in seq_along(features)) { + if(!("sample_id" %in% colnames(features[[i]]))) { + features[[i]] <- tibble::add_column(features[[i]], sample_id = i) + } + } + + merged <- dplyr::bind_rows(features) + return(merged) } #' @export @@ -89,3 +91,23 @@ create_feature_sample_table <- function(features) { ) return(table) } + +span <- function(x) { + diff(range(x, na.rm = TRUE)) +} + +#' @description +#' Compute standard deviation of m/z values groupwise +#' @export +compute_mz_sd <- function(feature_groups) { + mz_sd <- c() + for (i in seq_along(feature_groups)) { + group <- feature_groups[[i]] + + if (nrow(group > 1)) { + group_sd <- sd(group[, "mz"]) + mz_sd <- append(mz_sd, group_sd) + } + } + return(mz_sd) +} \ No newline at end of file diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index b82d92a7..da174ea7 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -5,6 +5,7 @@ channels: - defaults dependencies: - r-base >= 3.5 + - r-pkgload <= 1.2.4 - r-devtools - r-biocmanager - r-mass @@ -21,7 +22,7 @@ dependencies: - r-rocr - r-rocs - r-rcpp - - r-arrow + - r-arrow <= 8.0.0 - r-dplyr - r-tidyr - r-stringr @@ -33,3 +34,4 @@ dependencies: - r-datacomparer - r-patrick - radian + - r-httpgd diff --git a/conda/meta.yaml b/conda/meta.yaml index 6a57bb75..6ce393b2 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -57,7 +57,7 @@ requirements: - r-rocr - r-rocs - r-rcpp - - r-arrow + - r-arrow <= 8.0.0 - r-dplyr - r-tidyr - r-stringr diff --git a/tests/remote-files/features.txt b/tests/remote-files/features.txt index 5d7849aa..e03130d6 100644 --- a/tests/remote-files/features.txt +++ b/tests/remote-files/features.txt @@ -1,4 +1,5 @@ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/features/mbr_test0_features.Rds https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/features/RCX_06_shortened_features.Rds +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/features/RCX_06_shortened_gaussian_features.Rds https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/features/RCX_07_shortened_features.Rds https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/features/RCX_08_shortened_features.Rds \ No newline at end of file diff --git a/tests/testthat/test-adjust-time.R b/tests/testthat/test-adjust-time.R index df7d74ec..78095244 100644 --- a/tests/testthat/test-adjust-time.R +++ b/tests/testthat/test-adjust-time.R @@ -29,7 +29,7 @@ patrick::with_parameters_test_that( doParallel::registerDoParallel(cluster) corrected <- adjust.time( - features = extracted, + extracted_features = extracted, mz_tol_relative = mz_tol, rt_tol_relative = chr_tol, mz_max_diff = find_tol_max_d, @@ -41,10 +41,11 @@ patrick::with_parameters_test_that( file.path(testdata, "adjusted", paste0(x, ".parquet")) }) - expected <- lapply(expected_filenames, arrow::read_parquet) - expected <- lapply(expected, as.data.frame) + expected <- lapply(expected_filenames, function(x) { + tibble::as_tibble(arrow::read_parquet(x)) |> dplyr::rename( rt = pos, sample_id = V6, rt_cluster = V7) + }) - corrected <- lapply(corrected, as.data.frame) + corrected <- lapply(corrected, tibble::as_tibble) expect_equal(corrected, expected) }, diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R index d3049536..e68e93dd 100644 --- a/tests/testthat/test-hybrid.R +++ b/tests/testthat/test-hybrid.R @@ -1,11 +1,4 @@ -test_that("basic hybrid test", { - test_files <- c('../testdata/mbr_test0.mzml', - '../testdata/mbr_test1.mzml', - '../testdata/mbr_test2.mzml') - - expected <- arrow::read_parquet('../testdata/hybrid_recovered_feature_sample_table.parquet') - known_table <- arrow::read_parquet('../testdata/known_table.parquet') - +get_num_workers <- function() { # CRAN limits the number of cores available to packages to 2 # source https://stackoverflow.com/questions/50571325/r-cran-check-fail-when-using-parallel-functions#50571533 chk <- Sys.getenv("_R_CHECK_LIMIT_CORES_", "") @@ -17,8 +10,21 @@ test_that("basic hybrid test", { # use all cores in devtools::test() num_workers <- parallel::detectCores() } + return(num_workers) +} + +test_that("basic hybrid test", { + test_files <- c('../testdata/mbr_test0.mzml', + '../testdata/mbr_test1.mzml', + '../testdata/mbr_test2.mzml') + + expected <- arrow::read_parquet('../testdata/hybrid_recovered_feature_sample_table.parquet') + known_table <- arrow::read_parquet('../testdata/known_table.parquet') + + actual <- hybrid(test_files, known_table, cluster = get_num_workers()) - result <- hybrid(test_files, known_table, cluster = num_workers) + actual$recovered_feature_sample_table <- actual$recovered_feature_sample_table |> dplyr::arrange_all() + expected <- expected |> dplyr::arrange_all() - expect_equal(result$recovered_feature_sample_table, expected) + expect_equal(actual$recovered_feature_sample_table, expected) }) diff --git a/tests/testthat/test-prof.to.features.R b/tests/testthat/test-prof.to.features.R index 8fd53837..78baaba6 100644 --- a/tests/testthat/test-prof.to.features.R +++ b/tests/testthat/test-prof.to.features.R @@ -9,6 +9,7 @@ patrick::with_parameters_test_that( extracted_features, sd.cut = sd_cut, sigma.ratio.lim = sigma_ratio_lim, + shape.model = shape_model, do.plot = FALSE ) @@ -22,25 +23,36 @@ patrick::with_parameters_test_that( filename = c("mbr_test0_cdf.Rds"), expected_filename = "mbr_test0_features.Rds", sd_cut = c(0.1, 100), - sigma_ratio_lim = c(0.1, 10) + sigma_ratio_lim = c(0.1, 10), + shape_model = "bi-Gaussian" + ), + RCX_01_shortened_gaussian = list( + filename = c("RCX_06_shortened_cdf.Rds"), + expected_filename = "RCX_06_shortened_gaussian_features.Rds", + sd_cut = c(0.01, 500), + sigma_ratio_lim = c(0.01, 100), + shape_model = "Gaussian" ), RCX_01_shortened_v2 = list( filename = c("RCX_06_shortened_cdf.Rds"), expected_filename = "RCX_06_shortened_features.Rds", sd_cut = c(0.01, 500), - sigma_ratio_lim = c(0.01, 100) + sigma_ratio_lim = c(0.01, 100), + shape_model = "bi-Gaussian" ), RCX_09_shortened_v2 = list( filename = c("RCX_07_shortened_cdf.Rds"), expected_filename = "RCX_07_shortened_features.Rds", sd_cut = c(0.01, 500), - sigma_ratio_lim = c(0.01, 100) + sigma_ratio_lim = c(0.01, 100), + shape_model = "bi-Gaussian" ), RCX_16_shortened_v2 = list( filename = c("RCX_08_shortened_cdf.Rds"), expected_filename = "RCX_08_shortened_features.Rds", sd_cut = c(0.01, 500), - sigma_ratio_lim = c(0.01, 100) + sigma_ratio_lim = c(0.01, 100), + shape_model = "bi-Gaussian" ) ) ) diff --git a/tests/testthat/test-recover-weaker.R b/tests/testthat/test-recover-weaker.R index c5cec338..7558d7e7 100644 --- a/tests/testthat/test-recover-weaker.R +++ b/tests/testthat/test-recover-weaker.R @@ -19,8 +19,10 @@ patrick::with_parameters_test_that( file.path(testdata, "adjusted", paste0(x, ".parquet")) }) - adjusted <- lapply(filenames, arrow::read_parquet) - adjusted <- lapply(adjusted, as.data.frame) + adjusted <- lapply(filenames, function(x) { + arrow::read_parquet(x) |> dplyr::rename(rt = pos, sample_id = V6) + }) + #adjusted <- lapply(adjusted, as.data.frame) aligned <- load_aligned_features( file.path(testdata, "aligned", "rt_cross_table.parquet"), @@ -28,33 +30,12 @@ patrick::with_parameters_test_that( file.path(testdata, "aligned", "tolerances.parquet") ) - chk <- Sys.getenv("_R_CHECK_LIMIT_CORES_", "") - - if (nzchar(chk) && chk == "TRUE") { - # use 2 cores in CRAN/Travis/AppVeyor - cluster <- 2L - } else { - # use all cores in devtools::test() - cluster <- parallel::detectCores() - } - - if (!is(cluster, "cluster")) { - cluster <- parallel::makeCluster(cluster) - on.exit(parallel::stopCluster(cluster)) - } - - clusterExport(cluster, c( - "recover.weaker", "load.lcms", "find.turn.point", - "combine.seq.3", "interpol.area", "duplicate.row.remove", "compute_all_times", "load_file" - )) - clusterEvalQ(cluster, library("splines")) - recovered <- lapply(seq_along(ms_files), function(i) { recover.weaker( filename = ms_files[[i]], sample_name = get_sample_name(files[i]), - this.f1 = extracted[[i]], - this.f2 = adjusted[[i]], + extracted_features = extracted[[i]], + adjusted_features = adjusted[[i]], pk.times = aligned$rt_crosstab, aligned.ftrs = aligned$int_crosstab, orig.tol = mz_tol, @@ -108,23 +89,24 @@ patrick::with_parameters_test_that( file.path(testdata, "recovered", "recovered-corrected", paste0(x, ".parquet")) }) - corrected_recovered_expected <- lapply(filenames, arrow::read_parquet) - + corrected_recovered_expected <- lapply(filenames, function(x) { + arrow::read_parquet(x) |> dplyr::rename(rt = pos, sample_id = V6) + }) # preprocess dataframes - keys <- c("mz", "pos", "sd1", "sd2") + keys <- c("mz", "sd1", "sd2", "area") extracted_recovered_actual <- lapply(extracted_recovered_actual, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(c(keys, "pos")) }) corrected_recovered_actual <- lapply(corrected_recovered_actual, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(c(keys, "rt")) }) extracted_recovered_expected <- lapply(extracted_recovered_expected, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(c(keys, "pos")) }) corrected_recovered_expected <- lapply(corrected_recovered_expected, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(c(keys, "rt")) }) # compare files @@ -133,33 +115,19 @@ patrick::with_parameters_test_that( actual_extracted_i <- extracted_recovered_actual[[i]] expected_extracted_i <- extracted_recovered_expected[[i]] - report <- dataCompareR::rCompare(actual_extracted_i, expected_extracted_i, keys = keys, roundDigits = 4, mismatches = 100000) - dataCompareR::saveReport(report, reportName = files[[i]], showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) - - expect_true(nrow(report$rowMatching$inboth) >= 0.9 * nrow(expected_extracted_i)) - - incommon <- as.numeric(rownames(report$rowMatching$inboth)) - - subset_actual <- actual_extracted_i %>% dplyr::slice(incommon) - subset_expected <- expected_extracted_i %>% dplyr::slice(incommon) + expect_equal(actual_extracted_i, expected_extracted_i) - expect_equal(subset_actual$area, subset_expected$area, tolerance = 0.01 * max(subset_expected$area)) + # report <- dataCompareR::rCompare(actual_extracted_i, expected_extracted_i, keys = keys, roundDigits = 4, mismatches = 100000) + # dataCompareR::saveReport(report, reportName = paste0(files[[i]],"_extracted"), showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) # corrected recovered actual_corrected_i <- corrected_recovered_actual[[i]] expected_corrected_i <- corrected_recovered_expected[[i]] - report <- dataCompareR::rCompare(actual_corrected_i, expected_corrected_i, keys = keys, roundDigits = 4, mismatches = 100000) - dataCompareR::saveReport(report, reportName = files[[i]], showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) - - expect_true(nrow(report$rowMatching$inboth) >= 0.9 * nrow(expected_corrected_i)) - - incommon <- as.numeric(rownames(report$rowMatching$inboth)) - - subset_actual <- actual_corrected_i %>% dplyr::slice(incommon) - subset_expected <- expected_corrected_i %>% dplyr::slice(incommon) + expect_equal(actual_corrected_i, expected_corrected_i) - expect_equal(subset_actual$area, subset_expected$area, tolerance = 0.01 * max(subset_expected$area)) + # report <- dataCompareR::rCompare(actual_corrected_i, expected_corrected_i, keys = keys, roundDigits = 4, mismatches = 100000) + # dataCompareR::saveReport(report, reportName = paste0(files[[i]],"_adjusted"), showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) } }, patrick::cases( @@ -174,13 +142,4 @@ patrick::with_parameters_test_that( recover_min_count = 3 ) ) -) - -files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened") -mz_tol = 1e-05 -recover_mz_range = NA -recover_chr_range = NA -use_observed_range = TRUE -min_bandwidth = NA -max_bandwidth = NA -recover_min_count = 3 \ No newline at end of file +) \ No newline at end of file diff --git a/tests/testthat/test-two-step-hybrid.R b/tests/testthat/test-two-step-hybrid.R index 1258c2a1..e6ea6e33 100644 --- a/tests/testthat/test-two-step-hybrid.R +++ b/tests/testthat/test-two-step-hybrid.R @@ -1,4 +1,5 @@ test_that("basic two-step hybrid test", { + skip("Disabled") skip_on_ci() test_names <- c( "mbr_test0.mzml",