From b006c195a4b039a42c7d870166724beebfb49c96 Mon Sep 17 00:00:00 2001 From: wverastegui Date: Fri, 29 Jul 2022 11:18:27 +0200 Subject: [PATCH 01/53] setup remote --- filetest | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 filetest diff --git a/filetest b/filetest new file mode 100644 index 00000000..e69de29b From b099d9e39e1726e68991671e3e38bba8cb1af150 Mon Sep 17 00:00:00 2001 From: wverastegui Date: Fri, 29 Jul 2022 11:19:24 +0200 Subject: [PATCH 02/53] Update --- filetest | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 filetest diff --git a/filetest b/filetest deleted file mode 100644 index e69de29b..00000000 From ae0be7dd0ebc0622e22cdef4eb6cf987996fa53f Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 1 Aug 2022 13:58:09 +0200 Subject: [PATCH 03/53] extracted functions into top level --- R/recover.weaker.R | 381 ++++++++++++++++++++++++++------------------- 1 file changed, 224 insertions(+), 157 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 78e7accc..b3027bb5 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -143,11 +143,182 @@ get_raw_features_in_mzrange <- function(data_table, aligned.ftrs, breaks, i, cus return(features) } +#' @export +get_rt_region_indices <- function(retention_time, that.prof, chr_tol) { + if (!is.na(retention_time)) { + selection <- which(abs(that.prof[, 2] - retention_time) < chr_tol * 2) + } else { + selection <- seq_len(that.prof) + } + return(selection) +} + +compute_EIC_area <- function(thee.sel, that.prof, base.curve, all.times, aver.diff) { + if (length(thee.sel) > 1) { + that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], base.curve[, 1], all.times) + } else { + that.inte <- that.prof[thee.sel, 3] * aver.diff + } + return(that.inte) +} + +get_features_in_rt_range <- function(this, base.curve, bw) { + this.span <- range(this[, 1]) + this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ] + this.curve[this.curve[, 1] %in% this[, 1], 2] <- this[, 2] + + this.smooth <- ksmooth(this.curve[, 1], this.curve[, 2], kernel = "normal", bandwidth = bw) + smooth.y <- this.smooth$y + turns <- find.turn.point(smooth.y) + pks <- this.smooth$x[turns$pks] + vlys <- this.smooth$x[turns$vlys] + vlys <- c(-Inf, vlys, Inf) + return(list(pks = pks, vlys = vlys)) +} + +compute_pks_vlys_rt <- function(that.prof, base.curve, bandwidth, min.bw, max.bw, target_rt, recover.min.count) { + # extract rt labels and intensities + this <- that.prof[, 2:3] + this <- this[order(this[, 1]), ] + + bw <- min(max(bandwidth * (max(this[, 1]) - min(this[, 1])), min.bw), max.bw) + + roi <- get_features_in_rt_range( + this, + base.curve, + bw + ) + pks <- roi$pks + vlys <- roi$vlys + + pks.n <- pks + for (m in 1:length(pks)) + { + this.vlys <- c(max(vlys[which(vlys < pks[m])]), min(vlys[which(vlys > pks[m])])) # same as upper part in function + pks.n[m] <- sum(this[, 1] >= this.vlys[1] & this[, 1] <= this.vlys[2]) + } + + if (!is.na(target_rt)) { + pks.d <- abs(pks - target_rt) # distance from the target peak location + pks.d[pks.n == 0] <- Inf + pks <- pks[which(pks.d == min(pks.d))[1]] + } else { + pks <- pks[pks.n > recover.min.count] + } + return(list(pks = pks, vlys = vlys, this = this)) +} + +compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, base.curve, all.times) { + curr.rec <- c(that.mass, NA, NA) + + # same filtering of peaks as in compute_pks_vlyws and as above + vlys <- c( + max(all$vlys[which(all$vlys < pks)]), + min(all$vlys[which(all$vlys > pks)]) + ) + + this <- all$this[which(all$this[, 1] >= vlys[1] & all$this[, 1] <= vlys[2]), ] + + if (is.null(nrow(this))) { + curr.rec[3] <- this[2] * aver.diff + curr.rec[2] <- this[1] + } else { + x <- this[, 1] + y <- this[, 2] + + if (nrow(this) >= 10) { + miu <- sum(x * y) / sum(y) + sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) + if (sigma == 0) { + curr.rec[3] <- sum(y) * aver.diff + curr.rec[2] <- miu + } else { + fitted <- dnorm(x, mean = miu, sd = sigma) + this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 + sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) + } + } else { + sc <- interpol.area(x, y, base.curve[, 1], all.times) + miu <- median(x) + } + curr.rec[3] <- sc + curr.rec[2] <- miu + } + return(curr.rec) +} + +compute_rectangle <- function(data_table, + aligned.ftrs, + breaks, + i, + custom.mz.tol, + orig.tol, intensity.weighted, + recover.min.count, + target.time, + custom.chr.tol, + base.curve, + all.times, + aver.diff, + bandwidth, + min.bw, + max.bw) { + features <- get_raw_features_in_mzrange(data_table, aligned.ftrs, breaks, i, custom.mz.tol) + + mass.den <- compute_mass_density( + mz = features$mz, + intensities = features$intensities, + bandwidth = 0.5 * orig.tol * aligned.ftrs[i, "mz"], + intensity_weighted = intensity.weighted + ) + + # find peaks in mz range in raw data + mass.turns <- find.turn.point(mass.den$y) + mass.pks <- mass.den$x[mass.turns$pks] # mz values with highest density + mass.vlys <- c(-Inf, mass.den$x[mass.turns$vlys], Inf) # masses with lowest densities values -> valley + mass.pks <- mass.pks[which(abs(mass.pks - aligned.ftrs[i, "mz"]) < custom.mz.tol[i] / 1.5)] + + this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) + for (peak in mass.pks) { + # get mass values of valleys the closest to the peak + mass.lower <- max(mass.vlys[mass.vlys < peak]) + mass.upper <- min(mass.vlys[mass.vlys > peak]) + + that <- features |> + dplyr::filter(mz > mass.lower & mz <= mass.upper) |> + dplyr::arrange_at("labels") + + # get values in RT region of interest? + if (nrow(that) > recover.min.count) { + that.prof <- combine.seq.3(that$labels, that$mz, that$intensities) + that.mass <- sum(that.prof[, 1] * that.prof[, 3]) / sum(that.prof[, 3]) + curr.rec <- c(that.mass, NA, NA) + + if (nrow(that.prof) < 10) { + thee.sel <- get_rt_region_indices(target.time[i], that.prof, custom.chr.tol[i]) + + if (length(thee.sel) > recover.min.count) { + curr.rec[3] <- compute_EIC_area(thee.sel, that.prof, base.curve, all.times, aver.diff) + curr.rec[2] <- median(that.prof[thee.sel, 2]) + this.rec <- rbind(this.rec, curr.rec) + } + } else { + all <- compute_pks_vlys_rt(that.prof, base.curve, bandwidth, min.bw, max.bw, target.time[i], recover.min.count) + + for (pks in all$pks) { + curr.rec <- compute_curr_rec_with_enough_peaks(that.mass, pks, all, aver.diff, base.curve, all.times) + this.rec <- rbind(this.rec, curr.rec) + } + } + } + } + return(this.rec) +} + #' Recover weak signals in some profiles that is not identified as a peak, but corresponds to identified peaks in other spectra. -#' +#' #' @description -#' Given the aligned feature table, some features are identified in a subgroup of spectra. This doesn't mean they don't exist in the other spectra. -#' The signal could be too low to pass the run filter. Thus after obtaining the aligned feature table, this function re-analyzes each spectrum to +#' Given the aligned feature table, some features are identified in a subgroup of spectra. This doesn't mean they don't exist in the other spectra. +#' The signal could be too low to pass the run filter. Thus after obtaining the aligned feature table, this function re-analyzes each spectrum to #' try and fill in the holes in the aligned feature table. #' @param filename the cdf file name from which weaker signal is to be recovered. #' @param loc the location of the filename in the vector of filenames. @@ -237,166 +408,62 @@ recover.weaker <- function(filename, for (i in seq_along(this.ftrs)) { if (this.ftrs[i] == 0 && aligned.ftrs[i, "mz"] < max(masses)) { - features <- get_raw_features_in_mzrange(data_table, aligned.ftrs, breaks, i, custom.mz.tol) - - mass.den <- compute_mass_density( - mz = features$mz, - intensities = features$intensities, - bandwidth = 0.5 * orig.tol * aligned.ftrs[i, "mz"], - intensity_weighted = intensity.weighted + this.rec <- compute_rectangle( + data_table, + aligned.ftrs, + breaks, + i, + custom.mz.tol, + orig.tol, intensity.weighted, + recover.min.count, + target.time, + custom.chr.tol, + base.curve, + all.times, + aver.diff, + bandwidth, + min.bw, + max.bw ) - # find peaks in mz range in raw data - mass.turns <- find.turn.point(mass.den$y) - mass.pks <- mass.den$x[mass.turns$pks] # mz values with highest density - mass.vlys <- c(-Inf, mass.den$x[mass.turns$vlys], Inf) # masses with lowest densities values -> valley - mass.pks <- mass.pks[which(abs(mass.pks - aligned.ftrs[i, "mz"]) < custom.mz.tol[i] / 1.5)] - - if (length(mass.pks) > 0) { - this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) - for (peak in mass.pks) - { - # get mass values of valleys the closest to the peak - mass.lower <- max(mass.vlys[mass.vlys < peak]) - mass.upper <- min(mass.vlys[mass.vlys > peak]) - - that <- features |> - dplyr::filter(mz > mass.lower & mz <= mass.upper) |> - dplyr::arrange_at("labels") - - if (nrow(that) > recover.min.count) { - that.prof <- combine.seq.3(that$labels, that$mz, that$intensities) - that.mass <- sum(that.prof[, 1] * that.prof[, 3]) / sum(that.prof[, 3]) - curr.rec <- c(that.mass, NA, NA) - - if (nrow(that.prof) < 10) { - if (!is.na(target.time[i])) { - thee.sel <- which(abs(that.prof[, 2] - target.time[i]) < custom.chr.tol[i] * 2) - } else { - thee.sel <- 1:nrow(that.prof) - } - - if (length(thee.sel) > recover.min.count) { - if (length(thee.sel) > 1) { - that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], base.curve[, 1], all.times) - } else { - that.inte <- that.prof[thee.sel, 3] * aver.diff - } - curr.rec[3] <- that.inte - curr.rec[2] <- median(that.prof[thee.sel, 2]) - this.rec <- rbind(this.rec, curr.rec) - } - } else { - this <- that.prof[, 2:3] - this <- this[order(this[, 1]), ] - this.span <- range(this[, 1]) - this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ] - this.curve[this.curve[, 1] %in% this[, 1], 2] <- this[, 2] - - bw <- min(max(bandwidth * (max(this[, 1]) - min(this[, 1])), min.bw), max.bw) - this.smooth <- ksmooth(this.curve[, 1], this.curve[, 2], kernel = "normal", bandwidth = bw) - smooth.y <- this.smooth$y - turns <- find.turn.point(smooth.y) - pks <- this.smooth$x[turns$pks] - vlys <- this.smooth$x[turns$vlys] - vlys <- c(-Inf, vlys, Inf) - - pks.n <- pks - for (m in 1:length(pks)) - { - this.vlys <- c(max(vlys[which(vlys < pks[m])]), min(vlys[which(vlys > pks[m])])) - pks.n[m] <- sum(this[, 1] >= this.vlys[1] & this[, 1] <= this.vlys[2]) - } - - if (!is.na(target.time[i])) { - pks.d <- abs(pks - target.time[i]) # distance from the target peak location - pks.d[pks.n == 0] <- Inf - pks <- pks[which(pks.d == min(pks.d))[1]] - } else { - pks <- pks[pks.n > recover.min.count] - } - - all.pks <- pks - all.vlys <- vlys - all.this <- this - - if (length(all.pks) > 0) { - for (pks.i in 1:length(all.pks)) - { - pks <- all.pks[pks.i] - vlys <- c(max(all.vlys[which(all.vlys < pks)]), min(all.vlys[which(all.vlys > pks)])) - - this <- all.this[which(all.this[, 1] >= vlys[1] & all.this[, 1] <= vlys[2]), ] - if (is.null(nrow(this))) { - curr.rec[3] <- this[2] * aver.diff - curr.rec[2] <- this[1] - } else { - x <- this[, 1] - y <- this[, 2] - - if (nrow(this) >= 10) { - miu <- sum(x * y) / sum(y) - sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) - if (sigma == 0) { - curr.rec[3] <- sum(y) * aver.diff - curr.rec[2] <- miu - } else { - fitted <- dnorm(x, mean = miu, sd = sigma) - this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 - sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - } - } else { - sc <- interpol.area(x, y, base.curve[, 1], all.times) - miu <- median(x) - } - curr.rec[3] <- sc - curr.rec[2] <- miu - } - this.rec <- rbind(this.rec, curr.rec) - } - } - } - } - } - - if (!is.na(target.time[i])) { - this.sel <- which(abs(this.rec[, 2] - target.time[i]) < custom.chr.tol[i]) - } else { - this.sel <- 1:nrow(this.rec) - this.sel <- this.sel[this.sel != 1] - } + if (!is.na(target.time[i])) { + this.sel <- which(abs(this.rec[, 2] - target.time[i]) < custom.chr.tol[i]) + } else { + this.sel <- seq_len(this.rec) + this.sel <- this.sel[this.sel != 1] + } - if (length(this.sel) > 0) { - if (length(this.sel) > 1) { - if (!is.na(target.time[i])) { - this.d <- (this.rec[, 2] - target.time[i])^2 / custom.chr.tol[i]^2 + (this.rec[, 1] - aligned.ftrs[i, 1])^2 / custom.mz.tol[i]^2 - this.sel <- which(this.d == min(this.d))[1] - } else { - this.d <- abs(this.rec[, 1] - aligned.ftrs[i, 1]) - this.sel <- which(this.d == min(this.d))[1] - } + if (length(this.sel) > 0) { + if (length(this.sel) > 1) { + if (!is.na(target.time[i])) { + this.d <- (this.rec[, 2] - target.time[i])^2 / custom.chr.tol[i]^2 + (this.rec[, 1] - aligned.ftrs[i, 1])^2 / custom.mz.tol[i]^2 + this.sel <- which(this.d == min(this.d))[1] + } else { + this.d <- abs(this.rec[, 1] - aligned.ftrs[i, 1]) + this.sel <- which(this.d == min(this.d))[1] } - this.pos.diff <- abs(this.f1[, 2] - this.rec[this.sel, 2]) - this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] - this.f1 <- rbind(this.f1, c(this.rec[this.sel, 1], this.rec[this.sel, 2], NA, NA, this.rec[this.sel, 3])) - this.time.adjust <- (-this.f1[this.pos.diff, 2] + this.f2[this.pos.diff, 2]) - this.f2 <- rbind( - this.f2, - c( - this.rec[this.sel, 1], - this.rec[this.sel, 2] + this.time.adjust, - NA, - NA, - this.rec[this.sel, 3], - grep(sample_name, colnames(aligned.ftrs)), - NA - ) - ) - this.ftrs[i] <- this.rec[this.sel, 3] - this.times[i] <- this.rec[this.sel, 2] + this.time.adjust - this.mz[i] <- this.rec[this.sel, 1] } + + this.pos.diff <- abs(this.f1[, 2] - this.rec[this.sel, 2]) + this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] + this.f1 <- rbind(this.f1, c(this.rec[this.sel, 1], this.rec[this.sel, 2], NA, NA, this.rec[this.sel, 3])) + this.time.adjust <- (-this.f1[this.pos.diff, 2] + this.f2[this.pos.diff, 2]) + this.f2 <- rbind( + this.f2, + c( + this.rec[this.sel, 1], + this.rec[this.sel, 2] + this.time.adjust, + NA, + NA, + this.rec[this.sel, 3], + grep(sample_name, colnames(aligned.ftrs)), + NA + ) + ) + this.ftrs[i] <- this.rec[this.sel, 3] + this.times[i] <- this.rec[this.sel, 2] + this.time.adjust + this.mz[i] <- this.rec[this.sel, 1] } } } From e573714058cd901f3fbae3021b96651f0323c4e7 Mon Sep 17 00:00:00 2001 From: wverastegui Date: Tue, 2 Aug 2022 11:44:31 +0200 Subject: [PATCH 04/53] Refactoring --- R/find.tol.time.R | 52 +++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/R/find.tol.time.R b/R/find.tol.time.R index 4f19c7f6..614c9f9f 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -17,7 +17,7 @@ #' This parameter limits the tolerance in absolute terms. It mostly influences feature matching in higher m/z range. #' @param max.num.segments the maximum number of segments. #' @param do.plot Indicates whether plot should be drawn. -#' @return A matrix with six columns. Every row corrsponds to a peak in one of the spectrum. The columns are: m/z, elution time, spread, signal strength, +#' @return A matrix with six columns. Every row corresponds to a peak in one of the spectrum. The columns are: m/z, elution time, spread, signal strength, #' spectrum label, and peak group label. The rows are ordered by the median m/z of each peak group, and with each peak group the rows are ordered #' by the elution time. #' @examples @@ -34,27 +34,44 @@ find.tol.time <- function(mz, mz_tol_absolute = 0.01, max.num.segments = 10000, do.plot = TRUE) { - o <- order(mz) - mz <- mz[o] - chr <- chr[o] - lab <- lab[o] - rm(o) + + features <- tibble::tibble(mz = mz, rt = chr, labels = lab) + features <- dplyr::arrange_at(features, "mz") + + # o <- order(mz) + # mz <- mz[o] + # chr <- chr[o] + # lab <- lab[o] + # rm(o) + + mz <- features$mz + chr <- features$rt + lab <- features$labels l <- length(mz) + + min_mz_tol <- min( + mz_tol_absolute, + mz_tol_relative * ((mz[2:l] + mz[1:(l - 1)]) / 2) + ) - breaks <- - c(0, which((mz[2:l] - mz[1:(l - 1)]) > min(mz_tol_absolute, mz_tol_relative * (( - mz[2:l] + mz[1:(l - 1)] - ) / 2))), l) - + indices <- which((mz[2:l] - mz[1:(l - 1)]) > min_mz_tol) + breaks <- c(0, indices, l) for (i in 2:length(breaks)) { - this.o <- order(chr[(breaks[i - 1] + 1):breaks[i]]) + this.iter <- (breaks[i - 1] + 1):breaks[i] + this.o <- order(chr[this.iter]) this.o <- this.o + breaks[i - 1] - mz[(breaks[i - 1] + 1):breaks[i]] <- mz[this.o] - chr[(breaks[i - 1] + 1):breaks[i]] <- chr[this.o] - lab[(breaks[i - 1] + 1):breaks[i]] <- lab[this.o] + mz[this.iter] <- mz[this.o] + chr[this.iter] <- chr[this.o] + lab[this.iter] <- lab[this.o] + # newfeatures <- tibble::tibble(mz = mz[this.iter], chr = chr[this.iter], lab = lab[this.iter]) } - + + # mz <- newfeatures$mz + # chr <- newfeatures$chr + # lab <- newfeatures$lab + + breaks <- breaks[c(-1, -length(breaks))] if (is.na(rt_tol_relative)) { da <- 0 @@ -98,7 +115,8 @@ find.tol.time <- function(mz, rt_tol_relative <- x[sel] if (do.plot) { - tolerance_plot(x, y, exp.y, sel, main = "find retention time tolerance") + tolerance_plot(x, y, exp.y, + sel, main = "find retention time tolerance") } } From 91746c298af0ff5ab8ca178afbbb30730eb8ca43 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 2 Aug 2022 13:37:31 +0200 Subject: [PATCH 05/53] renamed and extracted variables --- R/recover.weaker.R | 173 ++++++++++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 57 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index b3027bb5..75d9721e 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -125,30 +125,39 @@ compute_breaks_2 <- function(data_table, orig.tol) { } #' @export -get_mzrange_bound_indices <- function(aligned.ftrs, masses, breaks, i, custom.mz.tol) { - if (aligned.ftrs[i, "mz"] <= masses[breaks[2]]) { +get_mzrange_bound_indices <- function(aligned_feature_mass, masses, breaks, mz_tol) { + if (aligned_feature_mass <= masses[breaks[2]]) { this.found <- c(1, 2) } else { - this.found <- c(which(abs(masses[breaks] - aligned.ftrs[i, "mz"]) < custom.mz.tol[i]), min(which(masses[breaks] > aligned.ftrs[i, "mz"])), max(which(masses[breaks] < aligned.ftrs[i, "mz"]))) + 1 + this.found <- c( + which(abs(masses[breaks] - aligned_feature_mass) < mz_tol), + min(which(masses[breaks] > aligned_feature_mass)), + max(which(masses[breaks] < aligned_feature_mass)) + ) + 1 this.found <- c(min(this.found), max(this.found)) } return(this.found) } #' @export -get_raw_features_in_mzrange <- function(data_table, aligned.ftrs, breaks, i, custom.mz.tol) { - this.found <- get_mzrange_bound_indices(aligned.ftrs, data_table$mz, breaks, i, custom.mz.tol) +get_raw_features_in_mzrange <- function(data_table, aligned_feature_mass, breaks, mz_tol) { + this.found <- get_mzrange_bound_indices( + aligned_feature_mass, + data_table$mz, + breaks, + mz_tol + ) this.sel <- (breaks[this.found[1]] + 1):breaks[this.found[2]] features <- data_table |> dplyr::slice(this.sel) return(features) } #' @export -get_rt_region_indices <- function(retention_time, that.prof, chr_tol) { +get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { if (!is.na(retention_time)) { - selection <- which(abs(that.prof[, 2] - retention_time) < chr_tol * 2) + selection <- which(abs(profile_data[, 2] - retention_time) < chr_tol) } else { - selection <- seq_len(that.prof) + selection <- seq_len(profile_data) } return(selection) } @@ -167,13 +176,14 @@ get_features_in_rt_range <- function(this, base.curve, bw) { this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ] this.curve[this.curve[, 1] %in% this[, 1], 2] <- this[, 2] - this.smooth <- ksmooth(this.curve[, 1], this.curve[, 2], kernel = "normal", bandwidth = bw) - smooth.y <- this.smooth$y - turns <- find.turn.point(smooth.y) - pks <- this.smooth$x[turns$pks] - vlys <- this.smooth$x[turns$vlys] - vlys <- c(-Inf, vlys, Inf) - return(list(pks = pks, vlys = vlys)) + this.smooth <- ksmooth( + this.curve[, 1], + this.curve[, 2], + kernel = "normal", + bandwidth = bw + ) + + return(compute_peaks_and_valleys(this.smooth)) } compute_pks_vlys_rt <- function(that.prof, base.curve, bandwidth, min.bw, max.bw, target_rt, recover.min.count) { @@ -194,8 +204,8 @@ compute_pks_vlys_rt <- function(that.prof, base.curve, bandwidth, min.bw, max.bw pks.n <- pks for (m in 1:length(pks)) { - this.vlys <- c(max(vlys[which(vlys < pks[m])]), min(vlys[which(vlys > pks[m])])) # same as upper part in function - pks.n[m] <- sum(this[, 1] >= this.vlys[1] & this[, 1] <= this.vlys[2]) + boundaries <- compute_mass_boundaries(vlys, pks[m]) + pks.n[m] <- sum(this[, 1] >= boundaries$lower & this[, 1] <= boundaries$upper) } if (!is.na(target_rt)) { @@ -212,12 +222,8 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, b curr.rec <- c(that.mass, NA, NA) # same filtering of peaks as in compute_pks_vlyws and as above - vlys <- c( - max(all$vlys[which(all$vlys < pks)]), - min(all$vlys[which(all$vlys > pks)]) - ) - - this <- all$this[which(all$this[, 1] >= vlys[1] & all$this[, 1] <= vlys[2]), ] + boundaries <- compute_mass_boundaries(all$vlys, pks) + this <- all$this[which(all$this[, 1] >= boundaries$lower & all$this[, 1] <= boundaries$upper), ] if (is.null(nrow(this))) { curr.rec[3] <- this[2] * aver.diff @@ -247,12 +253,27 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, b return(curr.rec) } +compute_mass_boundaries <- function(mass.vlys, peak) { + lower <- max(mass.vlys[mass.vlys < peak]) + upper <- min(mass.vlys[mass.vlys > peak]) + return(list(lower = lower, upper = upper)) +} + +compute_peaks_and_valleys <- function(dens) { + turns <- find.turn.point(dens$y) + pks <- dens$x[turns$pks] # mz values with highest density + vlys <- dens$x[turns$vlys] + vlys <- c(-Inf, vlys, Inf) # masses with lowest densities values -> valley + return(list(pks = pks, vlys = vlys)) +} + compute_rectangle <- function(data_table, - aligned.ftrs, + aligned_feature_mass, breaks, i, custom.mz.tol, - orig.tol, intensity.weighted, + orig.tol, + intensity.weighted, recover.min.count, target.time, custom.chr.tol, @@ -262,29 +283,31 @@ compute_rectangle <- function(data_table, bandwidth, min.bw, max.bw) { - features <- get_raw_features_in_mzrange(data_table, aligned.ftrs, breaks, i, custom.mz.tol) + features <- get_raw_features_in_mzrange( + data_table, + aligned_feature_mass, + breaks, + custom.mz.tol[i] + ) mass.den <- compute_mass_density( mz = features$mz, intensities = features$intensities, - bandwidth = 0.5 * orig.tol * aligned.ftrs[i, "mz"], + bandwidth = 0.5 * orig.tol * aligned_feature_mass, intensity_weighted = intensity.weighted ) # find peaks in mz range in raw data - mass.turns <- find.turn.point(mass.den$y) - mass.pks <- mass.den$x[mass.turns$pks] # mz values with highest density - mass.vlys <- c(-Inf, mass.den$x[mass.turns$vlys], Inf) # masses with lowest densities values -> valley - mass.pks <- mass.pks[which(abs(mass.pks - aligned.ftrs[i, "mz"]) < custom.mz.tol[i] / 1.5)] + mass_range <- compute_peaks_and_valleys(mass.den) + mass_range$pks <- mass_range$pks[which(abs(mass_range$pks - aligned_feature_mass) < custom.mz.tol[i] / 1.5)] this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) - for (peak in mass.pks) { + for (peak in mass_range$pks) { # get mass values of valleys the closest to the peak - mass.lower <- max(mass.vlys[mass.vlys < peak]) - mass.upper <- min(mass.vlys[mass.vlys > peak]) + mass <- compute_mass_boundaries(mass_range$vlys, peak) that <- features |> - dplyr::filter(mz > mass.lower & mz <= mass.upper) |> + dplyr::filter(mz > mass$lower & mz <= mass$upper) |> dplyr::arrange_at("labels") # get values in RT region of interest? @@ -294,18 +317,43 @@ compute_rectangle <- function(data_table, curr.rec <- c(that.mass, NA, NA) if (nrow(that.prof) < 10) { - thee.sel <- get_rt_region_indices(target.time[i], that.prof, custom.chr.tol[i]) + thee.sel <- get_rt_region_indices( + target.time[i], + that.prof, + custom.chr.tol[i] * 2 + ) if (length(thee.sel) > recover.min.count) { - curr.rec[3] <- compute_EIC_area(thee.sel, that.prof, base.curve, all.times, aver.diff) + curr.rec[3] <- compute_EIC_area( + thee.sel, + that.prof, + base.curve, + all.times, + aver.diff + ) curr.rec[2] <- median(that.prof[thee.sel, 2]) this.rec <- rbind(this.rec, curr.rec) } } else { - all <- compute_pks_vlys_rt(that.prof, base.curve, bandwidth, min.bw, max.bw, target.time[i], recover.min.count) + all <- compute_pks_vlys_rt( + that.prof, + base.curve, + bandwidth, + min.bw, + max.bw, + target.time[i], + recover.min.count + ) for (pks in all$pks) { - curr.rec <- compute_curr_rec_with_enough_peaks(that.mass, pks, all, aver.diff, base.curve, all.times) + curr.rec <- compute_curr_rec_with_enough_peaks( + that.mass, + pks, + all, + aver.diff, + base.curve, + all.times + ) this.rec <- rbind(this.rec, curr.rec) } } @@ -314,6 +362,19 @@ compute_rectangle <- function(data_table, return(this.rec) } +refine_selection <- function(this.sel, target_rt, rectangle, aligned_rt, chr_tol, mz_tol) { + if (length(this.sel) > 1) { + if (!is.na(target_rt)) { + this.d <- (rectangle[, 2] - target_rt)^2 / chr_tol^2 + (rectangle[, 1] - aligned_rt)^2 / mz_tol^2 + this.sel <- which(this.d == min(this.d))[1] + } else { + this.d <- abs(rectangle[, 1] - aligned_rt) + this.sel <- which(this.d == min(this.d))[1] + } + } + return(this.sel) +} + #' Recover weak signals in some profiles that is not identified as a peak, but corresponds to identified peaks in other spectra. #' #' @description @@ -410,7 +471,7 @@ recover.weaker <- function(filename, if (this.ftrs[i] == 0 && aligned.ftrs[i, "mz"] < max(masses)) { this.rec <- compute_rectangle( data_table, - aligned.ftrs, + aligned.ftrs[i, "mz"], breaks, i, custom.mz.tol, @@ -426,25 +487,23 @@ recover.weaker <- function(filename, max.bw ) - if (!is.na(target.time[i])) { - this.sel <- which(abs(this.rec[, 2] - target.time[i]) < custom.chr.tol[i]) - } else { - this.sel <- seq_len(this.rec) - this.sel <- this.sel[this.sel != 1] - } - + this.sel <- get_rt_region_indices( + target.time[i], + this.rec, + custom.chr.tol[i] + ) + this.sel <- this.sel[this.sel != 1] if (length(this.sel) > 0) { - if (length(this.sel) > 1) { - if (!is.na(target.time[i])) { - this.d <- (this.rec[, 2] - target.time[i])^2 / custom.chr.tol[i]^2 + (this.rec[, 1] - aligned.ftrs[i, 1])^2 / custom.mz.tol[i]^2 - this.sel <- which(this.d == min(this.d))[1] - } else { - this.d <- abs(this.rec[, 1] - aligned.ftrs[i, 1]) - this.sel <- which(this.d == min(this.d))[1] - } - } - + this.sel <- refine_selection( + this.sel, + target.time[i], + this.rec, + aligned.ftrs[i, 1], + custom.chr.tol[i], + custom.mz.tol[i] + ) + this.pos.diff <- abs(this.f1[, 2] - this.rec[this.sel, 2]) this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] this.f1 <- rbind(this.f1, c(this.rec[this.sel, 1], this.rec[this.sel, 2], NA, NA, this.rec[this.sel, 3])) From 70152c64704ed24240782b643d38308d03bf0137 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 2 Aug 2022 14:05:41 +0200 Subject: [PATCH 06/53] Started adding documentation and reworked base.curve variable --- R/recover.weaker.R | 64 ++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 75d9721e..d44fb0e6 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -1,24 +1,33 @@ +#' Custom way of removing duplicate rows from a specifically formatted table. +#' +#' @description +#' Rows are considered as duplicate if the 1st, 2nd and 5th column are less than 1e-10 (tolerance) apart. +#' Only a single row in this `range` is kept from a group. +#' @param new.table The table from which the duplicate rows should be removed. Needs at least 5 columns. +#' Columns 1, 2 and 5 have to be of numeric type. +#' @param tolerance Tolerance to use for numeric comparisons. +#' @return Returns the same table with duplicate rows removed. #' @export -duplicate.row.remove <- function(new.table) { +duplicate.row.remove <- function(new.table, tolerance = 1e-10) { new.table <- new.table[order(new.table[, 1], new.table[, 2], new.table[, 5]), ] n <- 1 m <- 2 to.remove <- rep(0, nrow(new.table)) while (m <= nrow(new.table)) { - if (abs(new.table[m, 1] - new.table[n, 1]) < 1e-10 & - abs(new.table[m, 2] - new.table[n, 2]) < 1e-10 & - abs(new.table[m, 5] - new.table[n, 5]) < 1e-10) { + if (abs(new.table[m, 1] - new.table[n, 1]) < tolerance & + abs(new.table[m, 2] - new.table[n, 2]) < tolerance & + abs(new.table[m, 5] - new.table[n, 5]) < tolerance) { to.remove[m] <- 1 m <- m + 1 } else { n <- m m <- m + 1 } - # cat("*(", n, m, ")") } - if (sum(to.remove) > 0) new.table <- new.table[-which(to.remove == 1), ] + if (sum(to.remove) > 0) + new.table <- new.table[-which(to.remove == 1), ] new.table } @@ -26,7 +35,7 @@ duplicate.row.remove <- function(new.table) { #' @param base_curve Basis curve #' @export compute_all_times <- function(base_curve) { - all_times <- base_curve[, 1] + all_times <- base_curve if (all_times[1] > 0) all_times <- c(0, all_times) all_times <- c(all_times, 2 * all_times[length(all_times)] - all_times[length(all_times) - 1]) all_times <- (all_times[1:(length(all_times) - 1)] + all_times[2:length(all_times)]) / 2 @@ -38,7 +47,7 @@ compute_all_times <- function(base_curve) { compute_base_curve <- function(x) { base_curve <- unique(x) base_curve <- base_curve[order(base_curve)] - base_curve <- cbind(base_curve, base_curve * 0) + #base_curve <- cbind(base_curve, base_curve * 0) return(base_curve) } @@ -164,7 +173,7 @@ get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { compute_EIC_area <- function(thee.sel, that.prof, base.curve, all.times, aver.diff) { if (length(thee.sel) > 1) { - that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], base.curve[, 1], all.times) + that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], base.curve, all.times) } else { that.inte <- that.prof[thee.sel, 3] * aver.diff } @@ -173,7 +182,8 @@ compute_EIC_area <- function(thee.sel, that.prof, base.curve, all.times, aver.di get_features_in_rt_range <- function(this, base.curve, bw) { this.span <- range(this[, 1]) - this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ] + this.curve <- base.curve[base.curve >= this.span[1] & base.curve <= this.span[2]] + this.curve <- cbind(this.curve, this.curve * 0) this.curve[this.curve[, 1] %in% this[, 1], 2] <- this[, 2] this.smooth <- ksmooth( @@ -244,7 +254,7 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, b sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) } } else { - sc <- interpol.area(x, y, base.curve[, 1], all.times) + sc <- interpol.area(x, y, base.curve, all.times) miu <- median(x) } curr.rec[3] <- sc @@ -270,13 +280,12 @@ compute_peaks_and_valleys <- function(dens) { compute_rectangle <- function(data_table, aligned_feature_mass, breaks, - i, - custom.mz.tol, + custom_mz_tol, orig.tol, intensity.weighted, recover.min.count, - target.time, - custom.chr.tol, + target_rt, + custom_chr_tol, base.curve, all.times, aver.diff, @@ -287,7 +296,7 @@ compute_rectangle <- function(data_table, data_table, aligned_feature_mass, breaks, - custom.mz.tol[i] + custom_mz_tol ) mass.den <- compute_mass_density( @@ -299,7 +308,7 @@ compute_rectangle <- function(data_table, # find peaks in mz range in raw data mass_range <- compute_peaks_and_valleys(mass.den) - mass_range$pks <- mass_range$pks[which(abs(mass_range$pks - aligned_feature_mass) < custom.mz.tol[i] / 1.5)] + mass_range$pks <- mass_range$pks[which(abs(mass_range$pks - aligned_feature_mass) < custom_mz_tol / 1.5)] this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) for (peak in mass_range$pks) { @@ -318,9 +327,9 @@ compute_rectangle <- function(data_table, if (nrow(that.prof) < 10) { thee.sel <- get_rt_region_indices( - target.time[i], + target_rt, that.prof, - custom.chr.tol[i] * 2 + custom_chr_tol ) if (length(thee.sel) > recover.min.count) { @@ -341,7 +350,7 @@ compute_rectangle <- function(data_table, bandwidth, min.bw, max.bw, - target.time[i], + target_rt, recover.min.count ) @@ -431,7 +440,6 @@ recover.weaker <- function(filename, times <- this.raw$times data_table <- tibble::tibble(mz = this.raw$masses, labels = this.raw$labels, intensities = this.raw$intensi) |> dplyr::arrange_at("mz") rm(this.raw) - masses <- data_table$mz # Initialize parameters with default values if (is.na(mz.range)) mz.range <- 1.5 * align.mz.tol @@ -442,7 +450,7 @@ recover.weaker <- function(filename, base.curve <- compute_base_curve(sort(times)) - aver.diff <- mean(diff(base.curve[, 1])) + aver.diff <- mean(diff(base.curve)) all.times <- compute_all_times(base.curve) this.ftrs <- aligned.ftrs[, sample_name] @@ -468,17 +476,17 @@ recover.weaker <- function(filename, for (i in seq_along(this.ftrs)) { - if (this.ftrs[i] == 0 && aligned.ftrs[i, "mz"] < max(masses)) { + if (this.ftrs[i] == 0 && aligned.ftrs[i, "mz"] < max(data_table$mz)) { this.rec <- compute_rectangle( data_table, aligned.ftrs[i, "mz"], breaks, - i, - custom.mz.tol, - orig.tol, intensity.weighted, + custom.mz.tol[i], + orig.tol, + intensity.weighted, recover.min.count, - target.time, - custom.chr.tol, + target.time[i], + custom.chr.tol[i] * 2, base.curve, all.times, aver.diff, From 60119d1b5731c94761ea6e0092ed2a8cf471f6b0 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 2 Aug 2022 15:21:34 +0200 Subject: [PATCH 07/53] Removed compute base curve function --- R/recover.weaker.R | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index d44fb0e6..0d4ee3a8 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -43,15 +43,6 @@ compute_all_times <- function(base_curve) { return(all_times) } -#' @export -compute_base_curve <- function(x) { - base_curve <- unique(x) - base_curve <- base_curve[order(base_curve)] - #base_curve <- cbind(base_curve, base_curve * 0) - return(base_curve) -} - - #' Normalize vector so that sum(vec) = 1 l2normalize <- function(x) { x / sum(x) @@ -449,7 +440,7 @@ recover.weaker <- function(filename, if (min.bw >= max.bw) min.bw <- max.bw / 4 - base.curve <- compute_base_curve(sort(times)) + base.curve <- sort(unique(times)) aver.diff <- mean(diff(base.curve)) all.times <- compute_all_times(base.curve) From 572ebaaff04f53e7fc48db6acf2de6d47f0ac8a4 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 2 Aug 2022 15:52:49 +0200 Subject: [PATCH 08/53] Removed parallel code section from test --- tests/testthat/test-recover-weaker.R | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/tests/testthat/test-recover-weaker.R b/tests/testthat/test-recover-weaker.R index c5cec338..8400f108 100644 --- a/tests/testthat/test-recover-weaker.R +++ b/tests/testthat/test-recover-weaker.R @@ -28,27 +28,6 @@ patrick::with_parameters_test_that( file.path(testdata, "aligned", "tolerances.parquet") ) - chk <- Sys.getenv("_R_CHECK_LIMIT_CORES_", "") - - if (nzchar(chk) && chk == "TRUE") { - # use 2 cores in CRAN/Travis/AppVeyor - cluster <- 2L - } else { - # use all cores in devtools::test() - cluster <- parallel::detectCores() - } - - if (!is(cluster, "cluster")) { - cluster <- parallel::makeCluster(cluster) - on.exit(parallel::stopCluster(cluster)) - } - - clusterExport(cluster, c( - "recover.weaker", "load.lcms", "find.turn.point", - "combine.seq.3", "interpol.area", "duplicate.row.remove", "compute_all_times", "load_file" - )) - clusterEvalQ(cluster, library("splines")) - recovered <- lapply(seq_along(ms_files), function(i) { recover.weaker( filename = ms_files[[i]], From 145e31f495d579e4f46fd612df588ea1f33203b5 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 2 Aug 2022 15:54:50 +0200 Subject: [PATCH 09/53] Renamed all.times to delta_rt which is the actual content of the variable and renamed matching function --- NAMESPACE | 6 ++--- R/prof.to.features.R | 2 +- R/recover.weaker.R | 55 +++++++++++++++++++++++--------------------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index bd71c1fd..58d30bdf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,4 @@ # Generated by roxygen2: do not edit by hand - import(doParallel) import(e1071) import(foreach) @@ -14,7 +13,6 @@ import(ROCR) import(ROCS) import(snow) import(splines) - S3method(solve,sigma) export(adaptive.bin) export(adjust.time) @@ -23,11 +21,10 @@ export(bigauss.esti) export(bigauss.esti.EM) export(bigauss.mix) export(combine.seq.3) -export(compute_all_times) -export(compute_base_curve) export(compute_bounds) export(compute_breaks) export(compute_breaks_2) +export(compute_delta_rt) export(compute_densities) export(compute_mass_density) export(compute_mass_values) @@ -41,6 +38,7 @@ export(find.turn.point) export(get_custom_chr_tol) export(get_mzrange_bound_indices) export(get_raw_features_in_mzrange) +export(get_rt_region_indices) export(get_times_to_use) export(hybrid) export(interpol.area) diff --git a/R/prof.to.features.R b/R/prof.to.features.R index f33cf807..0ff4c5c2 100644 --- a/R/prof.to.features.R +++ b/R/prof.to.features.R @@ -642,7 +642,7 @@ prof.to.features <- function(a, if (min.bw >= max.bw) min.bw <- max.bw / 4 base.curve <- compute_base_curve(a[, 2]) - all.times <- compute_all_times(base.curve) + all.times <- compute_delta_rt(base.curve) this.features <- matrix(0, nrow = 1, ncol = 5) colnames(this.features) <- c("mz", "pos", "sd1", "sd2", "area") diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 0d4ee3a8..ebd3fe70 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -32,14 +32,17 @@ duplicate.row.remove <- function(new.table, tolerance = 1e-10) { } #' Compute all time values from base curve. -#' @param base_curve Basis curve +#' @param times Retention time values. #' @export -compute_all_times <- function(base_curve) { - all_times <- base_curve - if (all_times[1] > 0) all_times <- c(0, all_times) - all_times <- c(all_times, 2 * all_times[length(all_times)] - all_times[length(all_times) - 1]) +compute_delta_rt <- function(times) { + # add element which is 2x the last element - the second to last - basically the extrapolated next element + all_times <- c(0, times, 2 * times[length(times)] - times[length(times) - 1]) + + # basically take the mean between consecutive values as the values - somewhat smoothed all_times <- (all_times[1:(length(all_times) - 1)] + all_times[2:length(all_times)]) / 2 - all_times <- all_times[2:length(all_times)] - all_times[1:(length(all_times) - 1)] + + # get the differences between the values + all_times <- diff(all_times) return(all_times) } @@ -162,18 +165,18 @@ get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { return(selection) } -compute_EIC_area <- function(thee.sel, that.prof, base.curve, all.times, aver.diff) { +compute_EIC_area <- function(thee.sel, that.prof, times, delta_rt, aver.diff) { if (length(thee.sel) > 1) { - that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], base.curve, all.times) + that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], times, delta_rt) } else { that.inte <- that.prof[thee.sel, 3] * aver.diff } return(that.inte) } -get_features_in_rt_range <- function(this, base.curve, bw) { +get_features_in_rt_range <- function(this, times, bw) { this.span <- range(this[, 1]) - this.curve <- base.curve[base.curve >= this.span[1] & base.curve <= this.span[2]] + this.curve <- times[times >= this.span[1] & times <= this.span[2]] this.curve <- cbind(this.curve, this.curve * 0) this.curve[this.curve[, 1] %in% this[, 1], 2] <- this[, 2] @@ -187,7 +190,7 @@ get_features_in_rt_range <- function(this, base.curve, bw) { return(compute_peaks_and_valleys(this.smooth)) } -compute_pks_vlys_rt <- function(that.prof, base.curve, bandwidth, min.bw, max.bw, target_rt, recover.min.count) { +compute_pks_vlys_rt <- function(that.prof, times, bandwidth, min.bw, max.bw, target_rt, recover.min.count) { # extract rt labels and intensities this <- that.prof[, 2:3] this <- this[order(this[, 1]), ] @@ -196,7 +199,7 @@ compute_pks_vlys_rt <- function(that.prof, base.curve, bandwidth, min.bw, max.bw roi <- get_features_in_rt_range( this, - base.curve, + times, bw ) pks <- roi$pks @@ -219,7 +222,7 @@ compute_pks_vlys_rt <- function(that.prof, base.curve, bandwidth, min.bw, max.bw return(list(pks = pks, vlys = vlys, this = this)) } -compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, base.curve, all.times) { +compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, times, delta_rt) { curr.rec <- c(that.mass, NA, NA) # same filtering of peaks as in compute_pks_vlyws and as above @@ -245,7 +248,7 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, b sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) } } else { - sc <- interpol.area(x, y, base.curve, all.times) + sc <- interpol.area(x, y, times, delta_rt) miu <- median(x) } curr.rec[3] <- sc @@ -277,8 +280,8 @@ compute_rectangle <- function(data_table, recover.min.count, target_rt, custom_chr_tol, - base.curve, - all.times, + times, + delta_rt, aver.diff, bandwidth, min.bw, @@ -327,8 +330,8 @@ compute_rectangle <- function(data_table, curr.rec[3] <- compute_EIC_area( thee.sel, that.prof, - base.curve, - all.times, + times, + delta_rt, aver.diff ) curr.rec[2] <- median(that.prof[thee.sel, 2]) @@ -337,7 +340,7 @@ compute_rectangle <- function(data_table, } else { all <- compute_pks_vlys_rt( that.prof, - base.curve, + times, bandwidth, min.bw, max.bw, @@ -351,8 +354,8 @@ compute_rectangle <- function(data_table, pks, all, aver.diff, - base.curve, - all.times + times, + delta_rt ) this.rec <- rbind(this.rec, curr.rec) } @@ -440,9 +443,9 @@ recover.weaker <- function(filename, if (min.bw >= max.bw) min.bw <- max.bw / 4 - base.curve <- sort(unique(times)) - aver.diff <- mean(diff(base.curve)) - all.times <- compute_all_times(base.curve) + times <- sort(unique(times)) + aver.diff <- mean(diff(times)) + vec_delta_rt <- compute_delta_rt(times) this.ftrs <- aligned.ftrs[, sample_name] this.times <- pk.times[, sample_name] @@ -478,8 +481,8 @@ recover.weaker <- function(filename, recover.min.count, target.time[i], custom.chr.tol[i] * 2, - base.curve, - all.times, + times, + vec_delta_rt, aver.diff, bandwidth, min.bw, From 4b4863c3181d6bd7d8e0d2aee2234a1c765476ee Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 3 Aug 2022 10:31:00 +0200 Subject: [PATCH 10/53] Added documentation for compute target times and renamed variable --- R/recover.weaker.R | 164 ++++++++++++++++++++------- R/unsupervised.R | 4 +- tests/testthat/test-recover-weaker.R | 4 +- 3 files changed, 125 insertions(+), 47 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index ebd3fe70..c35e29d3 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -31,8 +31,13 @@ duplicate.row.remove <- function(new.table, tolerance = 1e-10) { new.table } -#' Compute all time values from base curve. +#' Compute custom smoothed h-method derivative of function. +#' @description +#' The function adds an extrapolated element in the end and a 0 element in the front, +#' then computes the midpoints between#' neighbouring elements and then uses the `diff` +#' function to compute the changes in rt between points. #' @param times Retention time values. +#' @return Differences between time values. #' @export compute_delta_rt <- function(times) { # add element which is 2x the last element - the second to last - basically the extrapolated next element @@ -47,10 +52,24 @@ compute_delta_rt <- function(times) { } #' Normalize vector so that sum(vec) = 1 +#' @description x / sum(x) +#' @param x Data to normalize. +#' @return Normalized data. l2normalize <- function(x) { x / sum(x) } + +#' Compute the density function of mz values. +#' @description +#' The function takes the mz values and uses \link[stats]{density} to +#' compute the local density, optionally using intensity based weighting. +#' @param mz Mass values to compute the density over. +#' @param intensities Intensities of the peaks at mz values. +#' Only used if intensity_weighted == TRUE. +#' @param bandwidth Bandwidth to use to compute the kernel density. +#' @param intensity_weighted Whether to use intensity weighting or not. +#' @return \link[stats]{density} object containing the densities. #' @export compute_mass_density <- function(mz, intensities, @@ -68,45 +87,92 @@ compute_mass_density <- function(mz, return(mass_density) } +#' Compute custom chromatographic tolerance. +#' @description +#' Compute chromatographic tolerance for each feature. If `use_observed_range == TRUE`, +#' the whole range of retention times for all peaks is used to compute the tolerance, +#' otherwise `chr_range` is used for each feature. +#' @param use_observed_range bool Whether to use the observed chromatographic range for computation or not. +#' @param peak_rts data.frame Retention time cross table with all peak rts. +#' @param chr_range float Default chromatographic tolerance to use. +#' @param aligned_features data.frame Aligned feature table. +#' @return vector Custom chromatographic tolerances to use for each feature. #' @export -get_custom_chr_tol <- function(use.observed.range, - pk.times, - chr.range, - aligned.ftrs) { - custom.chr.tol <- rep(chr.range, nrow(aligned.ftrs)) +get_custom_chr_tol <- function(use_observed_range, + peak_rts, + chr_range, + aligned_features) { + custom_chr_tol <- rep(chr_range, nrow(aligned_features)) - if (use.observed.range) { + if (use_observed_range) { # check observed rt range across ALL SAMPLES - all_peak_rts <- pk.times[, 5:ncol(pk.times)] + all_peak_rts <- peak_rts[, 5:ncol(peak_rts)] observed.chr.range <- (apply(all_peak_rts, 1, max) - apply(all_peak_rts, 1, min)) / 2 sufficient_rts <- apply(!is.na(all_peak_rts), 1, sum) >= 5 - selection <- which(sufficient_rts & custom.chr.tol > observed.chr.range) - custom.chr.tol[selection] <- observed.chr.range[selection] + selection <- which(sufficient_rts & custom_chr_tol > observed.chr.range) + custom_chr_tol[selection] <- observed.chr.range[selection] } - return(custom.chr.tol) + return(custom_chr_tol) } +#' Compute target times for regions of interest for recovery. +#' @description +#' Compute the individual target times for the features to be recovered in the sample. +#' Spline interpolation using \link[splines]{interpSpline} is used to map from adjusted times +#' back into the original times. The function requires `x` to be distinct, hence the filtering +#' to only include rt values that occurr only once in both lists. +#' @param aligned_rts vector Aligned retention time values. +#' @param original_sample_rts vector Original feature retention time values before correction. +#' @param adjusted_sample_rts vector Feature retention time values after time correction. #' @export -compute_target_time <- function(aligned_rts, orig.time, adjusted.time) { - to.use <- get_times_to_use(orig.time, adjusted.time) - orig.time <- orig.time[to.use] - adjusted.time <- adjusted.time[to.use] - - sel.non.na <- which(!is.na(aligned_rts)) - if (length(adjusted.time) >= 4) { - sp <- interpSpline(orig.time ~ adjusted.time, na.action = na.omit) - aligned_rts[sel.non.na] <- predict(sp, aligned_rts[sel.non.na])$y +compute_target_times <- function(aligned_rts, + original_sample_rts, + adjusted_sample_rts, + min_common_times = 4) { + to.use <- get_times_to_use(original_sample_rts, adjusted_sample_rts) + adjusted_subset <- adjusted_sample_rts[to.use] + + sel_non_na <- which(!is.na(aligned_rts)) + if (length(adjusted_subset) >= min_common_times) { + original_subset <- original_sample_rts[to.use] + sp <- splines::interpSpline( + original_subset ~ adjusted_subset, + na.action = na.omit + ) + aligned_rts[sel_non_na] <- predict(sp, aligned_rts[sel_non_na])$y } } +#' Get boolean mask for values that occur only once. +#' @description +#' Uses the \link[base]{table} function to compute the occurrences and then +#' checks which values only occur a single time. +#' @param values vector Values for which to compute the mask. +#' @return vector Boolean vector which is the mask of values occuring only once. +get_single_occurrence_mask <- function(values) { + ttt <- table(values) + mask <- values %in% as.numeric(names(ttt)[ttt == 1]) + return(mask) +} + +#' Get retention time values to use +#' @description +#' Obtain retention time values which occur only once in both the original and the adjusted times. +#' This is a custom version of the unique or intersection function with rounding etc. +#' @param original_sample_rts vector Original feature retention time values before correction. +#' @param adjusted_sample_rts vector Feature retention time values after time correction. +#' @param cap int Maximum number of time values to return. +#' @return Indices of retention time values to use. #' @export -get_times_to_use <- function(orig.time, adjusted.time) { - ttt.0 <- table(orig.time) - ttt <- table(adjusted.time) - to.use <- which(adjusted.time %in% as.numeric(names(ttt)[ttt == 1]) & orig.time %in% as.numeric(names(ttt.0)[ttt.0 == 1])) - if (length(to.use) > 2000) { - to.use <- sample(to.use, 2000, replace = FALSE) +get_times_to_use <- function(original_sample_rts, adjusted_sample_rts, cap = 2000) { + to.use <- which( + get_single_occurrence_mask(adjusted_sample_rts) & + get_single_occurrence_mask(original_sample_rts) + ) + + if (length(to.use) > cap) { + to.use <- sample(to.use, cap, replace = FALSE) } return(to.use) } @@ -390,8 +456,8 @@ refine_selection <- function(this.sel, target_rt, rectangle, aligned_rt, chr_tol #' @param pk.times matrix, with columns of m/z, median elution time, and elution times in each spectrum. #' @param align.mz.tol the m/z tolerance used in the alignment. #' @param align.chr.tol the elution time tolerance in the alignment. -#' @param this.f1 The matrix which is the output from proc.to.feature(). -#' @param this.f2 The matrix which is the output from proc.to.feature(). The retention time in this object have been adjusted by the function adjust.time(). +#' @param extracted_features The matrix which is the output from proc.to.feature(). +#' @param adjusted_features The matrix which is the output from proc.to.feature(). The retention time in this object have been adjusted by the function adjust.time(). #' @param mz.range The m/z around the feature m/z to search for observations. The default value is NA, in which case 1.5 times the m/z tolerance in the aligned object will be used. #' @param chr.range The retention time around the feature retention time to search for observations. The default value is NA, in which case 0.5 times the retention time tolerance in the aligned object will be used. #' @param use.observed.range If the value is TRUE, the actual range of the observed locations of the feature in all the spectra will be used. @@ -417,8 +483,8 @@ recover.weaker <- function(filename, pk.times, align.mz.tol, align.chr.tol, - this.f1, - this.f2, + extracted_features, + adjusted_features, mz.range = NA, chr.range = NA, use.observed.range = TRUE, @@ -432,7 +498,10 @@ recover.weaker <- function(filename, # load raw data this.raw <- load_file(filename) times <- this.raw$times - data_table <- tibble::tibble(mz = this.raw$masses, labels = this.raw$labels, intensities = this.raw$intensi) |> dplyr::arrange_at("mz") + data_table <- tibble::tibble( + mz = this.raw$masses, + labels = this.raw$labels, + intensities = this.raw$intensi) |> dplyr::arrange_at("mz") rm(this.raw) # Initialize parameters with default values @@ -458,12 +527,21 @@ recover.weaker <- function(filename, aligned.ftrs ) - target.time <- compute_target_time( + # rounding is used to create a histogram of retention time values + target_times <- compute_target_times( aligned.ftrs[, "rt"], - round(this.f1[, "pos"], 5), - round(this.f2[, "pos"], 5) + round(extracted_features[, "pos"], 5), + round(adjusted_features[, "pos"], 5) ) + # IMPORTANT: THIS CODE SECTION COULD BE USED TO REPLACE COMPUTE_TARGET_TIMES FOR THE TEST CASES AND + # IS A MASSIVE SIMPLIFICATION. + # sp <- splines::interpSpline( + # unique(extracted_features[, "pos"]) ~ unique(adjusted_features[, "pos"]), + # na.action = na.omit + # ) + # target_times <- predict(sp, aligned.ftrs[, "rt"])$y + breaks <- compute_breaks_2(data_table, orig.tol) this.mz <- rep(NA, length(this.ftrs)) @@ -479,7 +557,7 @@ recover.weaker <- function(filename, orig.tol, intensity.weighted, recover.min.count, - target.time[i], + target_times[i], custom.chr.tol[i] * 2, times, vec_delta_rt, @@ -490,7 +568,7 @@ recover.weaker <- function(filename, ) this.sel <- get_rt_region_indices( - target.time[i], + target_times[i], this.rec, custom.chr.tol[i] ) @@ -499,19 +577,19 @@ recover.weaker <- function(filename, if (length(this.sel) > 0) { this.sel <- refine_selection( this.sel, - target.time[i], + target_times[i], this.rec, aligned.ftrs[i, 1], custom.chr.tol[i], custom.mz.tol[i] ) - this.pos.diff <- abs(this.f1[, 2] - this.rec[this.sel, 2]) + this.pos.diff <- abs(extracted_features[, 2] - this.rec[this.sel, 2]) this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] - this.f1 <- rbind(this.f1, c(this.rec[this.sel, 1], this.rec[this.sel, 2], NA, NA, this.rec[this.sel, 3])) - this.time.adjust <- (-this.f1[this.pos.diff, 2] + this.f2[this.pos.diff, 2]) + this.f1 <- rbind(extracted_features, c(this.rec[this.sel, 1], this.rec[this.sel, 2], NA, NA, this.rec[this.sel, 3])) + this.time.adjust <- (-this.f1[this.pos.diff, 2] + adjusted_features[this.pos.diff, 2]) this.f2 <- rbind( - this.f2, + adjusted_features, c( this.rec[this.sel, 1], this.rec[this.sel, 2] + this.time.adjust, @@ -532,8 +610,8 @@ recover.weaker <- function(filename, to.return$this.mz <- this.mz to.return$this.ftrs <- this.ftrs to.return$this.times <- this.times - to.return$this.f1 <- duplicate.row.remove(this.f1) - to.return$this.f2 <- duplicate.row.remove(this.f2) + to.return$this.f1 <- duplicate.row.remove(extracted_features) + to.return$this.f2 <- duplicate.row.remove(adjusted_features) return(to.return) } diff --git a/R/unsupervised.R b/R/unsupervised.R index ead8b6c9..154e3005 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -85,8 +85,8 @@ recover_weaker_signals <- function( recover.weaker( sample_name = get_sample_name(filenames[i]), filename = filenames[[i]], - this.f1 = extracted_features[[i]], - this.f2 = corrected_features[[i]], + extracted_features = extracted_features[[i]], + adjusted_features = corrected_features[[i]], pk.times = aligned_rt_crosstab, aligned.ftrs = aligned_int_crosstab, orig.tol = original_mz_tolerance, diff --git a/tests/testthat/test-recover-weaker.R b/tests/testthat/test-recover-weaker.R index 8400f108..8c557fa6 100644 --- a/tests/testthat/test-recover-weaker.R +++ b/tests/testthat/test-recover-weaker.R @@ -32,8 +32,8 @@ patrick::with_parameters_test_that( recover.weaker( filename = ms_files[[i]], sample_name = get_sample_name(files[i]), - this.f1 = extracted[[i]], - this.f2 = adjusted[[i]], + extracted_features = extracted[[i]], + adjusted_features = adjusted[[i]], pk.times = aligned$rt_crosstab, aligned.ftrs = aligned$int_crosstab, orig.tol = mz_tol, From c67e22d0f46a4f06318d626d8ca23c2894585768 Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 3 Aug 2022 10:31:08 +0200 Subject: [PATCH 11/53] adjusted namespace --- NAMESPACE | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 58d30bdf..c63100ff 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,5 @@ # Generated by roxygen2: do not edit by hand + import(doParallel) import(e1071) import(foreach) @@ -28,7 +29,7 @@ export(compute_delta_rt) export(compute_densities) export(compute_mass_density) export(compute_mass_values) -export(compute_target_time) +export(compute_target_times) export(cont.index) export(duplicate.row.remove) export(extract_pattern_colnames) From ee6d1d2eb3e32469249a0309b22dfcc2bdea27b7 Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 3 Aug 2022 12:09:52 +0200 Subject: [PATCH 12/53] refactored find.tol.time function --- R/find.tol.time.R | 225 ++++++++++++++++++++++++++-------------------- 1 file changed, 129 insertions(+), 96 deletions(-) diff --git a/R/find.tol.time.R b/R/find.tol.time.R index 614c9f9f..25822cb5 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -1,24 +1,110 @@ -#' An internal function that find elution time tolerance level. -#' +#' Compute minimum mz tolerance to use. +#' @description +#' Compute the minimum mz tolerance based on the relative +#' tolerance and the mz values and the absolute tolerance. +#' Uses midpoints between mz values for the weighting. +#' @param mz vector Mz values to use. +#' @param mz_tol_relative float Relative mz tolerance to use with the mz values. +#' This forms a sort of weighted tolerance. +#' @param mz_tol_absolute float Absolute tolerance to use independent from the mz values. +#' @return float Minimum tolerance values to use. +compute_min_mz_tolerance <- function(mz, mz_tol_relative, mz_tol_absolute) { + l <- length(mz) + mz_midpoints <- ((mz[2:l] + mz[1:(l - 1)]) / 2) + mz_ftr_relative_tolerances <- mz_tol_relative * mz_midpoints + min_mz_tol <- min(mz_tol_absolute, mz_ftr_relative_tolerances) + return(min_mz_tol) +} + +compute_breaks_3 <- function(mz, min_mz_tol) { + l <- length(mz) + mass_differences <- diff(mz) + indices <- which(mass_differences > min_mz_tol) + breaks <- c(0, indices, l) + return(breaks) +} + +compute_rt_tol_relative <- function(breaks, + max.num.segments, + aver.bin.size, + number_of_samples, + chr, + min.bins, + max.bins, + do.plot = FALSE) { + da <- 0 + if (length(breaks) > max.num.segments) { + s <- floor(seq(2, length(breaks), length.out = max.num.segments)) + } else { + s <- 2:length(breaks) + } + + for (i in s) { + this.sel <- (breaks[i - 1] + 1):breaks[i] + + if (length(this.sel) <= 3 * number_of_samples) { + this.d <- as.vector(dist(chr[this.sel])) + if (length(this.d) > 100) { + this.d <- sample(this.d, 100) + } + da <- c(da, this.d) + } + } + + da <- da[!is.na(da)] + uppermost <- max(da) + n <- min(max.bins, max(min.bins, round(length(da) / aver.bin.size))) + des <- density(da, + kernel = "gaussian", n = n, + bw = uppermost / n * 2, from = 0 + ) + y <- des$y[des$x > 0] + x <- des$x[des$x > 0] + + this.l <- lm(y[x > uppermost / 4] ~ x[x > uppermost / 4]) + + exp.y <- this.l$coef[1] + this.l$coef[2] * x + + y2 <- y[1:(length(y) - 1)] + y3 <- y[2:(length(y))] + y2[which(y2 < y3)] <- y3[which(y2 < y3)] + y[1:(length(y) - 1)] <- y2 + + yy <- cumsum(y > 1.5 * exp.y) + yi <- seq_along(yy) + sel <- min(which(yy < yi)) - 1 + rt_tol_relative <- x[sel] + + if (do.plot) { + tolerance_plot(x, y, exp.y, + sel, + main = "find retention time tolerance" + ) + } + return(rt_tol_relative) +} + +#' An internal function that find elution time tolerance level. +#' #' This function finds the time tolerance level. Also, it returns the grouping information given the time tolerance. -#' +#' #' @param mz mz values of all peaks in all profiles in the study. #' @param chr retention time of all peaks in all profiles in the study. #' @param lab label of all peaks in all profiles in the study. #' @param number_of_samples The number of spectra in this analysis. -#' @param mz_tol_relative m/z tolerance level for the grouping of signals into peaks. This value is expressed as the percentage of the m/z value. +#' @param mz_tol_relative m/z tolerance level for the grouping of signals into peaks. This value is expressed as the percentage of the m/z value. #' This value, multiplied by the m/z value, becomes the cutoff level. -#' @param rt_tol_relative the elution time tolerance. If NA, the function finds the tolerance level first. If a numerical value is given, +#' @param rt_tol_relative the elution time tolerance. If NA, the function finds the tolerance level first. If a numerical value is given, #' the function directly goes to the second step - grouping peaks based on the tolerance. #' @param aver.bin.size The average bin size to determine the number of equally spaced points in the kernel density estimation. #' @param min.bins the minimum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too few observations are present. #' @param max.bins the maximum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too many observations are present. -#' @param mz_tol_absolute As the m/z tolerance in alignment is expressed in relative terms (ppm), it may not be suitable when the m/z range is wide. +#' @param mz_tol_absolute As the m/z tolerance in alignment is expressed in relative terms (ppm), it may not be suitable when the m/z range is wide. #' This parameter limits the tolerance in absolute terms. It mostly influences feature matching in higher m/z range. #' @param max.num.segments the maximum number of segments. #' @param do.plot Indicates whether plot should be drawn. -#' @return A matrix with six columns. Every row corresponds to a peak in one of the spectrum. The columns are: m/z, elution time, spread, signal strength, -#' spectrum label, and peak group label. The rows are ordered by the median m/z of each peak group, and with each peak group the rows are ordered +#' @return A matrix with six columns. Every row corresponds to a peak in one of the spectrum. The columns are: m/z, elution time, spread, signal strength, +#' spectrum label, and peak group label. The rows are ordered by the median m/z of each peak group, and with each peak group the rows are ordered #' by the elution time. #' @examples #' find.tol.time(mz, chr, lab, number_of_samples = number_of_samples, mz_tol_relative = mz_tol_relative, mz_tol_absolute = mz_tol_absolute, do.plot = FALSE) @@ -34,107 +120,54 @@ find.tol.time <- function(mz, mz_tol_absolute = 0.01, max.num.segments = 10000, do.plot = TRUE) { - features <- tibble::tibble(mz = mz, rt = chr, labels = lab) - features <- dplyr::arrange_at(features, "mz") - - # o <- order(mz) - # mz <- mz[o] - # chr <- chr[o] - # lab <- lab[o] - # rm(o) - - mz <- features$mz - chr <- features$rt - lab <- features$labels - - l <- length(mz) + features <- dplyr::arrange_at(features, "mz") - min_mz_tol <- min( - mz_tol_absolute, - mz_tol_relative * ((mz[2:l] + mz[1:(l - 1)]) / 2) + min_mz_tol <- compute_min_mz_tolerance( + features$mz, + mz_tol_relative, + mz_tol_absolute ) - - indices <- which((mz[2:l] - mz[1:(l - 1)]) > min_mz_tol) - breaks <- c(0, indices, l) - for (i in 2:length(breaks)) { - this.iter <- (breaks[i - 1] + 1):breaks[i] - this.o <- order(chr[this.iter]) - this.o <- this.o + breaks[i - 1] - mz[this.iter] <- mz[this.o] - chr[this.iter] <- chr[this.o] - lab[this.iter] <- lab[this.o] - # newfeatures <- tibble::tibble(mz = mz[this.iter], chr = chr[this.iter], lab = lab[this.iter]) + + mz_breaks <- compute_breaks_3(features$mz, min_mz_tol) + features$mz_group <- 0 + + for (i in 2:length(mz_breaks)) { + subset_indices <- (mz_breaks[i - 1] + 1):mz_breaks[i] + features$mz_group[subset_indices] <- i } - # mz <- newfeatures$mz - # chr <- newfeatures$chr - # lab <- newfeatures$lab + features <- features |> dplyr::arrange_at(c("mz_group", "rt")) + mz_breaks <- mz_breaks[c(-1, -length(mz_breaks))] - breaks <- breaks[c(-1, -length(breaks))] if (is.na(rt_tol_relative)) { - da <- 0 - if (length(breaks) > max.num.segments) { - s <- floor(seq(2, length(breaks), length.out = max.num.segments)) - } else { - s <- 2:length(breaks) - } - - for (i in s) { - this.sel <- (breaks[i - 1] + 1):breaks[i] - - if (length(this.sel) <= 3 * number_of_samples) { - this.d <- as.vector(dist(chr[this.sel])) - if (length(this.d) > 100) - this.d <- sample(this.d, 100) - da <- c(da, this.d) - } - } - - da <- da[!is.na(da)] - uppermost <- max(da) - n = min(max.bins, max(min.bins, round(length(da) / aver.bin.size))) - des <- density(da, kernel = "gaussian", n = n, - bw = uppermost / n * 2, from = 0) - y <- des$y[des$x > 0] - x <- des$x[des$x > 0] - - this.l <- lm(y[x > uppermost / 4] ~ x[x > uppermost / 4]) - - exp.y <- this.l$coef[1] + this.l$coef[2] * x - - y2 <- y[1:(length(y) - 1)] - y3 <- y[2:(length(y))] - y2[which(y2 < y3)] <- y3[which(y2 < y3)] - y[1:(length(y) - 1)] <- y2 - - yy <- cumsum(y > 1.5 * exp.y) - yi <- seq_along(yy) - sel <- min(which(yy < yi)) - 1 - rt_tol_relative <- x[sel] - - if (do.plot) { - tolerance_plot(x, y, exp.y, - sel, main = "find retention time tolerance") - } + rt_tol_relative <- compute_rt_tol_relative( + mz_breaks, + max.num.segments, + aver.bin.size, + number_of_samples, + features$rt, + min.bins, + max.bins + ) } - - da <- chr[2:l] - chr[1:(l - 1)] - breaks.2 <- which(da > rt_tol_relative) - all.breaks <- c(0, unique(c(breaks, breaks.2)), l) + + rt_diffs <- diff(features$rt) + rt_breaks <- which(rt_diffs > rt_tol_relative) + all.breaks <- c(0, unique(c(mz_breaks, rt_breaks)), nrow(features)) all.breaks <- all.breaks[order(all.breaks)] - - grps <- seq_along(mz) + + features$grps <- 0 for (i in 2:length(all.breaks)) { - grps[(all.breaks[i - 1] + 1):all.breaks[i]] <- i + features$grps[(all.breaks[i - 1] + 1):all.breaks[i]] <- i } - + list( - mz = mz, - chr = chr, - lab = lab, - grps = grps, + mz = features$mz, + chr = features$rt, + lab = features$labels, + grps = features$grps, chr.tol = rt_tol_relative, mz.tol = mz_tol_relative ) From 8ced53c0a04cd8249b7537cfd398471dc2aeca78 Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 3 Aug 2022 16:19:33 +0200 Subject: [PATCH 13/53] Further improved documentation and removed unnecessary functions --- R/recover.weaker.R | 137 +++++++++++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 53 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index c35e29d3..819ff888 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -8,7 +8,7 @@ #' @param tolerance Tolerance to use for numeric comparisons. #' @return Returns the same table with duplicate rows removed. #' @export -duplicate.row.remove <- function(new.table, tolerance = 1e-10) { +duplicate.row.remove <- function(new.table, tolerance = 1e-10) { new.table <- new.table[order(new.table[, 1], new.table[, 2], new.table[, 5]), ] n <- 1 m <- 2 @@ -26,8 +26,9 @@ duplicate.row.remove <- function(new.table, tolerance = 1e-10) { } } - if (sum(to.remove) > 0) + if (sum(to.remove) > 0) { new.table <- new.table[-which(to.remove == 1), ] + } new.table } @@ -71,19 +72,21 @@ l2normalize <- function(x) { #' @param intensity_weighted Whether to use intensity weighting or not. #' @return \link[stats]{density} object containing the densities. #' @export -compute_mass_density <- function(mz, - intensities, +compute_mass_density <- function(features, bandwidth, - intensity_weighted) { + intensity_weighted, + n = 512) { if (intensity_weighted) { - mass_density <- density( - mz, - weights = l2normalize(intensities), - bw = bandwidth - ) + weights <- l2normalize(features$intensities) } else { - mass_density <- density(mz, bw = bandwidth) + weights <- NULL } + mass_density <- density( + features$mz, + weights = weights, + bw = bandwidth, + n = n + ) return(mass_density) } @@ -168,7 +171,7 @@ get_single_occurrence_mask <- function(values) { get_times_to_use <- function(original_sample_rts, adjusted_sample_rts, cap = 2000) { to.use <- which( get_single_occurrence_mask(adjusted_sample_rts) & - get_single_occurrence_mask(original_sample_rts) + get_single_occurrence_mask(original_sample_rts) ) if (length(to.use) > cap) { @@ -177,49 +180,71 @@ get_times_to_use <- function(original_sample_rts, adjusted_sample_rts, cap = 200 return(to.use) } +#' Predict the indices for the valley points with low mass density. +#' @description +#' The density of mz values in the feature table is computed based on the tolerance. +#' The valleys or breaks of clusters in mz values are detected and a function +#' is approximated to predict the indices for the mass values which are the closest to those +#' valley points. +#' @param features data.frame Data table with features for which to predict the indices. +#' @param mz_orig_tol float Mz tolerance to use as KDE bandwidth parameter. +#' @return vector Predicted indices for valley points. #' @export -compute_breaks_2 <- function(data_table, orig.tol) { - all.mass.den <- density( - data_table$mz, - weights = l2normalize(data_table$intensities), - bw = 0.5 * orig.tol * max(data_table$mz), - n = 2^min(15, floor(log2(length(data_table$mz))) - 2) +predict_mz_break_indices <- function(features, mz_orig_tol) { + mz_density <- compute_mass_density( + features, + TRUE, + bandwidth = 0.5 * mz_orig_tol * max(features$mz), + n = 2^min(15, floor(log2(length(features$mz))) - 2) ) - all.mass.turns <- find.turn.point(all.mass.den$y) - all.mass.vlys <- all.mass.den$x[all.mass.turns$vlys] - breaks <- c(0, unique(round(approx(data_table$mz, seq_along(data_table$mz), xout = all.mass.vlys, rule = 2, ties = "ordered")$y))[-1]) + turnpoints <- find.turn.point(mz_density$y) + mz_valleys <- mz_density$x[turnpoints$vlys] + indices <- seq_along(features$mz) + predictions <- approx( + x = features$mz, + y = indices, + xout = mz_valleys, + rule = 2, + ties = "ordered" + )$y + + predicted_indices_at_vlys <- unique(round(predictions))[-1] + breaks <- c(0, predicted_indices_at_vlys) return(breaks) } +#' Compute range of valley indices which are in mz_tol range around aligned_feature_mass. +#' @description +#' +#' @param aligned_feature_mass float Mz value of the aligned feature. +#' @param mz vector mz values of the features. +#' @param vlys_indices vector Indices of the valley points of mz clusters. +#' @param mz_tol float Tolerance to use to check if values are close. +#' @return pair Index range (start, end). #' @export -get_mzrange_bound_indices <- function(aligned_feature_mass, masses, breaks, mz_tol) { - if (aligned_feature_mass <= masses[breaks[2]]) { - this.found <- c(1, 2) +get_mzrange_bound_indices <- function(aligned_feature_mass, + mz, + vlys_indices, + mz_tol) { + if (aligned_feature_mass <= mz[vlys_indices[2]]) { + all_indices <- c(1, 2) } else { - this.found <- c( - which(abs(masses[breaks] - aligned_feature_mass) < mz_tol), - min(which(masses[breaks] > aligned_feature_mass)), - max(which(masses[breaks] < aligned_feature_mass)) + # get all indices where mz is close to aligned_feature mass, + # the first one that is larger and the last one that is smaller. + upper_bound_idx <- min(which(mz[vlys_indices] > aligned_feature_mass)) + lower_bound_idx <- max(which(mz[vlys_indices] < aligned_feature_mass)) + valley_indices_within_tol <- which(abs(mz[vlys_indices] - aligned_feature_mass) < mz_tol) + all_indices <- c( + valley_indices_within_tol, + upper_bound_idx, + lower_bound_idx ) + 1 - this.found <- c(min(this.found), max(this.found)) } - return(this.found) + return(list(start = min(all_indices), end = max(all_indices))) } -#' @export -get_raw_features_in_mzrange <- function(data_table, aligned_feature_mass, breaks, mz_tol) { - this.found <- get_mzrange_bound_indices( - aligned_feature_mass, - data_table$mz, - breaks, - mz_tol - ) - this.sel <- (breaks[this.found[1]] + 1):breaks[this.found[2]] - features <- data_table |> dplyr::slice(this.sel) - return(features) -} #' @export get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { @@ -324,9 +349,9 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, t } compute_mass_boundaries <- function(mass.vlys, peak) { - lower <- max(mass.vlys[mass.vlys < peak]) - upper <- min(mass.vlys[mass.vlys > peak]) - return(list(lower = lower, upper = upper)) + lower <- max(mass.vlys[mass.vlys < peak]) + upper <- min(mass.vlys[mass.vlys > peak]) + return(list(lower = lower, upper = upper)) } compute_peaks_and_valleys <- function(dens) { @@ -352,23 +377,28 @@ compute_rectangle <- function(data_table, bandwidth, min.bw, max.bw) { - features <- get_raw_features_in_mzrange( - data_table, + + bounds <- get_mzrange_bound_indices( aligned_feature_mass, + data_table$mz, breaks, - custom_mz_tol + orig.tol + ) + + features <- dplyr::slice( + data_table, + (breaks[bounds$start] + 1):breaks[bounds$end] ) mass.den <- compute_mass_density( - mz = features$mz, - intensities = features$intensities, + features, bandwidth = 0.5 * orig.tol * aligned_feature_mass, intensity_weighted = intensity.weighted ) # find peaks in mz range in raw data mass_range <- compute_peaks_and_valleys(mass.den) - mass_range$pks <- mass_range$pks[which(abs(mass_range$pks - aligned_feature_mass) < custom_mz_tol / 1.5)] + mass_range$pks <- mass_range$pks[abs(mass_range$pks - aligned_feature_mass) < custom_mz_tol / 1.5] this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) for (peak in mass_range$pks) { @@ -501,7 +531,8 @@ recover.weaker <- function(filename, data_table <- tibble::tibble( mz = this.raw$masses, labels = this.raw$labels, - intensities = this.raw$intensi) |> dplyr::arrange_at("mz") + intensities = this.raw$intensi + ) |> dplyr::arrange_at("mz") rm(this.raw) # Initialize parameters with default values @@ -542,7 +573,7 @@ recover.weaker <- function(filename, # ) # target_times <- predict(sp, aligned.ftrs[, "rt"])$y - breaks <- compute_breaks_2(data_table, orig.tol) + breaks <- predict_mz_break_indices(data_table, orig.tol) this.mz <- rep(NA, length(this.ftrs)) From d520cde513ed4c6b2d786b9433dd6f3a6a72e15f Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 4 Aug 2022 10:32:28 +0200 Subject: [PATCH 14/53] Added further documentation and renamed variables --- R/recover.weaker.R | 62 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 819ff888..c1e74020 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -256,6 +256,7 @@ get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { return(selection) } + compute_EIC_area <- function(thee.sel, that.prof, times, delta_rt, aver.diff) { if (length(thee.sel) > 1) { that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], times, delta_rt) @@ -348,12 +349,37 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, t return(curr.rec) } -compute_mass_boundaries <- function(mass.vlys, peak) { - lower <- max(mass.vlys[mass.vlys < peak]) - upper <- min(mass.vlys[mass.vlys > peak]) +#' Compute bounds of area using given peak and mass valley points. +#' @description +#' The lower bound is the mass of the valley the closest but smaller than peak +#' and the upper bound is the mass of the valley the closest but higher than +#' the peak. +#' @param mz_valley_points vector Mz values of valley points defining mz clusters. +#' @param peak_mz double Value of the peak mz for which to get the valley bounds. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item lower - double - The mz value of the lower bound valley point. +#' \item upper - double - The mz value of the upper bound valley point. +#' } +compute_mass_boundaries <- function(mz_valley_points, peak_mz) { + lower <- max(mz_valley_points[mz_valley_points < peak_mz]) + upper <- min(mz_valley_points[mz_valley_points > peak_mz]) return(list(lower = lower, upper = upper)) } +#' Compute peaks and valleys of density function. +#' @description +#' Given a density function, the turn points are computed and +#' the peaks and valleys in the original data (points with highest +#' and lowest density) are returned. +#' @param density stats::density Density object for which to compute peaks +#' and valleys. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item pks - vector - The data points at which the density peaks. +#' \item vlys - vector - The points in the data where the density is low +#' (forming a valley in the function). +#' } compute_peaks_and_valleys <- function(dens) { turns <- find.turn.point(dens$y) pks <- dens$x[turns$pks] # mz values with highest density @@ -362,6 +388,7 @@ compute_peaks_and_valleys <- function(dens) { return(list(pks = pks, vlys = vlys)) } + compute_rectangle <- function(data_table, aligned_feature_mass, breaks, @@ -547,8 +574,8 @@ recover.weaker <- function(filename, aver.diff <- mean(diff(times)) vec_delta_rt <- compute_delta_rt(times) - this.ftrs <- aligned.ftrs[, sample_name] - this.times <- pk.times[, sample_name] + sample_intensities <- aligned.ftrs[, sample_name] + sample_times <- pk.times[, sample_name] custom.mz.tol <- mz.range * aligned.ftrs$mz custom.chr.tol <- get_custom_chr_tol( @@ -575,11 +602,22 @@ recover.weaker <- function(filename, breaks <- predict_mz_break_indices(data_table, orig.tol) - this.mz <- rep(NA, length(this.ftrs)) + this.mz <- rep(NA, length(sample_intensities)) + max_mz <- max(data_table$mz) + + # THIS CONSTRUCT TO EXTRACT MISSING FEATURES COULD BE USED TO POSSIBLY SPEED UP + # THE COMPUTATION AS THE LOOP WILL ONLY GO OVER THE ROWS AND THE ADDITIONAL VARIABLES + # CAN BE ADDED USING THE MUTATE FUNCTION. + # t_aligned <- tibble::tibble(aligned.ftrs) + # missing_features <- dplyr::filter(t_aligned, !!rlang::sym(sample_name) == 0 & mz < max_mz) + + # if(nrow(missing_features) > 0) { + # browser() + # } - for (i in seq_along(this.ftrs)) + for (i in seq_along(sample_intensities)) { - if (this.ftrs[i] == 0 && aligned.ftrs[i, "mz"] < max(data_table$mz)) { + if (sample_intensities[i] == 0 && aligned.ftrs[i, "mz"] < max_mz) { this.rec <- compute_rectangle( data_table, aligned.ftrs[i, "mz"], @@ -631,16 +669,16 @@ recover.weaker <- function(filename, NA ) ) - this.ftrs[i] <- this.rec[this.sel, 3] - this.times[i] <- this.rec[this.sel, 2] + this.time.adjust + sample_intensities[i] <- this.rec[this.sel, 3] + sample_times[i] <- this.rec[this.sel, 2] + this.time.adjust this.mz[i] <- this.rec[this.sel, 1] } } } to.return <- new("list") to.return$this.mz <- this.mz - to.return$this.ftrs <- this.ftrs - to.return$this.times <- this.times + to.return$this.ftrs <- sample_intensities + to.return$this.times <- sample_times to.return$this.f1 <- duplicate.row.remove(extracted_features) to.return$this.f2 <- duplicate.row.remove(adjusted_features) From 9004c2222f696965c3a801adc54e85a263d78657 Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 4 Aug 2022 13:31:29 +0200 Subject: [PATCH 15/53] Fixing other test cases --- NAMESPACE | 3 +-- R/combine.seq.3.R | 43 +++++++++++++++++++++---------------------- R/extract_features.R | 4 ++-- R/prof.to.features.R | 2 +- R/recover.weaker.R | 10 +++++----- 5 files changed, 30 insertions(+), 32 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c63100ff..f11194ac 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,7 +24,6 @@ export(bigauss.mix) export(combine.seq.3) export(compute_bounds) export(compute_breaks) -export(compute_breaks_2) export(compute_delta_rt) export(compute_densities) export(compute_mass_density) @@ -38,7 +37,6 @@ export(find.tol) export(find.turn.point) export(get_custom_chr_tol) export(get_mzrange_bound_indices) -export(get_raw_features_in_mzrange) export(get_rt_region_indices) export(get_times_to_use) export(hybrid) @@ -50,6 +48,7 @@ export(make.known.table) export(normix) export(normix.bic) export(plot_raw_profile_histogram) +export(predict_mz_break_indices) export(prep.uv) export(proc.cdf) export(prof.to.features) diff --git a/R/combine.seq.3.R b/R/combine.seq.3.R index a646c621..067c4e3c 100644 --- a/R/combine.seq.3.R +++ b/R/combine.seq.3.R @@ -1,32 +1,31 @@ #' An internal function. -#' +#' #' This is a internal function. -#' -#' @param a vector of retention time. -#' @param mz vector of m/z ratio. -#' @param inte vector of signal strength. +#' +#' @param table dataframe of retention time, m/z ratio, signal strength. #' @return returns #' \itemize{ -#' \item mz - m/z ratio -#' \item a - retention time -#' \item int - signal strength +#' \item masses - m/z ratio +#' \item labels - retention time +#' \item intensi - signal strength #' } #' @export #' @examples -#' combine.seq.3(retention_time_vector, masses, intensi) -combine.seq.3 <- -function(a, mz, inte) ### the input need to be pre-ordered by a -{ - l <- length(a) - breaks <- c(0, which(a[1:(l - 1)] != a[2:l]), l) - new.int <- new.mz <- rep(0, length(breaks)-1) - +#' combine.seq.3(table) +combine.seq.3 <- function(features) { + l <- nrow(features) + breaks <- c(0, which(features$labels[1:(l - 1)] != features$labels[2:l]), l) + new_table <- data.frame(mz = rep(0, length(breaks) - 1), labels = unique(features$labels), intensities = rep(0, length(breaks) - 1)) + for (i in 1:(length(breaks) - 1)) { - this.int<-inte[(breaks[i] + 1):breaks[i + 1]] - this.mz<-mz[(breaks[i] + 1):breaks[i + 1]] - new.int[i] <- sum(this.int) - new.mz[i] <- median(this.mz[which(this.int==max(this.int))]) + start <- breaks[i] + 1 + end <- breaks[i + 1] + mz <- features$mz[start:end] + ints <- features$intensities[start:end] + + new_table$intensities[i] <- sum(ints) + new_table$mz[i] <- median(mz[which(ints == max(ints))]) } - new.a <- unique(a) - return(cbind(new.mz, new.a, new.int)) + + return(new_table) } diff --git a/R/extract_features.R b/R/extract_features.R index 7f57265d..c4244380 100644 --- a/R/extract_features.R +++ b/R/extract_features.R @@ -68,8 +68,8 @@ extract_features <- function( 'compute_densities', 'compute_breaks', 'rm.ridge', - 'compute_base_curve', - 'compute_all_times', + #'compute_base_curve', + 'compute_delta_rt', 'bigauss.mix', 'bigauss.esti', 'rev_cum_sum', diff --git a/R/prof.to.features.R b/R/prof.to.features.R index 0ff4c5c2..3ebaac6c 100644 --- a/R/prof.to.features.R +++ b/R/prof.to.features.R @@ -641,7 +641,7 @@ prof.to.features <- function(a, if (is.na(max.bw)) max.bw <- diff(range(a[, 2], na.rm = TRUE)) / 15 if (min.bw >= max.bw) min.bw <- max.bw / 4 - base.curve <- compute_base_curve(a[, 2]) + base.curve <- sort(unique(a[, 2])) all.times <- compute_delta_rt(base.curve) this.features <- matrix(0, nrow = 1, ncol = 5) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index c1e74020..a72fa684 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -259,9 +259,9 @@ get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { compute_EIC_area <- function(thee.sel, that.prof, times, delta_rt, aver.diff) { if (length(thee.sel) > 1) { - that.inte <- interpol.area(that.prof[thee.sel, 2], that.prof[thee.sel, 3], times, delta_rt) + that.inte <- interpol.area(that.prof$labels[thee.sel], that.prof$intensities[thee.sel], times, delta_rt) } else { - that.inte <- that.prof[thee.sel, 3] * aver.diff + that.inte <- that.prof$intensities[thee.sel] * aver.diff } return(that.inte) } @@ -438,8 +438,8 @@ compute_rectangle <- function(data_table, # get values in RT region of interest? if (nrow(that) > recover.min.count) { - that.prof <- combine.seq.3(that$labels, that$mz, that$intensities) - that.mass <- sum(that.prof[, 1] * that.prof[, 3]) / sum(that.prof[, 3]) + that.prof <- combine.seq.3(that) + that.mass <- sum(that.prof$mz * that.prof$intensities) / sum(that.prof$intensities) curr.rec <- c(that.mass, NA, NA) if (nrow(that.prof) < 10) { @@ -457,7 +457,7 @@ compute_rectangle <- function(data_table, delta_rt, aver.diff ) - curr.rec[2] <- median(that.prof[thee.sel, 2]) + curr.rec[2] <- median(that.prof$labels[thee.sel]) this.rec <- rbind(this.rec, curr.rec) } } else { From a49291db0df03aa0947b6b55dcdf58090b1c7b8e Mon Sep 17 00:00:00 2001 From: wverastegui Date: Thu, 4 Aug 2022 13:37:53 +0200 Subject: [PATCH 16/53] added coments --- R/find.tol.time.R | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/R/find.tol.time.R b/R/find.tol.time.R index 25822cb5..ba831f1b 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -8,6 +8,7 @@ #' This forms a sort of weighted tolerance. #' @param mz_tol_absolute float Absolute tolerance to use independent from the mz values. #' @return float Minimum tolerance values to use. + compute_min_mz_tolerance <- function(mz, mz_tol_relative, mz_tol_absolute) { l <- length(mz) mz_midpoints <- ((mz[2:l] + mz[1:(l - 1)]) / 2) @@ -16,6 +17,11 @@ compute_min_mz_tolerance <- function(mz, mz_tol_relative, mz_tol_absolute) { return(min_mz_tol) } +#' @description +#' Compute indices of mass differences greater than min_mz_tol. +#' @param mz mz values of all peaks in all profiles in the study. +#' @param min_mz_tol float Minimum tolerance value. +#' @return breaks Integer indices of mass differences to use. compute_breaks_3 <- function(mz, min_mz_tol) { l <- length(mz) mass_differences <- diff(mz) @@ -24,6 +30,18 @@ compute_breaks_3 <- function(mz, min_mz_tol) { return(breaks) } +#' Compute rt relative tolerance to use. +#' @description +#' Compute the elution time tolerance based on the kernel density estimation. +#' It plots the fitting function if set to TRUE. +#' @param max.num.segments the maximum number of segments. +#' @param aver.bin.size The average bin size to determine the number of equally spaced points in the kernel density estimation. +#' @param number_of_samples The number of spectra in this analysis. +#' @param chr retention time of all peaks in all profiles in the study. +#' @param min.bins the minimum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too few observations are present. +#' @param max.bins the maximum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too many observations are present. +#' @param do.plot Indicates whether plot should be drawn. +#' @return rt_tol_relative the elution time tolerance. compute_rt_tol_relative <- function(breaks, max.num.segments, aver.bin.size, @@ -33,24 +51,32 @@ compute_rt_tol_relative <- function(breaks, max.bins, do.plot = FALSE) { da <- 0 + #' This conditional makes sure that length(s) is <= max.num.segments + #' If False, length(s) = max.num.segments, and s[i] is the largest + #' integer no greater than the corresponding element. Otherwise + #' length(s) = length(breaks) - 1 if (length(breaks) > max.num.segments) { s <- floor(seq(2, length(breaks), length.out = max.num.segments)) } else { s <- 2:length(breaks) } + #' This loop creates a vector with distances between rt peaks. Distances + #' are stored in a triangular matrix and converted to a vector subsequently. + #' Vector length should be < 100, otherwise, vector is + #' constructed extracting only 100 samples. for (i in s) { - this.sel <- (breaks[i - 1] + 1):breaks[i] - - if (length(this.sel) <= 3 * number_of_samples) { - this.d <- as.vector(dist(chr[this.sel])) - if (length(this.d) > 100) { - this.d <- sample(this.d, 100) + subset_idx <- (breaks[i - 1] + 1):breaks[i]# create subset of indices + if (length(subset_idx) <= 3 * number_of_samples) { + rt_distances <- as.vector(dist(chr[subset_idx])) + if (length(rt_distances) > 100) { + rt_distances <- sample(rt_distances, 100) } - da <- c(da, this.d) + da <- c(da, rt_distances) } } - + + #' Calculation of kernel density estimation to estimate the rt_tol_relative da <- da[!is.na(da)] uppermost <- max(da) n <- min(max.bins, max(min.bins, round(length(da) / aver.bin.size))) @@ -62,9 +88,7 @@ compute_rt_tol_relative <- function(breaks, x <- des$x[des$x > 0] this.l <- lm(y[x > uppermost / 4] ~ x[x > uppermost / 4]) - exp.y <- this.l$coef[1] + this.l$coef[2] * x - y2 <- y[1:(length(y) - 1)] y3 <- y[2:(length(y))] y2[which(y2 < y3)] <- y3[which(y2 < y3)] From 133a1bce30468ef22528b5028402df3ebb600442 Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 4 Aug 2022 14:54:23 +0200 Subject: [PATCH 17/53] Finalized find.tol.time --- R/find.tol.time.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/find.tol.time.R b/R/find.tol.time.R index ba831f1b..43231d6f 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -8,7 +8,6 @@ #' This forms a sort of weighted tolerance. #' @param mz_tol_absolute float Absolute tolerance to use independent from the mz values. #' @return float Minimum tolerance values to use. - compute_min_mz_tolerance <- function(mz, mz_tol_relative, mz_tol_absolute) { l <- length(mz) mz_midpoints <- ((mz[2:l] + mz[1:(l - 1)]) / 2) From f7e54ab15f5524dd1e9ebf91c287590c1be4c868 Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 4 Aug 2022 14:54:46 +0200 Subject: [PATCH 18/53] started find.turn.point --- R/find.turn.point.R | 109 +++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 38 deletions(-) diff --git a/R/find.turn.point.R b/R/find.turn.point.R index 13f3d9b2..a3bc5471 100644 --- a/R/find.turn.point.R +++ b/R/find.turn.point.R @@ -1,7 +1,37 @@ +find_local_maxima <- function(y, ties.method) { + padded_y <- rev(as.vector(c(-Inf, y, -Inf))) + + # each row is 3 consecutive values in descending order + # rows are sorted in ascending order + z <- embed(padded_y, dim = 3) + + # reverse the row ordering + # first column is equal y + z <- z[rev(seq(nrow(z))), ] + + # row where the max is in the middle is a turn point + v <- max.col(z, ties.method = ties.method) == 2 + + # alt <- diff(sign(diff(y))) == -2 + return(v) +} + +msExtrema <- function(y) { + index1 <- find_local_maxima(y, ties.method = "first") + index2 <- find_local_maxima(-y, ties.method = "last") + + # this is some sort of safety mechanism to protect against numerical issues + index.max <- index1 & !index2 + index.min <- index2 & !index1 + + list(index.max = index.max, index.min = index.min) +} + #' Find peaks and valleys of a curve. -#' +#' +#' @description #' This is an internal function which finds the peaks and valleys of a smooth curve. -#' +#' #' @param y The y values of a curve in x-y plane. #' @return A list object: #' \itemize{ @@ -11,45 +41,48 @@ #' @export #' @examples #' find.turn.point(y) -find.turn.point <- -function(y) -{ - peaks2<-function (x, ties.method) - { - z <- embed(rev(as.vector(c(-Inf, x, -Inf))), dim = 3) - z <- z[rev(seq(nrow(z))), ] - v <- max.col(z,ties.method=ties.method) == 2 - v - } - msExtrema<-function (x) - { - l<-length(x) - index1 <- peaks2(x, ties.method="first") - index2 <- peaks2(-x, ties.method="last") - index.max <- index1 & !index2 - index.min <- index2 & !index1 - list(index.max = index.max, index.min = index.min) - } - - y <- y[!is.na(y)] - if (length(unique(y)) == 1) { - pks <- round(length(y)/2) - vlys <- c(1, length(y)) +find.turn.point <- function(y) { + y <- y[!is.na(y)] # filter NA values + + if (length(unique(y)) == 1) { # if exactly one distinct value + middle_index <- round(length(y) / 2) # get mid index + start_and_end <- c(1, length(y)) # get first and last index + return(list(pks = middle_index, vlys = start_and_end)) + } else { + b <- msExtrema(y) + + pks <- which(b$index.max) + vlys <- which(b$index.min) + + # pks_mask <- diff(sign(diff(y))) + # vlys_mask <- diff(sign(diff(c(-Inf, -y, -Inf)))) + + # if(anyNA(pks_mask) || anyNA(vlys_mask)) { + # browser() + # } + + # pks_v2 <- which(pks_mask == -2) + 1 + # vlys_v2 <- which(vlys_mask == -2) + + # if(pks != pks_v2 || vlys != vlys_v2) { + # browser() + # } + + if (pks[1] != 1) { + vlys <- c(1, vlys) + } + + if (pks[length(pks)] != length(y)) { + vlys <- c(vlys, length(y)) + } + + if (length(pks) == 1) { + vlys <- c(1, length(y)) + } + x <- new("list") x$pks <- pks x$vlys <- vlys return(x) } - - b<-msExtrema(y) - pks<-which(b$index.max) - vlys<-which(b$index.min) - if(pks[1] != 1) vlys<-c(1, vlys) - if(pks[length(pks)] != length(y)) vlys<-c(vlys, length(y)) - - if(length(pks) == 1) vlys<-c(1,length(y)) - x <- new("list") - x$pks <- pks - x$vlys <- vlys - return(x) } From fbdd3037d875a070411f671ad56c60be223b2a6e Mon Sep 17 00:00:00 2001 From: hechth Date: Fri, 5 Aug 2022 15:50:27 +0200 Subject: [PATCH 19/53] Adapted to new combine.seq implementation --- R/combine.seq.3.R | 7 ++- R/recover.weaker.R | 105 ++++++++++++++++++++++++--------------------- 2 files changed, 61 insertions(+), 51 deletions(-) diff --git a/R/combine.seq.3.R b/R/combine.seq.3.R index 067c4e3c..f9d109a5 100644 --- a/R/combine.seq.3.R +++ b/R/combine.seq.3.R @@ -15,11 +15,16 @@ combine.seq.3 <- function(features) { l <- nrow(features) breaks <- c(0, which(features$labels[1:(l - 1)] != features$labels[2:l]), l) - new_table <- data.frame(mz = rep(0, length(breaks) - 1), labels = unique(features$labels), intensities = rep(0, length(breaks) - 1)) + new_table <- tibble::tibble( + mz = rep(0, length(breaks) - 1), + labels = unique(features$labels), + intensities = rep(0, length(breaks) - 1) + ) for (i in 1:(length(breaks) - 1)) { start <- breaks[i] + 1 end <- breaks[i + 1] + mz <- features$mz[start:end] ints <- features$intensities[start:end] diff --git a/R/recover.weaker.R b/R/recover.weaker.R index a72fa684..87a8027c 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -267,10 +267,10 @@ compute_EIC_area <- function(thee.sel, that.prof, times, delta_rt, aver.diff) { } get_features_in_rt_range <- function(this, times, bw) { - this.span <- range(this[, 1]) - this.curve <- times[times >= this.span[1] & times <= this.span[2]] + this.curve <- times[times >= min(this$labels) & times <= max(this$labels)] + this.curve <- cbind(this.curve, this.curve * 0) - this.curve[this.curve[, 1] %in% this[, 1], 2] <- this[, 2] + this.curve[this.curve[, 1] %in% this$labels, 2] <- this$intensities this.smooth <- ksmooth( this.curve[, 1], @@ -282,12 +282,8 @@ get_features_in_rt_range <- function(this, times, bw) { return(compute_peaks_and_valleys(this.smooth)) } -compute_pks_vlys_rt <- function(that.prof, times, bandwidth, min.bw, max.bw, target_rt, recover.min.count) { - # extract rt labels and intensities - this <- that.prof[, 2:3] - this <- this[order(this[, 1]), ] - - bw <- min(max(bandwidth * (max(this[, 1]) - min(this[, 1])), min.bw), max.bw) +compute_pks_vlys_rt <- function(this, times, bandwidth, min.bw, max.bw, target_rt, recover.min.count) { + bw <- min(max(bandwidth * (max(this[, "labels"]) - min(this[, "labels"])), min.bw), max.bw) roi <- get_features_in_rt_range( this, @@ -300,8 +296,8 @@ compute_pks_vlys_rt <- function(that.prof, times, bandwidth, min.bw, max.bw, tar pks.n <- pks for (m in 1:length(pks)) { - boundaries <- compute_mass_boundaries(vlys, pks[m]) - pks.n[m] <- sum(this[, 1] >= boundaries$lower & this[, 1] <= boundaries$upper) + boundaries <- compute_boundaries(vlys, pks[m]) + pks.n[m] <- sum(this$labels >= boundaries$lower & this$labels <= boundaries$upper) } if (!is.na(target_rt)) { @@ -311,40 +307,46 @@ compute_pks_vlys_rt <- function(that.prof, times, bandwidth, min.bw, max.bw, tar } else { pks <- pks[pks.n > recover.min.count] } - return(list(pks = pks, vlys = vlys, this = this)) + return(list(pks = pks, vlys = vlys)) } -compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, times, delta_rt) { +compute_mu_sc <- function(this, aver.diff, times, delta_rt) { + x <- this$labels + y <- this$intensities + + if (nrow(this) >= 10) { + miu <- sum(x * y) / sum(y) + sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) + if (sigma == 0) { + sc <- sum(y) * aver.diff + miu <- miu + } else { + fitted <- dnorm(x, mean = miu, sd = sigma) + this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 + sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) + } + } else { + sc <- interpol.area(x, y, times, delta_rt) + miu <- median(x) + } + return(list(sc = sc, miu = miu)) +} + +compute_curr_rec_with_enough_peaks <- function(that.mass, peak, valleys, labels_intensities, aver.diff, times, delta_rt) { curr.rec <- c(that.mass, NA, NA) # same filtering of peaks as in compute_pks_vlyws and as above - boundaries <- compute_mass_boundaries(all$vlys, pks) - this <- all$this[which(all$this[, 1] >= boundaries$lower & all$this[, 1] <= boundaries$upper), ] + boundaries <- compute_boundaries(valleys, peak) + + this <- labels_intensities |> dplyr::filter(labels >= boundaries$lower & labels <= boundaries$upper) - if (is.null(nrow(this))) { - curr.rec[3] <- this[2] * aver.diff - curr.rec[2] <- this[1] + if (nrow(this) == 1) { + curr.rec[3] <- this$intensities * aver.diff + curr.rec[2] <- this$labels } else { - x <- this[, 1] - y <- this[, 2] - - if (nrow(this) >= 10) { - miu <- sum(x * y) / sum(y) - sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) - if (sigma == 0) { - curr.rec[3] <- sum(y) * aver.diff - curr.rec[2] <- miu - } else { - fitted <- dnorm(x, mean = miu, sd = sigma) - this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 - sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - } - } else { - sc <- interpol.area(x, y, times, delta_rt) - miu <- median(x) - } - curr.rec[3] <- sc - curr.rec[2] <- miu + res <- compute_mu_sc(this, aver.diff, times, delta_rt) + curr.rec[3] <- res$sc + curr.rec[2] <- res$miu } return(curr.rec) } @@ -354,16 +356,16 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, pks, all, aver.diff, t #' The lower bound is the mass of the valley the closest but smaller than peak #' and the upper bound is the mass of the valley the closest but higher than #' the peak. -#' @param mz_valley_points vector Mz values of valley points defining mz clusters. -#' @param peak_mz double Value of the peak mz for which to get the valley bounds. +#' @param valley_points vector values of valley points defining clusters. +#' @param peak double Value of the peak for which to get the valley bounds. #' @return Returns a list object with the following objects in it: #' \itemize{ -#' \item lower - double - The mz value of the lower bound valley point. -#' \item upper - double - The mz value of the upper bound valley point. +#' \item lower - double - The value of the lower bound valley point. +#' \item upper - double - The value of the upper bound valley point. #' } -compute_mass_boundaries <- function(mz_valley_points, peak_mz) { - lower <- max(mz_valley_points[mz_valley_points < peak_mz]) - upper <- min(mz_valley_points[mz_valley_points > peak_mz]) +compute_boundaries <- function(valley_points, peak) { + lower <- max(valley_points[valley_points < peak]) + upper <- min(valley_points[valley_points > peak]) return(list(lower = lower, upper = upper)) } @@ -430,7 +432,7 @@ compute_rectangle <- function(data_table, this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) for (peak in mass_range$pks) { # get mass values of valleys the closest to the peak - mass <- compute_mass_boundaries(mass_range$vlys, peak) + mass <- compute_boundaries(mass_range$vlys, peak) that <- features |> dplyr::filter(mz > mass$lower & mz <= mass$upper) |> @@ -461,8 +463,10 @@ compute_rectangle <- function(data_table, this.rec <- rbind(this.rec, curr.rec) } } else { + labels_intensities <- dplyr::select(that.prof, c("labels", "intensities")) |> dplyr::arrange_at("labels") + all <- compute_pks_vlys_rt( - that.prof, + labels_intensities, times, bandwidth, min.bw, @@ -471,11 +475,12 @@ compute_rectangle <- function(data_table, recover.min.count ) - for (pks in all$pks) { + for (peak in all$pks) { curr.rec <- compute_curr_rec_with_enough_peaks( that.mass, - pks, - all, + peak, + all$vlys, + labels_intensities, aver.diff, times, delta_rt From e93a9d3b84b2f0ca949ac3aba3349907dea3ad5d Mon Sep 17 00:00:00 2001 From: wverastegui Date: Fri, 5 Aug 2022 16:15:08 +0200 Subject: [PATCH 20/53] Refactored find.turn.point.R --- R/find.turn.point.R | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/R/find.turn.point.R b/R/find.turn.point.R index a3bc5471..289aa47f 100644 --- a/R/find.turn.point.R +++ b/R/find.turn.point.R @@ -4,7 +4,7 @@ find_local_maxima <- function(y, ties.method) { # each row is 3 consecutive values in descending order # rows are sorted in ascending order z <- embed(padded_y, dim = 3) - + # reverse the row ordering # first column is equal y z <- z[rev(seq(nrow(z))), ] @@ -41,48 +41,38 @@ msExtrema <- function(y) { #' @export #' @examples #' find.turn.point(y) +library(pastecs) find.turn.point <- function(y) { + y <- y[!is.na(y)] # filter NA values + m <- turnpoints(y) # It gives the index.max peak if (length(unique(y)) == 1) { # if exactly one distinct value middle_index <- round(length(y) / 2) # get mid index start_and_end <- c(1, length(y)) # get first and last index return(list(pks = middle_index, vlys = start_and_end)) } else { - b <- msExtrema(y) - - pks <- which(b$index.max) - vlys <- which(b$index.min) - - # pks_mask <- diff(sign(diff(y))) - # vlys_mask <- diff(sign(diff(c(-Inf, -y, -Inf)))) - - # if(anyNA(pks_mask) || anyNA(vlys_mask)) { - # browser() + b <- msExtrema(y) # Boolean list, TRUE for the extreme and peak values + + pks <- which(m$peaks) #which(b$index.max) # gives 1 index -> 258 + #vlys <- which(b$index.min) # gives 2 indices -> 1, 512 + + # These If redefine the vlys, not sure it is correct ! + # if (pks[1] != 1) { # this is TRUE + # vlys <- c(1, vlys) # } - # pks_v2 <- which(pks_mask == -2) + 1 - # vlys_v2 <- which(vlys_mask == -2) - - # if(pks != pks_v2 || vlys != vlys_v2) { - # browser() + # if (pks[length(pks)] != length(y)) { # This is TRUE + # vlys <- c(vlys, length(y)) # } - if (pks[1] != 1) { - vlys <- c(1, vlys) - } - - if (pks[length(pks)] != length(y)) { - vlys <- c(vlys, length(y)) - } - - if (length(pks) == 1) { - vlys <- c(1, length(y)) + if (length(pks) == 1) { #This is TRUE -> final vlys values are assigned here + vlys <- c(1, m$n)#c(1, length(y)) } x <- new("list") x$pks <- pks x$vlys <- vlys - return(x) + return(x) } } From 91547dd4a8294ebb19dc56005f307bd2a3faecec Mon Sep 17 00:00:00 2001 From: hechth Date: Fri, 5 Aug 2022 17:27:16 +0200 Subject: [PATCH 21/53] post merge cleanup --- NAMESPACE | 16 ---------------- R/adaptive.bin.R | 16 ++-------------- R/prof.to.features.R | 4 +++- R/recover.weaker.R | 3 ++- 4 files changed, 7 insertions(+), 32 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6013872a..ba4395b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,19 +1,5 @@ # Generated by roxygen2: do not edit by hand -import(doParallel) -import(e1071) -import(foreach) -import(gbm) -import(iterators) -import(MASS) -import(mzR) -import(randomForest) -import(Rcpp) -import(rgl) -import(ROCR) -import(ROCS) -import(snow) -import(splines) S3method(solve,sigma) export(adaptive.bin) export(adjust.time) @@ -22,8 +8,6 @@ export(bigauss.esti) export(bigauss.esti.EM) export(bigauss.mix) export(combine.seq.3) -export(compute_all_times) -export(compute_base_curve) export(compute_boundaries) export(compute_bounds) export(compute_breaks) diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R index 7b33e5f6..99751313 100644 --- a/R/adaptive.bin.R +++ b/R/adaptive.bin.R @@ -32,13 +32,6 @@ compute_breaks <- function(tol, masses, intensi, weighted) { return(breaks) } -#' @export -compute_boundaries <- function(mass.vlys, mass.pks, j){ - mass.lower <- max(mass.vlys[mass.vlys < mass.pks[j]]) - mass.upper <- min(mass.vlys[mass.vlys > mass.pks[j]]) - - return(list(lower = mass.lower, upper = mass.upper)) -} #' @export increment_counter <- function(pointers, that.n){ @@ -134,7 +127,7 @@ adaptive.bin <- function(x, for (j in 1:length(mass.pks)) { # compute boundaries - boundaries <- compute_boundaries(mass.vlys, mass.pks, j) + boundaries <- compute_boundaries(mass.vlys, mass.pks[j]) if (length(mass.pks) == 1){ boundaries$lower <- boundaries$lower - 1 @@ -144,12 +137,7 @@ adaptive.bin <- function(x, that <- this_table |> dplyr::filter(mz > boundaries$lower & mz <= boundaries$upper) if (nrow(that) > 0) { - that <- combine.seq.3(that) - - if (nrow(that) != 1) { - that <- that[order(that[, 1]), ] - } - + that <- combine.seq.3(that) |> dplyr::arrange_at("mz") that.range <- diff(range(that$labels)) if (that.range > 0.5 * time_range & length(that$labels) > that.range * min.pres & length(that$labels) / (that.range / aver.time.range) > min.pres) { diff --git a/R/prof.to.features.R b/R/prof.to.features.R index 3ebaac6c..da3603a6 100644 --- a/R/prof.to.features.R +++ b/R/prof.to.features.R @@ -642,7 +642,9 @@ prof.to.features <- function(a, if (min.bw >= max.bw) min.bw <- max.bw / 4 base.curve <- sort(unique(a[, 2])) - all.times <- compute_delta_rt(base.curve) + base.curve <- cbind(base.curve, base.curve * 0) + + all.times <- compute_delta_rt(base.curve[, 1]) this.features <- matrix(0, nrow = 1, ncol = 5) colnames(this.features) <- c("mz", "pos", "sd1", "sd2", "area") diff --git a/R/recover.weaker.R b/R/recover.weaker.R index c646e357..441951ee 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -256,7 +256,7 @@ get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { if (!is.na(retention_time)) { selection <- which(abs(profile_data[, 2] - retention_time) < chr_tol) } else { - selection <- seq_len(profile_data) + selection <- 1 } return(selection) } @@ -368,6 +368,7 @@ compute_curr_rec_with_enough_peaks <- function(that.mass, peak, valleys, labels_ #' \item lower - double - The value of the lower bound valley point. #' \item upper - double - The value of the upper bound valley point. #' } +#' @export compute_boundaries <- function(valley_points, peak) { lower <- max(valley_points[valley_points < peak]) upper <- min(valley_points[valley_points > peak]) From a5f4c211711873cc0bc6ee085b7ae5d1a40c3d5a Mon Sep 17 00:00:00 2001 From: wverastegui Date: Fri, 5 Aug 2022 17:45:55 +0200 Subject: [PATCH 22/53] Refactored find.turn.point.R --- R/find.turn.point.R | 62 +++++++-------------------------------------- 1 file changed, 9 insertions(+), 53 deletions(-) diff --git a/R/find.turn.point.R b/R/find.turn.point.R index 289aa47f..d8bc7641 100644 --- a/R/find.turn.point.R +++ b/R/find.turn.point.R @@ -1,32 +1,3 @@ -find_local_maxima <- function(y, ties.method) { - padded_y <- rev(as.vector(c(-Inf, y, -Inf))) - - # each row is 3 consecutive values in descending order - # rows are sorted in ascending order - z <- embed(padded_y, dim = 3) - - # reverse the row ordering - # first column is equal y - z <- z[rev(seq(nrow(z))), ] - - # row where the max is in the middle is a turn point - v <- max.col(z, ties.method = ties.method) == 2 - - # alt <- diff(sign(diff(y))) == -2 - return(v) -} - -msExtrema <- function(y) { - index1 <- find_local_maxima(y, ties.method = "first") - index2 <- find_local_maxima(-y, ties.method = "last") - - # this is some sort of safety mechanism to protect against numerical issues - index.max <- index1 & !index2 - index.min <- index2 & !index1 - - list(index.max = index.max, index.min = index.min) -} - #' Find peaks and valleys of a curve. #' #' @description @@ -43,33 +14,18 @@ msExtrema <- function(y) { #' find.turn.point(y) library(pastecs) find.turn.point <- function(y) { - - y <- y[!is.na(y)] # filter NA values - m <- turnpoints(y) # It gives the index.max peak - - if (length(unique(y)) == 1) { # if exactly one distinct value - middle_index <- round(length(y) / 2) # get mid index - start_and_end <- c(1, length(y)) # get first and last index + y <- y[!is.na(y)] # filter NA values + if (length(unique(y)) == 1) { # if exactly one distinct value + middle_index <- round(length(y) / 2) # get mid index + start_and_end <- c(1, length(y)) # get first and last index return(list(pks = middle_index, vlys = start_and_end)) } else { - b <- msExtrema(y) # Boolean list, TRUE for the extreme and peak values - - pks <- which(m$peaks) #which(b$index.max) # gives 1 index -> 258 - #vlys <- which(b$index.min) # gives 2 indices -> 1, 512 - - # These If redefine the vlys, not sure it is correct ! - # if (pks[1] != 1) { # this is TRUE - # vlys <- c(1, vlys) - # } - - # if (pks[length(pks)] != length(y)) { # This is TRUE - # vlys <- c(vlys, length(y)) - # } - - if (length(pks) == 1) { #This is TRUE -> final vlys values are assigned here - vlys <- c(1, m$n)#c(1, length(y)) + list_tp <- turnpoints(y) + pks <- which(list_tp$peaks) + vlys <- which(list_tp$pits) + if (length(pks) == 1) { + vlys <- c(1, list_tp$n) } - x <- new("list") x$pks <- pks x$vlys <- vlys From 75b39a1cd280305f0e597ee42a2a6ead217b69bd Mon Sep 17 00:00:00 2001 From: wverastegui Date: Mon, 8 Aug 2022 10:50:48 +0200 Subject: [PATCH 23/53] refactored find turn points with pastecs library --- DESCRIPTION | 4 ++-- R/find.turn.point.R | 22 ++++++++++++---------- conda/environment-dev.yaml | 1 + conda/meta.yaml | 1 + 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2c3a32a3..7a2804fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. Depends: R (>= 2.10), MASS, rgl, mzR, splines, doParallel, foreach, - iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp, dplyr, tidyr, stringr, tibble + iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp, dplyr, tidyr, stringr, tibble, pastecs biocViews: Technology, MassSpectrometry License: GPL(>=2) + file LICENSE LazyLoad: yes @@ -17,4 +17,4 @@ NeedsCompilation: no Suggests: arrow, dataCompareR, testthat (>= 3.0.0) Config/testthat/edition: 3 -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 diff --git a/R/find.turn.point.R b/R/find.turn.point.R index d8bc7641..24216e4f 100644 --- a/R/find.turn.point.R +++ b/R/find.turn.point.R @@ -1,3 +1,7 @@ +#' @import pastecs +NULL +#> NULL + #' Find peaks and valleys of a curve. #' #' @description @@ -12,7 +16,6 @@ #' @export #' @examples #' find.turn.point(y) -library(pastecs) find.turn.point <- function(y) { y <- y[!is.na(y)] # filter NA values if (length(unique(y)) == 1) { # if exactly one distinct value @@ -20,15 +23,14 @@ find.turn.point <- function(y) { start_and_end <- c(1, length(y)) # get first and last index return(list(pks = middle_index, vlys = start_and_end)) } else { - list_tp <- turnpoints(y) - pks <- which(list_tp$peaks) - vlys <- which(list_tp$pits) - if (length(pks) == 1) { - vlys <- c(1, list_tp$n) + list_tp <- pastecs::turnpoints(y) + peaks <- which(list_tp$peaks) + pits <- which(list_tp$pits) + + if (length(peaks) == 1) { + pits <- c(1, list_tp$n) } - x <- new("list") - x$pks <- pks - x$vlys <- vlys - return(x) + + return(list(pks = peaks, vlys = pits)) } } diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index b82d92a7..714a5645 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -33,3 +33,4 @@ dependencies: - r-datacomparer - r-patrick - radian + - r-pastecs diff --git a/conda/meta.yaml b/conda/meta.yaml index 6a57bb75..7304eb81 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -62,6 +62,7 @@ requirements: - r-tidyr - r-stringr - r-tibble + - r-pastecs test: commands: From 0bdeaf3218ef21b4b7f5e2c3d9dc622e7f9c4a10 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 8 Aug 2022 11:09:13 +0200 Subject: [PATCH 24/53] final refactorings --- R/recover.weaker.R | 59 +++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 441951ee..640e1412 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -13,16 +13,16 @@ NULL #' @param tolerance Tolerance to use for numeric comparisons. #' @return Returns the same table with duplicate rows removed. #' @export -duplicate.row.remove <- function(new.table, tolerance = 1e-10) { - new.table <- new.table[order(new.table[, 1], new.table[, 2], new.table[, 5]), ] +duplicate.row.remove <- function(features, tolerance = 1e-10) { + new.table <- features |> dplyr::arrange_at(c("mz", "pos", "area")) n <- 1 m <- 2 to.remove <- rep(0, nrow(new.table)) while (m <= nrow(new.table)) { - if (abs(new.table[m, 1] - new.table[n, 1]) < tolerance & - abs(new.table[m, 2] - new.table[n, 2]) < tolerance & - abs(new.table[m, 5] - new.table[n, 5]) < tolerance) { + if (abs(new.table$mz[m] - new.table$mz[n]) < tolerance & + abs(new.table$pos[m] - new.table$pos[n]) < tolerance & + abs(new.table$area[m] - new.table$area[n]) < tolerance) { to.remove[m] <- 1 m <- m + 1 } else { @@ -254,7 +254,7 @@ get_mzrange_bound_indices <- function(aligned_feature_mass, #' @export get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { if (!is.na(retention_time)) { - selection <- which(abs(profile_data[, 2] - retention_time) < chr_tol) + selection <- which(abs(profile_data$labels - retention_time) < chr_tol) } else { selection <- 1 } @@ -435,7 +435,7 @@ compute_rectangle <- function(data_table, mass_range <- compute_peaks_and_valleys(mass.den) mass_range$pks <- mass_range$pks[abs(mass_range$pks - aligned_feature_mass) < custom_mz_tol / 1.5] - this.rec <- matrix(c(Inf, Inf, Inf), nrow = 1) + this.rec <- tibble::tibble(mz = Inf, labels = Inf, intensities = Inf) for (peak in mass_range$pks) { # get mass values of valleys the closest to the peak mass <- compute_boundaries(mass_range$vlys, peak) @@ -466,7 +466,7 @@ compute_rectangle <- function(data_table, aver.diff ) curr.rec[2] <- median(that.prof$labels[thee.sel]) - this.rec <- rbind(this.rec, curr.rec) + this.rec <- tibble::add_row(this.rec, tibble::tibble_row(mz = curr.rec[1], labels = curr.rec[2], intensities = curr.rec[3])) } } else { labels_intensities <- dplyr::select(that.prof, c("labels", "intensities")) |> dplyr::arrange_at("labels") @@ -491,7 +491,8 @@ compute_rectangle <- function(data_table, times, delta_rt ) - this.rec <- rbind(this.rec, curr.rec) + + this.rec <- tibble::add_row(this.rec, tibble::tibble_row(mz = curr.rec[1], labels = curr.rec[2], intensities = curr.rec[3])) } } } @@ -596,7 +597,7 @@ recover.weaker <- function(filename, aligned.ftrs ) - # rounding is used to create a histogram of retention time values + # # rounding is used to create a histogram of retention time values target_times <- compute_target_times( aligned.ftrs[, "rt"], round(extracted_features[, "pos"], 5), @@ -664,25 +665,25 @@ recover.weaker <- function(filename, custom.mz.tol[i] ) - this.pos.diff <- abs(extracted_features[, 2] - this.rec[this.sel, 2]) + this.pos.diff <- abs(extracted_features$pos - this.rec$labels[this.sel]) this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] - this.f1 <- rbind(extracted_features, c(this.rec[this.sel, 1], this.rec[this.sel, 2], NA, NA, this.rec[this.sel, 3])) - this.time.adjust <- (-this.f1[this.pos.diff, 2] + adjusted_features[this.pos.diff, 2]) - this.f2 <- rbind( - adjusted_features, - c( - this.rec[this.sel, 1], - this.rec[this.sel, 2] + this.time.adjust, - NA, - NA, - this.rec[this.sel, 3], - grep(sample_name, colnames(aligned.ftrs)), - NA - ) + this.f1 <- extracted_features |> tibble::add_row( + mz = this.rec$mz[this.sel], + pos = this.rec$labels[this.sel], + area = this.rec$intensities[this.sel] ) - sample_intensities[i] <- this.rec[this.sel, 3] - sample_times[i] <- this.rec[this.sel, 2] + this.time.adjust - this.mz[i] <- this.rec[this.sel, 1] + this.time.adjust <- (-this.f1$pos[this.pos.diff] + adjusted_features$pos[this.pos.diff]) + + this.f2 <- adjusted_features |> tibble::add_row( + mz = this.rec$mz[this.sel], + pos = this.rec$labels[this.sel] + this.time.adjust, + area = this.rec$intensities[this.sel], + V6 = grep(sample_name, colnames(aligned.ftrs)) + ) + + sample_intensities[i] <- this.rec$intensities[this.sel] + sample_times[i] <- this.rec$labels[this.sel] + this.time.adjust + this.mz[i] <- this.rec$mz[this.sel] } } } @@ -690,8 +691,8 @@ recover.weaker <- function(filename, to.return$this.mz <- this.mz to.return$this.ftrs <- sample_intensities to.return$this.times <- sample_times - to.return$this.f1 <- duplicate.row.remove(extracted_features) - to.return$this.f2 <- duplicate.row.remove(adjusted_features) + to.return$this.f1 <- duplicate.row.remove(this.f1) + to.return$this.f2 <- duplicate.row.remove(this.f2) return(to.return) } From 824fcae30f2180cfc6943c3e64a3550d98384292 Mon Sep 17 00:00:00 2001 From: wverastegui Date: Mon, 8 Aug 2022 11:16:28 +0200 Subject: [PATCH 25/53] Updated changelog file --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e93352f3..f079f73c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - added tests with realistic testdata for `extract_features.R` [#42](https://github.com/RECETOX/recetox-aplcms/pull/42), [#54](https://github.com/RECETOX/recetox-aplcms/pull/54) - added tests for `feature.align.R` ([#40](https://github.com/RECETOX/recetox-aplcms/pull/40)), and `adjust.time.R` ([#39](https://github.com/RECETOX/recetox-aplcms/pull/40)) - added CI to repository's GitHub Actions [#45](https://github.com/RECETOX/recetox-aplcms/pull/45),[#49](https://github.com/RECETOX/recetox-aplcms/pull/49) +- added pastecs library to dependencies [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) ### Changed - refactored `feature.align.R` [#63](https://github.com/RECETOX/recetox-aplcms/pull/63) - refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64) +- refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) +- refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) ### Removed ## [0.9.4] - 2022-05-10 From 821bb9a17bb75cf849a0289433ffc0d953f64ddf Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 8 Aug 2022 11:33:31 +0200 Subject: [PATCH 26/53] updated variable name --- R/recover.weaker.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 640e1412..d928b85b 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -500,13 +500,13 @@ compute_rectangle <- function(data_table, return(this.rec) } -refine_selection <- function(this.sel, target_rt, rectangle, aligned_rt, chr_tol, mz_tol) { +refine_selection <- function(this.sel, target_rt, rectangle, aligned_mz, chr_tol, mz_tol) { if (length(this.sel) > 1) { if (!is.na(target_rt)) { - this.d <- (rectangle[, 2] - target_rt)^2 / chr_tol^2 + (rectangle[, 1] - aligned_rt)^2 / mz_tol^2 + this.d <- (rectangle$labels - target_rt)^2 / chr_tol^2 + (rectangle$mz - aligned_mz)^2 / mz_tol^2 this.sel <- which(this.d == min(this.d))[1] } else { - this.d <- abs(rectangle[, 1] - aligned_rt) + this.d <- abs(rectangle$mz - aligned_mz) this.sel <- which(this.d == min(this.d))[1] } } From 8e1f381f49a99ee063128dda3747f814e26bdc7f Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 8 Aug 2022 15:15:52 +0200 Subject: [PATCH 27/53] Finalized documentation --- R/recover.weaker.R | 244 ++++++++++++++++++++++++++++----------------- R/utils.R | 4 + 2 files changed, 156 insertions(+), 92 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index d928b85b..f8754e3d 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -250,32 +250,37 @@ get_mzrange_bound_indices <- function(aligned_feature_mass, return(list(start = min(all_indices), end = max(all_indices))) } - +#' Get indices where rt in `features` is within `chr_tol` of `target_time`. +#' @param target_time float Target retention time region. +#' @param features tibble Feature table including `labels` column. +#' @param chr_tol float Retention time tolerance. +#' @return vector Indices which are within `chr_tol` from `target_time` or +#' 1 if `target_time` is NA. #' @export -get_rt_region_indices <- function(retention_time, profile_data, chr_tol) { - if (!is.na(retention_time)) { - selection <- which(abs(profile_data$labels - retention_time) < chr_tol) +get_rt_region_indices <- function(target_time, features, chr_tol) { + if (!is.na(target_time)) { + selection <- which(abs(features$labels - target_time) < chr_tol) } else { selection <- 1 } return(selection) } +#' Get peaks and valleys of smoothed rt values in range. +#' +#' @param features tibble Data table with `labels` and `intensities` columns. +#' @param times vector Raw retention time data from raw data file. +#' @param bw float Bandwidth to use for kernel smoothing. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item pks - vector - The data points at which the density peaks. +#' \item vlys - vector - The points in the data where the density is low +#' (forming a valley in the function). +get_features_in_rt_range <- function(features, times, bw) { + time_curve <- times[times >= min(features$labels) & times <= max(features$labels)] -compute_EIC_area <- function(thee.sel, that.prof, times, delta_rt, aver.diff) { - if (length(thee.sel) > 1) { - that.inte <- interpol.area(that.prof$labels[thee.sel], that.prof$intensities[thee.sel], times, delta_rt) - } else { - that.inte <- that.prof$intensities[thee.sel] * aver.diff - } - return(that.inte) -} - -get_features_in_rt_range <- function(this, times, bw) { - this.curve <- times[times >= min(this$labels) & times <= max(this$labels)] - - this.curve <- cbind(this.curve, this.curve * 0) - this.curve[this.curve[, 1] %in% this$labels, 2] <- this$intensities + this.curve <- cbind(time_curve, time_curve * 0) + this.curve[this.curve[, 1] %in% features$labels, 2] <- features$intensities this.smooth <- ksmooth( this.curve[, 1], @@ -287,73 +292,126 @@ get_features_in_rt_range <- function(this, times, bw) { return(compute_peaks_and_valleys(this.smooth)) } -compute_pks_vlys_rt <- function(this, times, bandwidth, min.bw, max.bw, target_rt, recover.min.count) { - bw <- min(max(bandwidth * (max(this[, "labels"]) - min(this[, "labels"])), min.bw), max.bw) +#' Count the number of peaks in all valleys +#' @description +#' For each peak in ROI, count the peaks between the valley points. +#' @param roi list Named list with vectors `pks` and `vlys`. +#' @param times vector Retention time values +#' @return vector Numbers of peaks within each region defined by a peak and the two valley points. +count_peaks <- function(roi, times) { + num_peaks <- rep(0, length(roi$pks)) + + for (m in seq_along(roi$pks)) { + boundaries <- compute_boundaries(roi$vlys, roi$pks[m]) + num_peaks[m] <- sum(times >= boundaries$lower & times <= boundaries$upper) + } + return(num_peaks) +} +#' Compute peaks and valleys which have at least `recover_min_count` peaks. +#' +#' @param features tibble Features with `mz`, `labels` and `intensities`. +#' @param times vector Retention time values from the raw data file. +#' @param bandwith flot Bandwidth to use in smoothing. +#' @param target_rt float Retention time at which to recover the intensity. +#' @param recover_min_count int Minimum number of peaks required in the area to recover the signal. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item pks - vector - The data points at which the density peaks with at least `recover_min_count` peaks between the valley points. +#' \item vlys - vector - The points in the data where the density is low +#' (forming a valley in the function). +compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_min_count) { roi <- get_features_in_rt_range( - this, + features, times, - bw + bandwidth ) + pks <- roi$pks vlys <- roi$vlys - pks.n <- pks - for (m in 1:length(pks)) - { - boundaries <- compute_boundaries(vlys, pks[m]) - pks.n[m] <- sum(this$labels >= boundaries$lower & this$labels <= boundaries$upper) - } + num_peaks <- count_peaks(roi, features$labels) if (!is.na(target_rt)) { pks.d <- abs(pks - target_rt) # distance from the target peak location - pks.d[pks.n == 0] <- Inf - pks <- pks[which(pks.d == min(pks.d))[1]] + pks.d[num_peaks == 0] <- Inf + pks <- pks[which.min(pks.d)] } else { - pks <- pks[pks.n > recover.min.count] + pks <- pks[num_peaks > recover_min_count] } + return(list(pks = pks, vlys = vlys)) } -compute_mu_sc <- function(this, aver.diff, times, delta_rt) { - x <- this$labels - y <- this$intensities - - if (nrow(this) >= 10) { - miu <- sum(x * y) / sum(y) - sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) - if (sigma == 0) { - sc <- sum(y) * aver.diff - miu <- miu - } else { - fitted <- dnorm(x, mean = miu, sd = sigma) - this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 - sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) - } +#' Compute interpolated retention time and intensity values. +#' +#' @param features tibble Features with `labels` and `intensities` columns. +#' @param aver_diff float Average retention time difference. +#' @return Returns a list object with the following objects in it: +#' \itemize{ +#' \item intensity - float - Interpolated intensity value. +#' \item label - float - Interpolated retention time value. +compute_mu_sc <- function(features, aver_diff) { + x <- features$labels + y <- features$intensities + + sum_y <- sum(y) + miu <- sum(x * y) / sum_y # weighted retention time values + sigma <- sqrt(sum(y * (x - miu)^2) / sum_y) + if (sigma == 0) { + sc <- sum_y * aver_diff + miu <- miu } else { - sc <- interpol.area(x, y, times, delta_rt) - miu <- median(x) + fitted <- dnorm(x, mean = miu, sd = sigma) + selection <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > 1e-2 + sc <- exp(sum(fitted[selection]^2 * log(y[selection] / fitted[selection]) / sum(fitted[selection]^2))) } - return(list(sc = sc, miu = miu)) -} -compute_curr_rec_with_enough_peaks <- function(that.mass, peak, valleys, labels_intensities, aver.diff, times, delta_rt) { - curr.rec <- c(that.mass, NA, NA) + return(list(intensity = sc, label = miu)) +} +#' Compute the rectangle around recovered features given that enough peaks are present. +#' @description +#' +#' @param mz Mz value of the feature. +#' @param peak Peak around which to detect the new feature. +#' @param valleys Valley points to compute the boundary region. +#' @param features tibble Tibble with `labels` and `intensities` column. +#' @param aver_diff float Average retention time difference. +#' @param times vector Raw retention time values from raw data file. +#' @param delta_rt vector Differences between consecutive retention time values (diff(times)). +#' @return list Triplet of mz, label and intensity for the feature. +compute_curr_rec_with_enough_peaks <- function(mz, + peak, + valleys, + features, + aver_diff, + times, + delta_rt) { # same filtering of peaks as in compute_pks_vlyws and as above boundaries <- compute_boundaries(valleys, peak) - this <- labels_intensities |> dplyr::filter(labels >= boundaries$lower & labels <= boundaries$upper) + subset <- features |> + dplyr::filter(labels >= boundaries$lower & labels <= boundaries$upper) - if (nrow(this) == 1) { - curr.rec[3] <- this$intensities * aver.diff - curr.rec[2] <- this$labels + if (nrow(subset) == 1) { + intensity <- subset$intensities * aver_diff + label <- subset$labels + } else if (nrow(subset) >= 10) { + res <- compute_mu_sc(subset, aver_diff) + intensity <- res$intensity + label <- res$label } else { - res <- compute_mu_sc(this, aver.diff, times, delta_rt) - curr.rec[3] <- res$sc - curr.rec[2] <- res$miu + intensity <- interpol.area( + subset$labels, + subset$intensities, + times, + delta_rt + ) + label <- median(subset$labels) } - return(curr.rec) + + return(c(mz, label, intensity)) } #' Compute bounds of area using given peak and mass valley points. @@ -385,7 +443,7 @@ compute_boundaries <- function(valley_points, peak) { #' @return Returns a list object with the following objects in it: #' \itemize{ #' \item pks - vector - The data points at which the density peaks. -#' \item vlys - vector - The points in the data where the density is low +#' \item vlys - vector - The points in the data where the density is low #' (forming a valley in the function). #' } compute_peaks_and_valleys <- function(dens) { @@ -412,7 +470,6 @@ compute_rectangle <- function(data_table, bandwidth, min.bw, max.bw) { - bounds <- get_mzrange_bound_indices( aligned_feature_mass, data_table$mz, @@ -458,25 +515,22 @@ compute_rectangle <- function(data_table, ) if (length(thee.sel) > recover.min.count) { - curr.rec[3] <- compute_EIC_area( - thee.sel, - that.prof, - times, - delta_rt, - aver.diff - ) + if (length(thee.sel) > 1) { + curr.rec[3] <- interpol.area(that.prof$labels[thee.sel], that.prof$intensities[thee.sel], times, delta_rt) + } else { + curr.rec[3] <- that.prof$intensities[thee.sel] * aver.diff + } curr.rec[2] <- median(that.prof$labels[thee.sel]) this.rec <- tibble::add_row(this.rec, tibble::tibble_row(mz = curr.rec[1], labels = curr.rec[2], intensities = curr.rec[3])) } } else { labels_intensities <- dplyr::select(that.prof, c("labels", "intensities")) |> dplyr::arrange_at("labels") + bw <- min(max(bandwidth * (span(labels_intensities$labels)), min.bw), max.bw) all <- compute_pks_vlys_rt( labels_intensities, times, - bandwidth, - min.bw, - max.bw, + bw, target_rt, recover.min.count ) @@ -500,16 +554,21 @@ compute_rectangle <- function(data_table, return(this.rec) } -refine_selection <- function(this.sel, target_rt, rectangle, aligned_mz, chr_tol, mz_tol) { - if (length(this.sel) > 1) { - if (!is.na(target_rt)) { - this.d <- (rectangle$labels - target_rt)^2 / chr_tol^2 + (rectangle$mz - aligned_mz)^2 / mz_tol^2 - this.sel <- which(this.d == min(this.d))[1] - } else { - this.d <- abs(rectangle$mz - aligned_mz) - this.sel <- which(this.d == min(this.d))[1] - } +#' Refine the selection based on mz and rt differences. +#' @param target_rt float Target retention time value. +#' @param rectangle tibble Features with columns `labels` and `mz`. +#' @param aligned_mz float Mz value in the aligned feature table of the +#' feature to be recovered. +#' @param chr_tol float Retention time tolerance. +#' @param mz_tol float Mz tolerance to use. +#' @return int Index of value in rectable closest to `target_rt` and `aligned_mz`. +refine_selection <- function(target_rt, rectangle, aligned_mz, chr_tol, mz_tol) { + if (!is.na(target_rt)) { + this.d <- (rectangle$labels - target_rt)^2 / chr_tol^2 + (rectangle$mz - aligned_mz)^2 / mz_tol^2 + } else { + this.d <- abs(rectangle$mz - aligned_mz) } + this.sel <- which.min(this.d) return(this.sel) } @@ -577,8 +636,8 @@ recover.weaker <- function(filename, # Initialize parameters with default values if (is.na(mz.range)) mz.range <- 1.5 * align.mz.tol if (is.na(chr.range)) chr.range <- align.chr.tol / 2 - if (is.na(min.bw)) min.bw <- diff(range(times, na.rm = TRUE)) / 60 - if (is.na(max.bw)) max.bw <- diff(range(times, na.rm = TRUE)) / 15 + if (is.na(min.bw)) min.bw <- span(times) / 60 + if (is.na(max.bw)) max.bw <- span(times) / 15 if (min.bw >= max.bw) min.bw <- max.bw / 4 @@ -656,14 +715,15 @@ recover.weaker <- function(filename, this.sel <- this.sel[this.sel != 1] if (length(this.sel) > 0) { - this.sel <- refine_selection( - this.sel, - target_times[i], - this.rec, - aligned.ftrs[i, 1], - custom.chr.tol[i], - custom.mz.tol[i] - ) + if (length(this.sel) > 1) { + this.sel <- refine_selection( + target_times[i], + this.rec, + aligned.ftrs[i, 1], + custom.chr.tol[i], + custom.mz.tol[i] + ) + } this.pos.diff <- abs(extracted_features$pos - this.rec$labels[this.sel]) this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] @@ -673,7 +733,7 @@ recover.weaker <- function(filename, area = this.rec$intensities[this.sel] ) this.time.adjust <- (-this.f1$pos[this.pos.diff] + adjusted_features$pos[this.pos.diff]) - + this.f2 <- adjusted_features |> tibble::add_row( mz = this.rec$mz[this.sel], pos = this.rec$labels[this.sel] + this.time.adjust, diff --git a/R/utils.R b/R/utils.R index 5a62f226..f329cbe9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -89,3 +89,7 @@ create_feature_sample_table <- function(features) { ) return(table) } + +span <- function(x) { + diff(range(x, na.rm = TRUE)) +} \ No newline at end of file From e5cde4255fab3ec7e0e60ba2cb27ba9b6fb0912c Mon Sep 17 00:00:00 2001 From: root Date: Mon, 8 Aug 2022 16:03:43 +0200 Subject: [PATCH 28/53] Added final documentation sections. --- R/recover.weaker.R | 51 ++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index f8754e3d..fbcdc54d 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -312,7 +312,7 @@ count_peaks <- function(roi, times) { #' #' @param features tibble Features with `mz`, `labels` and `intensities`. #' @param times vector Retention time values from the raw data file. -#' @param bandwith flot Bandwidth to use in smoothing. +#' @param bandwidth float Bandwidth to use in smoothing. #' @param target_rt float Retention time at which to recover the intensity. #' @param recover_min_count int Minimum number of peaks required in the area to recover the signal. #' @return Returns a list object with the following objects in it: @@ -454,27 +454,44 @@ compute_peaks_and_valleys <- function(dens) { return(list(pks = pks, vlys = vlys)) } - +#' Compute rectangle around feature with `aligned_feature_mz` and `target_rt` for recovery. +#' +#' @param data_table tibble Feature table with `mz`, `labels` and `intensities` column. +#' @param aligned_feature_mz float Mz value of feature in aligned feature table. +#' @param breaks vector Integer boundaries of clusters in mz values. +#' @param custom_mz_tol float Custom mz tolerance for the feature. +#' @param orig_mz_tol float Flat original mz tolerance to use. +#' @param use_intensity_weighting bool Whether to use intensity weighting. +#' @param recover_min_count int Minimum number of peaks required in the area to recover the signal. +#' @param target_rt float Target retention time value. +#' @param custom_chr_tol float Custom chromatographic tolerance to use. +#' @param times vector Raw retention time values from raw data file. +#' @param delta_rt vector Differences between consecutive retention time values (diff(times)). +#' @param aver_diff float Average retention time difference. +#' @param bandwidth float Bandwidth to use in smoothing. +#' @param min.bw float Minimum bandwidth to use. +#' @param max.bw float Maximum bandwidth to use. +#' @return tibble Tibble with `mz`, `labels` and `intensities` columns. compute_rectangle <- function(data_table, - aligned_feature_mass, + aligned_feature_mz, breaks, custom_mz_tol, - orig.tol, - intensity.weighted, - recover.min.count, + orig_mz_tol, + use_intensity_weighting, + recover_min_count, target_rt, custom_chr_tol, times, delta_rt, - aver.diff, + aver_diff, bandwidth, min.bw, max.bw) { bounds <- get_mzrange_bound_indices( - aligned_feature_mass, + aligned_feature_mz, data_table$mz, breaks, - orig.tol + orig_mz_tol ) features <- dplyr::slice( @@ -484,13 +501,13 @@ compute_rectangle <- function(data_table, mass.den <- compute_mass_density( features, - bandwidth = 0.5 * orig.tol * aligned_feature_mass, - intensity_weighted = intensity.weighted + bandwidth = 0.5 * orig_mz_tol * aligned_feature_mz, + intensity_weighted = use_intensity_weighting ) # find peaks in mz range in raw data mass_range <- compute_peaks_and_valleys(mass.den) - mass_range$pks <- mass_range$pks[abs(mass_range$pks - aligned_feature_mass) < custom_mz_tol / 1.5] + mass_range$pks <- mass_range$pks[abs(mass_range$pks - aligned_feature_mz) < custom_mz_tol / 1.5] this.rec <- tibble::tibble(mz = Inf, labels = Inf, intensities = Inf) for (peak in mass_range$pks) { @@ -502,7 +519,7 @@ compute_rectangle <- function(data_table, dplyr::arrange_at("labels") # get values in RT region of interest? - if (nrow(that) > recover.min.count) { + if (nrow(that) > recover_min_count) { that.prof <- combine.seq.3(that) that.mass <- sum(that.prof$mz * that.prof$intensities) / sum(that.prof$intensities) curr.rec <- c(that.mass, NA, NA) @@ -514,11 +531,11 @@ compute_rectangle <- function(data_table, custom_chr_tol ) - if (length(thee.sel) > recover.min.count) { + if (length(thee.sel) > recover_min_count) { if (length(thee.sel) > 1) { curr.rec[3] <- interpol.area(that.prof$labels[thee.sel], that.prof$intensities[thee.sel], times, delta_rt) } else { - curr.rec[3] <- that.prof$intensities[thee.sel] * aver.diff + curr.rec[3] <- that.prof$intensities[thee.sel] * aver_diff } curr.rec[2] <- median(that.prof$labels[thee.sel]) this.rec <- tibble::add_row(this.rec, tibble::tibble_row(mz = curr.rec[1], labels = curr.rec[2], intensities = curr.rec[3])) @@ -532,7 +549,7 @@ compute_rectangle <- function(data_table, times, bw, target_rt, - recover.min.count + recover_min_count ) for (peak in all$pks) { @@ -541,7 +558,7 @@ compute_rectangle <- function(data_table, peak, all$vlys, labels_intensities, - aver.diff, + aver_diff, times, delta_rt ) From 3788061f73fb533c295c5c6792abd25ecf304d1b Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 8 Aug 2022 16:27:06 +0200 Subject: [PATCH 29/53] tiny refactor --- R/recover.weaker.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index fbcdc54d..f7b1c679 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -742,8 +742,7 @@ recover.weaker <- function(filename, ) } - this.pos.diff <- abs(extracted_features$pos - this.rec$labels[this.sel]) - this.pos.diff <- which(this.pos.diff == min(this.pos.diff))[1] + this.pos.diff <- which.min(abs(extracted_features$pos - this.rec$labels[this.sel])) this.f1 <- extracted_features |> tibble::add_row( mz = this.rec$mz[this.sel], pos = this.rec$labels[this.sel], From 9c3df649fe4ee31d5fb062e4ccd9c44594c4030a Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 9 Aug 2022 09:09:39 +0200 Subject: [PATCH 30/53] reformatting --- R/recover.weaker.R | 56 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index f7b1c679..5714bc46 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -344,7 +344,7 @@ compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_m } #' Compute interpolated retention time and intensity values. -#' +#' #' @param features tibble Features with `labels` and `intensities` columns. #' @param aver_diff float Average retention time difference. #' @return Returns a list object with the following objects in it: @@ -455,7 +455,7 @@ compute_peaks_and_valleys <- function(dens) { } #' Compute rectangle around feature with `aligned_feature_mz` and `target_rt` for recovery. -#' +#' #' @param data_table tibble Feature table with `mz`, `labels` and `intensities` column. #' @param aligned_feature_mz float Mz value of feature in aligned feature table. #' @param breaks vector Integer boundaries of clusters in mz values. @@ -507,7 +507,8 @@ compute_rectangle <- function(data_table, # find peaks in mz range in raw data mass_range <- compute_peaks_and_valleys(mass.den) - mass_range$pks <- mass_range$pks[abs(mass_range$pks - aligned_feature_mz) < custom_mz_tol / 1.5] + pks_in_tol <- abs(mass_range$pks - aligned_feature_mz) < custom_mz_tol / 1.5 + mass_range$pks <- mass_range$pks[pks_in_tol] this.rec <- tibble::tibble(mz = Inf, labels = Inf, intensities = Inf) for (peak in mass_range$pks) { @@ -533,15 +534,30 @@ compute_rectangle <- function(data_table, if (length(thee.sel) > recover_min_count) { if (length(thee.sel) > 1) { - curr.rec[3] <- interpol.area(that.prof$labels[thee.sel], that.prof$intensities[thee.sel], times, delta_rt) + curr.rec[3] <- interpol.area( + that.prof$labels[thee.sel], + that.prof$intensities[thee.sel], + times, + delta_rt + ) } else { curr.rec[3] <- that.prof$intensities[thee.sel] * aver_diff } curr.rec[2] <- median(that.prof$labels[thee.sel]) - this.rec <- tibble::add_row(this.rec, tibble::tibble_row(mz = curr.rec[1], labels = curr.rec[2], intensities = curr.rec[3])) + this.rec <- tibble::add_row( + this.rec, + tibble::tibble_row( + mz = curr.rec[1], + labels = curr.rec[2], + intensities = curr.rec[3] + ) + ) } } else { - labels_intensities <- dplyr::select(that.prof, c("labels", "intensities")) |> dplyr::arrange_at("labels") + labels_intensities <- dplyr::select( + that.prof, + c("labels", "intensities") + ) |> dplyr::arrange_at("labels") bw <- min(max(bandwidth * (span(labels_intensities$labels)), min.bw), max.bw) all <- compute_pks_vlys_rt( @@ -563,7 +579,14 @@ compute_rectangle <- function(data_table, delta_rt ) - this.rec <- tibble::add_row(this.rec, tibble::tibble_row(mz = curr.rec[1], labels = curr.rec[2], intensities = curr.rec[3])) + this.rec <- tibble::add_row( + this.rec, + tibble::tibble_row( + mz = curr.rec[1], + labels = curr.rec[2], + intensities = curr.rec[3] + ) + ) } } } @@ -581,7 +604,9 @@ compute_rectangle <- function(data_table, #' @return int Index of value in rectable closest to `target_rt` and `aligned_mz`. refine_selection <- function(target_rt, rectangle, aligned_mz, chr_tol, mz_tol) { if (!is.na(target_rt)) { - this.d <- (rectangle$labels - target_rt)^2 / chr_tol^2 + (rectangle$mz - aligned_mz)^2 / mz_tol^2 + rt_term <- (rectangle$labels - target_rt)^2 / chr_tol^2 + mz_term <- (rectangle$mz - aligned_mz)^2 / mz_tol^2 + this.d <- rt_term + mz_term } else { this.d <- abs(rectangle$mz - aligned_mz) } @@ -602,14 +627,19 @@ refine_selection <- function(target_rt, rectangle, aligned_mz, chr_tol, mz_tol) #' @param align.mz.tol the m/z tolerance used in the alignment. #' @param align.chr.tol the elution time tolerance in the alignment. #' @param extracted_features The matrix which is the output from proc.to.feature(). -#' @param adjusted_features The matrix which is the output from proc.to.feature(). The retention time in this object have been adjusted by the function adjust.time(). -#' @param mz.range The m/z around the feature m/z to search for observations. The default value is NA, in which case 1.5 times the m/z tolerance in the aligned object will be used. -#' @param chr.range The retention time around the feature retention time to search for observations. The default value is NA, in which case 0.5 times the retention time tolerance in the aligned object will be used. -#' @param use.observed.range If the value is TRUE, the actual range of the observed locations of the feature in all the spectra will be used. +#' @param adjusted_features The matrix which is the output from proc.to.feature(). +#' The retention time in this object have been adjusted by the function adjust.time(). +#' @param mz.range The m/z around the feature m/z to search for observations. +#' The default value is NA, in which case 1.5 times the m/z tolerance in the aligned object will be used. +#' @param chr.range The retention time around the feature retention time to search for observations. +#' The default value is NA, in which case 0.5 times the retention time tolerance in the aligned object will be used. +#' @param use.observed.range If the value is TRUE, the actual range of the observed locations +#' of the feature in all the spectra will be used. #' @param orig.tol The mz.tol parameter provided to the proc.cdf() function. This helps retrieve the intermediate file. #' @param min.bw The minimum bandwidth to use in the kernel smoother. #' @param max.bw The maximum bandwidth to use in the kernel smoother. -#' @param bandwidth A value between zero and one. Multiplying this value to the length of the signal along the time axis helps determine the bandwidth in the kernel smoother used for peak identification. +#' @param bandwidth A value between zero and one. Multiplying this value to the length of the signal along the +#' time axis helps determine the bandwidth in the kernel smoother used for peak identification. #' @param recover.min.count minimum number of raw data points to support a recovery. #' @param intensity.weighted Whether to use intensity to weight mass density estimation. #' @return Returns a list object with the following objects in it: From 01ed54ac9e9e66efd26344f1d6d7253e05eade13 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 10 Aug 2022 10:39:55 +0200 Subject: [PATCH 31/53] Changed test case to actually check the recovered tables --- tests/testthat/test-recover-weaker.R | 47 +++++++--------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/tests/testthat/test-recover-weaker.R b/tests/testthat/test-recover-weaker.R index 8c557fa6..016d9e36 100644 --- a/tests/testthat/test-recover-weaker.R +++ b/tests/testthat/test-recover-weaker.R @@ -90,20 +90,20 @@ patrick::with_parameters_test_that( corrected_recovered_expected <- lapply(filenames, arrow::read_parquet) # preprocess dataframes - keys <- c("mz", "pos", "sd1", "sd2") + keys <- c("mz", "pos", "sd1", "sd2", "area") extracted_recovered_actual <- lapply(extracted_recovered_actual, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(keys) }) corrected_recovered_actual <- lapply(corrected_recovered_actual, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(keys) }) extracted_recovered_expected <- lapply(extracted_recovered_expected, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(keys) }) corrected_recovered_expected <- lapply(corrected_recovered_expected, function(x) { - as.data.frame(x) |> dplyr::arrange_at(keys) + x |> dplyr::arrange_at(keys) }) # compare files @@ -112,33 +112,19 @@ patrick::with_parameters_test_that( actual_extracted_i <- extracted_recovered_actual[[i]] expected_extracted_i <- extracted_recovered_expected[[i]] - report <- dataCompareR::rCompare(actual_extracted_i, expected_extracted_i, keys = keys, roundDigits = 4, mismatches = 100000) - dataCompareR::saveReport(report, reportName = files[[i]], showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) + expect_equal(actual_extracted_i, expected_extracted_i) - expect_true(nrow(report$rowMatching$inboth) >= 0.9 * nrow(expected_extracted_i)) - - incommon <- as.numeric(rownames(report$rowMatching$inboth)) - - subset_actual <- actual_extracted_i %>% dplyr::slice(incommon) - subset_expected <- expected_extracted_i %>% dplyr::slice(incommon) - - expect_equal(subset_actual$area, subset_expected$area, tolerance = 0.01 * max(subset_expected$area)) + # report <- dataCompareR::rCompare(actual_extracted_i, expected_extracted_i, keys = keys, roundDigits = 4, mismatches = 100000) + # dataCompareR::saveReport(report, reportName = paste0(files[[i]],"_extracted"), showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) # corrected recovered actual_corrected_i <- corrected_recovered_actual[[i]] expected_corrected_i <- corrected_recovered_expected[[i]] - report <- dataCompareR::rCompare(actual_corrected_i, expected_corrected_i, keys = keys, roundDigits = 4, mismatches = 100000) - dataCompareR::saveReport(report, reportName = files[[i]], showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) - - expect_true(nrow(report$rowMatching$inboth) >= 0.9 * nrow(expected_corrected_i)) - - incommon <- as.numeric(rownames(report$rowMatching$inboth)) - - subset_actual <- actual_corrected_i %>% dplyr::slice(incommon) - subset_expected <- expected_corrected_i %>% dplyr::slice(incommon) + expect_equal(actual_corrected_i, expected_corrected_i) - expect_equal(subset_actual$area, subset_expected$area, tolerance = 0.01 * max(subset_expected$area)) + # report <- dataCompareR::rCompare(actual_corrected_i, expected_corrected_i, keys = keys, roundDigits = 4, mismatches = 100000) + # dataCompareR::saveReport(report, reportName = paste0(files[[i]],"_adjusted"), showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) } }, patrick::cases( @@ -153,13 +139,4 @@ patrick::with_parameters_test_that( recover_min_count = 3 ) ) -) - -files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened") -mz_tol = 1e-05 -recover_mz_range = NA -recover_chr_range = NA -use_observed_range = TRUE -min_bandwidth = NA -max_bandwidth = NA -recover_min_count = 3 \ No newline at end of file +) \ No newline at end of file From 8beafb401888c82b756cdf58e8de28bb78a89750 Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 10 Aug 2022 10:40:38 +0200 Subject: [PATCH 32/53] fixed typo --- R/hybrid.R | 2 +- R/unsupervised.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/hybrid.R b/R/hybrid.R index fd6de2fc..9be0aa3d 100644 --- a/R/hybrid.R +++ b/R/hybrid.R @@ -347,7 +347,7 @@ hybrid <- function( corrected_features = recovered_corrected, aligned_feature_sample_table = aligned_feature_sample_table, recovered_feature_sample_table = recovered_feature_sample_table, - aligned_mz_toletance = as.numeric(recovered_aligned$mz_tolerance), + aligned_mz_tolerance = as.numeric(recovered_aligned$mz_tolerance), aligned_rt_tolerance = as.numeric(recovered_aligned$rt_tolerance), updated_known_table = as.data.frame(augmented$known_table), features_known_table_pairing = as.data.frame(augmented$pairing) diff --git a/R/unsupervised.R b/R/unsupervised.R index e1e8ff7e..c28e6a26 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -89,8 +89,8 @@ recover_weaker_signals <- function( recover.weaker( sample_name = get_sample_name(filenames[i]), filename = filenames[[i]], - extracted_features = extracted_features[[i]], - adjusted_features = corrected_features[[i]], + extracted_features = as_tibble(extracted_features[[i]]), + adjusted_features = as_tibble(corrected_features[[i]]), pk.times = aligned_rt_crosstab, aligned.ftrs = aligned_int_crosstab, orig.tol = original_mz_tolerance, @@ -287,7 +287,7 @@ unsupervised <- function( corrected_features = recovered$corrected_features, aligned_feature_sample_table = aligned_feature_sample_table, recovered_feature_sample_table = recovered_feature_sample_table, - aligned_mz_toletance = as.numeric(aligned$mz_tolerance), + aligned_mz_tolerance = as.numeric(aligned$mz_tolerance), aligned_rt_tolerance = as.numeric(aligned$rt_tolerance) ) } From 4e91d2d27f60e6c1ab5463e274d7790d9878d2df Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 10 Aug 2022 10:41:00 +0200 Subject: [PATCH 33/53] Adapted to tibble and properly added rows to dataframes --- R/recover.weaker.R | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 5714bc46..2e954a67 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -258,7 +258,7 @@ get_mzrange_bound_indices <- function(aligned_feature_mass, #' 1 if `target_time` is NA. #' @export get_rt_region_indices <- function(target_time, features, chr_tol) { - if (!is.na(target_time)) { + if (!is.null(target_time) && !is.na(target_time)) { selection <- which(abs(features$labels - target_time) < chr_tol) } else { selection <- 1 @@ -332,7 +332,7 @@ compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_m num_peaks <- count_peaks(roi, features$labels) - if (!is.na(target_rt)) { + if (!is.null (target_rt) && !is.na(target_rt)) { pks.d <- abs(pks - target_rt) # distance from the target peak location pks.d[num_peaks == 0] <- Inf pks <- pks[which.min(pks.d)] @@ -706,18 +706,19 @@ recover.weaker <- function(filename, # # rounding is used to create a histogram of retention time values target_times <- compute_target_times( aligned.ftrs[, "rt"], - round(extracted_features[, "pos"], 5), - round(adjusted_features[, "pos"], 5) + round(extracted_features$pos, 5), + round(adjusted_features$pos, 5) ) # IMPORTANT: THIS CODE SECTION COULD BE USED TO REPLACE COMPUTE_TARGET_TIMES FOR THE TEST CASES AND # IS A MASSIVE SIMPLIFICATION. # sp <- splines::interpSpline( - # unique(extracted_features[, "pos"]) ~ unique(adjusted_features[, "pos"]), + # unique(extracted_features$pos) ~ unique(adjusted_features$pos), # na.action = na.omit # ) # target_times <- predict(sp, aligned.ftrs[, "rt"])$y + breaks <- predict_mz_break_indices(data_table, orig.tol) this.mz <- rep(NA, length(sample_intensities)) @@ -773,18 +774,19 @@ recover.weaker <- function(filename, } this.pos.diff <- which.min(abs(extracted_features$pos - this.rec$labels[this.sel])) - this.f1 <- extracted_features |> tibble::add_row( + extracted_features <- extracted_features |> tibble::add_row( mz = this.rec$mz[this.sel], pos = this.rec$labels[this.sel], area = this.rec$intensities[this.sel] ) - this.time.adjust <- (-this.f1$pos[this.pos.diff] + adjusted_features$pos[this.pos.diff]) + + this.time.adjust <- (-extracted_features$pos[this.pos.diff] + adjusted_features$pos[this.pos.diff]) - this.f2 <- adjusted_features |> tibble::add_row( + adjusted_features <- adjusted_features |> tibble::add_row( mz = this.rec$mz[this.sel], pos = this.rec$labels[this.sel] + this.time.adjust, area = this.rec$intensities[this.sel], - V6 = grep(sample_name, colnames(aligned.ftrs)) + V6 = grep(sample_name, colnames(aligned.ftrs)) - 4 # offset for other columns `mz`, `rt` etc ) sample_intensities[i] <- this.rec$intensities[this.sel] @@ -797,8 +799,8 @@ recover.weaker <- function(filename, to.return$this.mz <- this.mz to.return$this.ftrs <- sample_intensities to.return$this.times <- sample_times - to.return$this.f1 <- duplicate.row.remove(this.f1) - to.return$this.f2 <- duplicate.row.remove(this.f2) + to.return$this.f1 <- duplicate.row.remove(extracted_features) + to.return$this.f2 <- duplicate.row.remove(adjusted_features) return(to.return) } From 5db5d2fccd676c6dc6735384d2f37a0cbdd09039 Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 10 Aug 2022 10:41:23 +0200 Subject: [PATCH 34/53] extracted function to compute cores and starting on fixing hybrid test case --- tests/testthat/test-hybrid.R | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R index d3049536..e68e93dd 100644 --- a/tests/testthat/test-hybrid.R +++ b/tests/testthat/test-hybrid.R @@ -1,11 +1,4 @@ -test_that("basic hybrid test", { - test_files <- c('../testdata/mbr_test0.mzml', - '../testdata/mbr_test1.mzml', - '../testdata/mbr_test2.mzml') - - expected <- arrow::read_parquet('../testdata/hybrid_recovered_feature_sample_table.parquet') - known_table <- arrow::read_parquet('../testdata/known_table.parquet') - +get_num_workers <- function() { # CRAN limits the number of cores available to packages to 2 # source https://stackoverflow.com/questions/50571325/r-cran-check-fail-when-using-parallel-functions#50571533 chk <- Sys.getenv("_R_CHECK_LIMIT_CORES_", "") @@ -17,8 +10,21 @@ test_that("basic hybrid test", { # use all cores in devtools::test() num_workers <- parallel::detectCores() } + return(num_workers) +} + +test_that("basic hybrid test", { + test_files <- c('../testdata/mbr_test0.mzml', + '../testdata/mbr_test1.mzml', + '../testdata/mbr_test2.mzml') + + expected <- arrow::read_parquet('../testdata/hybrid_recovered_feature_sample_table.parquet') + known_table <- arrow::read_parquet('../testdata/known_table.parquet') + + actual <- hybrid(test_files, known_table, cluster = get_num_workers()) - result <- hybrid(test_files, known_table, cluster = num_workers) + actual$recovered_feature_sample_table <- actual$recovered_feature_sample_table |> dplyr::arrange_all() + expected <- expected |> dplyr::arrange_all() - expect_equal(result$recovered_feature_sample_table, expected) + expect_equal(actual$recovered_feature_sample_table, expected) }) From d11743bf2cf748ada800c31ec99e7735abbc2baf Mon Sep 17 00:00:00 2001 From: hechth Date: Wed, 10 Aug 2022 10:41:59 +0200 Subject: [PATCH 35/53] temporarily disabled two step hybrid --- tests/testthat/test-two-step-hybrid.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-two-step-hybrid.R b/tests/testthat/test-two-step-hybrid.R index 1258c2a1..e6ea6e33 100644 --- a/tests/testthat/test-two-step-hybrid.R +++ b/tests/testthat/test-two-step-hybrid.R @@ -1,4 +1,5 @@ test_that("basic two-step hybrid test", { + skip("Disabled") skip_on_ci() test_names <- c( "mbr_test0.mzml", From 13928c81363acaeac11713f8d553256d5ac91b94 Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 11 Aug 2022 08:02:09 +0200 Subject: [PATCH 36/53] Reformatted file and adjusted variable name --- R/adjust.time.R | 369 ++++++++++++++++-------------- tests/testthat/test-adjust-time.R | 2 +- 2 files changed, 195 insertions(+), 176 deletions(-) diff --git a/R/adjust.time.R b/R/adjust.time.R index cd0e4478..b8fa05a3 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -2,99 +2,104 @@ NULL #> NULL -compute_comb <- function(candi, template, this.feature, j){ - this.comb <- dplyr::bind_rows(dplyr::bind_cols(candi, label = rep(template, nrow(candi))), - dplyr::bind_cols(this.feature[, 1:2], label = rep(j, nrow(this.feature)))) - this.comb <- dplyr::arrange(this.comb, this.comb[, 1]) - return(this.comb) +compute_comb <- function(candi, template, this.feature, j) { + this.comb <- dplyr::bind_rows( + dplyr::bind_cols(candi, label = rep(template, nrow(candi))), + dplyr::bind_cols(this.feature[, 1:2], label = rep(j, nrow(this.feature))) + ) + this.comb <- dplyr::arrange(this.comb, this.comb[, 1]) + return(this.comb) } -compute_sel <- function(this.comb, mz_tol_relative, rt_tol_relative){ - l <- nrow(this.comb) - sel <- which(this.comb[2:l, 1] - this.comb[1:(l-1), 1] < - mz_tol_relative * this.comb[1:(l-1), 1] * 2 & - abs(this.comb[2:l, 2] - this.comb[1:(l-1), 2]) < - rt_tol_relative & this.comb[2:l, 3] != this.comb[1:(l-1), 3]) - return(sel) +compute_sel <- function(this.comb, mz_tol_relative, rt_tol_relative) { + l <- nrow(this.comb) + sel <- which(this.comb[2:l, 1] - this.comb[1:(l - 1), 1] < + mz_tol_relative * this.comb[1:(l - 1), 1] * 2 & + abs(this.comb[2:l, 2] - this.comb[1:(l - 1), 2]) < + rt_tol_relative & this.comb[2:l, 3] != this.comb[1:(l - 1), 3]) + return(sel) } -compute_template_adjusted_rt <- function(this.comb, sel, j){ - all.ftr.table <- cbind(this.comb[sel, 2], this.comb[sel+1, 2]) - to.flip <- which(this.comb[sel, 3] == j) - temp <- all.ftr.table[to.flip, 2] - all.ftr.table[to.flip, 2] <- all.ftr.table[to.flip, 1] - all.ftr.table[to.flip, 1] <- temp - - # now the first column is the template retention time. - # the second column is the to-be-adjusted retention time - - cat(c("sample", j, "using", nrow(all.ftr.table), ", ")) - if(j %% 3 == 0) - cat("\n") - - all.ftr.table <- all.ftr.table[order(all.ftr.table[, 2]), ] - return(all.ftr.table) +compute_template_adjusted_rt <- function(this.comb, sel, j) { + all.ftr.table <- cbind(this.comb[sel, 2], this.comb[sel + 1, 2]) + to.flip <- which(this.comb[sel, 3] == j) + temp <- all.ftr.table[to.flip, 2] + all.ftr.table[to.flip, 2] <- all.ftr.table[to.flip, 1] + all.ftr.table[to.flip, 1] <- temp + + # now the first column is the template retention time. + # the second column is the to-be-adjusted retention time + + cat(c("sample", j, "using", nrow(all.ftr.table), ", ")) + if (j %% 3 == 0) { + cat("\n") + } + + all.ftr.table <- all.ftr.table[order(all.ftr.table[, 2]), ] + return(all.ftr.table) } -compute_corrected_features <- function(this.feature, this.diff, avg_time){ - this.feature <- this.feature[order(this.feature[, 2], this.feature[, 1]), ] - this.corrected <- this.old <- this.feature[, 2] - to.correct <- this.old[this.old >= min(this.diff) & - this.old <= max(this.diff)] - - this.smooth <- ksmooth(this.diff, avg_time, kernel="normal", - bandwidth = (max(this.diff) - min(this.diff)) / 5, - x.points = to.correct) - - this.corrected[this.old >= min(this.diff) & this.old <= max(this.diff)] <- - this.smooth$y + to.correct - this.corrected[this.old < min(this.diff)] <- this.corrected[this.old < min(this.diff)] + - mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) - this.corrected[this.old > max(this.diff)] <- this.corrected[this.old > max(this.diff)] + - mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) - this.feature[, 2] <- this.corrected - this.feature <- this.feature[order(this.feature[, 1], this.feature[, 2]), ] - return(this.feature) +compute_corrected_features <- function(this.feature, this.diff, avg_time) { + this.feature <- this.feature[order(this.feature[, 2], this.feature[, 1]), ] + this.corrected <- this.old <- this.feature[, 2] + to.correct <- this.old[this.old >= min(this.diff) & + this.old <= max(this.diff)] + + this.smooth <- ksmooth(this.diff, avg_time, + kernel = "normal", + bandwidth = (max(this.diff) - min(this.diff)) / 5, + x.points = to.correct + ) + + this.corrected[this.old >= min(this.diff) & this.old <= max(this.diff)] <- + this.smooth$y + to.correct + this.corrected[this.old < min(this.diff)] <- this.corrected[this.old < min(this.diff)] + + mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) + this.corrected[this.old > max(this.diff)] <- this.corrected[this.old > max(this.diff)] + + mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) + this.feature[, 2] <- this.corrected + this.feature <- this.feature[order(this.feature[, 1], this.feature[, 2]), ] + return(this.feature) } fill_missing_values <- function(orig.feature, this.feature) { - missing_values <- which(is.na(this.feature[, 2])) - for(i in missing_values) { - this.d <- abs(orig.feature[i, 2] - orig.feature[, 2]) - this.d[missing_values] <- Inf - this.s <- which(this.d == min(this.d))[1] - this.feature[i, 2] <- orig.feature[i, 2] + this.feature[this.s, 2] - - orig.feature[this.s, 2] - } - return(this.feature) + missing_values <- which(is.na(this.feature[, 2])) + for (i in missing_values) { + this.d <- abs(orig.feature[i, 2] - orig.feature[, 2]) + this.d[missing_values] <- Inf + this.s <- which(this.d == min(this.d))[1] + this.feature[i, 2] <- orig.feature[i, 2] + this.feature[this.s, 2] - + orig.feature[this.s, 2] + } + return(this.feature) } #' Adjust retention time across spectra. -#' +#' #' This function adjusts the retention time in each LC/MS profile to achieve better between-profile agreement. -#' -#' @param features A list object. Each component is a matrix which is the output from proc.to.feature() +#' +#' @param extracted_features A list object. Each component is a matrix which is the output from proc.to.feature() #' @param mz_tol_relative The m/z tolerance level for peak alignment. The default is NA, which allows the -#' program to search for the tolerance level based on the data. This value is expressed as the +#' program to search for the tolerance level based on the data. This value is expressed as the #' percentage of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. -#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which +#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which #' allows the program to search for the tolerance level based on the data. -#' @param colors The vector of colors to be used for the line plots of time adjustments. The default is NA, +#' @param colors The vector of colors to be used for the line plots of time adjustments. The default is NA, #' in which case the program uses a set of default color set. -#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. +#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. #' This is only used when the mz_tol_relative is NA. -#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable -#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly +#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable +#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly #' influences feature matching in higher m/z range. #' @param do.plot Indicates whether plot should be drawn. #' @param rt_colname contains the retention time information -#' @return A list object with the exact same structure as the input object features, i.e. one matrix per profile -#' being processed. The only difference this output object has with the input object is that the retention time +#' @return A list object with the exact same structure as the input object features, i.e. one matrix per profile +#' being processed. The only difference this output object has with the input object is that the retention time #' column in each of the matrices is changed to new adjusted values. #' @export #' @examples #' adjust.time(extracted_features, mz_max_diff = 10 * 1e-05, do.plot = FALSE) -adjust.time <- function(features, +adjust.time <- function(extracted_features, mz_tol_relative = NA, rt_tol_relative = NA, colors = NA, @@ -102,116 +107,130 @@ adjust.time <- function(features, mz_tol_absolute = 0.01, do.plot = TRUE, rt_colname = "pos") { - - number_of_samples <- nrow(summary(features)) - if(number_of_samples > 1) { - if (do.plot) { - par(mfrow = c(2,2)) - draw_plot(label = "Retention time \n adjustment", - cex = 2) - } - - values <- get_feature_values(features, rt_colname) - mz <- values$mz - chr <- values$rt - lab <- values$sample_id - - if(is.na(mz_tol_relative)) { - mz_tol_relative <- find.tol(mz, mz_max_diff = mz_max_diff, do.plot = do.plot) - } else if(do.plot) { - draw_plot(main = "m/z tolerance level given", - label = mz_tol_relative) - } - - if(!is.na(rt_tol_relative) && do.plot) { - draw_plot(main = "retention time \n tolerance level given", - label = rt_tol_relative) - } - - all.ft <- find.tol.time(mz, - chr, - lab, - number_of_samples = number_of_samples, - mz_tol_relative = mz_tol_relative, - rt_tol_relative = rt_tol_relative, - mz_tol_absolute = mz_tol_absolute, - do.plot = do.plot) - rt_tol_relative <- all.ft$rt.tol - - message("**** performing time correction ****") - message(paste("m/z tolerance level: ", mz_tol_relative)) - message(paste("time tolerance level:", rt_tol_relative)) - - for(i in 1:number_of_samples) { - this <- features[[i]] - sel <- which(all.ft$lab == i) - that <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel]) - this <- this[order(this[, 1], this[, 2]), ] - that <- that[order(that[, 1], that[, 2]), ] - this <- cbind(this, V6=rep(i, nrow(this)), V7=that[, 3]) - features[[i]] <- this - } - - num.ftrs <- as.vector(table(all.ft$lab)) - template <- which(num.ftrs == max(num.ftrs))[1] - message(paste("the template is sample", template)) - - candi <- features[[template]][, 1:2] - - corrected_features <- foreach::foreach(j = 1:number_of_samples,.export = c("compute_corrected_features", - "compute_template_adjusted_rt", "compute_comb", "compute_sel")) %dopar% { - this.feature <- features[[j]] - if(j != template) { - this.comb <- compute_comb(candi, template, this.feature, j) - - sel <- compute_sel(this.comb, mz_tol_relative, rt_tol_relative) - if(length(sel) < 20) { - cat("too few, aborted") - } else { - all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j) - - # the to be adjusted time - this.diff <- all.ftr.table[, 2] - - # the difference between the true time and the to-be-adjusted time - avg_time <- all.ftr.table[, 1] - this.diff - - this.feature <- compute_corrected_features(this.feature, this.diff, avg_time) - } - } - - if(sum(is.na(this.feature[, 2])) > 0) { - this.feature <- fill_missing_values( - features[[j]], - this.feature - ) - } - this.feature - } - } else { - message("Only one sample. No need to correct for time.") + number_of_samples <- nrow(summary(extracted_features)) + + if (number_of_samples <= 1) { + message("Only one sample. No need to correct for time.") + } + + if (do.plot) { + par(mfrow = c(2, 2)) + draw_plot(label = "Retention time \n adjustment", cex = 2) + } + + values <- concatenate_feature_tables(extracted_features, rt_colname) + mz <- values$mz + chr <- values$rt + lab <- values$sample_id + + if (is.na(mz_tol_relative)) { + mz_tol_relative <- find.tol(mz, mz_max_diff = mz_max_diff, do.plot = do.plot) + } else if (do.plot) { + draw_plot( + main = "m/z tolerance level given", + label = mz_tol_relative + ) + } + + if (!is.na(rt_tol_relative) && do.plot) { + draw_plot( + main = "retention time \n tolerance level given", + label = rt_tol_relative + ) + } + + all.ft <- find.tol.time(mz, + chr, + lab, + number_of_samples = number_of_samples, + mz_tol_relative = mz_tol_relative, + rt_tol_relative = rt_tol_relative, + mz_tol_absolute = mz_tol_absolute, + do.plot = do.plot + ) + rt_tol_relative <- all.ft$rt.tol + + message("**** performing time correction ****") + message(paste("m/z tolerance level: ", mz_tol_relative)) + message(paste("time tolerance level:", rt_tol_relative)) + + for (i in 1:number_of_samples) { + features <- extracted_features[[i]] + sel <- which(all.ft$lab == i) + + feature_subset <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel]) + features <- features[order(features[, 1], features[, 2]), ] + feature_subset <- feature_subset[order(feature_subset[, 1], feature_subset[, 2]), ] + features <- cbind(features, V6 = rep(i, nrow(features)), V7 = feature_subset[, 3]) + extracted_features[[i]] <- features + } + + num.ftrs <- as.vector(table(all.ft$lab)) + template <- which(num.ftrs == max(num.ftrs))[1] + message(paste("the template is sample", template)) + + candi <- extracted_features[[template]][, 1:2] + + corrected_features <- foreach::foreach(j = 1:number_of_samples, .export = c( + "compute_corrected_features", + "compute_template_adjusted_rt", "compute_comb", "compute_sel" + )) %dopar% { + this.feature <- extracted_features[[j]] + if (j != template) { + this.comb <- compute_comb(candi, template, this.feature, j) + + sel <- compute_sel(this.comb, mz_tol_relative, rt_tol_relative) + if (length(sel) < 20) { + cat("too few, aborted") + } else { + all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j) + + # the to be adjusted time + this.diff <- all.ftr.table[, 2] + + # the difference between the true time and the to-be-adjusted time + avg_time <- all.ftr.table[, 1] - this.diff + + this.feature <- compute_corrected_features(this.feature, this.diff, avg_time) + } + } + + if (sum(is.na(this.feature[, 2])) > 0) { + this.feature <- fill_missing_values( + extracted_features[[j]], + this.feature + ) } + this.feature + } - if (do.plot) { - if(is.na(colors[1])) - colors <-c ("red", "blue", "dark blue", "orange", "green", "yellow", - "cyan", "pink", "violet", "bisque", "azure", "brown", - "chocolate", rep("grey", number_of_samples)) - - draw_plot(x = range(features[[1]][, 2]), - y = c(-rt_tol_relative, rt_tol_relative), - xlab = "Original Retention time", - ylab = "Retention time deviation", - axes = TRUE) - - for(i in 1:number_of_samples) { - features[[i]] <- features[[i]][order(features[[i]][, 1], features[[i]][, 2]), ] - points(features[[i]][, 2], corrected_features[[i]][, 2] - features[[i]][, 2], - col=colors[i], cex=.2) - } + + if (do.plot) { + if (is.na(colors[1])) { + colors <- c( + "red", "blue", "dark blue", "orange", "green", "yellow", + "cyan", "pink", "violet", "bisque", "azure", "brown", + "chocolate", rep("grey", number_of_samples) + ) } - if(exists("corrected_features")){ - return(corrected_features) + draw_plot( + x = range(extracted_features[[1]][, 2]), + y = c(-rt_tol_relative, rt_tol_relative), + xlab = "Original Retention time", + ylab = "Retention time deviation", + axes = TRUE + ) + + for (i in 1:number_of_samples) { + extracted_features[[i]] <- extracted_features[[i]][order(extracted_features[[i]][, 1], extracted_features[[i]][, 2]), ] + points(extracted_features[[i]][, 2], corrected_features[[i]][, 2] - extracted_features[[i]][, 2], + col = colors[i], cex = .2 + ) } + } + + if (exists("corrected_features")) { + return(corrected_features) + } } diff --git a/tests/testthat/test-adjust-time.R b/tests/testthat/test-adjust-time.R index df7d74ec..93fdb0e7 100644 --- a/tests/testthat/test-adjust-time.R +++ b/tests/testthat/test-adjust-time.R @@ -29,7 +29,7 @@ patrick::with_parameters_test_that( doParallel::registerDoParallel(cluster) corrected <- adjust.time( - features = extracted, + extracted_features = extracted, mz_tol_relative = mz_tol, rt_tol_relative = chr_tol, mz_max_diff = find_tol_max_d, From 7cd24d9df32cd3736982632375e0105c7418bf90 Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 11 Aug 2022 08:02:44 +0200 Subject: [PATCH 37/53] Fixed bug with vscDebugger --- conda/environment-dev.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index b82d92a7..5fb19b4a 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -5,6 +5,7 @@ channels: - defaults dependencies: - r-base >= 3.5 + - r-pkgload <= 1.2.4 - r-devtools - r-biocmanager - r-mass From 919297451a6dccb96f3f87b4e5393bdccc0923cb Mon Sep 17 00:00:00 2001 From: root Date: Thu, 11 Aug 2022 08:03:37 +0200 Subject: [PATCH 38/53] Adapted function name to better reflect functionality --- R/feature.align.R | 2 +- R/utils.R | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index 9646562d..c0d6da90 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -131,7 +131,7 @@ feature.align <- function(features, number_of_samples <- nrow(summary(features)) if (number_of_samples > 1) { - values <- get_feature_values(features, rt_colname) + values <- concatenate_feature_tables(features, rt_colname) mz_values <- values$mz rt <- values$rt sample_id <- values$sample_id diff --git a/R/utils.R b/R/utils.R index e0a43ee6..df8a2aae 100644 --- a/R/utils.R +++ b/R/utils.R @@ -2,11 +2,15 @@ NULL #> NULL -get_feature_values <- function(features, rt_colname) { +#' Concatenate multiple feature lists and add the sample id (origin of feature) as additional column. +#' +#' @param features list List of tibbles containing extracted feature tables. +#' @param rt_colname string Name of retention time information column, usually "pos". +concatenate_feature_tables <- function(features, rt_colname) { mz <- c() rt <- c() sample_id <- c() - for (i in 1:length(features)) { + for (i in seq_along(features)) { sample_features <- dplyr::as_tibble(features[[i]]) mz <- c(mz, sample_features$mz) rt <- c(rt, sample_features[[rt_colname]]) From 7c8c11454ca360b270339bb7651af5c7ddd10043 Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 11 Aug 2022 08:03:52 +0200 Subject: [PATCH 39/53] Adapted to new variable name --- R/hybrid.R | 4 ++-- R/unsupervised.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/hybrid.R b/R/hybrid.R index 9be0aa3d..007b3e13 100644 --- a/R/hybrid.R +++ b/R/hybrid.R @@ -257,7 +257,7 @@ hybrid <- function( message("**** time correction ****") corrected <- adjust.time( - features = extracted, + extracted_features = extracted, mz_tol_relative = align_mz_tol, rt_tol_relative = align_chr_tol, mz_max_diff = 10 * mz_tol, @@ -305,7 +305,7 @@ hybrid <- function( message("**** second round time correction ****") recovered_corrected <- adjust.time( - features = recovered$extracted_features, + extracted_features = recovered$extracted_features, mz_tol_relative = align_mz_tol, rt_tol_relative = align_chr_tol, mz_max_diff = 10 * mz_tol, diff --git a/R/unsupervised.R b/R/unsupervised.R index c28e6a26..b88a3a6d 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -234,7 +234,7 @@ unsupervised <- function( message("**** time correction ****") corrected <- adjust.time( - features = extracted, + extracted_features = extracted, mz_tol_relative = align_mz_tol, rt_tol_relative = align_chr_tol, mz_max_diff = 10 * mz_tol, From 4f3cec6e8991caaa73a6697b40a387985765d3de Mon Sep 17 00:00:00 2001 From: hechth Date: Thu, 11 Aug 2022 10:56:02 +0200 Subject: [PATCH 40/53] added r-httpgd package for plotting --- conda/environment-dev.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index 5fb19b4a..e72a0982 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -34,3 +34,4 @@ dependencies: - r-datacomparer - r-patrick - radian + - r-httpgd From 96c64aed9afc5aa6bbe68b4e8e3f03829548da1e Mon Sep 17 00:00:00 2001 From: wverastegui Date: Thu, 11 Aug 2022 11:30:16 +0200 Subject: [PATCH 41/53] Reverted to refactored version without turnpoint(). Updated changelog file. --- CHANGELOG.md | 1 - R/find.turn.point.R | 72 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f079f73c..dec775d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - added tests with realistic testdata for `extract_features.R` [#42](https://github.com/RECETOX/recetox-aplcms/pull/42), [#54](https://github.com/RECETOX/recetox-aplcms/pull/54) - added tests for `feature.align.R` ([#40](https://github.com/RECETOX/recetox-aplcms/pull/40)), and `adjust.time.R` ([#39](https://github.com/RECETOX/recetox-aplcms/pull/40)) - added CI to repository's GitHub Actions [#45](https://github.com/RECETOX/recetox-aplcms/pull/45),[#49](https://github.com/RECETOX/recetox-aplcms/pull/49) -- added pastecs library to dependencies [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) ### Changed - refactored `feature.align.R` [#63](https://github.com/RECETOX/recetox-aplcms/pull/63) - refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64) diff --git a/R/find.turn.point.R b/R/find.turn.point.R index 24216e4f..a7a6d076 100644 --- a/R/find.turn.point.R +++ b/R/find.turn.point.R @@ -1,6 +1,39 @@ -#' @import pastecs -NULL -#> NULL +#' @description +#' Compute local maxima turn points. +#' @param y The y values of a curve in x-y plane. +#' @param ties.method specifies the method rank uses to break ties. +#' @return boolean row with local maxima turn point. +find_local_maxima <- function(y, ties.method) { + padded_y <- rev(as.vector(c(-Inf, y, -Inf))) + + # each row is 3 consecutive values in descending order + # rows are sorted in ascending order + z <- embed(padded_y, dim = 3) + + # reverse the row ordering + # first column is equal y + z <- z[rev(seq(nrow(z))), ] + + # row where the max is in the middle is a turn point + v <- max.col(z, ties.method = ties.method) == 2 + + return(v) +} + +#' @description +#' Compute maxima and minima turn points. +#' @param y The y values of a curve in x-y plane. +#' @return boolean row with local maxima and minima turn points. +msExtrema <- function(y) { + index1 <- find_local_maxima(y, ties.method = "first") + index2 <- find_local_maxima(-y, ties.method = "last") + + # this is some sort of safety mechanism to protect against numerical issues + index.max <- index1 & !index2 + index.min <- index2 & !index1 + + list(index.max = index.max, index.min = index.min) +} #' Find peaks and valleys of a curve. #' @@ -17,20 +50,33 @@ NULL #' @examples #' find.turn.point(y) find.turn.point <- function(y) { - y <- y[!is.na(y)] # filter NA values - if (length(unique(y)) == 1) { # if exactly one distinct value - middle_index <- round(length(y) / 2) # get mid index - start_and_end <- c(1, length(y)) # get first and last index + y <- y[!is.na(y)] # filter NA values + + if (length(unique(y)) == 1) { # if exactly one distinct value + middle_index <- round(length(y) / 2) # get mid index + start_and_end <- c(1, length(y)) # get first and last index return(list(pks = middle_index, vlys = start_and_end)) } else { - list_tp <- pastecs::turnpoints(y) - peaks <- which(list_tp$peaks) - pits <- which(list_tp$pits) + b <- msExtrema(y) + + pks <- which(b$index.max) + vlys <- which(b$index.min) + + if (pks[1] != 1) { + vlys <- c(1, vlys) + } + + if (pks[length(pks)] != length(y)) { + vlys <- c(vlys, length(y)) + } - if (length(peaks) == 1) { - pits <- c(1, list_tp$n) + if (length(pks) == 1) { + vlys <- c(1, length(y)) } - return(list(pks = peaks, vlys = pits)) + x <- new("list") + x$pks <- pks + x$vlys <- vlys + return(x) } } From 9df570d112d217ac1855c772fadd662b8a6938b9 Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Thu, 11 Aug 2022 12:02:50 +0200 Subject: [PATCH 42/53] Update conda/environment-dev.yaml --- conda/environment-dev.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index 714a5645..b82d92a7 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -33,4 +33,3 @@ dependencies: - r-datacomparer - r-patrick - radian - - r-pastecs From fa0622900dd1c415a95302a3fa99874246c489a4 Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Thu, 11 Aug 2022 12:02:56 +0200 Subject: [PATCH 43/53] Update conda/meta.yaml --- conda/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 7304eb81..6a57bb75 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -62,7 +62,6 @@ requirements: - r-tidyr - r-stringr - r-tibble - - r-pastecs test: commands: From 852e2c9c4bccc074ee404b5c9b0f9af5127e58dc Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Fri, 12 Aug 2022 08:26:06 +0200 Subject: [PATCH 44/53] Update DESCRIPTION --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2b3ebef3..a564cc41 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. Depends: R (>= 3.50), MASS, rgl, mzR, splines, doParallel, foreach, - iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp, dplyr, tidyr, stringr, tibble, pastecs, tools, arrow + iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp, dplyr, tidyr, stringr, tibble, tools, arrow biocViews: Technology, MassSpectrometry License: GPL-2 LazyLoad: yes From 79aa39416522c77c014a7ab5e5fcb2de1b707fdd Mon Sep 17 00:00:00 2001 From: hechth Date: Fri, 12 Aug 2022 10:43:00 +0200 Subject: [PATCH 45/53] fixed adjust.time, feature.align and extract_features test cases --- R/adjust.time.R | 7 +- R/extract_features.R | 2 + R/feature.align.R | 247 +++++++++++++++++++----------- R/find.tol.time.R | 106 +++++++++---- R/utils.R | 24 +-- tests/testthat/test-adjust-time.R | 7 +- 6 files changed, 256 insertions(+), 137 deletions(-) diff --git a/R/adjust.time.R b/R/adjust.time.R index cd0e4478..edc7f64f 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -111,7 +111,10 @@ adjust.time <- function(features, cex = 2) } - values <- get_feature_values(features, rt_colname) + features <- lapply(features, function(x) tibble::as_tibble(x) |> dplyr::rename(rt = pos)) + + + values <- concatenate_feature_tables(features, rt_colname) mz <- values$mz chr <- values$rt lab <- values$sample_id @@ -148,7 +151,7 @@ adjust.time <- function(features, that <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel]) this <- this[order(this[, 1], this[, 2]), ] that <- that[order(that[, 1], that[, 2]), ] - this <- cbind(this, V6=rep(i, nrow(this)), V7=that[, 3]) + this <- cbind(this, sample_id=rep(i, nrow(this)), rt_cluster=that[, 3]) features[[i]] <- this } diff --git a/R/extract_features.R b/R/extract_features.R index ef02533a..bfe4696a 100644 --- a/R/extract_features.R +++ b/R/extract_features.R @@ -62,6 +62,8 @@ extract_features <- function( 'load.lcms', 'adaptive.bin', 'find.turn.point', + 'msExtrema', + 'find_local_maxima', 'combine.seq.3', 'cont.index', 'interpol.area', diff --git a/R/feature.align.R b/R/feature.align.R index 9646562d..3a824aa6 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -11,25 +11,29 @@ to_attach <- function(pick, number_of_samples, use = "sum") { for (i in seq_along(strengths)) { # select all areas/RTs from the same sample values <- pick[pick[, 6] == i, 5] - if (use == "sum") + if (use == "sum") { strengths[i] <- sum(values) - if (use == "median") + } + if (use == "median") { strengths[i] <- median(values) + } # can be NA if pick does not contain any data from a sample } # average of m/z, average of rt, min of m/z, max of m/z, sum/median of areas/RTs - return(c(mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]), - max(pick[, 1]), strengths)) + return(c( + mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]), + max(pick[, 1]), strengths + )) } } create_output <- function(sample_grouped, number_of_samples, deviation) { - return(c(to_attach(sample_grouped, number_of_samples, use = "sum"), - to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"), - deviation - ) - ) + return(c( + to_attach(sample_grouped, number_of_samples, use = "sum"), + to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"), + deviation + )) } @@ -81,27 +85,63 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, for (i in seq_along(turns$peaks)) { sample_grouped <- filter_based_on_density(sample, turns, 1, i) if (validate_contents(sample_grouped, min_occurrence)) { - return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples)) + return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples)) } } } +create_rows <- function(i, + grps, + sel.labels, + mz_values, + rt, + area, + sample_id, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + number_of_samples) { + if (i %% 100 == 0) { + gc() + } # call Garbage Collection for performance improvement? + # select a group + group_ids <- which(grps == sel.labels[i]) + if (length(group_ids) > 1) { + # select data from the group + sample <- cbind( + mz_values[group_ids], rt[group_ids], rt[group_ids], + rt[group_ids], area[group_ids], sample_id[group_ids] + ) + # continue if data is from at least 'min_occurrence' samples + if (validate_contents(sample, min_occurrence)) { + return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples)) + } + } else if (min_occurrence == 1) { + sample_grouped <- c( + mz_values[group_ids], rt[group_ids], rt[group_ids], + rt[group_ids], area[group_ids], sample_id[group_ids] + ) + return(create_output(sample_grouped, number_of_samples, NA)) + } + return(NULL) +} + #' Align peaks from spectra into a feature table. -#' +#' #' Identifies which of the peaks from the profiles correspond to the same feature. -#' +#' #' @param features A list object. Each component is a matrix which is the output from proc.to.feature(). #' @param min_occurrence A feature has to show up in at least this number of profiles to be included in the final result. #' @param mz_tol_relative The m/z tolerance level for peak alignment. The default is NA, which allows the -#' program to search for the tolerance level based on the data. This value is expressed as the +#' program to search for the tolerance level based on the data. This value is expressed as the #' percentage of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. -#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which +#' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which #' allows the program to search for the tolerance level based on the data. -#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. +#' @param mz_max_diff Argument passed to find.tol(). Consider only m/z diffs smaller than this value. #' This is only used when the mz_tol_relative is NA. -#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable -#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly +#' @param mz_tol_absolute As the m/z tolerance is expressed in relative terms (ppm), it may not be suitable +#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly #' influences feature matching in higher m/z range. #' @param do.plot Indicates whether plot should be drawn. #' @param rt_colname Name of the column containing the retention time information. @@ -128,141 +168,174 @@ feature.align <- function(features, draw_plot(label = "Feature alignment", cex = 2) draw_plot() } - + + features <- lapply(features, function(x) { + x <- tibble::as_tibble(x) + if("pos" %in% colnames(x)) { + x <- x |> dplyr::rename(rt = pos) + } + return(x) + }) + number_of_samples <- nrow(summary(features)) if (number_of_samples > 1) { - values <- get_feature_values(features, rt_colname) + values <- concatenate_feature_tables(features, rt_colname) |> dplyr::arrange_at(c("mz", "rt")) + mz_values <- values$mz rt <- values$rt sample_id <- values$sample_id - + # sort all values by m/z, if equal by rt - ordering <- order(mz_values, rt) - mz_values <- mz_values[ordering] - rt <- rt[ordering] - sample_id <- sample_id[ordering] - + # ordering <- order(mz_values, rt) + # mz_values <- mz_values[ordering] + # rt <- rt[ordering] + # sample_id <- sample_id[ordering] + # find relative m/z tolerance level if (is.na(mz_tol_relative)) { mz_tol_relative <- find.tol(mz_values, mz_max_diff = mz_max_diff, do.plot = do.plot) if (length(mz_tol_relative) == 0) { mz_tol_relative <- 1e-5 - warning("Automatic tolerance finding failed, 10 ppm was assigned. + warning("Automatic tolerance finding failed, 10 ppm was assigned. May need to manually assign alignment mz tolerance level.") } } else if (do.plot) { - draw_plot(main = "alignment m/z tolerance level given", - label = mz_tol_relative, cex = 1.2) + draw_plot( + main = "alignment m/z tolerance level given", + label = mz_tol_relative, cex = 1.2 + ) } - + if (!is.na(rt_tol_relative) && do.plot) { - draw_plot(main = "retention time \n tolerance level given", - label = rt_tol_relative, cex = 1.2) + draw_plot( + main = "retention time \n tolerance level given", + label = rt_tol_relative, cex = 1.2 + ) } - + # find relative retention time tolerance level # also does some preprocessing grouping steps all_features <- find.tol.time(mz_values, - rt, - sample_id, - number_of_samples = number_of_samples, - mz_tol_relative = mz_tol_relative, - rt_tol_relative = rt_tol_relative, - mz_tol_absolute = mz_tol_absolute, - do.plot = do.plot) + rt, + sample_id, + number_of_samples = number_of_samples, + mz_tol_relative = mz_tol_relative, + rt_tol_relative = rt_tol_relative, + mz_tol_absolute = mz_tol_absolute, + do.plot = do.plot + ) rt_tol_relative <- all_features$rt.tol - + message("**** performing feature alignment ****") message(paste("m/z tolerance level: ", mz_tol_relative)) message(paste("time tolerance level:", rt_tol_relative)) - + # create zero vectors of length number_of_samples + 4 ? aligned_features <- pk.times <- rep(0, 4 + number_of_samples) mz_sd <- 0 - + labels <- unique(all_features$grps) area <- grps <- mz_values - + # grouping the features based on their m/z values (assuming the tolerance level) sizes <- c(0, cumsum(sapply(features, nrow))) for (i in 1:number_of_samples) { sample <- features[[i]] # order by m/z then by rt - sample <- sample[order(sample[, 1], sample[, 2]),] - + # sample <- sample[order(sample[, 1], sample[, 2]),] + sample <- features[[i]] |> dplyr::arrange_at(c("mz", "rt")) + # select preprocessed features belonging to current sample group_ids <- which(all_features$lab == i) # select m/z, rt and their group ID - sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids]) - sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),] - + # sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids]) + sample_grouped <- tibble::tibble(mz = all_features$mz[group_ids], rt = all_features$rt[group_ids], grps = all_features$grps[group_ids]) + + # sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),] + sample_grouped <- sample_grouped |> dplyr::arrange_at(c("mz", "rt")) + + # update m/z, rt, area values with ordered ones - mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample[, 1] - rt[(sizes[i] + 1):sizes[i + 1]] <- sample[, 2] - area[(sizes[i] + 1):sizes[i + 1]] <- sample[, 5] + mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample$mz + rt[(sizes[i] + 1):sizes[i + 1]] <- sample$rt + area[(sizes[i] + 1):sizes[i + 1]] <- sample$area # assign row identifier - grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped[, 3] + grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped$grps # assign batch identifier sample_id[(sizes[i] + 1):sizes[i + 1]] <- i } - + # table with number of values per group groups_cardinality <- table(all_features$grps) - # count those with minimal occurrence + # count those with minimal occurrence # (times 3 ? shouldn't be number of samples) !!! curr.row <- sum(groups_cardinality >= min_occurrence) * 3 mz_sd <- rep(0, curr.row) - + sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) - + # retention time alignment - aligned_features <- - foreach::foreach(i = seq_along(sel.labels), .combine = rbind) %do% { - if (i %% 100 == 0) - gc() # call Garbage Collection for performance improvement? - # select a group - group_ids <- which(grps == sel.labels[i]) - if (length(group_ids) > 1) { - # select data from the group - sample <- cbind(mz_values[group_ids], rt[group_ids], rt[group_ids], - rt[group_ids], area[group_ids], sample_id[group_ids]) - # continue if data is from at least 'min_occurrence' samples - if (validate_contents(sample, min_occurrence)) { - return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples)) - } - } else if (min_occurrence == 1) { - sample_grouped <- c(mz_values[group_ids], rt[group_ids], rt[group_ids], - rt[group_ids], area[group_ids], sample_id[group_ids]) - return(create_output(sample_grouped, number_of_samples, NA)) - } - return(NULL) - } - + # aligned_features <- + # foreach::foreach(i = seq_along(sel.labels), .combine = rbind) %do% + # create_rows( + # i, + # grps, + # sel.labels, + # mz_values, + # rt, + # area, + # sample_id, + # mz_tol_relative, + # rt_tol_relative, + # min_occurrence, + # number_of_samples + # ) + for(i in seq_along(sel.labels)) { + rows <- create_rows( + i, + grps, + sel.labels, + mz_values, + rt, + area, + sample_id, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + number_of_samples + ) + aligned_features <- rbind(aligned_features, rows) + } + aligned_features <- aligned_features[-1, ] + # select columns: average of m/z, average of rt, min of m/z, max of m/z, median of rt per sample (the second to_attach call) pk.times <- aligned_features[, (5 + number_of_samples):(2 * (4 + number_of_samples))] mz_sd <- aligned_features[, ncol(aligned_features)] # select columns: average of m/z, average of rt, min of m/z, max of m/z, sum of areas per sample (the first to_attach call) aligned_features <- aligned_features[, 1:(4 + number_of_samples)] - + # rename columns on both tables, samples are called "exp_i" colnames(aligned_features) <- colnames(pk.times) <- c("mz", "time", "mz.min", "mz.max", paste("exp", 1:number_of_samples)) - + # return both tables and both computed tolerances rec <- new("list") rec$aligned.ftrs <- aligned_features rec$pk.times <- pk.times rec$mz.tol <- mz_tol_relative rec$chr.tol <- rt_tol_relative - + if (do.plot) { - hist(mz_sd, xlab = "m/z SD", ylab = "Frequency", - main = "m/z SD distribution") - hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE), - xlab = "Retention time SD", ylab = "Frequency", - main = "Retention time SD distribution") + hist(mz_sd, + xlab = "m/z SD", ylab = "Frequency", + main = "m/z SD distribution" + ) + hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE), + xlab = "Retention time SD", ylab = "Frequency", + main = "Retention time SD distribution" + ) } - + return(rec) } else { message("There is but one experiment. What are you trying to align?") diff --git a/R/find.tol.time.R b/R/find.tol.time.R index e927b385..6c976218 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -32,7 +32,7 @@ compute_breaks_3 <- function(mz, min_mz_tol) { #' Compute rt relative tolerance to use. #' @description #' Compute the elution time tolerance based on the kernel density estimation. -#' It plots the fitting function if set to TRUE. +#' It plots the fitting function if set to TRUE. #' @param max.num.segments the maximum number of segments. #' @param aver.bin.size The average bin size to determine the number of equally spaced points in the kernel density estimation. #' @param number_of_samples The number of spectra in this analysis. @@ -49,7 +49,7 @@ compute_rt_tol_relative <- function(breaks, min.bins, max.bins, do.plot = FALSE) { - da <- 0 + distances <- 0 #' This conditional makes sure that length(s) is <= max.num.segments #' If False, length(s) = max.num.segments, and s[i] is the largest #' integer no greater than the corresponding element. Otherwise @@ -62,45 +62,83 @@ compute_rt_tol_relative <- function(breaks, #' This loop creates a vector with distances between rt peaks. Distances #' are stored in a triangular matrix and converted to a vector subsequently. - #' Vector length should be < 100, otherwise, vector is - #' constructed extracting only 100 samples. + #' Vector length should be < 100, otherwise, vector is + #' constructed extracting only 100 samples. for (i in s) { - subset_idx <- (breaks[i - 1] + 1):breaks[i]# create subset of indices + subset_idx <- (breaks[i - 1] + 1):breaks[i] # create subset of indices if (length(subset_idx) <= 3 * number_of_samples) { - rt_distances <- as.vector(dist(chr[subset_idx])) + rt_distances <- as.vector(dist(chr[subset_idx])) if (length(rt_distances) > 100) { - rt_distances <- sample(rt_distances, 100) + rt_distances <- sample(rt_distances, 100) } - da <- c(da, rt_distances) + distances <- c(distances, rt_distances) } } - - #' Calculation of kernel density estimation to estimate the rt_tol_relative - da <- da[!is.na(da)] - uppermost <- max(da) - n <- min(max.bins, max(min.bins, round(length(da) / aver.bin.size))) - des <- density(da, + + # #' Calculation of kernel density estimation to estimate the rt_tol_relative + # da <- da[!is.na(da)] + # uppermost <- max(da) + # n <- min(max.bins, max(min.bins, round(length(da) / aver.bin.size))) + # des <- density(da, + # kernel = "gaussian", n = n, + # bw = uppermost / n * 2, from = 0 + # ) + # y <- des$y[des$x > 0] + # x <- des$x[des$x > 0] + + # this.l <- lm(y[x > uppermost / 4] ~ x[x > uppermost / 4]) + # exp.y <- this.l$coef[1] + this.l$coef[2] * x + # y2 <- y[1:(length(y) - 1)] + # y3 <- y[2:(length(y))] + # y2[which(y2 < y3)] <- y3[which(y2 < y3)] + # y[1:(length(y) - 1)] <- y2 + + # yy <- cumsum(y > 1.5 * exp.y) + # yi <- seq_along(yy) + # sel <- min(which(yy < yi)) - 1 + # rt_tol_relative <- x[sel] + + # a long vector of distances between rt values (with no particular order) + distances <- distances[!is.na(distances)] + max_distance <- max(distances) # maximal distance + # number of equally spaced points at which the density is to be estimated + n <- min(max.bins, max(min.bins, round(length(distances) / aver.bin.size))) + # estimate probability density function of distances + des <- density(distances, kernel = "gaussian", n = n, - bw = uppermost / n * 2, from = 0 + bw = max_distance / n * 2, from = 0 ) - y <- des$y[des$x > 0] - x <- des$x[des$x > 0] - - this.l <- lm(y[x > uppermost / 4] ~ x[x > uppermost / 4]) - exp.y <- this.l$coef[1] + this.l$coef[2] * x - y2 <- y[1:(length(y) - 1)] - y3 <- y[2:(length(y))] - y2[which(y2 < y3)] <- y3[which(y2 < y3)] - y[1:(length(y) - 1)] <- y2 - - yy <- cumsum(y > 1.5 * exp.y) - yi <- seq_along(yy) - sel <- min(which(yy < yi)) - 1 - rt_tol_relative <- x[sel] + # the n (-1?) coordinates of the points where the density is estimated + points <- des$x[des$x > 0] + # the estimated density values + density_values <- des$y[des$x > 0] + + linear_regression_model <- lm(density_values[points > max_distance / 4] ~ points[points > max_distance / 4]) + + # compute probability density values (y) using the linear function + estimated_density_values <- linear_regression_model$coef[1] + linear_regression_model$coef[2] * points + + values_not_last <- density_values[1:(length(density_values) - 1)] # density values without the last one + values_not_first <- density_values[2:(length(density_values))] # density values without the first one + # pair-wise copy greater value to the left + values_not_last[which(values_not_last < values_not_first)] <- values_not_first[which(values_not_last < values_not_first)] + density_values[1:(length(density_values) - 1)] <- values_not_last + + # cumulative sum - where density value is greater than estimated density value + # cutoff is selected where the density of the empirical distribution is >1.5 times the density of the distribution + cumulative <- cumsum(density_values > 1.5 * estimated_density_values) + cumulative_indices <- seq_along(cumulative) + # find last index where density value is greater than estimated density value + selected <- min(which(cumulative < cumulative_indices)) - 1 + # corresponding coordinate is used as rt tolerance + rt_tol_relative <- points[selected] if (do.plot) { - tolerance_plot(x, y, exp.y, - sel, + tolerance_plot( + points, + density_values, + estimated_density_values, + selected, main = "find retention time tolerance" ) } @@ -143,7 +181,7 @@ find.tol.time <- function(mz, mz_tol_absolute = 0.01, max.num.segments = 10000, do.plot = TRUE) { - features <- tibble::tibble(mz = mz, rt = chr, labels = lab) + features <- tibble::tibble(mz = mz, rt = rt, labels = sample_id) features <- dplyr::arrange_at(features, "mz") min_mz_tol <- compute_min_mz_tolerance( @@ -188,10 +226,10 @@ find.tol.time <- function(mz, list( mz = features$mz, - chr = features$rt, + rt = features$rt, lab = features$labels, grps = features$grps, - chr.tol = rt_tol_relative, + rt.tol = rt_tol_relative, mz.tol = mz_tol_relative ) } diff --git a/R/utils.R b/R/utils.R index d508aa59..1faa58cb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -2,17 +2,19 @@ NULL #> NULL -get_feature_values <- function(features, rt_colname) { - mz <- c() - rt <- c() - sample_id <- c() - for (i in 1:length(features)) { - sample_features <- dplyr::as_tibble(features[[i]]) - mz <- c(mz, sample_features$mz) - rt <- c(rt, sample_features[[rt_colname]]) - sample_id <- c(sample_id, rep(i, nrow(sample_features))) - } - return(list(mz = mz, rt = rt, sample_id = sample_id)) +#' Concatenate multiple feature lists and add the sample id (origin of feature) as additional column. +#' +#' @param features list List of tibbles containing extracted feature tables. +#' @param rt_colname string Name of retention time information column, usually "pos". +concatenate_feature_tables <- function(features, rt_colname) { + for (i in seq_along(features)) { + if(!("sample_id" %in% colnames(features[[i]]))) { + features[[i]] <- tibble::add_column(features[[i]], sample_id = i) + } + } + + merged <- dplyr::bind_rows(features) + return(merged) } #' @export diff --git a/tests/testthat/test-adjust-time.R b/tests/testthat/test-adjust-time.R index df7d74ec..6fa0625b 100644 --- a/tests/testthat/test-adjust-time.R +++ b/tests/testthat/test-adjust-time.R @@ -41,10 +41,11 @@ patrick::with_parameters_test_that( file.path(testdata, "adjusted", paste0(x, ".parquet")) }) - expected <- lapply(expected_filenames, arrow::read_parquet) - expected <- lapply(expected, as.data.frame) + expected <- lapply(expected_filenames, function(x) { + tibble::as_tibble(arrow::read_parquet(x)) |> dplyr::rename( rt = pos, sample_id = V6, rt_cluster = V7) + }) - corrected <- lapply(corrected, as.data.frame) + corrected <- lapply(corrected, tibble::as_tibble) expect_equal(corrected, expected) }, From c354bdcb0af1d24d30fa63b87aa3c6f6549a9952 Mon Sep 17 00:00:00 2001 From: hechth Date: Fri, 12 Aug 2022 10:57:39 +0200 Subject: [PATCH 46/53] fixed unsupervised and hybrid test cases --- R/recover.weaker.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index c15692a5..fba30756 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -231,7 +231,7 @@ recover.weaker <- function(filename, target.time <- compute_target_time( aligned.ftrs[, "rt"], round(this.f1[, "pos"], 5), - round(this.f2[, "pos"], 5) + round(this.f2[, "rt"], 5) ) breaks <- compute_breaks_2(data_table, orig.tol) From 054f62f2b88256a172dc72b79f5305d7f10fdfd2 Mon Sep 17 00:00:00 2001 From: hechth Date: Fri, 12 Aug 2022 11:38:33 +0200 Subject: [PATCH 47/53] Added missing function exports --- NAMESPACE | 2 ++ R/find.turn.point.R | 2 ++ 2 files changed, 4 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 483e2708..52c998f4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ export(extract_pattern_colnames) export(feature.align) export(find.tol) export(find.turn.point) +export(find_local_maxima) export(get_custom_chr_tol) export(get_mzrange_bound_indices) export(get_raw_features_in_mzrange) @@ -35,6 +36,7 @@ export(load.lcms) export(load_data) export(load_file) export(make.known.table) +export(msExtrema) export(normix) export(normix.bic) export(plot_raw_profile_histogram) diff --git a/R/find.turn.point.R b/R/find.turn.point.R index a7a6d076..29c39420 100644 --- a/R/find.turn.point.R +++ b/R/find.turn.point.R @@ -3,6 +3,7 @@ #' @param y The y values of a curve in x-y plane. #' @param ties.method specifies the method rank uses to break ties. #' @return boolean row with local maxima turn point. +#' @export find_local_maxima <- function(y, ties.method) { padded_y <- rev(as.vector(c(-Inf, y, -Inf))) @@ -24,6 +25,7 @@ find_local_maxima <- function(y, ties.method) { #' Compute maxima and minima turn points. #' @param y The y values of a curve in x-y plane. #' @return boolean row with local maxima and minima turn points. +#' @export msExtrema <- function(y) { index1 <- find_local_maxima(y, ties.method = "first") index2 <- find_local_maxima(-y, ties.method = "last") From 6c7cf2f32c10ee0b9bfb19f81202dc9d04b24f83 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 16 Aug 2022 12:03:28 +0200 Subject: [PATCH 48/53] addressed comment --- R/recover.weaker.R | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 5771e251..21d378ae 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -16,23 +16,20 @@ NULL duplicate.row.remove <- function(features, tolerance = 1e-10) { new.table <- features |> dplyr::arrange_at(c("mz", "rt", "area")) n <- 1 - m <- 2 - to.remove <- rep(0, nrow(new.table)) + to.remove <- c() - while (m <= nrow(new.table)) { + for (m in 2:nrow(new.table)) { if (abs(new.table$mz[m] - new.table$mz[n]) < tolerance & abs(new.table$rt[m] - new.table$rt[n]) < tolerance & abs(new.table$area[m] - new.table$area[n]) < tolerance) { - to.remove[m] <- 1 - m <- m + 1 + to.remove <- c(to.remove, m) } else { n <- m - m <- m + 1 } } - if (sum(to.remove) > 0) { - new.table <- new.table[-which(to.remove == 1), ] + if(length(to.remove) > 0) { + new.table <- new.table[-to.remove, ] } new.table } @@ -261,7 +258,7 @@ get_rt_region_indices <- function(target_time, features, chr_tol) { if (!is.null(target_time) && !is.na(target_time)) { selection <- which(abs(features$labels - target_time) < chr_tol) } else { - selection <- 1:nrow(features) + selection <- seq_len(features) } return(selection) } From 05c213a072728137a2ac9b943e0e719cd173fa59 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 16 Aug 2022 15:08:40 +0200 Subject: [PATCH 49/53] pinned r-arrow version as 9.0.0 fails --- conda/environment-dev.yaml | 2 +- conda/meta.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index e72a0982..da174ea7 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -22,7 +22,7 @@ dependencies: - r-rocr - r-rocs - r-rcpp - - r-arrow + - r-arrow <= 8.0.0 - r-dplyr - r-tidyr - r-stringr diff --git a/conda/meta.yaml b/conda/meta.yaml index 6a57bb75..6ce393b2 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -57,7 +57,7 @@ requirements: - r-rocr - r-rocs - r-rcpp - - r-arrow + - r-arrow <= 8.0.0 - r-dplyr - r-tidyr - r-stringr From 51fa9532cc453f869205a6f4af9825f6dfbf1667 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 16 Aug 2022 15:08:56 +0200 Subject: [PATCH 50/53] moved arrange out of loop --- R/recover.weaker.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 21d378ae..8a2cd201 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -494,7 +494,7 @@ compute_rectangle <- function(data_table, features <- dplyr::slice( data_table, (breaks[bounds$start] + 1):breaks[bounds$end] - ) + ) |> dplyr::arrange_at("labels") mass.den <- compute_mass_density( features, @@ -513,8 +513,7 @@ compute_rectangle <- function(data_table, mass <- compute_boundaries(mass_range$vlys, peak) that <- features |> - dplyr::filter(mz > mass$lower & mz <= mass$upper) |> - dplyr::arrange_at("labels") + dplyr::filter(mz > mass$lower & mz <= mass$upper) # get values in RT region of interest? if (nrow(that) > recover_min_count) { From 804f16b9c13da628ab40239acb3953e883d3264a Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 16 Aug 2022 15:52:48 +0200 Subject: [PATCH 51/53] reverted change --- R/recover.weaker.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 8a2cd201..5d9bba62 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -258,7 +258,7 @@ get_rt_region_indices <- function(target_time, features, chr_tol) { if (!is.null(target_time) && !is.na(target_time)) { selection <- which(abs(features$labels - target_time) < chr_tol) } else { - selection <- seq_len(features) + selection <- 1:nrow(features) } return(selection) } From bbf3237c6b5d667fcf455f0ff7d4567ea0635586 Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 16 Aug 2022 17:49:42 +0200 Subject: [PATCH 52/53] Fixed documentation --- DESCRIPTION | 6 +++--- NAMESPACE | 2 ++ R/extract_features.R | 6 ++---- R/mass.match.R | 28 ++++++++++++++++------------ 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a564cc41..56db4ee0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,13 +8,13 @@ Authors@R: c( Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. -Depends: R (>= 3.50), MASS, rgl, mzR, splines, doParallel, foreach, - iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp, dplyr, tidyr, stringr, tibble, tools, arrow +Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach, + snow, dplyr, tidyr, stringr, tibble, tools, arrow biocViews: Technology, MassSpectrometry License: GPL-2 LazyLoad: yes NeedsCompilation: no Suggests: - arrow, dataCompareR, testthat (>= 3.0.0) + dataCompareR, testthat (>= 3.0.0) Config/testthat/edition: 3 RoxygenNote: 7.2.1 diff --git a/NAMESPACE b/NAMESPACE index f6cdb6f2..17dc1347 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ export(compute_mass_values) export(compute_target_times) export(cont.index) export(duplicate.row.remove) +export(extract_features) export(extract_pattern_colnames) export(feature.align) export(find.tol) @@ -34,6 +35,7 @@ export(load.lcms) export(load_data) export(load_file) export(make.known.table) +export(mass.match) export(msExtrema) export(normix) export(normix.bic) diff --git a/R/extract_features.R b/R/extract_features.R index 3d935de3..3c089e66 100644 --- a/R/extract_features.R +++ b/R/extract_features.R @@ -4,7 +4,7 @@ NULL #' feature extraction #' -#' extract feature +#' extract features #' #' @param cluster The number of CPU cores to be used #' @param filenames The CDF file names. @@ -35,8 +35,7 @@ NULL #' model in an EIC. #' @param BIC_factor The factor that is multiplied on the number of parameters to modify the BIC criterion. #' If larger than 1, models with more peaks are penalized more. -#' @examples -#' extract_features(cluster, filenames, min_pres, min_run, mz_tol, 0, 0.05, intensity_weighted, NA, NA, sd_cut, sigma_ratio_lim, "bi-Gaussian", "moment", 0.01, 1, 2.0) +#' @export extract_features <- function( cluster, filenames, @@ -76,7 +75,6 @@ extract_features <- function( 'compute_boundaries', 'increment_counter', 'rm.ridge', - #'compute_base_curve', 'compute_delta_rt', 'bigauss.mix', 'bigauss.esti', diff --git a/R/mass.match.R b/R/mass.match.R index 6a3aa3f8..b3dfe949 100644 --- a/R/mass.match.R +++ b/R/mass.match.R @@ -2,17 +2,21 @@ #' @param x The mz array for which to compute the matching. #' @param known.mz The mz value with which to match. #' @param match.tol.ppm Matching tolerance in ppm. -#' @return Binary vector, 1 indicating a match, 0 a mismatch. +#' @return Binary vector, 1 indicating a match, 0 a mismatch. +#' @export #' @examples -#' mass.match(aligned.ftrs, known.table, match.tol.ppm) -mass.match <- -function(x, known.mz, match.tol.ppm=5) -{ - mass.matched.pos<-rep(0, length(x)) - for(i in 1:length(x)) - { - this.d<-abs((x[i]-known.mz)/x[i]) - if(min(this.d) < match.tol.ppm/1e6) mass.matched.pos[i]<-1 - } - return(mass.matched.pos) +#' mass.match( +#' x = c(10, 20, 21), +#' known.mz = 20 +#' ) +mass.match <- function(x, known.mz, match.tol.ppm = 5) { + mass.matched.pos <- rep(0, length(x)) + for (i in seq_along(x)) + { + this.d <- abs((x[i] - known.mz) / x[i]) + if (min(this.d) < match.tol.ppm / 1e6) { + mass.matched.pos[i] <- 1 + } + } + return(mass.matched.pos) } From 668a6bfaa6861fa0b1f74151e2c4c3bc0689e267 Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Wed, 17 Aug 2022 09:06:24 +0200 Subject: [PATCH 53/53] Update R/find.tol.time.R MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matej Troják --- R/find.tol.time.R | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/R/find.tol.time.R b/R/find.tol.time.R index 6c976218..738064b5 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -75,28 +75,6 @@ compute_rt_tol_relative <- function(breaks, } } - # #' Calculation of kernel density estimation to estimate the rt_tol_relative - # da <- da[!is.na(da)] - # uppermost <- max(da) - # n <- min(max.bins, max(min.bins, round(length(da) / aver.bin.size))) - # des <- density(da, - # kernel = "gaussian", n = n, - # bw = uppermost / n * 2, from = 0 - # ) - # y <- des$y[des$x > 0] - # x <- des$x[des$x > 0] - - # this.l <- lm(y[x > uppermost / 4] ~ x[x > uppermost / 4]) - # exp.y <- this.l$coef[1] + this.l$coef[2] * x - # y2 <- y[1:(length(y) - 1)] - # y3 <- y[2:(length(y))] - # y2[which(y2 < y3)] <- y3[which(y2 < y3)] - # y[1:(length(y) - 1)] <- y2 - - # yy <- cumsum(y > 1.5 * exp.y) - # yi <- seq_along(yy) - # sel <- min(which(yy < yi)) - 1 - # rt_tol_relative <- x[sel] # a long vector of distances between rt values (with no particular order) distances <- distances[!is.na(distances)]