RECETOX · hechth · Aug 10, 2022 · Jul 23, 2022 · Jul 23, 2022 · Jul 23, 2022
diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -109,8 +109,8 @@ adjust.time <- function(features,
 
         values <- get_feature_values(features, rt_colname)
         mz <- values$mz
-        chr <- values$chr
-        lab <- values$lab
+        chr <- values$rt
+        lab <- values$sample_id
 
         if(is.na(mz_tol_relative)) {
             mz_tol_relative <- find.tol(mz, mz_max_diff = mz_max_diff, do.plot = do.plot)
@@ -132,7 +132,7 @@ adjust.time <- function(features,
                                 rt_tol_relative = rt_tol_relative,
                                 mz_tol_absolute = mz_tol_absolute,
                                 do.plot = do.plot)
-        rt_tol_relative <- all.ft$chr.tol
+        rt_tol_relative <- all.ft$rt.tol
 
         message("**** performing time correction ****")
         message(paste("m/z tolerance level: ", mz_tol_relative))
@@ -141,7 +141,7 @@ adjust.time <- function(features,
         for(i in 1:number_of_samples) {
             this <- features[[i]]
             sel <- which(all.ft$lab == i)
-            that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
+            that <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel])
             this <- this[order(this[, 1], this[, 2]), ]
             that <- that[order(that[, 1], that[, 2]), ]
             this <- cbind(this, V6=rep(i, nrow(this)), V7=that[, 3])

diff --git a/R/feature.align.R b/R/feature.align.R
@@ -5,16 +5,84 @@ to_attach <- function(pick, number_of_samples, use = "sum") {
         return(c(pick[1], pick[2], pick[1], pick[1], strengths))
     } else {
         for (i in seq_along(strengths)) {
+            # select all areas/RTs from the same sample
+            values <- pick[pick[, 6] == i, 5]
             if (use == "sum")
-                strengths[i] <- sum(pick[pick[, 6] == i, 5])
+                strengths[i] <- sum(values)
             if (use == "median")
-                strengths[i] <- median(pick[pick[, 6] == i, 5])
+                strengths[i] <- median(values)
+            # can be NA if pick does not contain any data from a sample
         }
+        # average of m/z, average of rt, min of m/z, max of m/z, sum/median of areas/RTs
         return(c(mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]),
                  max(pick[, 1]), strengths))
     }
 }
 
+
+create_output <- function(sample_grouped, number_of_samples, deviation) {
+    return(c(to_attach(sample_grouped, number_of_samples, use = "sum"),
+             to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
+             deviation
+    )
+    )
+}
+
+
+validate_contents <- function(samples, min_occurrence) {
+    # validate whether data is still from at least 'min_occurrence' number of samples
+    if (!is.null(nrow(samples))) {
+        if (length(unique(samples[, 6])) >= min_occurrence) {
+            return(TRUE)
+        }
+        return(FALSE)
+    }
+    return(FALSE)
+}
+
+
+find_optima <- function(data, bandwidth) {
+    # Kernel Density Estimation
+    den <- density(data, bw = bandwidth)
+    # select statistically significant points
+    turns <- find.turn.point(den$y)
+    return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys]))
+}
+
+
+filter_based_on_density <- function(sample, turns, index, i) {
+    # select data within lower and upper bound from density estimation
+    lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]])
+    upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]])
+    selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound)
+    return(sample[selected, ])
+}
+
+
+select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) {
+    # turns for rt
+    turns <- find_optima(sample[, 2], bandwidth = rt_tol_relative / 1.414)
+    for (i in seq_along(turns$peaks)) {
+        sample_grouped <- filter_based_on_density(sample, turns, 2, i)
+        if (validate_contents(sample_grouped, min_occurrence)) {
+            return(create_output(sample_grouped, number_of_samples, sd(sample_grouped[, 1], na.rm = TRUE)))
+        }
+    }
+}
+
+
+select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) {
+    # turns for m/z
+    turns <- find_optima(sample[, 1], bandwidth = mz_tol_relative * median(sample[, 1]))
+    for (i in seq_along(turns$peaks)) {
+        sample_grouped <- filter_based_on_density(sample, turns, 1, i)
+        if (validate_contents(sample_grouped, min_occurrence)) {
+            return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples)) 
+        }
+    }
+}
+
+
 #' Align peaks from spectra into a feature table.
 #' 
 #' Identifies which of the peaks from the profiles correspond to the same feature.
@@ -61,22 +129,22 @@ feature.align <- function(features,
     if (number_of_samples > 1) {
         values <- get_feature_values(features, rt_colname)
         mz_values <- values$mz
-        chr <- values$chr
-        lab <- values$lab
+        rt <- values$rt
+        sample_id <- values$sample_id
 
-        o <- order(mz_values, chr)
-        mz_values <- mz_values[o]
-        chr <- chr[o]
-        lab <- lab[o]
+        # sort all values by m/z, if equal by rt
+        ordering <- order(mz_values, rt)
+        mz_values <- mz_values[ordering]
+        rt <- rt[ordering]
+        sample_id <- sample_id[ordering]
 
         # find relative m/z tolerance level
         if (is.na(mz_tol_relative)) {
             mz_tol_relative <- find.tol(mz_values, mz_max_diff = mz_max_diff, do.plot = do.plot)
             if (length(mz_tol_relative) == 0) {
                 mz_tol_relative <- 1e-5
-                warning(
-                    "Automatic tolerance finding failed, 10 ppm was assigned. May need to manually assign alignment mz tolerance level."
-                )
+                warning("Automatic tolerance finding failed, 10 ppm was assigned. 
+                        May need to manually assign alignment mz tolerance level.")
             }
         } else if (do.plot) {
             draw_plot(main = "alignment m/z tolerance level given", 
@@ -89,119 +157,102 @@ feature.align <- function(features,
         }
 
         # find relative retention time tolerance level
-        all.ft <- find.tol.time(mz_values,
-                                chr,
-                                lab,
-                                number_of_samples = number_of_samples,
-                                mz_tol_relative = mz_tol_relative,
-                                rt_tol_relative = rt_tol_relative,
-                                mz_tol_absolute = mz_tol_absolute,
-                                do.plot = do.plot)
-        rt_tol_relative <- all.ft$chr.tol
+        # also does some preprocessing grouping steps
+        all_features <- find.tol.time(mz_values,
+                                      rt,
+                                      sample_id,
+                                      number_of_samples = number_of_samples,
+                                      mz_tol_relative = mz_tol_relative,
+                                      rt_tol_relative = rt_tol_relative,
+                                      mz_tol_absolute = mz_tol_absolute,
+                                      do.plot = do.plot)
+        rt_tol_relative <- all_features$rt.tol
 
         message("**** performing feature alignment ****")
         message(paste("m/z tolerance level: ", mz_tol_relative))
         message(paste("time tolerance level:", rt_tol_relative))
 
-        aligned.ftrs <- pk.times <- rep(0, 4 + number_of_samples)
-        mz.sd.rec <- 0
-
-        labels <- unique(all.ft$grps)
+        # create zero vectors of length number_of_samples + 4 ?
+        aligned_features <- pk.times <- rep(0, 4 + number_of_samples)
+        mz_sd <- 0
 
+        labels <- unique(all_features$grps)
         area <- grps <- mz_values
 
         # grouping the features based on their m/z values (assuming the tolerance level)
         sizes <- c(0, cumsum(sapply(features, nrow)))
         for (i in 1:number_of_samples) {
-            this <- features[[i]]
-            sel <- which(all.ft$lab == i)
-            that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
-            this <- this[order(this[, 1], this[, 2]),]
-            that <- that[order(that[, 1], that[, 2]),]
+            sample <- features[[i]]
+            # order by m/z then by rt
+            sample <- sample[order(sample[, 1], sample[, 2]),]
+
+            # select preprocessed features belonging to current sample
+            group_ids <- which(all_features$lab == i)
+            # select m/z, rt and their group ID
+            sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids])
+            sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),]
 
-            mz_values[(sizes[i] + 1):sizes[i + 1]] <- this[, 1]
-            chr[(sizes[i] + 1):sizes[i + 1]] <- this[, 2]
-            area[(sizes[i] + 1):sizes[i + 1]] <- this[, 5]
-            grps[(sizes[i] + 1):sizes[i + 1]] <- that[, 3]
-            lab[(sizes[i] + 1):sizes[i + 1]] <- i
+            # update m/z, rt, area values with ordered ones
+            mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample[, 1]
+            rt[(sizes[i] + 1):sizes[i + 1]] <- sample[, 2]
+            area[(sizes[i] + 1):sizes[i + 1]] <- sample[, 5]
+            # assign row identifier
+            grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped[, 3]
+            # assign batch identifier
+            sample_id[(sizes[i] + 1):sizes[i + 1]] <- i
         }
 
-        ttt <- table(all.ft$grps)
-        curr.row <- sum(ttt >= min_occurrence) * 3
-        mz.sd.rec <- rep(0, curr.row)
+        # table with number of values per group
+        groups_cardinality <- table(all_features$grps)
+        # count those with minimal occurrence 
+        # (times 3 ? shouldn't be number of samples) !!!
+        curr.row <- sum(groups_cardinality >= min_occurrence) * 3
+        mz_sd <- rep(0, curr.row)
 
-        sel.labels <- as.numeric(names(ttt)[ttt >= min_occurrence])
+        sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
 
         # retention time alignment
-        aligned.ftrs <-
+        aligned_features <-
             foreach::foreach(i = seq_along(sel.labels), .combine = rbind) %do% {
                 if (i %% 100 == 0)
-                    gc()
-                this.return <- NULL
-                sel <- which(grps == sel.labels[i])
-                if (length(sel) > 1) {
-                    this <- cbind(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
-                    if (length(unique(this[, 6])) >= min_occurrence) {
-                        this.den <- density(this[, 1], bw = mz_tol_relative * median(this[, 1]))
-                        turns <- find.turn.point(this.den$y)
-                        pks <- this.den$x[turns$pks]
-                        vlys <- this.den$x[turns$vlys]
-                        for (j in seq_along(pks)) {
-                            this.lower <- max(vlys[vlys < pks[j]])
-                            this.upper <- min(vlys[vlys > pks[j]])
-                            this.sel <- which(this[, 1] > this.lower & this[, 1] <= this.upper)
-                            that <- this[this.sel, ]
-                            if (!is.null(nrow(that))) {
-                                if (length(unique(that[, 6])) >= min_occurrence) {
-                                    that.den <- density(that[, 2], bw = rt_tol_relative / 1.414)
-                                    that.turns <- find.turn.point(that.den$y)
-                                    that.pks <- that.den$x[that.turns$pks]
-                                    that.vlys <- that.den$x[that.turns$vlys]
-                                    for (k in seq_along(that.pks)) {
-                                        that.lower <- max(that.vlys[that.vlys < that.pks[k]])
-                                        that.upper <- min(that.vlys[that.vlys > that.pks[k]])
-                                        thee <- that[that[, 2] > that.lower & that[, 2] <= that.upper, ]
-                                        if (!is.null(nrow(thee))) {
-                                            if (length(unique(thee[, 6])) >= min_occurrence) {
-                                                this.return <-
-                                                    c(to_attach(thee, number_of_samples, use = "sum"),
-                                                      to_attach(thee[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
-                                                      sd(thee[, 1], na.rm = TRUE)
-                                                    )
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                } else {
-                    if (min_occurrence == 1) {
-                        thee <- c(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
-                        this.return <- c(to_attach(thee, number_of_samples, use = "sum"),
-                                         to_attach(thee[c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
-                                         NA
-                        )
+                    gc() # call Garbage Collection for performance improvement?
+                # select a group
+                group_ids <- which(grps == sel.labels[i])
+                if (length(group_ids) > 1) {
+                    # select data from the group
+                    sample <- cbind(mz_values[group_ids], rt[group_ids], rt[group_ids], 
+                                    rt[group_ids], area[group_ids], sample_id[group_ids])
+                    # continue if data is from at least 'min_occurrence' samples
+                    if (validate_contents(sample, min_occurrence)) {
+                        return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples))
                     }
+                } else if (min_occurrence == 1) {
+                    sample_grouped <- c(mz_values[group_ids], rt[group_ids], rt[group_ids], 
+                                        rt[group_ids], area[group_ids], sample_id[group_ids])
+                    return(create_output(sample_grouped, number_of_samples, NA))
                 }
-                this.return
+                return(NULL)
             }
 
-        pk.times <- aligned.ftrs[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
-        mz.sd.rec <- aligned.ftrs[, ncol(aligned.ftrs)]
-        aligned.ftrs <- aligned.ftrs[, 1:(4 + number_of_samples)]
+        # select columns: average of m/z, average of rt, min of m/z, max of m/z, median of rt per sample (the second to_attach call)
+        pk.times <- aligned_features[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
+        mz_sd <- aligned_features[, ncol(aligned_features)]
+        # select columns: average of m/z, average of rt, min of m/z, max of m/z, sum of areas per sample (the first to_attach call)
+        aligned_features <- aligned_features[, 1:(4 + number_of_samples)]
 
-        colnames(aligned.ftrs) <-
+        # rename columns on both tables, samples are called "exp_i"
+        colnames(aligned_features) <-
             colnames(pk.times) <- c("mz", "time", "mz.min", "mz.max", paste("exp", 1:number_of_samples))
 
+        # return both tables and both computed tolerances
         rec <- new("list")
-        rec$aligned.ftrs <- aligned.ftrs
+        rec$aligned.ftrs <- aligned_features
         rec$pk.times <- pk.times
         rec$mz.tol <- mz_tol_relative
         rec$chr.tol <- rt_tol_relative
 
         if (do.plot) {
-            hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency",
+            hist(mz_sd, xlab = "m/z SD", ylab = "Frequency",
                  main = "m/z SD distribution")
             hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE), 
                  xlab = "Retention time SD", ylab = "Frequency",

diff --git a/R/find.tol.R b/R/find.tol.R
@@ -20,25 +20,39 @@ find.tol <- function(mz_values,
                      do.plot = TRUE) {
     mz_values <- mz_values[order(mz_values)]
     l <- length(mz_values)
-    da <- (mz_values[2:l] - mz_values[1:(l - 1)]) / ((mz_values[2:l] + mz_values[1:(l - 1)]) / 2)
-    da <- da[da < mz_max_diff]
-    n <- min(max.bins, max(round(length(da) / aver.bin.size), min.bins))
-    des <- density(da, kernel = "gaussian", n = n, bw = mz_max_diff / n * 2, from = 0)
-    y <- des$y[des$x > 0]
-    x <- des$x[des$x > 0]
+    # pairwise m/z difference divided by their average, filtered outside of tolerance limit
+    distances <- (mz_values[2:l] - mz_values[1:(l - 1)]) / 
+                 ((mz_values[2:l] + mz_values[1:(l - 1)]) / 2)
+    distances <- distances[distances < mz_max_diff]
 
-    to.use <- da[da > max(da) / 4] - max(da) / 4
-    this.rate <- MASS::fitdistr(to.use, "exponential")$estimate
-    exp.y <- dexp(x, rate = this.rate)
-    exp.y <- exp.y * sum(y[x > max(da) / 4]) / sum(exp.y[x > max(da) / 4])
+    # number of equally spaced points at which the density is to be estimated
+    n <- min(max.bins, max(round(length(distances) / aver.bin.size), min.bins))
+    # estimate probability density function of distances
+    des <- density(distances, kernel = "gaussian", n = n,
+                   bw = mz_max_diff / n * 2, from = 0)
+    # the n (-1?) coordinates of the points where the density is estimated
+    points <- des$y[des$x > 0]
+    # the estimated density values
+    density_values <- des$x[des$x > 0]
 
-    yy <- cumsum(y > 1.5 * exp.y)
-    yi <- seq_along(yy)
-    sel <- min(which(yy < yi)) - 1
+    # select the upper 75% of the sorted data
+    top_data <- distances[distances > max(distances) / 4] - max(distances) / 4
+    # parameter of the exponential distribution is estimated
+    lambda <- MASS::fitdistr(top_data, "exponential")$estimate
+    # values of the exponential distribution are calculated at equally spaced points
+    estimated_density_values <- dexp(density_values, rate = lambda)
+    # add the rest of the data
+    estimated_density_values <- estimated_density_values * sum(points[density_values > max(distances) / 4]) / 
+                                                           sum(estimated_density_values[density_values > max(distances) / 4])
+
+    # cutoff is selected where the density of the empirical distribution is >1.5 times the density of the exponential distribution
+    cumulative <- cumsum(points > 1.5 * estimated_density_values)
+    cumulative_indices <- seq_along(cumulative)
+    selected <- min(which(cumulative < cumulative_indices)) - 1
 
     if (do.plot) {
-        tolerance_plot(x, y, exp.y, sel, main = "find m/z tolerance")
+        tolerance_plot(density_values, points, estimated_density_values, selected, main = "find m/z tolerance")
     }
 
-    return(x[sel])
+    return(density_values[selected])
 }