Skip to content

Commit

Permalink
Merge pull request #88 from xtrojak/37-alignment-refactor-final
Browse files Browse the repository at this point in the history
Align features refactoring (vol. 2)
  • Loading branch information
hechth authored Aug 10, 2022
2 parents 38391f1 + bcb3b59 commit cf207d7
Show file tree
Hide file tree
Showing 6 changed files with 309 additions and 211 deletions.
8 changes: 4 additions & 4 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ adjust.time <- function(features,

values <- get_feature_values(features, rt_colname)
mz <- values$mz
chr <- values$chr
lab <- values$lab
chr <- values$rt
lab <- values$sample_id

if(is.na(mz_tol_relative)) {
mz_tol_relative <- find.tol(mz, mz_max_diff = mz_max_diff, do.plot = do.plot)
Expand All @@ -136,7 +136,7 @@ adjust.time <- function(features,
rt_tol_relative = rt_tol_relative,
mz_tol_absolute = mz_tol_absolute,
do.plot = do.plot)
rt_tol_relative <- all.ft$chr.tol
rt_tol_relative <- all.ft$rt.tol

message("**** performing time correction ****")
message(paste("m/z tolerance level: ", mz_tol_relative))
Expand All @@ -145,7 +145,7 @@ adjust.time <- function(features,
for(i in 1:number_of_samples) {
this <- features[[i]]
sel <- which(all.ft$lab == i)
that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
that <- cbind(all.ft$mz[sel], all.ft$rt[sel], all.ft$grps[sel])
this <- this[order(this[, 1], this[, 2]), ]
that <- that[order(that[, 1], that[, 2]), ]
this <- cbind(this, V6=rep(i, nrow(this)), V7=that[, 3])
Expand Down
235 changes: 143 additions & 92 deletions R/feature.align.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,84 @@ to_attach <- function(pick, number_of_samples, use = "sum") {
return(c(pick[1], pick[2], pick[1], pick[1], strengths))
} else {
for (i in seq_along(strengths)) {
# select all areas/RTs from the same sample
values <- pick[pick[, 6] == i, 5]
if (use == "sum")
strengths[i] <- sum(pick[pick[, 6] == i, 5])
strengths[i] <- sum(values)
if (use == "median")
strengths[i] <- median(pick[pick[, 6] == i, 5])
strengths[i] <- median(values)
# can be NA if pick does not contain any data from a sample
}
# average of m/z, average of rt, min of m/z, max of m/z, sum/median of areas/RTs
return(c(mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]),
max(pick[, 1]), strengths))
}
}


create_output <- function(sample_grouped, number_of_samples, deviation) {
return(c(to_attach(sample_grouped, number_of_samples, use = "sum"),
to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
deviation
)
)
}


validate_contents <- function(samples, min_occurrence) {
# validate whether data is still from at least 'min_occurrence' number of samples
if (!is.null(nrow(samples))) {
if (length(unique(samples[, 6])) >= min_occurrence) {
return(TRUE)
}
return(FALSE)
}
return(FALSE)
}


find_optima <- function(data, bandwidth) {
# Kernel Density Estimation
den <- density(data, bw = bandwidth)
# select statistically significant points
turns <- find.turn.point(den$y)
return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys]))
}


filter_based_on_density <- function(sample, turns, index, i) {
# select data within lower and upper bound from density estimation
lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]])
upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]])
selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound)
return(sample[selected, ])
}


select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) {
# turns for rt
turns <- find_optima(sample[, 2], bandwidth = rt_tol_relative / 1.414)
for (i in seq_along(turns$peaks)) {
sample_grouped <- filter_based_on_density(sample, turns, 2, i)
if (validate_contents(sample_grouped, min_occurrence)) {
return(create_output(sample_grouped, number_of_samples, sd(sample_grouped[, 1], na.rm = TRUE)))
}
}
}


select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) {
# turns for m/z
turns <- find_optima(sample[, 1], bandwidth = mz_tol_relative * median(sample[, 1]))
for (i in seq_along(turns$peaks)) {
sample_grouped <- filter_based_on_density(sample, turns, 1, i)
if (validate_contents(sample_grouped, min_occurrence)) {
return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples))
}
}
}


#' Align peaks from spectra into a feature table.
#'
#' Identifies which of the peaks from the profiles correspond to the same feature.
Expand Down Expand Up @@ -65,22 +133,22 @@ feature.align <- function(features,
if (number_of_samples > 1) {
values <- get_feature_values(features, rt_colname)
mz_values <- values$mz
chr <- values$chr
lab <- values$lab
rt <- values$rt
sample_id <- values$sample_id

o <- order(mz_values, chr)
mz_values <- mz_values[o]
chr <- chr[o]
lab <- lab[o]
# sort all values by m/z, if equal by rt
ordering <- order(mz_values, rt)
mz_values <- mz_values[ordering]
rt <- rt[ordering]
sample_id <- sample_id[ordering]

# find relative m/z tolerance level
if (is.na(mz_tol_relative)) {
mz_tol_relative <- find.tol(mz_values, mz_max_diff = mz_max_diff, do.plot = do.plot)
if (length(mz_tol_relative) == 0) {
mz_tol_relative <- 1e-5
warning(
"Automatic tolerance finding failed, 10 ppm was assigned. May need to manually assign alignment mz tolerance level."
)
warning("Automatic tolerance finding failed, 10 ppm was assigned.
May need to manually assign alignment mz tolerance level.")
}
} else if (do.plot) {
draw_plot(main = "alignment m/z tolerance level given",
Expand All @@ -93,119 +161,102 @@ feature.align <- function(features,
}

# find relative retention time tolerance level
all.ft <- find.tol.time(mz_values,
chr,
lab,
number_of_samples = number_of_samples,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative,
mz_tol_absolute = mz_tol_absolute,
do.plot = do.plot)
rt_tol_relative <- all.ft$chr.tol
# also does some preprocessing grouping steps
all_features <- find.tol.time(mz_values,
rt,
sample_id,
number_of_samples = number_of_samples,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative,
mz_tol_absolute = mz_tol_absolute,
do.plot = do.plot)
rt_tol_relative <- all_features$rt.tol

message("**** performing feature alignment ****")
message(paste("m/z tolerance level: ", mz_tol_relative))
message(paste("time tolerance level:", rt_tol_relative))

aligned.ftrs <- pk.times <- rep(0, 4 + number_of_samples)
mz.sd.rec <- 0

labels <- unique(all.ft$grps)
# create zero vectors of length number_of_samples + 4 ?
aligned_features <- pk.times <- rep(0, 4 + number_of_samples)
mz_sd <- 0

labels <- unique(all_features$grps)
area <- grps <- mz_values

# grouping the features based on their m/z values (assuming the tolerance level)
sizes <- c(0, cumsum(sapply(features, nrow)))
for (i in 1:number_of_samples) {
this <- features[[i]]
sel <- which(all.ft$lab == i)
that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel])
this <- this[order(this[, 1], this[, 2]),]
that <- that[order(that[, 1], that[, 2]),]
sample <- features[[i]]
# order by m/z then by rt
sample <- sample[order(sample[, 1], sample[, 2]),]

# select preprocessed features belonging to current sample
group_ids <- which(all_features$lab == i)
# select m/z, rt and their group ID
sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids])
sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),]

mz_values[(sizes[i] + 1):sizes[i + 1]] <- this[, 1]
chr[(sizes[i] + 1):sizes[i + 1]] <- this[, 2]
area[(sizes[i] + 1):sizes[i + 1]] <- this[, 5]
grps[(sizes[i] + 1):sizes[i + 1]] <- that[, 3]
lab[(sizes[i] + 1):sizes[i + 1]] <- i
# update m/z, rt, area values with ordered ones
mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample[, 1]
rt[(sizes[i] + 1):sizes[i + 1]] <- sample[, 2]
area[(sizes[i] + 1):sizes[i + 1]] <- sample[, 5]
# assign row identifier
grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped[, 3]
# assign batch identifier
sample_id[(sizes[i] + 1):sizes[i + 1]] <- i
}

ttt <- table(all.ft$grps)
curr.row <- sum(ttt >= min_occurrence) * 3
mz.sd.rec <- rep(0, curr.row)
# table with number of values per group
groups_cardinality <- table(all_features$grps)
# count those with minimal occurrence
# (times 3 ? shouldn't be number of samples) !!!
curr.row <- sum(groups_cardinality >= min_occurrence) * 3
mz_sd <- rep(0, curr.row)

sel.labels <- as.numeric(names(ttt)[ttt >= min_occurrence])
sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])

# retention time alignment
aligned.ftrs <-
aligned_features <-
foreach::foreach(i = seq_along(sel.labels), .combine = rbind) %do% {
if (i %% 100 == 0)
gc()
this.return <- NULL
sel <- which(grps == sel.labels[i])
if (length(sel) > 1) {
this <- cbind(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
if (length(unique(this[, 6])) >= min_occurrence) {
this.den <- density(this[, 1], bw = mz_tol_relative * median(this[, 1]))
turns <- find.turn.point(this.den$y)
pks <- this.den$x[turns$pks]
vlys <- this.den$x[turns$vlys]
for (j in seq_along(pks)) {
this.lower <- max(vlys[vlys < pks[j]])
this.upper <- min(vlys[vlys > pks[j]])
this.sel <- which(this[, 1] > this.lower & this[, 1] <= this.upper)
that <- this[this.sel, ]
if (!is.null(nrow(that))) {
if (length(unique(that[, 6])) >= min_occurrence) {
that.den <- density(that[, 2], bw = rt_tol_relative / 1.414)
that.turns <- find.turn.point(that.den$y)
that.pks <- that.den$x[that.turns$pks]
that.vlys <- that.den$x[that.turns$vlys]
for (k in seq_along(that.pks)) {
that.lower <- max(that.vlys[that.vlys < that.pks[k]])
that.upper <- min(that.vlys[that.vlys > that.pks[k]])
thee <- that[that[, 2] > that.lower & that[, 2] <= that.upper, ]
if (!is.null(nrow(thee))) {
if (length(unique(thee[, 6])) >= min_occurrence) {
this.return <-
c(to_attach(thee, number_of_samples, use = "sum"),
to_attach(thee[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
sd(thee[, 1], na.rm = TRUE)
)
}
}
}
}
}
}
}
} else {
if (min_occurrence == 1) {
thee <- c(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel])
this.return <- c(to_attach(thee, number_of_samples, use = "sum"),
to_attach(thee[c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"),
NA
)
gc() # call Garbage Collection for performance improvement?
# select a group
group_ids <- which(grps == sel.labels[i])
if (length(group_ids) > 1) {
# select data from the group
sample <- cbind(mz_values[group_ids], rt[group_ids], rt[group_ids],
rt[group_ids], area[group_ids], sample_id[group_ids])
# continue if data is from at least 'min_occurrence' samples
if (validate_contents(sample, min_occurrence)) {
return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples))
}
} else if (min_occurrence == 1) {
sample_grouped <- c(mz_values[group_ids], rt[group_ids], rt[group_ids],
rt[group_ids], area[group_ids], sample_id[group_ids])
return(create_output(sample_grouped, number_of_samples, NA))
}
this.return
return(NULL)
}

pk.times <- aligned.ftrs[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
mz.sd.rec <- aligned.ftrs[, ncol(aligned.ftrs)]
aligned.ftrs <- aligned.ftrs[, 1:(4 + number_of_samples)]
# select columns: average of m/z, average of rt, min of m/z, max of m/z, median of rt per sample (the second to_attach call)
pk.times <- aligned_features[, (5 + number_of_samples):(2 * (4 + number_of_samples))]
mz_sd <- aligned_features[, ncol(aligned_features)]
# select columns: average of m/z, average of rt, min of m/z, max of m/z, sum of areas per sample (the first to_attach call)
aligned_features <- aligned_features[, 1:(4 + number_of_samples)]

colnames(aligned.ftrs) <-
# rename columns on both tables, samples are called "exp_i"
colnames(aligned_features) <-
colnames(pk.times) <- c("mz", "time", "mz.min", "mz.max", paste("exp", 1:number_of_samples))

# return both tables and both computed tolerances
rec <- new("list")
rec$aligned.ftrs <- aligned.ftrs
rec$aligned.ftrs <- aligned_features
rec$pk.times <- pk.times
rec$mz.tol <- mz_tol_relative
rec$chr.tol <- rt_tol_relative

if (do.plot) {
hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency",
hist(mz_sd, xlab = "m/z SD", ylab = "Frequency",
main = "m/z SD distribution")
hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE),
xlab = "Retention time SD", ylab = "Frequency",
Expand Down
44 changes: 29 additions & 15 deletions R/find.tol.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,39 @@ find.tol <- function(mz_values,
do.plot = TRUE) {
mz_values <- mz_values[order(mz_values)]
l <- length(mz_values)
da <- (mz_values[2:l] - mz_values[1:(l - 1)]) / ((mz_values[2:l] + mz_values[1:(l - 1)]) / 2)
da <- da[da < mz_max_diff]
n <- min(max.bins, max(round(length(da) / aver.bin.size), min.bins))
des <- density(da, kernel = "gaussian", n = n, bw = mz_max_diff / n * 2, from = 0)
y <- des$y[des$x > 0]
x <- des$x[des$x > 0]
# pairwise m/z difference divided by their average, filtered outside of tolerance limit
distances <- (mz_values[2:l] - mz_values[1:(l - 1)]) /
((mz_values[2:l] + mz_values[1:(l - 1)]) / 2)
distances <- distances[distances < mz_max_diff]

to.use <- da[da > max(da) / 4] - max(da) / 4
this.rate <- MASS::fitdistr(to.use, "exponential")$estimate
exp.y <- dexp(x, rate = this.rate)
exp.y <- exp.y * sum(y[x > max(da) / 4]) / sum(exp.y[x > max(da) / 4])
# number of equally spaced points at which the density is to be estimated
n <- min(max.bins, max(round(length(distances) / aver.bin.size), min.bins))
# estimate probability density function of distances
des <- density(distances, kernel = "gaussian", n = n,
bw = mz_max_diff / n * 2, from = 0)
# the n (-1?) coordinates of the points where the density is estimated
points <- des$y[des$x > 0]
# the estimated density values
density_values <- des$x[des$x > 0]

yy <- cumsum(y > 1.5 * exp.y)
yi <- seq_along(yy)
sel <- min(which(yy < yi)) - 1
# select the upper 75% of the sorted data
top_data <- distances[distances > max(distances) / 4] - max(distances) / 4
# parameter of the exponential distribution is estimated
lambda <- MASS::fitdistr(top_data, "exponential")$estimate
# values of the exponential distribution are calculated at equally spaced points
estimated_density_values <- dexp(density_values, rate = lambda)
# add the rest of the data
estimated_density_values <- estimated_density_values * sum(points[density_values > max(distances) / 4]) /
sum(estimated_density_values[density_values > max(distances) / 4])

# cutoff is selected where the density of the empirical distribution is >1.5 times the density of the exponential distribution
cumulative <- cumsum(points > 1.5 * estimated_density_values)
cumulative_indices <- seq_along(cumulative)
selected <- min(which(cumulative < cumulative_indices)) - 1

if (do.plot) {
tolerance_plot(x, y, exp.y, sel, main = "find m/z tolerance")
tolerance_plot(density_values, points, estimated_density_values, selected, main = "find m/z tolerance")
}

return(x[sel])
return(density_values[selected])
}
Loading

0 comments on commit cf207d7

Please sign in to comment.