-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Align features refactoring (vol. 2) #88
Changes from all commits
5d3df6e
4610739
30c2d70
e57c27a
9772986
34e23c9
cdd72d4
753875a
bcb3b59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,16 +5,84 @@ to_attach <- function(pick, number_of_samples, use = "sum") { | |
return(c(pick[1], pick[2], pick[1], pick[1], strengths)) | ||
} else { | ||
for (i in seq_along(strengths)) { | ||
# select all areas/RTs from the same sample | ||
values <- pick[pick[, 6] == i, 5] | ||
if (use == "sum") | ||
strengths[i] <- sum(pick[pick[, 6] == i, 5]) | ||
strengths[i] <- sum(values) | ||
if (use == "median") | ||
strengths[i] <- median(pick[pick[, 6] == i, 5]) | ||
strengths[i] <- median(values) | ||
# can be NA if pick does not contain any data from a sample | ||
} | ||
# average of m/z, average of rt, min of m/z, max of m/z, sum/median of areas/RTs | ||
return(c(mean(pick[, 1]), mean(pick[, 2]), min(pick[, 1]), | ||
max(pick[, 1]), strengths)) | ||
} | ||
} | ||
|
||
|
||
create_output <- function(sample_grouped, number_of_samples, deviation) { | ||
return(c(to_attach(sample_grouped, number_of_samples, use = "sum"), | ||
to_attach(sample_grouped[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"), | ||
deviation | ||
) | ||
) | ||
} | ||
|
||
|
||
validate_contents <- function(samples, min_occurrence) { | ||
# validate whether data is still from at least 'min_occurrence' number of samples | ||
if (!is.null(nrow(samples))) { | ||
if (length(unique(samples[, 6])) >= min_occurrence) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Index should be rather done via column name than index. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this point, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh - okay. |
||
return(TRUE) | ||
} | ||
return(FALSE) | ||
} | ||
return(FALSE) | ||
} | ||
|
||
|
||
find_optima <- function(data, bandwidth) { | ||
# Kernel Density Estimation | ||
den <- density(data, bw = bandwidth) | ||
# select statistically significant points | ||
turns <- find.turn.point(den$y) | ||
return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys])) | ||
} | ||
Comment on lines
+44
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function was also extracted in many other refactorings. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's address this together in #65. |
||
|
||
|
||
filter_based_on_density <- function(sample, turns, index, i) { | ||
# select data within lower and upper bound from density estimation | ||
lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]]) | ||
upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]]) | ||
selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound) | ||
return(sample[selected, ]) | ||
} | ||
Comment on lines
+53
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same - this function should be re-used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, should be properly fixed as part of #65. |
||
|
||
|
||
select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) { | ||
# turns for rt | ||
turns <- find_optima(sample[, 2], bandwidth = rt_tol_relative / 1.414) | ||
for (i in seq_along(turns$peaks)) { | ||
sample_grouped <- filter_based_on_density(sample, turns, 2, i) | ||
if (validate_contents(sample_grouped, min_occurrence)) { | ||
return(create_output(sample_grouped, number_of_samples, sd(sample_grouped[, 1], na.rm = TRUE))) | ||
} | ||
} | ||
} | ||
|
||
|
||
select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) { | ||
# turns for m/z | ||
turns <- find_optima(sample[, 1], bandwidth = mz_tol_relative * median(sample[, 1])) | ||
for (i in seq_along(turns$peaks)) { | ||
sample_grouped <- filter_based_on_density(sample, turns, 1, i) | ||
if (validate_contents(sample_grouped, min_occurrence)) { | ||
return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples)) | ||
} | ||
} | ||
} | ||
|
||
|
||
#' Align peaks from spectra into a feature table. | ||
#' | ||
#' Identifies which of the peaks from the profiles correspond to the same feature. | ||
|
@@ -61,22 +129,22 @@ feature.align <- function(features, | |
if (number_of_samples > 1) { | ||
values <- get_feature_values(features, rt_colname) | ||
mz_values <- values$mz | ||
chr <- values$chr | ||
lab <- values$lab | ||
rt <- values$rt | ||
sample_id <- values$sample_id | ||
|
||
o <- order(mz_values, chr) | ||
mz_values <- mz_values[o] | ||
chr <- chr[o] | ||
lab <- lab[o] | ||
# sort all values by m/z, if equal by rt | ||
ordering <- order(mz_values, rt) | ||
mz_values <- mz_values[ordering] | ||
rt <- rt[ordering] | ||
sample_id <- sample_id[ordering] | ||
Comment on lines
+135
to
+139
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Data could be arranged in a |
||
|
||
# find relative m/z tolerance level | ||
if (is.na(mz_tol_relative)) { | ||
mz_tol_relative <- find.tol(mz_values, mz_max_diff = mz_max_diff, do.plot = do.plot) | ||
if (length(mz_tol_relative) == 0) { | ||
mz_tol_relative <- 1e-5 | ||
warning( | ||
"Automatic tolerance finding failed, 10 ppm was assigned. May need to manually assign alignment mz tolerance level." | ||
) | ||
warning("Automatic tolerance finding failed, 10 ppm was assigned. | ||
May need to manually assign alignment mz tolerance level.") | ||
Comment on lines
+146
to
+147
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is something that we should address maybe: I doubt someone will see this warning given that most of our test-cases produce dozens of warnings. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think the function should fail with an error? Or we could parametrise the whole function to use default value vs. fail with an error. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I don't think an error would be an appropriate solution. I just wanted to point out that apLCMS may produce a meaningful warning, so we should make sure that the warning doesn't get lost in a bunch of deprecation warnings, etc. I think we just have to reduce the number of other warnings like deprecation, imports, and so on. Should be done with #93. |
||
} | ||
} else if (do.plot) { | ||
draw_plot(main = "alignment m/z tolerance level given", | ||
|
@@ -89,119 +157,102 @@ feature.align <- function(features, | |
} | ||
|
||
# find relative retention time tolerance level | ||
all.ft <- find.tol.time(mz_values, | ||
chr, | ||
lab, | ||
number_of_samples = number_of_samples, | ||
mz_tol_relative = mz_tol_relative, | ||
rt_tol_relative = rt_tol_relative, | ||
mz_tol_absolute = mz_tol_absolute, | ||
do.plot = do.plot) | ||
rt_tol_relative <- all.ft$chr.tol | ||
# also does some preprocessing grouping steps | ||
all_features <- find.tol.time(mz_values, | ||
rt, | ||
sample_id, | ||
number_of_samples = number_of_samples, | ||
mz_tol_relative = mz_tol_relative, | ||
rt_tol_relative = rt_tol_relative, | ||
mz_tol_absolute = mz_tol_absolute, | ||
do.plot = do.plot) | ||
rt_tol_relative <- all_features$rt.tol | ||
|
||
message("**** performing feature alignment ****") | ||
message(paste("m/z tolerance level: ", mz_tol_relative)) | ||
message(paste("time tolerance level:", rt_tol_relative)) | ||
|
||
aligned.ftrs <- pk.times <- rep(0, 4 + number_of_samples) | ||
mz.sd.rec <- 0 | ||
|
||
labels <- unique(all.ft$grps) | ||
# create zero vectors of length number_of_samples + 4 ? | ||
aligned_features <- pk.times <- rep(0, 4 + number_of_samples) | ||
mz_sd <- 0 | ||
|
||
labels <- unique(all_features$grps) | ||
area <- grps <- mz_values | ||
|
||
# grouping the features based on their m/z values (assuming the tolerance level) | ||
sizes <- c(0, cumsum(sapply(features, nrow))) | ||
for (i in 1:number_of_samples) { | ||
this <- features[[i]] | ||
sel <- which(all.ft$lab == i) | ||
that <- cbind(all.ft$mz[sel], all.ft$chr[sel], all.ft$grps[sel]) | ||
this <- this[order(this[, 1], this[, 2]),] | ||
that <- that[order(that[, 1], that[, 2]),] | ||
sample <- features[[i]] | ||
# order by m/z then by rt | ||
sample <- sample[order(sample[, 1], sample[, 2]),] | ||
|
||
# select preprocessed features belonging to current sample | ||
group_ids <- which(all_features$lab == i) | ||
# select m/z, rt and their group ID | ||
sample_grouped <- cbind(all_features$mz[group_ids], all_features$rt[group_ids], all_features$grps[group_ids]) | ||
sample_grouped <- sample_grouped[order(sample_grouped[, 1], sample_grouped[, 2]),] | ||
|
||
mz_values[(sizes[i] + 1):sizes[i + 1]] <- this[, 1] | ||
chr[(sizes[i] + 1):sizes[i + 1]] <- this[, 2] | ||
area[(sizes[i] + 1):sizes[i + 1]] <- this[, 5] | ||
grps[(sizes[i] + 1):sizes[i + 1]] <- that[, 3] | ||
lab[(sizes[i] + 1):sizes[i + 1]] <- i | ||
# update m/z, rt, area values with ordered ones | ||
mz_values[(sizes[i] + 1):sizes[i + 1]] <- sample[, 1] | ||
rt[(sizes[i] + 1):sizes[i + 1]] <- sample[, 2] | ||
area[(sizes[i] + 1):sizes[i + 1]] <- sample[, 5] | ||
# assign row identifier | ||
grps[(sizes[i] + 1):sizes[i + 1]] <- sample_grouped[, 3] | ||
# assign batch identifier | ||
sample_id[(sizes[i] + 1):sizes[i + 1]] <- i | ||
} | ||
|
||
ttt <- table(all.ft$grps) | ||
curr.row <- sum(ttt >= min_occurrence) * 3 | ||
mz.sd.rec <- rep(0, curr.row) | ||
# table with number of values per group | ||
groups_cardinality <- table(all_features$grps) | ||
# count those with minimal occurrence | ||
# (times 3 ? shouldn't be number of samples) !!! | ||
curr.row <- sum(groups_cardinality >= min_occurrence) * 3 | ||
mz_sd <- rep(0, curr.row) | ||
|
||
sel.labels <- as.numeric(names(ttt)[ttt >= min_occurrence]) | ||
sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) | ||
|
||
# retention time alignment | ||
aligned.ftrs <- | ||
aligned_features <- | ||
foreach::foreach(i = seq_along(sel.labels), .combine = rbind) %do% { | ||
if (i %% 100 == 0) | ||
gc() | ||
this.return <- NULL | ||
sel <- which(grps == sel.labels[i]) | ||
if (length(sel) > 1) { | ||
this <- cbind(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel]) | ||
if (length(unique(this[, 6])) >= min_occurrence) { | ||
this.den <- density(this[, 1], bw = mz_tol_relative * median(this[, 1])) | ||
turns <- find.turn.point(this.den$y) | ||
pks <- this.den$x[turns$pks] | ||
vlys <- this.den$x[turns$vlys] | ||
for (j in seq_along(pks)) { | ||
this.lower <- max(vlys[vlys < pks[j]]) | ||
this.upper <- min(vlys[vlys > pks[j]]) | ||
this.sel <- which(this[, 1] > this.lower & this[, 1] <= this.upper) | ||
that <- this[this.sel, ] | ||
if (!is.null(nrow(that))) { | ||
if (length(unique(that[, 6])) >= min_occurrence) { | ||
that.den <- density(that[, 2], bw = rt_tol_relative / 1.414) | ||
that.turns <- find.turn.point(that.den$y) | ||
that.pks <- that.den$x[that.turns$pks] | ||
that.vlys <- that.den$x[that.turns$vlys] | ||
for (k in seq_along(that.pks)) { | ||
that.lower <- max(that.vlys[that.vlys < that.pks[k]]) | ||
that.upper <- min(that.vlys[that.vlys > that.pks[k]]) | ||
thee <- that[that[, 2] > that.lower & that[, 2] <= that.upper, ] | ||
if (!is.null(nrow(thee))) { | ||
if (length(unique(thee[, 6])) >= min_occurrence) { | ||
this.return <- | ||
c(to_attach(thee, number_of_samples, use = "sum"), | ||
to_attach(thee[, c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"), | ||
sd(thee[, 1], na.rm = TRUE) | ||
) | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} else { | ||
if (min_occurrence == 1) { | ||
thee <- c(mz_values[sel], chr[sel], chr[sel], chr[sel], area[sel], lab[sel]) | ||
this.return <- c(to_attach(thee, number_of_samples, use = "sum"), | ||
to_attach(thee[c(1, 2, 3, 4, 2, 6)], number_of_samples, use = "median"), | ||
NA | ||
) | ||
gc() # call Garbage Collection for performance improvement? | ||
# select a group | ||
group_ids <- which(grps == sel.labels[i]) | ||
if (length(group_ids) > 1) { | ||
# select data from the group | ||
sample <- cbind(mz_values[group_ids], rt[group_ids], rt[group_ids], | ||
rt[group_ids], area[group_ids], sample_id[group_ids]) | ||
# continue if data is from at least 'min_occurrence' samples | ||
if (validate_contents(sample, min_occurrence)) { | ||
return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples)) | ||
} | ||
} else if (min_occurrence == 1) { | ||
sample_grouped <- c(mz_values[group_ids], rt[group_ids], rt[group_ids], | ||
rt[group_ids], area[group_ids], sample_id[group_ids]) | ||
return(create_output(sample_grouped, number_of_samples, NA)) | ||
} | ||
this.return | ||
return(NULL) | ||
} | ||
|
||
pk.times <- aligned.ftrs[, (5 + number_of_samples):(2 * (4 + number_of_samples))] | ||
mz.sd.rec <- aligned.ftrs[, ncol(aligned.ftrs)] | ||
aligned.ftrs <- aligned.ftrs[, 1:(4 + number_of_samples)] | ||
# select columns: average of m/z, average of rt, min of m/z, max of m/z, median of rt per sample (the second to_attach call) | ||
pk.times <- aligned_features[, (5 + number_of_samples):(2 * (4 + number_of_samples))] | ||
mz_sd <- aligned_features[, ncol(aligned_features)] | ||
# select columns: average of m/z, average of rt, min of m/z, max of m/z, sum of areas per sample (the first to_attach call) | ||
aligned_features <- aligned_features[, 1:(4 + number_of_samples)] | ||
Comment on lines
+238
to
+241
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Computations for the numbers of columns could be extracted to variables to clearly indicate what is being used and why. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will be changed in #87 anyway. |
||
|
||
colnames(aligned.ftrs) <- | ||
# rename columns on both tables, samples are called "exp_i" | ||
colnames(aligned_features) <- | ||
colnames(pk.times) <- c("mz", "time", "mz.min", "mz.max", paste("exp", 1:number_of_samples)) | ||
|
||
# return both tables and both computed tolerances | ||
rec <- new("list") | ||
rec$aligned.ftrs <- aligned.ftrs | ||
rec$aligned.ftrs <- aligned_features | ||
rec$pk.times <- pk.times | ||
rec$mz.tol <- mz_tol_relative | ||
rec$chr.tol <- rt_tol_relative | ||
|
||
if (do.plot) { | ||
hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency", | ||
hist(mz_sd, xlab = "m/z SD", ylab = "Frequency", | ||
main = "m/z SD distribution") | ||
hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE), | ||
xlab = "Retention time SD", ylab = "Frequency", | ||
Comment on lines
254
to
258
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be moved to the plotting. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as in @zargham-ahmad's comment, do you have a suggestion? I think we would just create an artificial function which takes the same arguments as
Comment on lines
+255
to
258
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can be moved to plot.R There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have any suggestions? I think we would just create an artificial function which takes the same arguments as |
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the role of the vector with swapped indices here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if I understand the question.
to_attach
is used to compute the sum of areas per sample (first call) and retention times (second call) - that's why in the second call, column2
(RTs) is passed instead of column5
(areas). I want to simplify this final step and also outputs of the alignment step as part of #87.