Skip to content

Commit

Permalink
perf optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
traversc committed Feb 22, 2024
1 parent e5f5ace commit 203a0ac
Show file tree
Hide file tree
Showing 30 changed files with 2,305 additions and 234 deletions.
Empty file modified R/RadixTree_search_helpers.R
100644 → 100755
Empty file.
Empty file modified R/pairwise.R
100644 → 100755
Empty file.
Empty file modified R/utils.R
100644 → 100755
Empty file.
Empty file modified R/zzz.R
100644 → 100755
Empty file.
65 changes: 36 additions & 29 deletions inst/extra_tests/benchmark.r
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,12 @@ run_og <- function(query, target, max_distance, show_progress = F) {
results %>% arrange(query, target)
}

run_dnatree <- function(query, target, max_distance=NULL, max_fraction=NULL, mode = "levenshtein", show_progress = FALSE, nthreads = 8) {
x <- treedist::DNATree$new()
x$insert(target)
x$search(query, max_distance = max_distance, max_fraction = max_fraction, mode = mode, show_progress=show_progress, nthreads=nthreads) %>%
arrange(query, target)
}
# run_dnatree <- function(query, target, max_distance=NULL, max_fraction=NULL, mode = "levenshtein", show_progress = FALSE, nthreads = 8) {
# x <- treedist::DNATree$new()
# x$insert(target)
# x$search(query, max_distance = max_distance, max_fraction = max_fraction, mode = mode, show_progress=show_progress, nthreads=nthreads) %>%
# arrange(query, target)
# }

run_radixtree <- function(query, target, max_distance=NULL, max_fraction=NULL, mode = "levenshtein", show_progress = FALSE, nthreads = 8) {
x <- seqtrie::RadixTree$new()
Expand All @@ -81,12 +81,12 @@ run_radixforest <- function(query, target, max_distance=NULL, max_fraction=NULL,
arrange(query, target)
}

run_prefixtree <- function(query, target, max_distance=NULL, max_fraction=NULL, mode = "levenshtein", show_progress = FALSE, nthreads = 8) {
x <- treedist::PrefixTree$new()
x$insert(target)
x$search(query, max_distance = max_distance, max_fraction = max_fraction, mode = mode, show_progress=show_progress, nthreads=nthreads) %>%
arrange(query, target)
}
# run_prefixtree <- function(query, target, max_distance=NULL, max_fraction=NULL, mode = "levenshtein", show_progress = FALSE, nthreads = 8) {
# x <- treedist::PrefixTree$new()
# x$insert(target)
# x$search(query, max_distance = max_distance, max_fraction = max_fraction, mode = mode, show_progress=show_progress, nthreads=nthreads) %>%
# arrange(query, target)
# }

run_stringdist <- function(query, target, max_distance=NULL, max_fraction=NULL, nthreads = 8, show_progress = F) {
results <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=nthreads)
Expand All @@ -103,29 +103,32 @@ run_stringdist <- function(query, target, max_distance=NULL, max_fraction=NULL,
dplyr::arrange(results, query, target)
}

methods <- list(run_dnatree, run_radixtree, run_radixforest, run_prefixtree, run_stringdist, run_og)
names(methods) <- c("DNATree", "RadixTree", "RadixForest", "PrefixTree", "stringdist", "OG")
# methods <- list(run_dnatree, run_radixtree, run_radixforest, run_prefixtree, run_stringdist, run_og)
# names(methods) <- c("DNATree", "RadixTree", "RadixForest", "PrefixTree", "stringdist", "OG")

methods <- list(run_radixtree, run_radixforest, run_og)
names(methods) <- c("RadixTree", "RadixForest", "OG")

data("covid_cdr3")
# data("covid_cdr3")
cc3_subset <- sample(covid_cdr3, size = 1000)

sd_results <- run_stringdist(cc3_subset, cc3_subset, 2)
og_results <- run_og(cc3_subset, cc3_subset, 2)
# sd_results <- run_stringdist(cc3_subset, cc3_subset, 2)
# og_results <- run_og(cc3_subset, cc3_subset, 2)
# dt_results <- run_dnatree(cc3_subset, cc3_subset, max_distance = 2)
rt_results <- run_radixtree(cc3_subset, cc3_subset, max_distance = 2)
rf_results <- run_radixforest(cc3_subset, cc3_subset, max_distance = 2)
# rt_results <- run_radixtree(cc3_subset, cc3_subset, max_distance = 2)
# rf_results <- run_radixforest(cc3_subset, cc3_subset, max_distance = 2)
# pt_results <- run_prefixtree(cc3_subset, cc3_subset, max_distance = 2)

stopifnot(identical(sd_results, og_results))
# stopifnot(identical(sd_results, og_results))
# stopifnot(identical(sd_results, dt_results))
stopifnot(identical(sd_results, rt_results))
stopifnot(identical(sd_results, rf_results))
# stopifnot(identical(sd_results, rt_results))
# stopifnot(identical(sd_results, rf_results))
# stopifnot(identical(sd_results, pt_results))

################################################################################

grid <- expand.grid(nseqs = c(10000), maxdist = c(2,3), iter = 1:NITER, method = names(methods)) %>% sample_n(nrow(.))
grid <- filter(grid, nseqs <= 1000 | method %in% c("DNATree", "RadixTree", "RadixForest", "PrefixTree"))
grid <- expand.grid(nseqs = c(100,300,1000,3000,10000), maxdist = c(2,3), iter = 1:NITER, method = names(methods)) %>% sample_n(nrow(.))
# grid <- filter(grid, nseqs <= 1000 | method %in% c("DNATree", "RadixTree", "RadixForest", "PrefixTree"))
grid$time <- rep(0, nrow(grid))
for(i in 1:nrow(grid)) {
print(grid[i,])
Expand All @@ -138,7 +141,7 @@ for(i in 1:nrow(grid)) {
}
maxdist_results <- grid

grid <- expand.grid(nseqs = c(100,300,1000,3000,10000,30000), maxfrac = c(0.035,0.15), iter = 1:NITER, method = c("DNATree", "RadixTree", "RadixForest", "PrefixTree")) %>% sample_n(nrow(.))
grid <- expand.grid(nseqs = c(100,300,1000,3000,10000,30000), maxfrac = c(0.035,0.15), iter = 1:NITER, method = c("RadixTree", "RadixForest")) %>% sample_n(nrow(.))
grid$time <- rep(0, nrow(grid))
for(i in 1:nrow(grid)) {
print(grid[i,])
Expand All @@ -154,9 +157,13 @@ maxfrac_results <- grid
maxdist_results %>% group_by(nseqs, method, maxdist) %>% summarize(time = mean(time)) %>% as.data.frame %>% print
maxfrac_results %>% group_by(nseqs, method, maxfrac) %>% summarize(time = mean(time)) %>% as.data.frame %>% print

g <- ggplot(grid, aes(x = nseqs, y = time, color = method)) + geom_point() + geom_smooth(fill = NA) +
ggplot(maxfrac_results, aes(x = nseqs, y = time, color = method)) + geom_point() + geom_smooth(fill = NA) +
scale_x_log10() +
facet_wrap(~maxfrac, scales = "free") +
theme_bw(base_size = 16)

ggplot(maxdist_results, aes(x = nseqs, y = time, color = method)) + geom_point() + geom_smooth(fill = NA) +
scale_x_log10() +
# scale_y_log10() +
facet_wrap(~maxfrac) +
facet_wrap(~maxfrac, scales = "free") +
theme_bw(base_size = 16)
ggsave(g, file = "benchmark_plot.png", width = 6, height = 4)
# ggsave(g, file = "benchmark_plot.png", width = 6, height = 4)
Empty file modified inst/extra_tests/benchmark_plot.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
21 changes: 21 additions & 0 deletions inst/extra_tests/simple_benchmark.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
suppressPackageStartupMessages({
library(seqtrie)
library(dplyr)
})
data(covid_cdr3)
set.seed(314156)
NITER = 5
NT = 4

grid <- expand.grid(nseqs = c(30000), maxfrac = c(0.05), iter = 1:NITER, method = c("RadixForest")) %>% sample_n(nrow(.))
grid$time <- rep(0, nrow(grid))
for(i in 1:nrow(grid)) {
x <- sample(covid_cdr3, size = grid$nseqs[i])
time <- Sys.time()
r <- seqtrie::dist_search(x, x, max_fraction = grid$maxfrac[i], show_progres = FALSE, tree_class = grid$method[i], nthread=NT)
grid$time[i] <- as.numeric(Sys.time() - time, units = "secs")
rm(x, r)
gc(full=TRUE)
}

cat(mean(grid$time), "\n")
40 changes: 40 additions & 0 deletions inst/extra_tests/small_array_size_bench.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
library(dplyr)
library(data.table)
library(ggplot2)
library(patchwork)
library(this.path)
setwd(dirname(this.path())) # ...seqtrie/inst/extra_tests

array_sizes <- c(seq(0,96,by = 8),1000) %>% rep(each = 3) %>% sample
lapply(array_sizes, function(AS) {
Sys.setenv("SEQTRIE_SMALL_ARRAY_SIZE"=AS)
system("cd ../../ && make install")
res <- system2("/usr/bin/time", args=c("-v", "Rscript simple_benchmark.R"), stdout=T, stderr=T)
mem <- grep("Maximum resident set size", res, value=T) %>%
gsub(".+:", "", .) %>%
as.numeric
data.frame(array_size=AS, time = as.numeric(res[1]), mem_usage = mem)
}) %>% rbindlist -> results

results <- filter(results, array_size != 1000)

results2 <- arrange(results, array_size) %>%
group_by(array_size) %>%
summarize(mem_usage = mean(mem_usage), time = mean(time))

g1 <- ggplot(results, aes(x = array_size, y = time)) +
geom_point(pch=21, color = "chartreuse") +
geom_line(data=results2, color = "chartreuse") +
scale_x_continuous(breaks = array_sizes) +
theme_bw(base_size=14)

g2 <- ggplot(results, aes(x = array_size, y = mem_usage)) +
geom_point(pch=21, color = "darkorange") +
geom_line(data=results2, color = "darkorange") +
scale_x_continuous(breaks = array_sizes) +
theme_bw(base_size=14)

g1 + g2 + plot_layout(ncol=1)



Loading

0 comments on commit 203a0ac

Please sign in to comment.